rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use crate::utils::element_cache::ElementCache;
5use crate::utils::regex_cache::URL_SIMPLE_REGEX;
6use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
7use regex::Regex;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::path::PathBuf;
11use std::sync::LazyLock;
12
13/// Macro for profiling sections - only active in non-WASM builds
14#[cfg(not(target_arch = "wasm32"))]
15macro_rules! profile_section {
16    ($name:expr, $profile:expr, $code:expr) => {{
17        let start = std::time::Instant::now();
18        let result = $code;
19        if $profile {
20            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
21        }
22        result
23    }};
24}
25
26#[cfg(target_arch = "wasm32")]
27macro_rules! profile_section {
28    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
29}
30
31// Comprehensive link pattern that captures both inline and reference links
32// Use (?s) flag to make . match newlines
33static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
34    Regex::new(
35        r#"(?sx)
36        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
37        (?:
38            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
39            |
40            \[([^\]]*)\]      # Reference ID in group 6
41        )"#
42    ).unwrap()
43});
44
45// Image pattern (similar to links but with ! prefix)
46// Use (?s) flag to make . match newlines
47static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
48    Regex::new(
49        r#"(?sx)
50        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
51        (?:
52            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
53            |
54            \[([^\]]*)\]      # Reference ID in group 6
55        )"#
56    ).unwrap()
57});
58
59// Reference definition pattern
60static REF_DEF_PATTERN: LazyLock<Regex> =
61    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
62
63// Pattern for bare URLs - uses centralized URL pattern from regex_cache
64
65// Pattern for email addresses
66static BARE_EMAIL_PATTERN: LazyLock<Regex> =
67    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
68
69// Pattern for blockquote prefix in parse_list_blocks
70static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
71
72/// Pre-computed information about a line
73#[derive(Debug, Clone)]
74pub struct LineInfo {
75    /// Byte offset where this line starts in the document
76    pub byte_offset: usize,
77    /// Length of the line in bytes (without newline)
78    pub byte_len: usize,
79    /// Number of bytes of leading whitespace (for substring extraction)
80    pub indent: usize,
81    /// Visual column width of leading whitespace (with proper tab expansion)
82    /// Per CommonMark, tabs expand to the next column that is a multiple of 4.
83    /// Use this for numeric comparisons like checking for indented code blocks (>= 4).
84    pub visual_indent: usize,
85    /// Whether the line is blank (empty or only whitespace)
86    pub is_blank: bool,
87    /// Whether this line is inside a code block
88    pub in_code_block: bool,
89    /// Whether this line is inside front matter
90    pub in_front_matter: bool,
91    /// Whether this line is inside an HTML block
92    pub in_html_block: bool,
93    /// Whether this line is inside an HTML comment
94    pub in_html_comment: bool,
95    /// List item information if this line starts a list item
96    pub list_item: Option<ListItemInfo>,
97    /// Heading information if this line is a heading
98    pub heading: Option<HeadingInfo>,
99    /// Blockquote information if this line is a blockquote
100    pub blockquote: Option<BlockquoteInfo>,
101    /// Whether this line is inside a mkdocstrings autodoc block
102    pub in_mkdocstrings: bool,
103    /// Whether this line is part of an ESM import/export block (MDX only)
104    pub in_esm_block: bool,
105    /// Whether this line is a continuation of a multi-line code span from a previous line
106    pub in_code_span_continuation: bool,
107    /// Whether this line is a horizontal rule (---, ***, ___, etc.)
108    /// Pre-computed for consistent detection across all rules
109    pub is_horizontal_rule: bool,
110    /// Whether this line is inside a math block ($$ ... $$)
111    pub in_math_block: bool,
112}
113
114impl LineInfo {
115    /// Get the line content as a string slice from the source document
116    pub fn content<'a>(&self, source: &'a str) -> &'a str {
117        &source[self.byte_offset..self.byte_offset + self.byte_len]
118    }
119}
120
121/// Information about a list item
122#[derive(Debug, Clone)]
123pub struct ListItemInfo {
124    /// The marker used (*, -, +, or number with . or ))
125    pub marker: String,
126    /// Whether it's ordered (true) or unordered (false)
127    pub is_ordered: bool,
128    /// The number for ordered lists
129    pub number: Option<usize>,
130    /// Column where the marker starts (0-based)
131    pub marker_column: usize,
132    /// Column where content after marker starts
133    pub content_column: usize,
134}
135
136/// Heading style type
137#[derive(Debug, Clone, PartialEq)]
138pub enum HeadingStyle {
139    /// ATX style heading (# Heading)
140    ATX,
141    /// Setext style heading with = underline
142    Setext1,
143    /// Setext style heading with - underline
144    Setext2,
145}
146
147/// Parsed link information
148#[derive(Debug, Clone)]
149pub struct ParsedLink<'a> {
150    /// Line number (1-indexed)
151    pub line: usize,
152    /// Start column (0-indexed) in the line
153    pub start_col: usize,
154    /// End column (0-indexed) in the line
155    pub end_col: usize,
156    /// Byte offset in document
157    pub byte_offset: usize,
158    /// End byte offset in document
159    pub byte_end: usize,
160    /// Link text
161    pub text: Cow<'a, str>,
162    /// Link URL or reference
163    pub url: Cow<'a, str>,
164    /// Whether this is a reference link [text][ref] vs inline [text](url)
165    pub is_reference: bool,
166    /// Reference ID for reference links
167    pub reference_id: Option<Cow<'a, str>>,
168    /// Link type from pulldown-cmark
169    pub link_type: LinkType,
170}
171
172/// Information about a broken link reported by pulldown-cmark
173#[derive(Debug, Clone)]
174pub struct BrokenLinkInfo {
175    /// The reference text that couldn't be resolved
176    pub reference: String,
177    /// Byte span in the source document
178    pub span: std::ops::Range<usize>,
179}
180
181/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
182#[derive(Debug, Clone)]
183pub struct FootnoteRef {
184    /// The footnote ID (without the ^ prefix)
185    pub id: String,
186    /// Line number (1-indexed)
187    pub line: usize,
188    /// Start byte offset in document
189    pub byte_offset: usize,
190    /// End byte offset in document
191    pub byte_end: usize,
192}
193
194/// Parsed image information
195#[derive(Debug, Clone)]
196pub struct ParsedImage<'a> {
197    /// Line number (1-indexed)
198    pub line: usize,
199    /// Start column (0-indexed) in the line
200    pub start_col: usize,
201    /// End column (0-indexed) in the line
202    pub end_col: usize,
203    /// Byte offset in document
204    pub byte_offset: usize,
205    /// End byte offset in document
206    pub byte_end: usize,
207    /// Alt text
208    pub alt_text: Cow<'a, str>,
209    /// Image URL or reference
210    pub url: Cow<'a, str>,
211    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
212    pub is_reference: bool,
213    /// Reference ID for reference images
214    pub reference_id: Option<Cow<'a, str>>,
215    /// Link type from pulldown-cmark
216    pub link_type: LinkType,
217}
218
219/// Reference definition [ref]: url "title"
220#[derive(Debug, Clone)]
221pub struct ReferenceDef {
222    /// Line number (1-indexed)
223    pub line: usize,
224    /// Reference ID (normalized to lowercase)
225    pub id: String,
226    /// URL
227    pub url: String,
228    /// Optional title
229    pub title: Option<String>,
230    /// Byte offset where the reference definition starts
231    pub byte_offset: usize,
232    /// Byte offset where the reference definition ends
233    pub byte_end: usize,
234    /// Byte offset where the title starts (if present, includes quote)
235    pub title_byte_start: Option<usize>,
236    /// Byte offset where the title ends (if present, includes quote)
237    pub title_byte_end: Option<usize>,
238}
239
240/// Parsed code span information
241#[derive(Debug, Clone)]
242pub struct CodeSpan {
243    /// Line number where the code span starts (1-indexed)
244    pub line: usize,
245    /// Line number where the code span ends (1-indexed)
246    pub end_line: usize,
247    /// Start column (0-indexed) in the line
248    pub start_col: usize,
249    /// End column (0-indexed) in the line
250    pub end_col: usize,
251    /// Byte offset in document
252    pub byte_offset: usize,
253    /// End byte offset in document
254    pub byte_end: usize,
255    /// Number of backticks used (1, 2, 3, etc.)
256    pub backtick_count: usize,
257    /// Content inside the code span (without backticks)
258    pub content: String,
259}
260
261/// Parsed math span information (inline $...$ or display $$...$$)
262#[derive(Debug, Clone)]
263pub struct MathSpan {
264    /// Line number where the math span starts (1-indexed)
265    pub line: usize,
266    /// Line number where the math span ends (1-indexed)
267    pub end_line: usize,
268    /// Start column (0-indexed) in the line
269    pub start_col: usize,
270    /// End column (0-indexed) in the line
271    pub end_col: usize,
272    /// Byte offset in document
273    pub byte_offset: usize,
274    /// End byte offset in document
275    pub byte_end: usize,
276    /// Whether this is display math ($$...$$) vs inline ($...$)
277    pub is_display: bool,
278    /// Content inside the math delimiters
279    pub content: String,
280}
281
282/// Information about a heading
283#[derive(Debug, Clone)]
284pub struct HeadingInfo {
285    /// Heading level (1-6 for ATX, 1-2 for Setext)
286    pub level: u8,
287    /// Style of heading
288    pub style: HeadingStyle,
289    /// The heading marker (# characters or underline)
290    pub marker: String,
291    /// Column where the marker starts (0-based)
292    pub marker_column: usize,
293    /// Column where heading text starts
294    pub content_column: usize,
295    /// The heading text (without markers and without custom ID syntax)
296    pub text: String,
297    /// Custom header ID if present (e.g., from {#custom-id} syntax)
298    pub custom_id: Option<String>,
299    /// Original heading text including custom ID syntax
300    pub raw_text: String,
301    /// Whether it has a closing sequence (for ATX)
302    pub has_closing_sequence: bool,
303    /// The closing sequence if present
304    pub closing_sequence: String,
305    /// Whether this is a valid CommonMark heading (ATX headings require space after #)
306    /// False for malformed headings like `#NoSpace` that MD018 should flag
307    pub is_valid: bool,
308}
309
310/// A valid heading from a filtered iteration
311///
312/// Only includes headings that are CommonMark-compliant (have space after #).
313/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
314#[derive(Debug, Clone)]
315pub struct ValidHeading<'a> {
316    /// The 1-indexed line number in the document
317    pub line_num: usize,
318    /// Reference to the heading information
319    pub heading: &'a HeadingInfo,
320    /// Reference to the full line info (for rules that need additional context)
321    pub line_info: &'a LineInfo,
322}
323
324/// Iterator over valid CommonMark headings in a document
325///
326/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
327/// but should not be processed by other heading rules.
328pub struct ValidHeadingsIter<'a> {
329    lines: &'a [LineInfo],
330    current_index: usize,
331}
332
333impl<'a> ValidHeadingsIter<'a> {
334    fn new(lines: &'a [LineInfo]) -> Self {
335        Self {
336            lines,
337            current_index: 0,
338        }
339    }
340}
341
342impl<'a> Iterator for ValidHeadingsIter<'a> {
343    type Item = ValidHeading<'a>;
344
345    fn next(&mut self) -> Option<Self::Item> {
346        while self.current_index < self.lines.len() {
347            let idx = self.current_index;
348            self.current_index += 1;
349
350            let line_info = &self.lines[idx];
351            if let Some(heading) = &line_info.heading
352                && heading.is_valid
353            {
354                return Some(ValidHeading {
355                    line_num: idx + 1, // Convert 0-indexed to 1-indexed
356                    heading,
357                    line_info,
358                });
359            }
360        }
361        None
362    }
363}
364
365/// Information about a blockquote line
366#[derive(Debug, Clone)]
367pub struct BlockquoteInfo {
368    /// Nesting level (1 for >, 2 for >>, etc.)
369    pub nesting_level: usize,
370    /// The indentation before the blockquote marker
371    pub indent: String,
372    /// Column where the first > starts (0-based)
373    pub marker_column: usize,
374    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
375    pub prefix: String,
376    /// Content after the blockquote marker(s)
377    pub content: String,
378    /// Whether the line has no space after the marker
379    pub has_no_space_after_marker: bool,
380    /// Whether the line has multiple spaces after the marker
381    pub has_multiple_spaces_after_marker: bool,
382    /// Whether this is an empty blockquote line needing MD028 fix
383    pub needs_md028_fix: bool,
384}
385
386/// Information about a list block
387#[derive(Debug, Clone)]
388pub struct ListBlock {
389    /// Line number where the list starts (1-indexed)
390    pub start_line: usize,
391    /// Line number where the list ends (1-indexed)
392    pub end_line: usize,
393    /// Whether it's ordered or unordered
394    pub is_ordered: bool,
395    /// The consistent marker for unordered lists (if any)
396    pub marker: Option<String>,
397    /// Blockquote prefix for this list (empty if not in blockquote)
398    pub blockquote_prefix: String,
399    /// Lines that are list items within this block
400    pub item_lines: Vec<usize>,
401    /// Nesting level (0 for top-level lists)
402    pub nesting_level: usize,
403    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
404    pub max_marker_width: usize,
405}
406
407use std::sync::{Arc, OnceLock};
408
409/// Map from line byte offset to list item data: (is_ordered, marker, marker_column, content_column, number)
410type ListItemMap = std::collections::HashMap<usize, (bool, String, usize, usize, Option<usize>)>;
411
412/// Character frequency data for fast content analysis
413#[derive(Debug, Clone, Default)]
414pub struct CharFrequency {
415    /// Count of # characters (headings)
416    pub hash_count: usize,
417    /// Count of * characters (emphasis, lists, horizontal rules)
418    pub asterisk_count: usize,
419    /// Count of _ characters (emphasis, horizontal rules)
420    pub underscore_count: usize,
421    /// Count of - characters (lists, horizontal rules, setext headings)
422    pub hyphen_count: usize,
423    /// Count of + characters (lists)
424    pub plus_count: usize,
425    /// Count of > characters (blockquotes)
426    pub gt_count: usize,
427    /// Count of | characters (tables)
428    pub pipe_count: usize,
429    /// Count of [ characters (links, images)
430    pub bracket_count: usize,
431    /// Count of ` characters (code spans, code blocks)
432    pub backtick_count: usize,
433    /// Count of < characters (HTML tags, autolinks)
434    pub lt_count: usize,
435    /// Count of ! characters (images)
436    pub exclamation_count: usize,
437    /// Count of newline characters
438    pub newline_count: usize,
439}
440
441/// Pre-parsed HTML tag information
442#[derive(Debug, Clone)]
443pub struct HtmlTag {
444    /// Line number (1-indexed)
445    pub line: usize,
446    /// Start column (0-indexed) in the line
447    pub start_col: usize,
448    /// End column (0-indexed) in the line
449    pub end_col: usize,
450    /// Byte offset in document
451    pub byte_offset: usize,
452    /// End byte offset in document
453    pub byte_end: usize,
454    /// Tag name (e.g., "div", "img", "br")
455    pub tag_name: String,
456    /// Whether it's a closing tag (`</tag>`)
457    pub is_closing: bool,
458    /// Whether it's self-closing (`<tag />`)
459    pub is_self_closing: bool,
460    /// Raw tag content
461    pub raw_content: String,
462}
463
464/// Pre-parsed emphasis span information
465#[derive(Debug, Clone)]
466pub struct EmphasisSpan {
467    /// Line number (1-indexed)
468    pub line: usize,
469    /// Start column (0-indexed) in the line
470    pub start_col: usize,
471    /// End column (0-indexed) in the line
472    pub end_col: usize,
473    /// Byte offset in document
474    pub byte_offset: usize,
475    /// End byte offset in document
476    pub byte_end: usize,
477    /// Type of emphasis ('*' or '_')
478    pub marker: char,
479    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
480    pub marker_count: usize,
481    /// Content inside the emphasis
482    pub content: String,
483}
484
485/// Pre-parsed table row information
486#[derive(Debug, Clone)]
487pub struct TableRow {
488    /// Line number (1-indexed)
489    pub line: usize,
490    /// Whether this is a separator row (contains only |, -, :, and spaces)
491    pub is_separator: bool,
492    /// Number of columns (pipe-separated cells)
493    pub column_count: usize,
494    /// Alignment info from separator row
495    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
496}
497
498/// Pre-parsed bare URL information (not in links)
499#[derive(Debug, Clone)]
500pub struct BareUrl {
501    /// Line number (1-indexed)
502    pub line: usize,
503    /// Start column (0-indexed) in the line
504    pub start_col: usize,
505    /// End column (0-indexed) in the line
506    pub end_col: usize,
507    /// Byte offset in document
508    pub byte_offset: usize,
509    /// End byte offset in document
510    pub byte_end: usize,
511    /// The URL string
512    pub url: String,
513    /// Type of URL ("http", "https", "ftp", "email")
514    pub url_type: String,
515}
516
517pub struct LintContext<'a> {
518    pub content: &'a str,
519    pub line_offsets: Vec<usize>,
520    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
521    pub lines: Vec<LineInfo>,             // Pre-computed line information
522    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
523    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
524    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
525    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
526    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
527    reference_defs_map: HashMap<String, usize>, // O(1) lookup by lowercase ID -> index in reference_defs
528    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
529    math_spans_cache: OnceLock<Arc<Vec<MathSpan>>>, // Lazy-loaded math spans ($...$ and $$...$$)
530    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
531    pub char_frequency: CharFrequency,    // Character frequency analysis
532    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
533    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
534    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
535    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
536    has_mixed_list_nesting_cache: OnceLock<bool>, // Cached result for mixed ordered/unordered list nesting detection
537    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
538    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
539    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
540    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
541    pub flavor: MarkdownFlavor,           // Markdown flavor being used
542    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
543}
544
545/// Detailed blockquote parse result with all components
546struct BlockquoteComponents<'a> {
547    indent: &'a str,
548    markers: &'a str,
549    spaces_after: &'a str,
550    content: &'a str,
551}
552
553/// Parse blockquote prefix with detailed components using manual parsing
554#[inline]
555fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
556    let bytes = line.as_bytes();
557    let mut pos = 0;
558
559    // Parse leading whitespace (indent)
560    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
561        pos += 1;
562    }
563    let indent_end = pos;
564
565    // Must have at least one '>' marker
566    if pos >= bytes.len() || bytes[pos] != b'>' {
567        return None;
568    }
569
570    // Parse '>' markers
571    while pos < bytes.len() && bytes[pos] == b'>' {
572        pos += 1;
573    }
574    let markers_end = pos;
575
576    // Parse spaces after markers
577    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
578        pos += 1;
579    }
580    let spaces_end = pos;
581
582    Some(BlockquoteComponents {
583        indent: &line[0..indent_end],
584        markers: &line[indent_end..markers_end],
585        spaces_after: &line[markers_end..spaces_end],
586        content: &line[spaces_end..],
587    })
588}
589
590impl<'a> LintContext<'a> {
591    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
592        #[cfg(not(target_arch = "wasm32"))]
593        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
594        #[cfg(target_arch = "wasm32")]
595        let profile = false;
596
597        let line_offsets = profile_section!("Line offsets", profile, {
598            let mut offsets = vec![0];
599            for (i, c) in content.char_indices() {
600                if c == '\n' {
601                    offsets.push(i + 1);
602                }
603            }
604            offsets
605        });
606
607        // Detect code blocks once and cache them
608        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
609
610        // Pre-compute HTML comment ranges ONCE for all operations
611        let html_comment_ranges = profile_section!(
612            "HTML comment ranges",
613            profile,
614            crate::utils::skip_context::compute_html_comment_ranges(content)
615        );
616
617        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
618        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
619            if flavor == MarkdownFlavor::MkDocs {
620                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
621            } else {
622                Vec::new()
623            }
624        });
625
626        // Pre-compute line information AND emphasis spans (without headings/blockquotes yet)
627        // Emphasis spans are captured during the same pulldown-cmark parse as list detection
628        let (mut lines, emphasis_spans) = profile_section!(
629            "Basic line info",
630            profile,
631            Self::compute_basic_line_info(
632                content,
633                &line_offsets,
634                &code_blocks,
635                flavor,
636                &html_comment_ranges,
637                &autodoc_ranges,
638            )
639        );
640
641        // Detect HTML blocks BEFORE heading detection
642        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
643
644        // Detect ESM import/export blocks in MDX files BEFORE heading detection
645        profile_section!(
646            "ESM blocks",
647            profile,
648            Self::detect_esm_blocks(content, &mut lines, flavor)
649        );
650
651        // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
652        let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
653
654        // Now detect headings and blockquotes
655        profile_section!(
656            "Headings & blockquotes",
657            profile,
658            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
659        );
660
661        // Parse code spans early so we can exclude them from link/image parsing
662        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
663
664        // Mark lines that are continuations of multi-line code spans
665        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
666        for span in &code_spans {
667            if span.end_line > span.line {
668                // Mark lines after the first line as continuations
669                for line_num in (span.line + 1)..=span.end_line {
670                    if let Some(line_info) = lines.get_mut(line_num - 1) {
671                        line_info.in_code_span_continuation = true;
672                    }
673                }
674            }
675        }
676
677        // Parse links, images, references, and list blocks
678        let (links, broken_links, footnote_refs) = profile_section!(
679            "Links",
680            profile,
681            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
682        );
683
684        let images = profile_section!(
685            "Images",
686            profile,
687            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
688        );
689
690        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
691
692        // Build O(1) lookup map for reference definitions by lowercase ID
693        let reference_defs_map: HashMap<String, usize> = reference_defs
694            .iter()
695            .enumerate()
696            .map(|(idx, def)| (def.id.to_lowercase(), idx))
697            .collect();
698
699        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
700
701        // Compute character frequency for fast content analysis
702        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
703
704        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
705        let table_blocks = profile_section!(
706            "Table blocks",
707            profile,
708            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
709                content,
710                &code_blocks,
711                &code_spans,
712                &html_comment_ranges,
713            )
714        );
715
716        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
717        let line_index = profile_section!(
718            "Line index",
719            profile,
720            crate::utils::range_utils::LineIndex::new(content)
721        );
722
723        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
724        let jinja_ranges = profile_section!(
725            "Jinja ranges",
726            profile,
727            crate::utils::jinja_utils::find_jinja_ranges(content)
728        );
729
730        Self {
731            content,
732            line_offsets,
733            code_blocks,
734            lines,
735            links,
736            images,
737            broken_links,
738            footnote_refs,
739            reference_defs,
740            reference_defs_map,
741            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
742            math_spans_cache: OnceLock::new(), // Lazy-loaded on first access
743            list_blocks,
744            char_frequency,
745            html_tags_cache: OnceLock::new(),
746            emphasis_spans_cache: OnceLock::from(Arc::new(emphasis_spans)),
747            table_rows_cache: OnceLock::new(),
748            bare_urls_cache: OnceLock::new(),
749            has_mixed_list_nesting_cache: OnceLock::new(),
750            html_comment_ranges,
751            table_blocks,
752            line_index,
753            jinja_ranges,
754            flavor,
755            source_file,
756        }
757    }
758
759    /// Get code spans - computed lazily on first access
760    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
761        Arc::clone(
762            self.code_spans_cache
763                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
764        )
765    }
766
767    /// Get math spans - computed lazily on first access
768    pub fn math_spans(&self) -> Arc<Vec<MathSpan>> {
769        Arc::clone(
770            self.math_spans_cache
771                .get_or_init(|| Arc::new(Self::parse_math_spans(self.content, &self.lines))),
772        )
773    }
774
775    /// Check if a byte position is within a math span (inline $...$ or display $$...$$)
776    pub fn is_in_math_span(&self, byte_pos: usize) -> bool {
777        let math_spans = self.math_spans();
778        math_spans
779            .iter()
780            .any(|span| byte_pos >= span.byte_offset && byte_pos < span.byte_end)
781    }
782
783    /// Get HTML comment ranges - pre-computed during LintContext construction
784    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
785        &self.html_comment_ranges
786    }
787
788    /// Get HTML tags - computed lazily on first access
789    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
790        Arc::clone(self.html_tags_cache.get_or_init(|| {
791            Arc::new(Self::parse_html_tags(
792                self.content,
793                &self.lines,
794                &self.code_blocks,
795                self.flavor,
796            ))
797        }))
798    }
799
800    /// Get emphasis spans - pre-computed during construction
801    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
802        Arc::clone(
803            self.emphasis_spans_cache
804                .get()
805                .expect("emphasis_spans_cache initialized during construction"),
806        )
807    }
808
809    /// Get table rows - computed lazily on first access
810    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
811        Arc::clone(
812            self.table_rows_cache
813                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
814        )
815    }
816
817    /// Get bare URLs - computed lazily on first access
818    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
819        Arc::clone(
820            self.bare_urls_cache
821                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
822        )
823    }
824
825    /// Check if document has mixed ordered/unordered list nesting.
826    /// Result is cached after first computation (document-level invariant).
827    /// This is used by MD007 for smart style auto-detection.
828    pub fn has_mixed_list_nesting(&self) -> bool {
829        *self
830            .has_mixed_list_nesting_cache
831            .get_or_init(|| self.compute_mixed_list_nesting())
832    }
833
834    /// Internal computation for mixed list nesting (only called once per LintContext).
835    fn compute_mixed_list_nesting(&self) -> bool {
836        // Track parent list items by their marker position and type
837        // Using marker_column instead of indent because it works correctly
838        // for blockquoted content where indent doesn't account for the prefix
839        // Stack stores: (marker_column, is_ordered)
840        let mut stack: Vec<(usize, bool)> = Vec::new();
841        let mut last_was_blank = false;
842
843        for line_info in &self.lines {
844            // Skip non-content lines (code blocks, frontmatter, HTML comments, etc.)
845            if line_info.in_code_block
846                || line_info.in_front_matter
847                || line_info.in_mkdocstrings
848                || line_info.in_html_comment
849                || line_info.in_esm_block
850            {
851                continue;
852            }
853
854            // OPTIMIZATION: Use pre-computed is_blank instead of content().trim()
855            if line_info.is_blank {
856                last_was_blank = true;
857                continue;
858            }
859
860            if let Some(list_item) = &line_info.list_item {
861                // Normalize column 1 to column 0 (consistent with MD007 check function)
862                let current_pos = if list_item.marker_column == 1 {
863                    0
864                } else {
865                    list_item.marker_column
866                };
867
868                // If there was a blank line and this item is at root level, reset stack
869                if last_was_blank && current_pos == 0 {
870                    stack.clear();
871                }
872                last_was_blank = false;
873
874                // Pop items at same or greater position (they're siblings or deeper, not parents)
875                while let Some(&(pos, _)) = stack.last() {
876                    if pos >= current_pos {
877                        stack.pop();
878                    } else {
879                        break;
880                    }
881                }
882
883                // Check if immediate parent has different type - this is mixed nesting
884                if let Some(&(_, parent_is_ordered)) = stack.last()
885                    && parent_is_ordered != list_item.is_ordered
886                {
887                    return true; // Found mixed nesting - early exit
888                }
889
890                stack.push((current_pos, list_item.is_ordered));
891            } else {
892                // Non-list line (but not blank) - could be paragraph or other content
893                last_was_blank = false;
894            }
895        }
896
897        false
898    }
899
900    /// Map a byte offset to (line, column)
901    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
902        match self.line_offsets.binary_search(&offset) {
903            Ok(line) => (line + 1, 1),
904            Err(line) => {
905                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
906                (line, offset - line_start + 1)
907            }
908        }
909    }
910
911    /// Check if a position is within a code block or code span
912    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
913        // Check code blocks first
914        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
915            return true;
916        }
917
918        // Check inline code spans (lazy load if needed)
919        self.code_spans()
920            .iter()
921            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
922    }
923
924    /// Get line information by line number (1-indexed)
925    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
926        if line_num > 0 {
927            self.lines.get(line_num - 1)
928        } else {
929            None
930        }
931    }
932
933    /// Get byte offset for a line number (1-indexed)
934    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
935        self.line_info(line_num).map(|info| info.byte_offset)
936    }
937
938    /// Get URL for a reference link/image by its ID (O(1) lookup via HashMap)
939    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
940        let normalized_id = ref_id.to_lowercase();
941        self.reference_defs_map
942            .get(&normalized_id)
943            .map(|&idx| self.reference_defs[idx].url.as_str())
944    }
945
946    /// Get a reference definition by its ID (O(1) lookup via HashMap)
947    pub fn get_reference_def(&self, ref_id: &str) -> Option<&ReferenceDef> {
948        let normalized_id = ref_id.to_lowercase();
949        self.reference_defs_map
950            .get(&normalized_id)
951            .map(|&idx| &self.reference_defs[idx])
952    }
953
954    /// Check if a reference definition exists by ID (O(1) lookup via HashMap)
955    pub fn has_reference_def(&self, ref_id: &str) -> bool {
956        let normalized_id = ref_id.to_lowercase();
957        self.reference_defs_map.contains_key(&normalized_id)
958    }
959
960    /// Check if a line is part of a list block
961    pub fn is_in_list_block(&self, line_num: usize) -> bool {
962        self.list_blocks
963            .iter()
964            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
965    }
966
967    /// Get the list block containing a specific line
968    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
969        self.list_blocks
970            .iter()
971            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
972    }
973
974    // Compatibility methods for DocumentStructure migration
975
976    /// Check if a line is within a code block
977    pub fn is_in_code_block(&self, line_num: usize) -> bool {
978        if line_num == 0 || line_num > self.lines.len() {
979            return false;
980        }
981        self.lines[line_num - 1].in_code_block
982    }
983
984    /// Check if a line is within front matter
985    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
986        if line_num == 0 || line_num > self.lines.len() {
987            return false;
988        }
989        self.lines[line_num - 1].in_front_matter
990    }
991
992    /// Check if a line is within an HTML block
993    pub fn is_in_html_block(&self, line_num: usize) -> bool {
994        if line_num == 0 || line_num > self.lines.len() {
995            return false;
996        }
997        self.lines[line_num - 1].in_html_block
998    }
999
1000    /// Check if a line and column is within a code span
1001    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
1002        if line_num == 0 || line_num > self.lines.len() {
1003            return false;
1004        }
1005
1006        // Use the code spans cache to check
1007        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
1008        // Convert col to 0-indexed for comparison
1009        let col_0indexed = if col > 0 { col - 1 } else { 0 };
1010        let code_spans = self.code_spans();
1011        code_spans.iter().any(|span| {
1012            // Check if line is within the span's line range
1013            if line_num < span.line || line_num > span.end_line {
1014                return false;
1015            }
1016
1017            if span.line == span.end_line {
1018                // Single-line span: check column bounds
1019                col_0indexed >= span.start_col && col_0indexed < span.end_col
1020            } else if line_num == span.line {
1021                // First line of multi-line span: anything after start_col is in span
1022                col_0indexed >= span.start_col
1023            } else if line_num == span.end_line {
1024                // Last line of multi-line span: anything before end_col is in span
1025                col_0indexed < span.end_col
1026            } else {
1027                // Middle line of multi-line span: entire line is in span
1028                true
1029            }
1030        })
1031    }
1032
1033    /// Check if a byte offset is within a code span
1034    #[inline]
1035    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
1036        let code_spans = self.code_spans();
1037        code_spans
1038            .iter()
1039            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
1040    }
1041
1042    /// Check if a byte position is within a reference definition
1043    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
1044    #[inline]
1045    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
1046        self.reference_defs
1047            .iter()
1048            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
1049    }
1050
1051    /// Check if a byte position is within an HTML comment
1052    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
1053    /// where k is the number of HTML comments (typically very small)
1054    #[inline]
1055    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
1056        self.html_comment_ranges
1057            .iter()
1058            .any(|range| byte_pos >= range.start && byte_pos < range.end)
1059    }
1060
1061    /// Check if a byte position is within an HTML tag (including multiline tags)
1062    /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
1063    #[inline]
1064    pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
1065        self.html_tags()
1066            .iter()
1067            .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
1068    }
1069
1070    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
1071    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
1072        self.jinja_ranges
1073            .iter()
1074            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1075    }
1076
1077    /// Check if a byte position is within a link reference definition title
1078    pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1079        self.reference_defs.iter().any(|def| {
1080            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1081                byte_pos >= start && byte_pos < end
1082            } else {
1083                false
1084            }
1085        })
1086    }
1087
1088    /// Check if content has any instances of a specific character (fast)
1089    pub fn has_char(&self, ch: char) -> bool {
1090        match ch {
1091            '#' => self.char_frequency.hash_count > 0,
1092            '*' => self.char_frequency.asterisk_count > 0,
1093            '_' => self.char_frequency.underscore_count > 0,
1094            '-' => self.char_frequency.hyphen_count > 0,
1095            '+' => self.char_frequency.plus_count > 0,
1096            '>' => self.char_frequency.gt_count > 0,
1097            '|' => self.char_frequency.pipe_count > 0,
1098            '[' => self.char_frequency.bracket_count > 0,
1099            '`' => self.char_frequency.backtick_count > 0,
1100            '<' => self.char_frequency.lt_count > 0,
1101            '!' => self.char_frequency.exclamation_count > 0,
1102            '\n' => self.char_frequency.newline_count > 0,
1103            _ => self.content.contains(ch), // Fallback for other characters
1104        }
1105    }
1106
1107    /// Get count of a specific character (fast)
1108    pub fn char_count(&self, ch: char) -> usize {
1109        match ch {
1110            '#' => self.char_frequency.hash_count,
1111            '*' => self.char_frequency.asterisk_count,
1112            '_' => self.char_frequency.underscore_count,
1113            '-' => self.char_frequency.hyphen_count,
1114            '+' => self.char_frequency.plus_count,
1115            '>' => self.char_frequency.gt_count,
1116            '|' => self.char_frequency.pipe_count,
1117            '[' => self.char_frequency.bracket_count,
1118            '`' => self.char_frequency.backtick_count,
1119            '<' => self.char_frequency.lt_count,
1120            '!' => self.char_frequency.exclamation_count,
1121            '\n' => self.char_frequency.newline_count,
1122            _ => self.content.matches(ch).count(), // Fallback for other characters
1123        }
1124    }
1125
1126    /// Check if content likely contains headings (fast)
1127    pub fn likely_has_headings(&self) -> bool {
1128        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
1129    }
1130
1131    /// Check if content likely contains lists (fast)
1132    pub fn likely_has_lists(&self) -> bool {
1133        self.char_frequency.asterisk_count > 0
1134            || self.char_frequency.hyphen_count > 0
1135            || self.char_frequency.plus_count > 0
1136    }
1137
1138    /// Check if content likely contains emphasis (fast)
1139    pub fn likely_has_emphasis(&self) -> bool {
1140        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1141    }
1142
1143    /// Check if content likely contains tables (fast)
1144    pub fn likely_has_tables(&self) -> bool {
1145        self.char_frequency.pipe_count > 2
1146    }
1147
1148    /// Check if content likely contains blockquotes (fast)
1149    pub fn likely_has_blockquotes(&self) -> bool {
1150        self.char_frequency.gt_count > 0
1151    }
1152
1153    /// Check if content likely contains code (fast)
1154    pub fn likely_has_code(&self) -> bool {
1155        self.char_frequency.backtick_count > 0
1156    }
1157
1158    /// Check if content likely contains links or images (fast)
1159    pub fn likely_has_links_or_images(&self) -> bool {
1160        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1161    }
1162
1163    /// Check if content likely contains HTML (fast)
1164    pub fn likely_has_html(&self) -> bool {
1165        self.char_frequency.lt_count > 0
1166    }
1167
1168    /// Get the blockquote prefix for inserting a blank line at the given line index.
1169    /// Returns the prefix without trailing content (e.g., ">" or ">>").
1170    /// This is needed because blank lines inside blockquotes must preserve the blockquote structure.
1171    /// Returns an empty string if the line is not inside a blockquote.
1172    pub fn blockquote_prefix_for_blank_line(&self, line_idx: usize) -> String {
1173        if let Some(line_info) = self.lines.get(line_idx)
1174            && let Some(ref bq) = line_info.blockquote
1175        {
1176            bq.prefix.trim_end().to_string()
1177        } else {
1178            String::new()
1179        }
1180    }
1181
1182    /// Get HTML tags on a specific line
1183    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1184        self.html_tags()
1185            .iter()
1186            .filter(|tag| tag.line == line_num)
1187            .cloned()
1188            .collect()
1189    }
1190
1191    /// Get emphasis spans on a specific line
1192    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1193        self.emphasis_spans()
1194            .iter()
1195            .filter(|span| span.line == line_num)
1196            .cloned()
1197            .collect()
1198    }
1199
1200    /// Get table rows on a specific line
1201    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1202        self.table_rows()
1203            .iter()
1204            .filter(|row| row.line == line_num)
1205            .cloned()
1206            .collect()
1207    }
1208
1209    /// Get bare URLs on a specific line
1210    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1211        self.bare_urls()
1212            .iter()
1213            .filter(|url| url.line == line_num)
1214            .cloned()
1215            .collect()
1216    }
1217
1218    /// Find the line index for a given byte offset using binary search.
1219    /// Returns (line_index, line_number, column) where:
1220    /// - line_index is the 0-based index in the lines array
1221    /// - line_number is the 1-based line number
1222    /// - column is the byte offset within that line
1223    #[inline]
1224    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1225        // Binary search to find the line containing this byte offset
1226        let idx = match lines.binary_search_by(|line| {
1227            if byte_offset < line.byte_offset {
1228                std::cmp::Ordering::Greater
1229            } else if byte_offset > line.byte_offset + line.byte_len {
1230                std::cmp::Ordering::Less
1231            } else {
1232                std::cmp::Ordering::Equal
1233            }
1234        }) {
1235            Ok(idx) => idx,
1236            Err(idx) => idx.saturating_sub(1),
1237        };
1238
1239        let line = &lines[idx];
1240        let line_num = idx + 1;
1241        let col = byte_offset.saturating_sub(line.byte_offset);
1242
1243        (idx, line_num, col)
1244    }
1245
1246    /// Check if a byte offset is within a code span using binary search
1247    #[inline]
1248    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1249        // Since spans are sorted by byte_offset, use partition_point for binary search
1250        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1251
1252        // Check the span that starts at or before our offset
1253        if idx > 0 {
1254            let span = &code_spans[idx - 1];
1255            if offset >= span.byte_offset && offset < span.byte_end {
1256                return true;
1257            }
1258        }
1259
1260        false
1261    }
1262
1263    /// Collect byte ranges of all links using pulldown-cmark
1264    /// This is used to skip heading detection for lines that fall within link syntax
1265    /// (e.g., multiline links like `[text](url\n#fragment)`)
1266    fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1267        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1268
1269        let mut link_ranges = Vec::new();
1270        let mut options = Options::empty();
1271        options.insert(Options::ENABLE_WIKILINKS);
1272        options.insert(Options::ENABLE_FOOTNOTES);
1273
1274        let parser = Parser::new_ext(content, options).into_offset_iter();
1275        let mut link_stack: Vec<usize> = Vec::new();
1276
1277        for (event, range) in parser {
1278            match event {
1279                Event::Start(Tag::Link { .. }) => {
1280                    link_stack.push(range.start);
1281                }
1282                Event::End(TagEnd::Link) => {
1283                    if let Some(start_pos) = link_stack.pop() {
1284                        link_ranges.push((start_pos, range.end));
1285                    }
1286                }
1287                _ => {}
1288            }
1289        }
1290
1291        link_ranges
1292    }
1293
1294    /// Parse all links in the content
1295    fn parse_links(
1296        content: &'a str,
1297        lines: &[LineInfo],
1298        code_blocks: &[(usize, usize)],
1299        code_spans: &[CodeSpan],
1300        flavor: MarkdownFlavor,
1301        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1302    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1303        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1304        use std::collections::HashSet;
1305
1306        let mut links = Vec::with_capacity(content.len() / 500);
1307        let mut broken_links = Vec::new();
1308        let mut footnote_refs = Vec::new();
1309
1310        // Track byte positions of links found by pulldown-cmark
1311        let mut found_positions = HashSet::new();
1312
1313        // Use pulldown-cmark's streaming parser with BrokenLink callback
1314        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1315        // This automatically handles:
1316        // - Escaped links (won't generate events)
1317        // - Links in code blocks/spans (won't generate Link events)
1318        // - Images (generates Tag::Image instead)
1319        // - Reference resolution (dest_url is already resolved!)
1320        // - Broken references (callback is invoked)
1321        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1322        let mut options = Options::empty();
1323        options.insert(Options::ENABLE_WIKILINKS);
1324        options.insert(Options::ENABLE_FOOTNOTES);
1325
1326        let parser = Parser::new_with_broken_link_callback(
1327            content,
1328            options,
1329            Some(|link: BrokenLink<'_>| {
1330                broken_links.push(BrokenLinkInfo {
1331                    reference: link.reference.to_string(),
1332                    span: link.span.clone(),
1333                });
1334                None
1335            }),
1336        )
1337        .into_offset_iter();
1338
1339        let mut link_stack: Vec<(
1340            usize,
1341            usize,
1342            pulldown_cmark::CowStr<'a>,
1343            LinkType,
1344            pulldown_cmark::CowStr<'a>,
1345        )> = Vec::new();
1346        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1347
1348        for (event, range) in parser {
1349            match event {
1350                Event::Start(Tag::Link {
1351                    link_type,
1352                    dest_url,
1353                    id,
1354                    ..
1355                }) => {
1356                    // Link start - record position, URL, and reference ID
1357                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1358                    text_chunks.clear();
1359                }
1360                Event::Text(text) if !link_stack.is_empty() => {
1361                    // Track text content with its byte range
1362                    text_chunks.push((text.to_string(), range.start, range.end));
1363                }
1364                Event::Code(code) if !link_stack.is_empty() => {
1365                    // Include inline code in link text (with backticks)
1366                    let code_text = format!("`{code}`");
1367                    text_chunks.push((code_text, range.start, range.end));
1368                }
1369                Event::End(TagEnd::Link) => {
1370                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1371                        // Skip if in HTML comment
1372                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1373                            text_chunks.clear();
1374                            continue;
1375                        }
1376
1377                        // Find line and column information
1378                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1379
1380                        // Skip if this link is on a MkDocs snippet line
1381                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1382                            text_chunks.clear();
1383                            continue;
1384                        }
1385
1386                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1387
1388                        let is_reference = matches!(
1389                            link_type,
1390                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1391                        );
1392
1393                        // Extract link text directly from source bytes to preserve escaping
1394                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1395                        let link_text = if start_pos < content.len() {
1396                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1397
1398                            // Find MATCHING ] by tracking bracket depth for nested brackets
1399                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1400                            // Brackets inside code spans (between backticks) should be ignored
1401                            let mut close_pos = None;
1402                            let mut depth = 0;
1403                            let mut in_code_span = false;
1404
1405                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1406                                // Count preceding backslashes
1407                                let mut backslash_count = 0;
1408                                let mut j = i;
1409                                while j > 0 && link_bytes[j - 1] == b'\\' {
1410                                    backslash_count += 1;
1411                                    j -= 1;
1412                                }
1413                                let is_escaped = backslash_count % 2 != 0;
1414
1415                                // Track code spans - backticks toggle in/out of code
1416                                if byte == b'`' && !is_escaped {
1417                                    in_code_span = !in_code_span;
1418                                }
1419
1420                                // Only count brackets when NOT in a code span
1421                                if !is_escaped && !in_code_span {
1422                                    if byte == b'[' {
1423                                        depth += 1;
1424                                    } else if byte == b']' {
1425                                        if depth == 0 {
1426                                            // Found the matching closing bracket
1427                                            close_pos = Some(i);
1428                                            break;
1429                                        } else {
1430                                            depth -= 1;
1431                                        }
1432                                    }
1433                                }
1434                            }
1435
1436                            if let Some(pos) = close_pos {
1437                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1438                            } else {
1439                                Cow::Borrowed("")
1440                            }
1441                        } else {
1442                            Cow::Borrowed("")
1443                        };
1444
1445                        // For reference links, use the actual reference ID from pulldown-cmark
1446                        let reference_id = if is_reference && !ref_id.is_empty() {
1447                            Some(Cow::Owned(ref_id.to_lowercase()))
1448                        } else if is_reference {
1449                            // For collapsed/shortcut references without explicit ID, use the link text
1450                            Some(Cow::Owned(link_text.to_lowercase()))
1451                        } else {
1452                            None
1453                        };
1454
1455                        // Track this position as found
1456                        found_positions.insert(start_pos);
1457
1458                        links.push(ParsedLink {
1459                            line: line_num,
1460                            start_col: col_start,
1461                            end_col: col_end,
1462                            byte_offset: start_pos,
1463                            byte_end: range.end,
1464                            text: link_text,
1465                            url: Cow::Owned(url.to_string()),
1466                            is_reference,
1467                            reference_id,
1468                            link_type,
1469                        });
1470
1471                        text_chunks.clear();
1472                    }
1473                }
1474                Event::FootnoteReference(footnote_id) => {
1475                    // Capture footnote references like [^1], [^note]
1476                    // Skip if in HTML comment
1477                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1478                        continue;
1479                    }
1480
1481                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1482                    footnote_refs.push(FootnoteRef {
1483                        id: footnote_id.to_string(),
1484                        line: line_num,
1485                        byte_offset: range.start,
1486                        byte_end: range.end,
1487                    });
1488                }
1489                _ => {}
1490            }
1491        }
1492
1493        // Also find undefined references using regex
1494        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1495        // because the reference is undefined
1496        for cap in LINK_PATTERN.captures_iter(content) {
1497            let full_match = cap.get(0).unwrap();
1498            let match_start = full_match.start();
1499            let match_end = full_match.end();
1500
1501            // Skip if this was already found by pulldown-cmark (it's a valid link)
1502            if found_positions.contains(&match_start) {
1503                continue;
1504            }
1505
1506            // Skip if escaped
1507            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1508                continue;
1509            }
1510
1511            // Skip if it's an image
1512            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1513                continue;
1514            }
1515
1516            // Skip if in code block
1517            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1518                continue;
1519            }
1520
1521            // Skip if in code span
1522            if Self::is_offset_in_code_span(code_spans, match_start) {
1523                continue;
1524            }
1525
1526            // Skip if in HTML comment
1527            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1528                continue;
1529            }
1530
1531            // Find line and column information
1532            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1533
1534            // Skip if this link is on a MkDocs snippet line
1535            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1536                continue;
1537            }
1538
1539            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1540
1541            let text = cap.get(1).map_or("", |m| m.as_str());
1542
1543            // Only process reference links (group 6)
1544            if let Some(ref_id) = cap.get(6) {
1545                let ref_id_str = ref_id.as_str();
1546                let normalized_ref = if ref_id_str.is_empty() {
1547                    Cow::Owned(text.to_lowercase()) // Implicit reference
1548                } else {
1549                    Cow::Owned(ref_id_str.to_lowercase())
1550                };
1551
1552                // This is an undefined reference (pulldown-cmark didn't parse it)
1553                links.push(ParsedLink {
1554                    line: line_num,
1555                    start_col: col_start,
1556                    end_col: col_end,
1557                    byte_offset: match_start,
1558                    byte_end: match_end,
1559                    text: Cow::Borrowed(text),
1560                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1561                    is_reference: true,
1562                    reference_id: Some(normalized_ref),
1563                    link_type: LinkType::Reference, // Undefined references are reference-style
1564                });
1565            }
1566        }
1567
1568        (links, broken_links, footnote_refs)
1569    }
1570
1571    /// Parse all images in the content
1572    fn parse_images(
1573        content: &'a str,
1574        lines: &[LineInfo],
1575        code_blocks: &[(usize, usize)],
1576        code_spans: &[CodeSpan],
1577        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1578    ) -> Vec<ParsedImage<'a>> {
1579        use crate::utils::skip_context::is_in_html_comment_ranges;
1580        use std::collections::HashSet;
1581
1582        // Pre-size based on a heuristic: images are less common than links
1583        let mut images = Vec::with_capacity(content.len() / 1000);
1584        let mut found_positions = HashSet::new();
1585
1586        // Use pulldown-cmark for parsing - more accurate and faster
1587        let parser = Parser::new(content).into_offset_iter();
1588        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1589            Vec::new();
1590        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1591
1592        for (event, range) in parser {
1593            match event {
1594                Event::Start(Tag::Image {
1595                    link_type,
1596                    dest_url,
1597                    id,
1598                    ..
1599                }) => {
1600                    image_stack.push((range.start, dest_url, link_type, id));
1601                    text_chunks.clear();
1602                }
1603                Event::Text(text) if !image_stack.is_empty() => {
1604                    text_chunks.push((text.to_string(), range.start, range.end));
1605                }
1606                Event::Code(code) if !image_stack.is_empty() => {
1607                    let code_text = format!("`{code}`");
1608                    text_chunks.push((code_text, range.start, range.end));
1609                }
1610                Event::End(TagEnd::Image) => {
1611                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1612                        // Skip if in code block
1613                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1614                            continue;
1615                        }
1616
1617                        // Skip if in code span
1618                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1619                            continue;
1620                        }
1621
1622                        // Skip if in HTML comment
1623                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1624                            continue;
1625                        }
1626
1627                        // Find line and column using binary search
1628                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1629                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1630
1631                        let is_reference = matches!(
1632                            link_type,
1633                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1634                        );
1635
1636                        // Extract alt text directly from source bytes to preserve escaping
1637                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1638                        let alt_text = if start_pos < content.len() {
1639                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1640
1641                            // Find MATCHING ] by tracking bracket depth for nested brackets
1642                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1643                            let mut close_pos = None;
1644                            let mut depth = 0;
1645
1646                            if image_bytes.len() > 2 {
1647                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1648                                    // Count preceding backslashes
1649                                    let mut backslash_count = 0;
1650                                    let mut j = i;
1651                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1652                                        backslash_count += 1;
1653                                        j -= 1;
1654                                    }
1655                                    let is_escaped = backslash_count % 2 != 0;
1656
1657                                    if !is_escaped {
1658                                        if byte == b'[' {
1659                                            depth += 1;
1660                                        } else if byte == b']' {
1661                                            if depth == 0 {
1662                                                // Found the matching closing bracket
1663                                                close_pos = Some(i);
1664                                                break;
1665                                            } else {
1666                                                depth -= 1;
1667                                            }
1668                                        }
1669                                    }
1670                                }
1671                            }
1672
1673                            if let Some(pos) = close_pos {
1674                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1675                            } else {
1676                                Cow::Borrowed("")
1677                            }
1678                        } else {
1679                            Cow::Borrowed("")
1680                        };
1681
1682                        let reference_id = if is_reference && !ref_id.is_empty() {
1683                            Some(Cow::Owned(ref_id.to_lowercase()))
1684                        } else if is_reference {
1685                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1686                        } else {
1687                            None
1688                        };
1689
1690                        found_positions.insert(start_pos);
1691                        images.push(ParsedImage {
1692                            line: line_num,
1693                            start_col: col_start,
1694                            end_col: col_end,
1695                            byte_offset: start_pos,
1696                            byte_end: range.end,
1697                            alt_text,
1698                            url: Cow::Owned(url.to_string()),
1699                            is_reference,
1700                            reference_id,
1701                            link_type,
1702                        });
1703                    }
1704                }
1705                _ => {}
1706            }
1707        }
1708
1709        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1710        for cap in IMAGE_PATTERN.captures_iter(content) {
1711            let full_match = cap.get(0).unwrap();
1712            let match_start = full_match.start();
1713            let match_end = full_match.end();
1714
1715            // Skip if already found by pulldown-cmark
1716            if found_positions.contains(&match_start) {
1717                continue;
1718            }
1719
1720            // Skip if the ! is escaped
1721            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1722                continue;
1723            }
1724
1725            // Skip if in code block, code span, or HTML comment
1726            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1727                || Self::is_offset_in_code_span(code_spans, match_start)
1728                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1729            {
1730                continue;
1731            }
1732
1733            // Only process reference images (undefined references not found by pulldown-cmark)
1734            if let Some(ref_id) = cap.get(6) {
1735                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1736                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1737                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1738                let ref_id_str = ref_id.as_str();
1739                let normalized_ref = if ref_id_str.is_empty() {
1740                    Cow::Owned(alt_text.to_lowercase())
1741                } else {
1742                    Cow::Owned(ref_id_str.to_lowercase())
1743                };
1744
1745                images.push(ParsedImage {
1746                    line: line_num,
1747                    start_col: col_start,
1748                    end_col: col_end,
1749                    byte_offset: match_start,
1750                    byte_end: match_end,
1751                    alt_text: Cow::Borrowed(alt_text),
1752                    url: Cow::Borrowed(""),
1753                    is_reference: true,
1754                    reference_id: Some(normalized_ref),
1755                    link_type: LinkType::Reference, // Undefined references are reference-style
1756                });
1757            }
1758        }
1759
1760        images
1761    }
1762
1763    /// Parse reference definitions
1764    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1765        // Pre-size based on lines count as reference definitions are line-based
1766        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1767
1768        for (line_idx, line_info) in lines.iter().enumerate() {
1769            // Skip lines in code blocks
1770            if line_info.in_code_block {
1771                continue;
1772            }
1773
1774            let line = line_info.content(content);
1775            let line_num = line_idx + 1;
1776
1777            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1778                let id_raw = cap.get(1).unwrap().as_str();
1779
1780                // Skip footnote definitions - they use [^id]: syntax and are semantically
1781                // different from reference link definitions
1782                if id_raw.starts_with('^') {
1783                    continue;
1784                }
1785
1786                let id = id_raw.to_lowercase();
1787                let url = cap.get(2).unwrap().as_str().to_string();
1788                let title_match = cap.get(3).or_else(|| cap.get(4));
1789                let title = title_match.map(|m| m.as_str().to_string());
1790
1791                // Calculate byte positions
1792                // The match starts at the beginning of the line (0) and extends to the end
1793                let match_obj = cap.get(0).unwrap();
1794                let byte_offset = line_info.byte_offset + match_obj.start();
1795                let byte_end = line_info.byte_offset + match_obj.end();
1796
1797                // Calculate title byte positions (includes the quote character before content)
1798                let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1799                    // The match is the content inside quotes, so we include the quote before
1800                    let start = line_info.byte_offset + m.start().saturating_sub(1);
1801                    let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1802                    (Some(start), Some(end))
1803                } else {
1804                    (None, None)
1805                };
1806
1807                refs.push(ReferenceDef {
1808                    line: line_num,
1809                    id,
1810                    url,
1811                    title,
1812                    byte_offset,
1813                    byte_end,
1814                    title_byte_start,
1815                    title_byte_end,
1816                });
1817            }
1818        }
1819
1820        refs
1821    }
1822
1823    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1824    /// Handles nested blockquotes like `> > > content`
1825    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1826    #[inline]
1827    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1828        let trimmed_start = line.trim_start();
1829        if !trimmed_start.starts_with('>') {
1830            return None;
1831        }
1832
1833        // Track total prefix length to handle nested blockquotes
1834        let mut remaining = line;
1835        let mut total_prefix_len = 0;
1836
1837        loop {
1838            let trimmed = remaining.trim_start();
1839            if !trimmed.starts_with('>') {
1840                break;
1841            }
1842
1843            // Add leading whitespace + '>' to prefix
1844            let leading_ws_len = remaining.len() - trimmed.len();
1845            total_prefix_len += leading_ws_len + 1;
1846
1847            let after_gt = &trimmed[1..];
1848
1849            // Handle optional whitespace after '>' (space or tab)
1850            if let Some(stripped) = after_gt.strip_prefix(' ') {
1851                total_prefix_len += 1;
1852                remaining = stripped;
1853            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1854                total_prefix_len += 1;
1855                remaining = stripped;
1856            } else {
1857                remaining = after_gt;
1858            }
1859        }
1860
1861        Some((&line[..total_prefix_len], remaining))
1862    }
1863
1864    /// Detect list items using pulldown-cmark for CommonMark-compliant parsing.
1865    ///
1866    /// Returns a HashMap keyed by line byte offset, containing:
1867    /// `(is_ordered, marker, marker_column, content_column, number)`
1868    ///
1869    /// ## Why pulldown-cmark?
1870    /// Using pulldown-cmark instead of regex ensures we only detect actual list items,
1871    /// not lines that merely look like lists (e.g., continuation paragraphs, code blocks).
1872    /// This fixes issue #253 where continuation lines were falsely detected.
1873    ///
1874    /// ## Tab indentation quirk
1875    /// Pulldown-cmark reports nested list items at the newline character position
1876    /// when tab indentation is used. For example, in `"* Item\n\t- Nested"`,
1877    /// the nested item is reported at byte 7 (the `\n`), not byte 8 (the `\t`).
1878    /// We detect this and advance to the correct line.
1879    ///
1880    /// ## HashMap key strategy
1881    /// We use `entry().or_insert()` because pulldown-cmark may emit multiple events
1882    /// that resolve to the same line (after newline adjustment). The first event
1883    /// for each line is authoritative.
1884    /// Detect list items and emphasis spans in a single pulldown-cmark pass.
1885    /// Returns both list items (for LineInfo) and emphasis spans (for MD030).
1886    /// This avoids a separate parse for emphasis detection.
1887    fn detect_list_items_and_emphasis_with_pulldown(
1888        content: &str,
1889        line_offsets: &[usize],
1890        flavor: MarkdownFlavor,
1891        front_matter_end: usize,
1892        code_blocks: &[(usize, usize)],
1893    ) -> (ListItemMap, Vec<EmphasisSpan>) {
1894        use std::collections::HashMap;
1895
1896        let mut list_items = HashMap::new();
1897        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1898
1899        let mut options = Options::empty();
1900        options.insert(Options::ENABLE_TABLES);
1901        options.insert(Options::ENABLE_FOOTNOTES);
1902        options.insert(Options::ENABLE_STRIKETHROUGH);
1903        options.insert(Options::ENABLE_TASKLISTS);
1904        // Always enable GFM features for consistency with existing behavior
1905        options.insert(Options::ENABLE_GFM);
1906
1907        // Suppress unused variable warning
1908        let _ = flavor;
1909
1910        let parser = Parser::new_ext(content, options).into_offset_iter();
1911        let mut list_depth: usize = 0;
1912        let mut list_stack: Vec<bool> = Vec::new();
1913
1914        for (event, range) in parser {
1915            match event {
1916                // Capture emphasis spans (for MD030's emphasis detection)
1917                Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
1918                    let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
1919                        2
1920                    } else {
1921                        1
1922                    };
1923                    let match_start = range.start;
1924                    let match_end = range.end;
1925
1926                    // Skip if in code block
1927                    if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1928                        // Determine marker character by looking at the content at the start
1929                        let marker = content[match_start..].chars().next().unwrap_or('*');
1930                        if marker == '*' || marker == '_' {
1931                            // Extract content between markers
1932                            let content_start = match_start + marker_count;
1933                            let content_end = if match_end >= marker_count {
1934                                match_end - marker_count
1935                            } else {
1936                                match_end
1937                            };
1938                            let content_part = if content_start < content_end && content_end <= content.len() {
1939                                &content[content_start..content_end]
1940                            } else {
1941                                ""
1942                            };
1943
1944                            // Find which line this emphasis is on using line_offsets
1945                            let line_idx = match line_offsets.binary_search(&match_start) {
1946                                Ok(idx) => idx,
1947                                Err(idx) => idx.saturating_sub(1),
1948                            };
1949                            let line_num = line_idx + 1;
1950                            let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
1951                            let col_start = match_start - line_start;
1952                            let col_end = match_end - line_start;
1953
1954                            emphasis_spans.push(EmphasisSpan {
1955                                line: line_num,
1956                                start_col: col_start,
1957                                end_col: col_end,
1958                                byte_offset: match_start,
1959                                byte_end: match_end,
1960                                marker,
1961                                marker_count,
1962                                content: content_part.to_string(),
1963                            });
1964                        }
1965                    }
1966                }
1967                Event::Start(Tag::List(start_number)) => {
1968                    list_depth += 1;
1969                    list_stack.push(start_number.is_some());
1970                }
1971                Event::End(TagEnd::List(_)) => {
1972                    list_depth = list_depth.saturating_sub(1);
1973                    list_stack.pop();
1974                }
1975                Event::Start(Tag::Item) if list_depth > 0 => {
1976                    // Get the ordered state for the CURRENT (innermost) list
1977                    let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
1978                    // Find which line this byte offset corresponds to
1979                    let item_start = range.start;
1980
1981                    // Binary search to find the line number
1982                    let mut line_idx = match line_offsets.binary_search(&item_start) {
1983                        Ok(idx) => idx,
1984                        Err(idx) => idx.saturating_sub(1),
1985                    };
1986
1987                    // Pulldown-cmark reports nested list items at the newline before the item
1988                    // when using tab indentation (e.g., "* Item\n\t- Nested").
1989                    // Advance to the actual content line in this case.
1990                    if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
1991                        line_idx += 1;
1992                    }
1993
1994                    // Skip list items in frontmatter (they are YAML/TOML syntax, not Markdown)
1995                    if front_matter_end > 0 && line_idx < front_matter_end {
1996                        continue;
1997                    }
1998
1999                    if line_idx < line_offsets.len() {
2000                        let line_start_byte = line_offsets[line_idx];
2001                        let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
2002                        let line = &content[line_start_byte..line_end.min(content.len())];
2003
2004                        // Strip trailing newline
2005                        let line = line
2006                            .strip_suffix('\n')
2007                            .or_else(|| line.strip_suffix("\r\n"))
2008                            .unwrap_or(line);
2009
2010                        // Strip blockquote prefix if present
2011                        let blockquote_parse = Self::parse_blockquote_prefix(line);
2012                        let (blockquote_prefix_len, line_to_parse) = if let Some((prefix, content)) = blockquote_parse {
2013                            (prefix.len(), content)
2014                        } else {
2015                            (0, line)
2016                        };
2017
2018                        // Parse the list marker from the actual line
2019                        if current_list_is_ordered {
2020                            if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
2021                                Self::parse_ordered_list(line_to_parse)
2022                            {
2023                                let marker = format!("{number_str}{delimiter}");
2024                                let marker_column = blockquote_prefix_len + leading_spaces.len();
2025                                let content_column = marker_column + marker.len() + spacing.len();
2026                                let number = number_str.parse().ok();
2027
2028                                list_items.entry(line_start_byte).or_insert((
2029                                    true,
2030                                    marker,
2031                                    marker_column,
2032                                    content_column,
2033                                    number,
2034                                ));
2035                            }
2036                        } else if let Some((leading_spaces, marker, spacing, _content)) =
2037                            Self::parse_unordered_list(line_to_parse)
2038                        {
2039                            let marker_column = blockquote_prefix_len + leading_spaces.len();
2040                            let content_column = marker_column + 1 + spacing.len();
2041
2042                            list_items.entry(line_start_byte).or_insert((
2043                                false,
2044                                marker.to_string(),
2045                                marker_column,
2046                                content_column,
2047                                None,
2048                            ));
2049                        }
2050                    }
2051                }
2052                _ => {}
2053            }
2054        }
2055
2056        (list_items, emphasis_spans)
2057    }
2058
2059    /// Fast unordered list parser - replaces regex for 5-10x speedup
2060    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
2061    /// Returns: Some((leading_ws, marker, spacing, content)) or None
2062    #[inline]
2063    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
2064        let bytes = line.as_bytes();
2065        let mut i = 0;
2066
2067        // Skip leading whitespace
2068        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2069            i += 1;
2070        }
2071
2072        // Check for marker
2073        if i >= bytes.len() {
2074            return None;
2075        }
2076        let marker = bytes[i] as char;
2077        if marker != '-' && marker != '*' && marker != '+' {
2078            return None;
2079        }
2080        let marker_pos = i;
2081        i += 1;
2082
2083        // Collect spacing after marker (space or tab only)
2084        let spacing_start = i;
2085        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2086            i += 1;
2087        }
2088
2089        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
2090    }
2091
2092    /// Fast ordered list parser - replaces regex for 5-10x speedup
2093    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
2094    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
2095    #[inline]
2096    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
2097        let bytes = line.as_bytes();
2098        let mut i = 0;
2099
2100        // Skip leading whitespace
2101        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2102            i += 1;
2103        }
2104
2105        // Collect digits
2106        let number_start = i;
2107        while i < bytes.len() && bytes[i].is_ascii_digit() {
2108            i += 1;
2109        }
2110        if i == number_start {
2111            return None; // No digits found
2112        }
2113
2114        // Check for delimiter
2115        if i >= bytes.len() {
2116            return None;
2117        }
2118        let delimiter = bytes[i] as char;
2119        if delimiter != '.' && delimiter != ')' {
2120            return None;
2121        }
2122        let delimiter_pos = i;
2123        i += 1;
2124
2125        // Collect spacing after delimiter (space or tab only)
2126        let spacing_start = i;
2127        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2128            i += 1;
2129        }
2130
2131        Some((
2132            &line[..number_start],
2133            &line[number_start..delimiter_pos],
2134            delimiter,
2135            &line[spacing_start..i],
2136            &line[i..],
2137        ))
2138    }
2139
2140    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
2141    /// Returns a Vec<bool> where index i indicates if line i is in a code block
2142    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
2143        let num_lines = line_offsets.len();
2144        let mut in_code_block = vec![false; num_lines];
2145
2146        // For each code block, mark all lines within it
2147        for &(start, end) in code_blocks {
2148            // Ensure we're at valid UTF-8 boundaries
2149            let safe_start = if start > 0 && !content.is_char_boundary(start) {
2150                let mut boundary = start;
2151                while boundary > 0 && !content.is_char_boundary(boundary) {
2152                    boundary -= 1;
2153                }
2154                boundary
2155            } else {
2156                start
2157            };
2158
2159            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
2160                let mut boundary = end;
2161                while boundary < content.len() && !content.is_char_boundary(boundary) {
2162                    boundary += 1;
2163                }
2164                boundary
2165            } else {
2166                end.min(content.len())
2167            };
2168
2169            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
2170            // That function now has proper list context awareness (see code_block_utils.rs)
2171            // and correctly distinguishes between:
2172            // - Fenced code blocks (``` or ~~~)
2173            // - Indented code blocks at document level (4 spaces + blank line before)
2174            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
2175            //
2176            // We no longer need to re-validate here. The original validation logic
2177            // was causing false positives by marking list continuation paragraphs as
2178            // code blocks when they have 4 spaces of indentation.
2179
2180            // Use binary search to find the first and last line indices
2181            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
2182            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
2183            //
2184            // Find the line that CONTAINS safe_start: the line with the largest
2185            // start offset that is <= safe_start. partition_point gives us the
2186            // first line that starts AFTER safe_start, so we subtract 1.
2187            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
2188            let first_line = first_line_after.saturating_sub(1);
2189            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
2190
2191            // Mark all lines in the range at once
2192            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
2193                *flag = true;
2194            }
2195        }
2196
2197        in_code_block
2198    }
2199
2200    /// Pre-compute which lines are inside math blocks ($$ ... $$) - O(n) single pass
2201    /// Returns a Vec<bool> where index i indicates if line i is in a math block
2202    fn compute_math_block_line_map(content: &str, code_block_map: &[bool]) -> Vec<bool> {
2203        let content_lines: Vec<&str> = content.lines().collect();
2204        let num_lines = content_lines.len();
2205        let mut in_math_block = vec![false; num_lines];
2206
2207        let mut inside_math = false;
2208
2209        for (i, line) in content_lines.iter().enumerate() {
2210            // Skip lines that are in code blocks - math delimiters inside code are literal
2211            if code_block_map.get(i).copied().unwrap_or(false) {
2212                continue;
2213            }
2214
2215            let trimmed = line.trim();
2216
2217            // Check for math block delimiter ($$)
2218            // A line with just $$ toggles the math block state
2219            if trimmed == "$$" {
2220                if inside_math {
2221                    // Closing delimiter - this line is still part of the math block
2222                    in_math_block[i] = true;
2223                    inside_math = false;
2224                } else {
2225                    // Opening delimiter - this line starts the math block
2226                    in_math_block[i] = true;
2227                    inside_math = true;
2228                }
2229            } else if inside_math {
2230                // Content inside math block
2231                in_math_block[i] = true;
2232            }
2233        }
2234
2235        in_math_block
2236    }
2237
2238    /// Pre-compute basic line information (without headings/blockquotes)
2239    /// Also returns emphasis spans detected during the pulldown-cmark parse
2240    fn compute_basic_line_info(
2241        content: &str,
2242        line_offsets: &[usize],
2243        code_blocks: &[(usize, usize)],
2244        flavor: MarkdownFlavor,
2245        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2246        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
2247    ) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
2248        let content_lines: Vec<&str> = content.lines().collect();
2249        let mut lines = Vec::with_capacity(content_lines.len());
2250
2251        // Pre-compute which lines are in code blocks
2252        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
2253
2254        // Pre-compute which lines are in math blocks ($$ ... $$)
2255        let math_block_map = Self::compute_math_block_line_map(content, &code_block_map);
2256
2257        // Detect front matter boundaries FIRST, before any other parsing
2258        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
2259        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2260
2261        // Use pulldown-cmark to detect list items AND emphasis spans in a single pass
2262        // (context-aware, eliminates false positives)
2263        let (list_item_map, emphasis_spans) = Self::detect_list_items_and_emphasis_with_pulldown(
2264            content,
2265            line_offsets,
2266            flavor,
2267            front_matter_end,
2268            code_blocks,
2269        );
2270
2271        for (i, line) in content_lines.iter().enumerate() {
2272            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
2273            let indent = line.len() - line.trim_start().len();
2274            // Compute visual indent with proper CommonMark tab expansion
2275            let visual_indent = ElementCache::calculate_indentation_width_default(line);
2276
2277            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
2278            let blockquote_parse = Self::parse_blockquote_prefix(line);
2279
2280            // For blank detection, consider blockquote context
2281            let is_blank = if let Some((_, content)) = blockquote_parse {
2282                // In blockquote context, check if content after prefix is blank
2283                content.trim().is_empty()
2284            } else {
2285                line.trim().is_empty()
2286            };
2287
2288            // Use pre-computed map for O(1) lookup instead of O(m) iteration
2289            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
2290
2291            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
2292            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
2293                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
2294            // Check if the ENTIRE line is within an HTML comment (not just the line start)
2295            // This ensures content after `-->` on the same line is not incorrectly skipped
2296            let line_end_offset = byte_offset + line.len();
2297            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
2298                html_comment_ranges,
2299                byte_offset,
2300                line_end_offset,
2301            );
2302            // Use pulldown-cmark's list detection for context-aware parsing
2303            // This eliminates false positives on continuation lines (issue #253)
2304            let list_item =
2305                list_item_map
2306                    .get(&byte_offset)
2307                    .map(
2308                        |(is_ordered, marker, marker_column, content_column, number)| ListItemInfo {
2309                            marker: marker.clone(),
2310                            is_ordered: *is_ordered,
2311                            number: *number,
2312                            marker_column: *marker_column,
2313                            content_column: *content_column,
2314                        },
2315                    );
2316
2317            // Detect horizontal rules (only outside code blocks and frontmatter)
2318            // Uses CommonMark-compliant check including leading indentation validation
2319            let in_front_matter = front_matter_end > 0 && i < front_matter_end;
2320            let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
2321
2322            // Get math block status for this line
2323            let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
2324
2325            lines.push(LineInfo {
2326                byte_offset,
2327                byte_len: line.len(),
2328                indent,
2329                visual_indent,
2330                is_blank,
2331                in_code_block,
2332                in_front_matter,
2333                in_html_block: false, // Will be populated after line creation
2334                in_html_comment,
2335                list_item,
2336                heading: None,    // Will be populated in second pass for Setext headings
2337                blockquote: None, // Will be populated after line creation
2338                in_mkdocstrings,
2339                in_esm_block: false, // Will be populated after line creation for MDX files
2340                in_code_span_continuation: false, // Will be populated after code spans are parsed
2341                is_horizontal_rule: is_hr,
2342                in_math_block,
2343            });
2344        }
2345
2346        (lines, emphasis_spans)
2347    }
2348
2349    /// Detect headings and blockquotes (called after HTML block detection)
2350    fn detect_headings_and_blockquotes(
2351        content: &str,
2352        lines: &mut [LineInfo],
2353        flavor: MarkdownFlavor,
2354        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2355        link_byte_ranges: &[(usize, usize)],
2356    ) {
2357        // Regex for heading detection
2358        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2359            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2360        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2361            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2362
2363        let content_lines: Vec<&str> = content.lines().collect();
2364
2365        // Detect front matter boundaries to skip those lines
2366        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2367
2368        // Detect headings (including Setext which needs look-ahead) and blockquotes
2369        for i in 0..lines.len() {
2370            let line = content_lines[i];
2371
2372            // Detect blockquotes FIRST, before any skip conditions.
2373            // A line can be both a blockquote AND contain a code block inside it.
2374            // We need to know about the blockquote marker regardless of code block status.
2375            // Skip only frontmatter lines - those are never blockquotes.
2376            if !(front_matter_end > 0 && i < front_matter_end)
2377                && let Some(bq) = parse_blockquote_detailed(line)
2378            {
2379                let nesting_level = bq.markers.len();
2380                let marker_column = bq.indent.len();
2381                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2382                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2383                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2384                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2385
2386                lines[i].blockquote = Some(BlockquoteInfo {
2387                    nesting_level,
2388                    indent: bq.indent.to_string(),
2389                    marker_column,
2390                    prefix,
2391                    content: bq.content.to_string(),
2392                    has_no_space_after_marker: has_no_space,
2393                    has_multiple_spaces_after_marker: has_multiple_spaces,
2394                    needs_md028_fix,
2395                });
2396
2397                // Update is_horizontal_rule for blockquote content
2398                // The original detection doesn't strip blockquote prefix, so we need to check here
2399                if !lines[i].in_code_block && is_horizontal_rule_content(bq.content.trim()) {
2400                    lines[i].is_horizontal_rule = true;
2401                }
2402            }
2403
2404            // Now apply skip conditions for heading detection
2405            if lines[i].in_code_block {
2406                continue;
2407            }
2408
2409            // Skip lines in front matter
2410            if front_matter_end > 0 && i < front_matter_end {
2411                continue;
2412            }
2413
2414            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2415            if lines[i].in_html_block {
2416                continue;
2417            }
2418
2419            // Skip heading detection for blank lines
2420            if lines[i].is_blank {
2421                continue;
2422            }
2423
2424            // Check for ATX headings (but skip MkDocs snippet lines)
2425            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2426            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2427                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2428                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2429            } else {
2430                false
2431            };
2432
2433            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2434                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2435                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2436                    continue;
2437                }
2438                // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2439                // This prevents false positives where `#fragment` is detected as a heading
2440                let line_offset = lines[i].byte_offset;
2441                if link_byte_ranges
2442                    .iter()
2443                    .any(|&(start, end)| line_offset > start && line_offset < end)
2444                {
2445                    continue;
2446                }
2447                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2448                let hashes = caps.get(2).map_or("", |m| m.as_str());
2449                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2450                let rest = caps.get(4).map_or("", |m| m.as_str());
2451
2452                let level = hashes.len() as u8;
2453                let marker_column = leading_spaces.len();
2454
2455                // Check for closing sequence, but handle custom IDs that might come after
2456                let (text, has_closing, closing_seq) = {
2457                    // First check if there's a custom ID at the end
2458                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2459                        // Check if this looks like a valid custom ID (ends with })
2460                        if rest[id_start..].trim_end().ends_with('}') {
2461                            // Split off the custom ID
2462                            (&rest[..id_start], &rest[id_start..])
2463                        } else {
2464                            (rest, "")
2465                        }
2466                    } else {
2467                        (rest, "")
2468                    };
2469
2470                    // Now look for closing hashes in the part before the custom ID
2471                    let trimmed_rest = rest_without_id.trim_end();
2472                    if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2473                        // Find the start of the hash sequence by walking backwards
2474                        // Use char_indices to get byte positions at char boundaries
2475                        let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2476
2477                        // Find which char index corresponds to last_hash_byte_pos
2478                        let last_hash_char_idx = char_positions
2479                            .iter()
2480                            .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2481
2482                        if let Some(mut char_idx) = last_hash_char_idx {
2483                            // Walk backwards to find start of hash sequence
2484                            while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2485                                char_idx -= 1;
2486                            }
2487
2488                            // Get the byte position of the start of hashes
2489                            let start_of_hashes = char_positions[char_idx].0;
2490
2491                            // Check if there's at least one space before the closing hashes
2492                            let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2493
2494                            // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2495                            let potential_closing = &trimmed_rest[start_of_hashes..];
2496                            let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2497
2498                            if is_all_hashes && has_space_before {
2499                                // This is a closing sequence
2500                                let closing_hashes = potential_closing.to_string();
2501                                // The text is everything before the closing hashes
2502                                // Don't include the custom ID here - it will be extracted later
2503                                let text_part = if !custom_id_part.is_empty() {
2504                                    // If we have a custom ID, append it back to get the full rest
2505                                    // This allows the extract_header_id function to handle it properly
2506                                    format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2507                                } else {
2508                                    trimmed_rest[..start_of_hashes].trim_end().to_string()
2509                                };
2510                                (text_part, true, closing_hashes)
2511                            } else {
2512                                // Not a valid closing sequence, return the full content
2513                                (rest.to_string(), false, String::new())
2514                            }
2515                        } else {
2516                            // Couldn't find char boundary, return the full content
2517                            (rest.to_string(), false, String::new())
2518                        }
2519                    } else {
2520                        // No hashes found, return the full content
2521                        (rest.to_string(), false, String::new())
2522                    }
2523                };
2524
2525                let content_column = marker_column + hashes.len() + spaces_after.len();
2526
2527                // Extract custom header ID if present
2528                let raw_text = text.trim().to_string();
2529                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2530
2531                // If no custom ID was found on the header line, check the next line for standalone attr-list
2532                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2533                    let next_line = content_lines[i + 1];
2534                    if !lines[i + 1].in_code_block
2535                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2536                        && let Some(next_line_id) =
2537                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2538                    {
2539                        custom_id = Some(next_line_id);
2540                    }
2541                }
2542
2543                // ATX heading is "valid" for processing by heading rules if:
2544                // 1. Has space after # (CommonMark compliant): `# Heading`
2545                // 2. Is empty (just hashes): `#`
2546                // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2547                // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2548                //
2549                // Invalid patterns (hashtag-like) are skipped by most heading rules:
2550                // - `#tag` - single # with lowercase (social hashtag)
2551                // - `#123` - single # with number (GitHub issue ref)
2552                let is_valid = !spaces_after.is_empty()
2553                    || rest.is_empty()
2554                    || level > 1
2555                    || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2556
2557                lines[i].heading = Some(HeadingInfo {
2558                    level,
2559                    style: HeadingStyle::ATX,
2560                    marker: hashes.to_string(),
2561                    marker_column,
2562                    content_column,
2563                    text: clean_text,
2564                    custom_id,
2565                    raw_text,
2566                    has_closing_sequence: has_closing,
2567                    closing_sequence: closing_seq,
2568                    is_valid,
2569                });
2570            }
2571            // Check for Setext headings (need to look at next line)
2572            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2573                let next_line = content_lines[i + 1];
2574                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2575                    // Skip if next line is front matter delimiter
2576                    if front_matter_end > 0 && i < front_matter_end {
2577                        continue;
2578                    }
2579
2580                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2581                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2582                    {
2583                        continue;
2584                    }
2585
2586                    // Per CommonMark spec 4.3, setext heading content cannot be interpretable as:
2587                    // list item, ATX heading, block quote, thematic break, code fence, or HTML block
2588                    let content_line = line.trim();
2589
2590                    // Skip list items (-, *, +) and thematic breaks (---, ***, etc.)
2591                    if content_line.starts_with('-') || content_line.starts_with('*') || content_line.starts_with('+') {
2592                        continue;
2593                    }
2594
2595                    // Skip underscore thematic breaks (___)
2596                    if content_line.starts_with('_') {
2597                        let non_ws: String = content_line.chars().filter(|c| !c.is_whitespace()).collect();
2598                        if non_ws.len() >= 3 && non_ws.chars().all(|c| c == '_') {
2599                            continue;
2600                        }
2601                    }
2602
2603                    // Skip numbered lists (1. Item, 2. Item, etc.)
2604                    if let Some(first_char) = content_line.chars().next()
2605                        && first_char.is_ascii_digit()
2606                    {
2607                        let num_end = content_line.chars().take_while(|c| c.is_ascii_digit()).count();
2608                        if num_end < content_line.len() {
2609                            let next = content_line.chars().nth(num_end);
2610                            if next == Some('.') || next == Some(')') {
2611                                continue;
2612                            }
2613                        }
2614                    }
2615
2616                    // Skip ATX headings
2617                    if ATX_HEADING_REGEX.is_match(line) {
2618                        continue;
2619                    }
2620
2621                    // Skip blockquotes
2622                    if content_line.starts_with('>') {
2623                        continue;
2624                    }
2625
2626                    // Skip code fences
2627                    let trimmed_start = line.trim_start();
2628                    if trimmed_start.len() >= 3 {
2629                        let first_three: String = trimmed_start.chars().take(3).collect();
2630                        if first_three == "```" || first_three == "~~~" {
2631                            continue;
2632                        }
2633                    }
2634
2635                    // Skip HTML blocks
2636                    if content_line.starts_with('<') {
2637                        continue;
2638                    }
2639
2640                    let underline = next_line.trim();
2641
2642                    let level = if underline.starts_with('=') { 1 } else { 2 };
2643                    let style = if level == 1 {
2644                        HeadingStyle::Setext1
2645                    } else {
2646                        HeadingStyle::Setext2
2647                    };
2648
2649                    // Extract custom header ID if present
2650                    let raw_text = line.trim().to_string();
2651                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2652
2653                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2654                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2655                        let attr_line = content_lines[i + 2];
2656                        if !lines[i + 2].in_code_block
2657                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2658                            && let Some(attr_line_id) =
2659                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2660                        {
2661                            custom_id = Some(attr_line_id);
2662                        }
2663                    }
2664
2665                    lines[i].heading = Some(HeadingInfo {
2666                        level,
2667                        style,
2668                        marker: underline.to_string(),
2669                        marker_column: next_line.len() - next_line.trim_start().len(),
2670                        content_column: lines[i].indent,
2671                        text: clean_text,
2672                        custom_id,
2673                        raw_text,
2674                        has_closing_sequence: false,
2675                        closing_sequence: String::new(),
2676                        is_valid: true, // Setext headings are always valid
2677                    });
2678                }
2679            }
2680        }
2681    }
2682
2683    /// Detect HTML blocks in the content
2684    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2685        // HTML block elements that trigger block context
2686        // Includes HTML5 media, embedded content, and interactive elements
2687        const BLOCK_ELEMENTS: &[&str] = &[
2688            "address",
2689            "article",
2690            "aside",
2691            "audio",
2692            "blockquote",
2693            "canvas",
2694            "details",
2695            "dialog",
2696            "dd",
2697            "div",
2698            "dl",
2699            "dt",
2700            "embed",
2701            "fieldset",
2702            "figcaption",
2703            "figure",
2704            "footer",
2705            "form",
2706            "h1",
2707            "h2",
2708            "h3",
2709            "h4",
2710            "h5",
2711            "h6",
2712            "header",
2713            "hr",
2714            "iframe",
2715            "li",
2716            "main",
2717            "menu",
2718            "nav",
2719            "noscript",
2720            "object",
2721            "ol",
2722            "p",
2723            "picture",
2724            "pre",
2725            "script",
2726            "search",
2727            "section",
2728            "source",
2729            "style",
2730            "summary",
2731            "svg",
2732            "table",
2733            "tbody",
2734            "td",
2735            "template",
2736            "textarea",
2737            "tfoot",
2738            "th",
2739            "thead",
2740            "tr",
2741            "track",
2742            "ul",
2743            "video",
2744        ];
2745
2746        let mut i = 0;
2747        while i < lines.len() {
2748            // Skip if already in code block or front matter
2749            if lines[i].in_code_block || lines[i].in_front_matter {
2750                i += 1;
2751                continue;
2752            }
2753
2754            let trimmed = lines[i].content(content).trim_start();
2755
2756            // Check if line starts with an HTML tag
2757            if trimmed.starts_with('<') && trimmed.len() > 1 {
2758                // Extract tag name safely
2759                let after_bracket = &trimmed[1..];
2760                let is_closing = after_bracket.starts_with('/');
2761                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2762
2763                // Extract tag name (stop at space, >, /, or end of string)
2764                let tag_name = tag_start
2765                    .chars()
2766                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2767                    .collect::<String>()
2768                    .to_lowercase();
2769
2770                // Check if it's a block element
2771                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2772                    // Mark this line as in HTML block
2773                    lines[i].in_html_block = true;
2774
2775                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2776                    // This avoids complex nesting logic that might cause infinite loops
2777                    if !is_closing {
2778                        let closing_tag = format!("</{tag_name}>");
2779                        // style and script tags can contain blank lines (CSS/JS formatting)
2780                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2781                        let mut j = i + 1;
2782                        let mut found_closing_tag = false;
2783                        while j < lines.len() && j < i + 100 {
2784                            // Limit search to 100 lines
2785                            // Stop at blank lines (except for style/script tags)
2786                            if !allow_blank_lines && lines[j].is_blank {
2787                                break;
2788                            }
2789
2790                            lines[j].in_html_block = true;
2791
2792                            // Check if this line contains the closing tag
2793                            if lines[j].content(content).contains(&closing_tag) {
2794                                found_closing_tag = true;
2795                            }
2796
2797                            // After finding closing tag, continue marking lines as
2798                            // in_html_block until blank line (per CommonMark spec)
2799                            if found_closing_tag {
2800                                j += 1;
2801                                // Continue marking subsequent lines until blank
2802                                while j < lines.len() && j < i + 100 {
2803                                    if lines[j].is_blank {
2804                                        break;
2805                                    }
2806                                    lines[j].in_html_block = true;
2807                                    j += 1;
2808                                }
2809                                break;
2810                            }
2811                            j += 1;
2812                        }
2813                    }
2814                }
2815            }
2816
2817            i += 1;
2818        }
2819    }
2820
2821    /// Detect ESM import/export blocks in MDX files
2822    /// ESM blocks consist of contiguous import/export statements at the top of the file
2823    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2824        // Only process MDX files
2825        if !flavor.supports_esm_blocks() {
2826            return;
2827        }
2828
2829        let mut in_multiline_comment = false;
2830
2831        for line in lines.iter_mut() {
2832            // Skip blank lines and HTML comments
2833            if line.is_blank || line.in_html_comment {
2834                continue;
2835            }
2836
2837            let trimmed = line.content(content).trim_start();
2838
2839            // Handle continuation of multi-line JS comments
2840            if in_multiline_comment {
2841                if trimmed.contains("*/") {
2842                    in_multiline_comment = false;
2843                }
2844                continue;
2845            }
2846
2847            // Skip single-line JS comments (// and ///)
2848            if trimmed.starts_with("//") {
2849                continue;
2850            }
2851
2852            // Handle start of multi-line JS comment
2853            if trimmed.starts_with("/*") {
2854                if !trimmed.contains("*/") {
2855                    in_multiline_comment = true;
2856                }
2857                continue;
2858            }
2859
2860            // Check if line starts with import or export
2861            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2862                line.in_esm_block = true;
2863            } else {
2864                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2865                break;
2866            }
2867        }
2868    }
2869
2870    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2871    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2872        let mut code_spans = Vec::new();
2873
2874        // Quick check - if no backticks, no code spans
2875        if !content.contains('`') {
2876            return code_spans;
2877        }
2878
2879        // Use pulldown-cmark's streaming parser with byte offsets
2880        let parser = Parser::new(content).into_offset_iter();
2881
2882        for (event, range) in parser {
2883            if let Event::Code(_) = event {
2884                let start_pos = range.start;
2885                let end_pos = range.end;
2886
2887                // The range includes the backticks, extract the actual content
2888                let full_span = &content[start_pos..end_pos];
2889                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2890
2891                // Extract content between backticks, preserving spaces
2892                let content_start = start_pos + backtick_count;
2893                let content_end = end_pos - backtick_count;
2894                let span_content = if content_start < content_end {
2895                    content[content_start..content_end].to_string()
2896                } else {
2897                    String::new()
2898                };
2899
2900                // Use binary search to find line number - O(log n) instead of O(n)
2901                // Find the rightmost line whose byte_offset <= start_pos
2902                let line_idx = lines
2903                    .partition_point(|line| line.byte_offset <= start_pos)
2904                    .saturating_sub(1);
2905                let line_num = line_idx + 1;
2906                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2907
2908                // Find end column using binary search
2909                let end_line_idx = lines
2910                    .partition_point(|line| line.byte_offset <= end_pos)
2911                    .saturating_sub(1);
2912                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2913
2914                // Convert byte offsets to character positions for correct Unicode handling
2915                // This ensures consistency with warning.column which uses character positions
2916                let line_content = lines[line_idx].content(content);
2917                let col_start = if byte_col_start <= line_content.len() {
2918                    line_content[..byte_col_start].chars().count()
2919                } else {
2920                    line_content.chars().count()
2921                };
2922
2923                let end_line_content = lines[end_line_idx].content(content);
2924                let col_end = if byte_col_end <= end_line_content.len() {
2925                    end_line_content[..byte_col_end].chars().count()
2926                } else {
2927                    end_line_content.chars().count()
2928                };
2929
2930                code_spans.push(CodeSpan {
2931                    line: line_num,
2932                    end_line: end_line_idx + 1,
2933                    start_col: col_start,
2934                    end_col: col_end,
2935                    byte_offset: start_pos,
2936                    byte_end: end_pos,
2937                    backtick_count,
2938                    content: span_content,
2939                });
2940            }
2941        }
2942
2943        // Sort by position to ensure consistent ordering
2944        code_spans.sort_by_key(|span| span.byte_offset);
2945
2946        code_spans
2947    }
2948
2949    /// Parse all math spans (inline $...$ and display $$...$$) using pulldown-cmark
2950    fn parse_math_spans(content: &str, lines: &[LineInfo]) -> Vec<MathSpan> {
2951        let mut math_spans = Vec::new();
2952
2953        // Quick check - if no $ signs, no math spans
2954        if !content.contains('$') {
2955            return math_spans;
2956        }
2957
2958        // Use pulldown-cmark with ENABLE_MATH option
2959        let mut options = Options::empty();
2960        options.insert(Options::ENABLE_MATH);
2961        let parser = Parser::new_ext(content, options).into_offset_iter();
2962
2963        for (event, range) in parser {
2964            let (is_display, math_content) = match &event {
2965                Event::InlineMath(text) => (false, text.as_ref()),
2966                Event::DisplayMath(text) => (true, text.as_ref()),
2967                _ => continue,
2968            };
2969
2970            let start_pos = range.start;
2971            let end_pos = range.end;
2972
2973            // Use binary search to find line number - O(log n) instead of O(n)
2974            let line_idx = lines
2975                .partition_point(|line| line.byte_offset <= start_pos)
2976                .saturating_sub(1);
2977            let line_num = line_idx + 1;
2978            let byte_col_start = start_pos - lines[line_idx].byte_offset;
2979
2980            // Find end column using binary search
2981            let end_line_idx = lines
2982                .partition_point(|line| line.byte_offset <= end_pos)
2983                .saturating_sub(1);
2984            let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2985
2986            // Convert byte offsets to character positions for correct Unicode handling
2987            let line_content = lines[line_idx].content(content);
2988            let col_start = if byte_col_start <= line_content.len() {
2989                line_content[..byte_col_start].chars().count()
2990            } else {
2991                line_content.chars().count()
2992            };
2993
2994            let end_line_content = lines[end_line_idx].content(content);
2995            let col_end = if byte_col_end <= end_line_content.len() {
2996                end_line_content[..byte_col_end].chars().count()
2997            } else {
2998                end_line_content.chars().count()
2999            };
3000
3001            math_spans.push(MathSpan {
3002                line: line_num,
3003                end_line: end_line_idx + 1,
3004                start_col: col_start,
3005                end_col: col_end,
3006                byte_offset: start_pos,
3007                byte_end: end_pos,
3008                is_display,
3009                content: math_content.to_string(),
3010            });
3011        }
3012
3013        // Sort by position to ensure consistent ordering
3014        math_spans.sort_by_key(|span| span.byte_offset);
3015
3016        math_spans
3017    }
3018
3019    /// Parse all list blocks in the content (legacy line-by-line approach)
3020    ///
3021    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
3022    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
3023    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
3024    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
3025    ///   treated as list continuation (based on the list marker width)
3026    ///
3027    /// When a new list item is encountered, we check if list-breaking content was seen
3028    /// since the last item. If so, we start a new list block.
3029    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
3030        // Minimum indentation for unordered list continuation per CommonMark spec
3031        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
3032
3033        /// Initialize or reset the forward-scanning tracking state.
3034        /// This helper eliminates code duplication across three initialization sites.
3035        #[inline]
3036        fn reset_tracking_state(
3037            list_item: &ListItemInfo,
3038            has_list_breaking_content: &mut bool,
3039            min_continuation: &mut usize,
3040        ) {
3041            *has_list_breaking_content = false;
3042            let marker_width = if list_item.is_ordered {
3043                list_item.marker.len() + 1 // Ordered markers need space after period/paren
3044            } else {
3045                list_item.marker.len()
3046            };
3047            *min_continuation = if list_item.is_ordered {
3048                marker_width
3049            } else {
3050                UNORDERED_LIST_MIN_CONTINUATION_INDENT
3051            };
3052        }
3053
3054        // Pre-size based on lines that could be list items
3055        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
3056        let mut current_block: Option<ListBlock> = None;
3057        let mut last_list_item_line = 0;
3058        let mut current_indent_level = 0;
3059        let mut last_marker_width = 0;
3060
3061        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
3062        let mut has_list_breaking_content_since_last_item = false;
3063        let mut min_continuation_for_tracking = 0;
3064
3065        for (line_idx, line_info) in lines.iter().enumerate() {
3066            let line_num = line_idx + 1;
3067
3068            // Enhanced code block handling using Design #3's context analysis
3069            if line_info.in_code_block {
3070                if let Some(ref mut block) = current_block {
3071                    // Calculate minimum indentation for list continuation
3072                    let min_continuation_indent =
3073                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
3074
3075                    // Analyze code block context using the three-tier classification
3076                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
3077
3078                    match context {
3079                        CodeBlockContext::Indented => {
3080                            // Code block is properly indented - continues the list
3081                            block.end_line = line_num;
3082                            continue;
3083                        }
3084                        CodeBlockContext::Standalone => {
3085                            // Code block separates lists - end current block
3086                            let completed_block = current_block.take().unwrap();
3087                            list_blocks.push(completed_block);
3088                            continue;
3089                        }
3090                        CodeBlockContext::Adjacent => {
3091                            // Edge case - use conservative behavior (continue list)
3092                            block.end_line = line_num;
3093                            continue;
3094                        }
3095                    }
3096                } else {
3097                    // No current list block - skip code block lines
3098                    continue;
3099                }
3100            }
3101
3102            // Extract blockquote prefix if any
3103            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
3104                caps.get(0).unwrap().as_str().to_string()
3105            } else {
3106                String::new()
3107            };
3108
3109            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
3110            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
3111            if let Some(ref block) = current_block
3112                && line_info.list_item.is_none()
3113                && !line_info.is_blank
3114                && !line_info.in_code_span_continuation
3115            {
3116                let line_content = line_info.content(content).trim();
3117
3118                // Check for structural separators that break lists
3119                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
3120                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
3121                // as they indicate improper indentation rather than lazy continuation.
3122                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
3123
3124                // Check if blockquote context changes (different prefix than current block)
3125                // Lines within the SAME blockquote context don't break lists
3126                let blockquote_prefix_changes = blockquote_prefix.trim() != block.blockquote_prefix.trim();
3127
3128                let breaks_list = line_info.heading.is_some()
3129                    || line_content.starts_with("---")
3130                    || line_content.starts_with("***")
3131                    || line_content.starts_with("___")
3132                    || crate::utils::skip_context::is_table_line(line_content)
3133                    || blockquote_prefix_changes
3134                    || (line_info.indent > 0
3135                        && line_info.indent < min_continuation_for_tracking
3136                        && !is_lazy_continuation);
3137
3138                if breaks_list {
3139                    has_list_breaking_content_since_last_item = true;
3140                }
3141            }
3142
3143            // If this line is a code span continuation within an active list block,
3144            // extend the block's end_line to include this line (maintains list continuity)
3145            if line_info.in_code_span_continuation
3146                && line_info.list_item.is_none()
3147                && let Some(ref mut block) = current_block
3148            {
3149                block.end_line = line_num;
3150            }
3151
3152            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
3153            // properly indented lines within the list). This ensures the workaround at line 2448
3154            // works correctly when there are multiple continuation lines before a nested list item.
3155            // Also include lazy continuation lines (indent=0) per CommonMark spec.
3156            // For blockquote lines, compute effective indent after stripping the prefix
3157            let effective_continuation_indent = if let Some(ref block) = current_block {
3158                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3159                let line_content = line_info.content(content);
3160                let line_bq_level = line_content
3161                    .chars()
3162                    .take_while(|c| *c == '>' || c.is_whitespace())
3163                    .filter(|&c| c == '>')
3164                    .count();
3165                if line_bq_level > 0 && line_bq_level == block_bq_level {
3166                    // Compute indent after blockquote markers
3167                    let mut pos = 0;
3168                    let mut found_markers = 0;
3169                    for c in line_content.chars() {
3170                        pos += c.len_utf8();
3171                        if c == '>' {
3172                            found_markers += 1;
3173                            if found_markers == line_bq_level {
3174                                if line_content.get(pos..pos + 1) == Some(" ") {
3175                                    pos += 1;
3176                                }
3177                                break;
3178                            }
3179                        }
3180                    }
3181                    let after_bq = &line_content[pos..];
3182                    after_bq.len() - after_bq.trim_start().len()
3183                } else {
3184                    line_info.indent
3185                }
3186            } else {
3187                line_info.indent
3188            };
3189            let adjusted_min_continuation_for_tracking = if let Some(ref block) = current_block {
3190                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3191                if block_bq_level > 0 {
3192                    if block.is_ordered { last_marker_width } else { 2 }
3193                } else {
3194                    min_continuation_for_tracking
3195                }
3196            } else {
3197                min_continuation_for_tracking
3198            };
3199            let is_valid_continuation = effective_continuation_indent >= adjusted_min_continuation_for_tracking
3200                || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
3201
3202            if std::env::var("RUMDL_DEBUG_LIST").is_ok() && line_info.list_item.is_none() && !line_info.is_blank {
3203                eprintln!(
3204                    "[DEBUG] Line {}: checking continuation - indent={}, min_cont={}, is_valid={}, in_code_span={}, in_code_block={}, has_block={}",
3205                    line_num,
3206                    effective_continuation_indent,
3207                    adjusted_min_continuation_for_tracking,
3208                    is_valid_continuation,
3209                    line_info.in_code_span_continuation,
3210                    line_info.in_code_block,
3211                    current_block.is_some()
3212                );
3213            }
3214
3215            if !line_info.in_code_span_continuation
3216                && line_info.list_item.is_none()
3217                && !line_info.is_blank
3218                && !line_info.in_code_block
3219                && is_valid_continuation
3220                && let Some(ref mut block) = current_block
3221            {
3222                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3223                    eprintln!(
3224                        "[DEBUG] Line {}: extending block.end_line from {} to {}",
3225                        line_num, block.end_line, line_num
3226                    );
3227                }
3228                block.end_line = line_num;
3229            }
3230
3231            // Check if this line is a list item
3232            if let Some(list_item) = &line_info.list_item {
3233                // Calculate nesting level based on indentation
3234                let item_indent = list_item.marker_column;
3235                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
3236
3237                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3238                    eprintln!(
3239                        "[DEBUG] Line {}: list item found, marker={:?}, indent={}",
3240                        line_num, list_item.marker, item_indent
3241                    );
3242                }
3243
3244                if let Some(ref mut block) = current_block {
3245                    // Check if this continues the current block
3246                    // For nested lists, we need to check if this is a nested item (higher nesting level)
3247                    // or a continuation at the same or lower level
3248                    let is_nested = nesting > block.nesting_level;
3249                    let same_type =
3250                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
3251                    let same_context = block.blockquote_prefix == blockquote_prefix;
3252                    // Allow one blank line after last item, or lines immediately after block content
3253                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
3254
3255                    // For unordered lists, also check marker consistency
3256                    let marker_compatible =
3257                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
3258
3259                    // O(1) check: Use the tracked variable instead of O(n) nested loop
3260                    // This eliminates the quadratic bottleneck from issue #148
3261                    let has_non_list_content = has_list_breaking_content_since_last_item;
3262
3263                    // A list continues if:
3264                    // 1. It's a nested item (indented more than the parent), OR
3265                    // 2. It's the same type at the same level with reasonable distance
3266                    let mut continues_list = if is_nested {
3267                        // Nested items always continue the list if they're in the same context
3268                        same_context && reasonable_distance && !has_non_list_content
3269                    } else {
3270                        // Same-level items need to match type and markers
3271                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
3272                    };
3273
3274                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3275                        eprintln!(
3276                            "[DEBUG] Line {}: continues_list={}, is_nested={}, same_type={}, same_context={}, reasonable_distance={}, marker_compatible={}, has_non_list_content={}, last_item={}, block.end_line={}",
3277                            line_num,
3278                            continues_list,
3279                            is_nested,
3280                            same_type,
3281                            same_context,
3282                            reasonable_distance,
3283                            marker_compatible,
3284                            has_non_list_content,
3285                            last_list_item_line,
3286                            block.end_line
3287                        );
3288                    }
3289
3290                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
3291                    // This handles edge cases where content patterns might otherwise split lists incorrectly
3292                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
3293                        // Check if the previous line was a list item or a continuation of a list item
3294                        // (including lazy continuation lines)
3295                        if block.item_lines.contains(&(line_num - 1)) {
3296                            // They're consecutive list items - force them to be in the same list
3297                            continues_list = true;
3298                        } else {
3299                            // Previous line is a continuation line within this block
3300                            // (e.g., lazy continuation with indent=0)
3301                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
3302                            continues_list = true;
3303                        }
3304                    }
3305
3306                    if continues_list {
3307                        // Extend current block
3308                        block.end_line = line_num;
3309                        block.item_lines.push(line_num);
3310
3311                        // Update max marker width
3312                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
3313                            list_item.marker.len() + 1
3314                        } else {
3315                            list_item.marker.len()
3316                        });
3317
3318                        // Update marker consistency for unordered lists
3319                        if !block.is_ordered
3320                            && block.marker.is_some()
3321                            && block.marker.as_ref() != Some(&list_item.marker)
3322                        {
3323                            // Mixed markers, clear the marker field
3324                            block.marker = None;
3325                        }
3326
3327                        // Reset tracked state for issue #148 optimization
3328                        reset_tracking_state(
3329                            list_item,
3330                            &mut has_list_breaking_content_since_last_item,
3331                            &mut min_continuation_for_tracking,
3332                        );
3333                    } else {
3334                        // End current block and start a new one
3335
3336                        list_blocks.push(block.clone());
3337
3338                        *block = ListBlock {
3339                            start_line: line_num,
3340                            end_line: line_num,
3341                            is_ordered: list_item.is_ordered,
3342                            marker: if list_item.is_ordered {
3343                                None
3344                            } else {
3345                                Some(list_item.marker.clone())
3346                            },
3347                            blockquote_prefix: blockquote_prefix.clone(),
3348                            item_lines: vec![line_num],
3349                            nesting_level: nesting,
3350                            max_marker_width: if list_item.is_ordered {
3351                                list_item.marker.len() + 1
3352                            } else {
3353                                list_item.marker.len()
3354                            },
3355                        };
3356
3357                        // Initialize tracked state for new block (issue #148 optimization)
3358                        reset_tracking_state(
3359                            list_item,
3360                            &mut has_list_breaking_content_since_last_item,
3361                            &mut min_continuation_for_tracking,
3362                        );
3363                    }
3364                } else {
3365                    // Start a new block
3366                    current_block = Some(ListBlock {
3367                        start_line: line_num,
3368                        end_line: line_num,
3369                        is_ordered: list_item.is_ordered,
3370                        marker: if list_item.is_ordered {
3371                            None
3372                        } else {
3373                            Some(list_item.marker.clone())
3374                        },
3375                        blockquote_prefix,
3376                        item_lines: vec![line_num],
3377                        nesting_level: nesting,
3378                        max_marker_width: list_item.marker.len(),
3379                    });
3380
3381                    // Initialize tracked state for new block (issue #148 optimization)
3382                    reset_tracking_state(
3383                        list_item,
3384                        &mut has_list_breaking_content_since_last_item,
3385                        &mut min_continuation_for_tracking,
3386                    );
3387                }
3388
3389                last_list_item_line = line_num;
3390                current_indent_level = item_indent;
3391                last_marker_width = if list_item.is_ordered {
3392                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
3393                } else {
3394                    list_item.marker.len()
3395                };
3396            } else if let Some(ref mut block) = current_block {
3397                // Not a list item - check if it continues the current block
3398                if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3399                    eprintln!(
3400                        "[DEBUG] Line {}: non-list-item, is_blank={}, block exists",
3401                        line_num, line_info.is_blank
3402                    );
3403                }
3404
3405                // For MD032 compatibility, we use a simple approach:
3406                // - Indented lines continue the list
3407                // - Blank lines followed by indented content continue the list
3408                // - Everything else ends the list
3409
3410                // Check if the last line in the list block ended with a backslash (hard line break)
3411                // This handles cases where list items use backslash for hard line breaks
3412                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
3413                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
3414                } else {
3415                    false
3416                };
3417
3418                // Calculate minimum indentation for list continuation
3419                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
3420                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
3421                let min_continuation_indent = if block.is_ordered {
3422                    current_indent_level + last_marker_width
3423                } else {
3424                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
3425                };
3426
3427                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
3428                    // Indented line or backslash continuation continues the list
3429                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3430                        eprintln!(
3431                            "[DEBUG] Line {}: indented continuation (indent={}, min={})",
3432                            line_num, line_info.indent, min_continuation_indent
3433                        );
3434                    }
3435                    block.end_line = line_num;
3436                } else if line_info.is_blank {
3437                    // Blank line - check if it's internal to the list or ending it
3438                    // We only include blank lines that are followed by more list content
3439                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3440                        eprintln!("[DEBUG] Line {line_num}: entering blank line handling");
3441                    }
3442                    let mut check_idx = line_idx + 1;
3443                    let mut found_continuation = false;
3444
3445                    // Skip additional blank lines
3446                    while check_idx < lines.len() && lines[check_idx].is_blank {
3447                        check_idx += 1;
3448                    }
3449
3450                    if check_idx < lines.len() {
3451                        let next_line = &lines[check_idx];
3452                        // For blockquote lines, compute indent AFTER stripping the blockquote prefix
3453                        let next_content = next_line.content(content);
3454                        // Use blockquote level (count of >) to compare, not the full prefix
3455                        // This avoids issues where the regex captures extra whitespace
3456                        let block_bq_level_for_indent = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3457                        let next_bq_level_for_indent = next_content
3458                            .chars()
3459                            .take_while(|c| *c == '>' || c.is_whitespace())
3460                            .filter(|&c| c == '>')
3461                            .count();
3462                        let effective_indent =
3463                            if next_bq_level_for_indent > 0 && next_bq_level_for_indent == block_bq_level_for_indent {
3464                                // For lines in the same blockquote context, compute indent after the blockquote marker(s)
3465                                // Find position after ">" and one space
3466                                let mut pos = 0;
3467                                let mut found_markers = 0;
3468                                for c in next_content.chars() {
3469                                    pos += c.len_utf8();
3470                                    if c == '>' {
3471                                        found_markers += 1;
3472                                        if found_markers == next_bq_level_for_indent {
3473                                            // Skip optional space after last >
3474                                            if next_content.get(pos..pos + 1) == Some(" ") {
3475                                                pos += 1;
3476                                            }
3477                                            break;
3478                                        }
3479                                    }
3480                                }
3481                                let after_blockquote_marker = &next_content[pos..];
3482                                after_blockquote_marker.len() - after_blockquote_marker.trim_start().len()
3483                            } else {
3484                                next_line.indent
3485                            };
3486                        // Also adjust min_continuation_indent for blockquote lists
3487                        // The marker_column includes blockquote prefix, so subtract it
3488                        let adjusted_min_continuation = if block_bq_level_for_indent > 0 {
3489                            // For blockquote lists, the continuation is relative to blockquote content
3490                            // current_indent_level includes blockquote prefix (2 for "> "), so use just 2 for unordered
3491                            if block.is_ordered { last_marker_width } else { 2 }
3492                        } else {
3493                            min_continuation_indent
3494                        };
3495                        // Check if followed by indented content (list continuation)
3496                        if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3497                            eprintln!(
3498                                "[DEBUG] Blank line {} checking next line {}: effective_indent={}, adjusted_min={}, next_is_list={}, in_code_block={}",
3499                                line_num,
3500                                check_idx + 1,
3501                                effective_indent,
3502                                adjusted_min_continuation,
3503                                next_line.list_item.is_some(),
3504                                next_line.in_code_block
3505                            );
3506                        }
3507                        if !next_line.in_code_block && effective_indent >= adjusted_min_continuation {
3508                            found_continuation = true;
3509                        }
3510                        // Check if followed by another list item at the same level
3511                        else if !next_line.in_code_block
3512                            && next_line.list_item.is_some()
3513                            && let Some(item) = &next_line.list_item
3514                        {
3515                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
3516                                .find(next_line.content(content))
3517                                .map_or(String::new(), |m| m.as_str().to_string());
3518                            if item.marker_column == current_indent_level
3519                                && item.is_ordered == block.is_ordered
3520                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
3521                            {
3522                                // Check if there was meaningful content between the list items (unused now)
3523                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
3524                                // Pre-compute block's blockquote level for use in closures
3525                                let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3526                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
3527                                    if let Some(between_line) = lines.get(idx) {
3528                                        let between_content = between_line.content(content);
3529                                        let trimmed = between_content.trim();
3530                                        // Skip empty lines
3531                                        if trimmed.is_empty() {
3532                                            return false;
3533                                        }
3534                                        // Check for meaningful content
3535                                        let line_indent = between_content.len() - between_content.trim_start().len();
3536
3537                                        // Check if blockquote level changed (not just if line starts with ">")
3538                                        let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3539                                            .find(between_content)
3540                                            .map_or(String::new(), |m| m.as_str().to_string());
3541                                        let between_bq_level = between_bq_prefix.chars().filter(|&c| c == '>').count();
3542                                        let blockquote_level_changed =
3543                                            trimmed.starts_with(">") && between_bq_level != block_bq_level;
3544
3545                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
3546                                        if trimmed.starts_with("```")
3547                                            || trimmed.starts_with("~~~")
3548                                            || trimmed.starts_with("---")
3549                                            || trimmed.starts_with("***")
3550                                            || trimmed.starts_with("___")
3551                                            || blockquote_level_changed
3552                                            || crate::utils::skip_context::is_table_line(trimmed)
3553                                            || between_line.heading.is_some()
3554                                        {
3555                                            return true; // These are structural separators - meaningful content that breaks lists
3556                                        }
3557
3558                                        // Only properly indented content continues the list
3559                                        line_indent >= min_continuation_indent
3560                                    } else {
3561                                        false
3562                                    }
3563                                });
3564
3565                                if block.is_ordered {
3566                                    // For ordered lists: don't continue if there are structural separators
3567                                    // Check if there are structural separators between the list items
3568                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3569                                        if let Some(between_line) = lines.get(idx) {
3570                                            let between_content = between_line.content(content);
3571                                            let trimmed = between_content.trim();
3572                                            if trimmed.is_empty() {
3573                                                return false;
3574                                            }
3575                                            // Check if blockquote level changed (not just if line starts with ">")
3576                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3577                                                .find(between_content)
3578                                                .map_or(String::new(), |m| m.as_str().to_string());
3579                                            let between_bq_level =
3580                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3581                                            let blockquote_level_changed =
3582                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3583                                            // Check for structural separators that break lists
3584                                            trimmed.starts_with("```")
3585                                                || trimmed.starts_with("~~~")
3586                                                || trimmed.starts_with("---")
3587                                                || trimmed.starts_with("***")
3588                                                || trimmed.starts_with("___")
3589                                                || blockquote_level_changed
3590                                                || crate::utils::skip_context::is_table_line(trimmed)
3591                                                || between_line.heading.is_some()
3592                                        } else {
3593                                            false
3594                                        }
3595                                    });
3596                                    found_continuation = !has_structural_separators;
3597                                } else {
3598                                    // For unordered lists: also check for structural separators
3599                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3600                                        if let Some(between_line) = lines.get(idx) {
3601                                            let between_content = between_line.content(content);
3602                                            let trimmed = between_content.trim();
3603                                            if trimmed.is_empty() {
3604                                                return false;
3605                                            }
3606                                            // Check if blockquote level changed (not just if line starts with ">")
3607                                            let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3608                                                .find(between_content)
3609                                                .map_or(String::new(), |m| m.as_str().to_string());
3610                                            let between_bq_level =
3611                                                between_bq_prefix.chars().filter(|&c| c == '>').count();
3612                                            let blockquote_level_changed =
3613                                                trimmed.starts_with(">") && between_bq_level != block_bq_level;
3614                                            // Check for structural separators that break lists
3615                                            trimmed.starts_with("```")
3616                                                || trimmed.starts_with("~~~")
3617                                                || trimmed.starts_with("---")
3618                                                || trimmed.starts_with("***")
3619                                                || trimmed.starts_with("___")
3620                                                || blockquote_level_changed
3621                                                || crate::utils::skip_context::is_table_line(trimmed)
3622                                                || between_line.heading.is_some()
3623                                        } else {
3624                                            false
3625                                        }
3626                                    });
3627                                    found_continuation = !has_structural_separators;
3628                                }
3629                            }
3630                        }
3631                    }
3632
3633                    if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3634                        eprintln!("[DEBUG] Blank line {line_num} final: found_continuation={found_continuation}");
3635                    }
3636                    if found_continuation {
3637                        // Include the blank line in the block
3638                        block.end_line = line_num;
3639                    } else {
3640                        // Blank line ends the list - don't include it
3641                        list_blocks.push(block.clone());
3642                        current_block = None;
3643                    }
3644                } else {
3645                    // Check for lazy continuation - non-indented line immediately after a list item
3646                    // But only if the line has sufficient indentation for the list type
3647                    let min_required_indent = if block.is_ordered {
3648                        current_indent_level + last_marker_width
3649                    } else {
3650                        current_indent_level + 2
3651                    };
3652
3653                    // For lazy continuation to apply, the line must either:
3654                    // 1. Have no indentation (true lazy continuation)
3655                    // 2. Have sufficient indentation for the list type
3656                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
3657                    let line_content = line_info.content(content).trim();
3658
3659                    // Check for table-like patterns
3660                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3661
3662                    // Check if blockquote level changed (not just if line starts with ">")
3663                    // Lines within the same blockquote level are NOT structural separators
3664                    let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3665                    let current_bq_level = blockquote_prefix.chars().filter(|&c| c == '>').count();
3666                    let blockquote_level_changed = line_content.starts_with(">") && current_bq_level != block_bq_level;
3667
3668                    let is_structural_separator = line_info.heading.is_some()
3669                        || line_content.starts_with("```")
3670                        || line_content.starts_with("~~~")
3671                        || line_content.starts_with("---")
3672                        || line_content.starts_with("***")
3673                        || line_content.starts_with("___")
3674                        || blockquote_level_changed
3675                        || looks_like_table;
3676
3677                    // Allow lazy continuation if we're still within the same list block
3678                    // (not just immediately after a list item)
3679                    let is_lazy_continuation = !is_structural_separator
3680                        && !line_info.is_blank
3681                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
3682
3683                    if is_lazy_continuation {
3684                        // Additional check: if the line starts with uppercase and looks like a new sentence,
3685                        // it's probably not a continuation
3686                        // BUT: for blockquote lines with sufficient effective indent, always treat as continuation
3687                        let line_content_raw = line_info.content(content);
3688                        let block_bq_level_lazy = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3689                        let line_bq_level_lazy = line_content_raw
3690                            .chars()
3691                            .take_while(|c| *c == '>' || c.is_whitespace())
3692                            .filter(|&c| c == '>')
3693                            .count();
3694                        let has_proper_blockquote_indent =
3695                            if line_bq_level_lazy > 0 && line_bq_level_lazy == block_bq_level_lazy {
3696                                // Compute effective indent after blockquote markers
3697                                let mut pos = 0;
3698                                let mut found_markers = 0;
3699                                for c in line_content_raw.chars() {
3700                                    pos += c.len_utf8();
3701                                    if c == '>' {
3702                                        found_markers += 1;
3703                                        if found_markers == line_bq_level_lazy {
3704                                            if line_content_raw.get(pos..pos + 1) == Some(" ") {
3705                                                pos += 1;
3706                                            }
3707                                            break;
3708                                        }
3709                                    }
3710                                }
3711                                let after_bq = &line_content_raw[pos..];
3712                                let effective_indent_lazy = after_bq.len() - after_bq.trim_start().len();
3713                                let min_required_for_bq = if block.is_ordered { last_marker_width } else { 2 };
3714                                effective_indent_lazy >= min_required_for_bq
3715                            } else {
3716                                false
3717                            };
3718
3719                        // If it has proper blockquote indent, it's a continuation regardless of uppercase
3720                        if has_proper_blockquote_indent {
3721                            block.end_line = line_num;
3722                        } else {
3723                            let content_to_check = if !blockquote_prefix.is_empty() {
3724                                // Strip blockquote prefix to check the actual content
3725                                line_info
3726                                    .content(content)
3727                                    .strip_prefix(&blockquote_prefix)
3728                                    .unwrap_or(line_info.content(content))
3729                                    .trim()
3730                            } else {
3731                                line_info.content(content).trim()
3732                            };
3733
3734                            let starts_with_uppercase =
3735                                content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
3736
3737                            // If it starts with uppercase and the previous line ended with punctuation,
3738                            // it's likely a new paragraph, not a continuation
3739                            if starts_with_uppercase && last_list_item_line > 0 {
3740                                // This looks like a new paragraph
3741                                list_blocks.push(block.clone());
3742                                current_block = None;
3743                            } else {
3744                                // This is a lazy continuation line
3745                                block.end_line = line_num;
3746                            }
3747                        }
3748                    } else {
3749                        // Non-indented, non-blank line that's not a lazy continuation - end the block
3750                        list_blocks.push(block.clone());
3751                        current_block = None;
3752                    }
3753                }
3754            }
3755        }
3756
3757        // Don't forget the last block
3758        if let Some(block) = current_block {
3759            list_blocks.push(block);
3760        }
3761
3762        // Merge adjacent blocks that should be one
3763        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3764
3765        list_blocks
3766    }
3767
3768    /// Compute character frequency for fast content analysis
3769    fn compute_char_frequency(content: &str) -> CharFrequency {
3770        let mut frequency = CharFrequency::default();
3771
3772        for ch in content.chars() {
3773            match ch {
3774                '#' => frequency.hash_count += 1,
3775                '*' => frequency.asterisk_count += 1,
3776                '_' => frequency.underscore_count += 1,
3777                '-' => frequency.hyphen_count += 1,
3778                '+' => frequency.plus_count += 1,
3779                '>' => frequency.gt_count += 1,
3780                '|' => frequency.pipe_count += 1,
3781                '[' => frequency.bracket_count += 1,
3782                '`' => frequency.backtick_count += 1,
3783                '<' => frequency.lt_count += 1,
3784                '!' => frequency.exclamation_count += 1,
3785                '\n' => frequency.newline_count += 1,
3786                _ => {}
3787            }
3788        }
3789
3790        frequency
3791    }
3792
3793    /// Parse HTML tags in the content
3794    fn parse_html_tags(
3795        content: &str,
3796        lines: &[LineInfo],
3797        code_blocks: &[(usize, usize)],
3798        flavor: MarkdownFlavor,
3799    ) -> Vec<HtmlTag> {
3800        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3801            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3802
3803        let mut html_tags = Vec::with_capacity(content.matches('<').count());
3804
3805        for cap in HTML_TAG_REGEX.captures_iter(content) {
3806            let full_match = cap.get(0).unwrap();
3807            let match_start = full_match.start();
3808            let match_end = full_match.end();
3809
3810            // Skip if in code block
3811            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3812                continue;
3813            }
3814
3815            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3816            let tag_name_original = cap.get(2).unwrap().as_str();
3817            let tag_name = tag_name_original.to_lowercase();
3818            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3819
3820            // Skip JSX components in MDX files (tags starting with uppercase letter)
3821            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3822            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3823                continue;
3824            }
3825
3826            // Find which line this tag is on
3827            let mut line_num = 1;
3828            let mut col_start = match_start;
3829            let mut col_end = match_end;
3830            for (idx, line_info) in lines.iter().enumerate() {
3831                if match_start >= line_info.byte_offset {
3832                    line_num = idx + 1;
3833                    col_start = match_start - line_info.byte_offset;
3834                    col_end = match_end - line_info.byte_offset;
3835                } else {
3836                    break;
3837                }
3838            }
3839
3840            html_tags.push(HtmlTag {
3841                line: line_num,
3842                start_col: col_start,
3843                end_col: col_end,
3844                byte_offset: match_start,
3845                byte_end: match_end,
3846                tag_name,
3847                is_closing,
3848                is_self_closing,
3849                raw_content: full_match.as_str().to_string(),
3850            });
3851        }
3852
3853        html_tags
3854    }
3855
3856    /// Parse table rows in the content
3857    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3858        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3859
3860        for (line_idx, line_info) in lines.iter().enumerate() {
3861            // Skip lines in code blocks or blank lines
3862            if line_info.in_code_block || line_info.is_blank {
3863                continue;
3864            }
3865
3866            let line = line_info.content(content);
3867            let line_num = line_idx + 1;
3868
3869            // Check if this line contains pipes (potential table row)
3870            if !line.contains('|') {
3871                continue;
3872            }
3873
3874            // Count columns by splitting on pipes
3875            let parts: Vec<&str> = line.split('|').collect();
3876            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3877
3878            // Check if this is a separator row
3879            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3880            let mut column_alignments = Vec::new();
3881
3882            if is_separator {
3883                for part in &parts[1..parts.len() - 1] {
3884                    // Skip first and last empty parts
3885                    let trimmed = part.trim();
3886                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3887                        "center".to_string()
3888                    } else if trimmed.ends_with(':') {
3889                        "right".to_string()
3890                    } else if trimmed.starts_with(':') {
3891                        "left".to_string()
3892                    } else {
3893                        "none".to_string()
3894                    };
3895                    column_alignments.push(alignment);
3896                }
3897            }
3898
3899            table_rows.push(TableRow {
3900                line: line_num,
3901                is_separator,
3902                column_count,
3903                column_alignments,
3904            });
3905        }
3906
3907        table_rows
3908    }
3909
3910    /// Parse bare URLs and emails in the content
3911    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3912        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3913
3914        // Check for bare URLs (not in angle brackets or markdown links)
3915        for cap in URL_SIMPLE_REGEX.captures_iter(content) {
3916            let full_match = cap.get(0).unwrap();
3917            let match_start = full_match.start();
3918            let match_end = full_match.end();
3919
3920            // Skip if in code block
3921            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3922                continue;
3923            }
3924
3925            // Skip if already in angle brackets or markdown links
3926            let preceding_char = if match_start > 0 {
3927                content.chars().nth(match_start - 1)
3928            } else {
3929                None
3930            };
3931            let following_char = content.chars().nth(match_end);
3932
3933            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3934                continue;
3935            }
3936            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3937                continue;
3938            }
3939
3940            let url = full_match.as_str();
3941            let url_type = if url.starts_with("https://") {
3942                "https"
3943            } else if url.starts_with("http://") {
3944                "http"
3945            } else if url.starts_with("ftp://") {
3946                "ftp"
3947            } else {
3948                "other"
3949            };
3950
3951            // Find which line this URL is on
3952            let mut line_num = 1;
3953            let mut col_start = match_start;
3954            let mut col_end = match_end;
3955            for (idx, line_info) in lines.iter().enumerate() {
3956                if match_start >= line_info.byte_offset {
3957                    line_num = idx + 1;
3958                    col_start = match_start - line_info.byte_offset;
3959                    col_end = match_end - line_info.byte_offset;
3960                } else {
3961                    break;
3962                }
3963            }
3964
3965            bare_urls.push(BareUrl {
3966                line: line_num,
3967                start_col: col_start,
3968                end_col: col_end,
3969                byte_offset: match_start,
3970                byte_end: match_end,
3971                url: url.to_string(),
3972                url_type: url_type.to_string(),
3973            });
3974        }
3975
3976        // Check for bare email addresses
3977        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3978            let full_match = cap.get(0).unwrap();
3979            let match_start = full_match.start();
3980            let match_end = full_match.end();
3981
3982            // Skip if in code block
3983            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3984                continue;
3985            }
3986
3987            // Skip if already in angle brackets or markdown links
3988            let preceding_char = if match_start > 0 {
3989                content.chars().nth(match_start - 1)
3990            } else {
3991                None
3992            };
3993            let following_char = content.chars().nth(match_end);
3994
3995            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3996                continue;
3997            }
3998            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3999                continue;
4000            }
4001
4002            let email = full_match.as_str();
4003
4004            // Find which line this email is on
4005            let mut line_num = 1;
4006            let mut col_start = match_start;
4007            let mut col_end = match_end;
4008            for (idx, line_info) in lines.iter().enumerate() {
4009                if match_start >= line_info.byte_offset {
4010                    line_num = idx + 1;
4011                    col_start = match_start - line_info.byte_offset;
4012                    col_end = match_end - line_info.byte_offset;
4013                } else {
4014                    break;
4015                }
4016            }
4017
4018            bare_urls.push(BareUrl {
4019                line: line_num,
4020                start_col: col_start,
4021                end_col: col_end,
4022                byte_offset: match_start,
4023                byte_end: match_end,
4024                url: email.to_string(),
4025                url_type: "email".to_string(),
4026            });
4027        }
4028
4029        bare_urls
4030    }
4031
4032    /// Get an iterator over valid CommonMark headings
4033    ///
4034    /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
4035    /// that should be flagged by MD018 but should not be processed by other heading rules.
4036    ///
4037    /// # Examples
4038    ///
4039    /// ```rust
4040    /// use rumdl_lib::lint_context::LintContext;
4041    /// use rumdl_lib::config::MarkdownFlavor;
4042    ///
4043    /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
4044    /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4045    ///
4046    /// for heading in ctx.valid_headings() {
4047    ///     println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
4048    /// }
4049    /// // Only prints valid headings, skips `#NoSpace`
4050    /// ```
4051    #[must_use]
4052    pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
4053        ValidHeadingsIter::new(&self.lines)
4054    }
4055
4056    /// Check if the document contains any valid CommonMark headings
4057    ///
4058    /// Returns `true` if there is at least one heading with proper space after `#`.
4059    #[must_use]
4060    pub fn has_valid_headings(&self) -> bool {
4061        self.lines
4062            .iter()
4063            .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
4064    }
4065}
4066
4067/// Merge adjacent list blocks that should be treated as one
4068fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
4069    if list_blocks.len() < 2 {
4070        return;
4071    }
4072
4073    let mut merger = ListBlockMerger::new(content, lines);
4074    *list_blocks = merger.merge(list_blocks);
4075}
4076
4077/// Helper struct to manage the complex logic of merging list blocks
4078struct ListBlockMerger<'a> {
4079    content: &'a str,
4080    lines: &'a [LineInfo],
4081}
4082
4083impl<'a> ListBlockMerger<'a> {
4084    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
4085        Self { content, lines }
4086    }
4087
4088    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
4089        let mut merged = Vec::with_capacity(list_blocks.len());
4090        let mut current = list_blocks[0].clone();
4091
4092        for next in list_blocks.iter().skip(1) {
4093            if self.should_merge_blocks(&current, next) {
4094                current = self.merge_two_blocks(current, next);
4095            } else {
4096                merged.push(current);
4097                current = next.clone();
4098            }
4099        }
4100
4101        merged.push(current);
4102        merged
4103    }
4104
4105    /// Determine if two adjacent list blocks should be merged
4106    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
4107        // Basic compatibility checks
4108        if !self.blocks_are_compatible(current, next) {
4109            return false;
4110        }
4111
4112        // Check spacing and content between blocks
4113        let spacing = self.analyze_spacing_between(current, next);
4114        match spacing {
4115            BlockSpacing::Consecutive => true,
4116            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
4117            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
4118                self.can_merge_with_content_between(current, next)
4119            }
4120        }
4121    }
4122
4123    /// Check if blocks have compatible structure for merging
4124    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
4125        current.is_ordered == next.is_ordered
4126            && current.blockquote_prefix == next.blockquote_prefix
4127            && current.nesting_level == next.nesting_level
4128    }
4129
4130    /// Analyze the spacing between two list blocks
4131    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
4132        let gap = next.start_line - current.end_line;
4133
4134        match gap {
4135            1 => BlockSpacing::Consecutive,
4136            2 => BlockSpacing::SingleBlank,
4137            _ if gap > 2 => {
4138                if self.has_only_blank_lines_between(current, next) {
4139                    BlockSpacing::MultipleBlanks
4140                } else {
4141                    BlockSpacing::ContentBetween
4142                }
4143            }
4144            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
4145        }
4146    }
4147
4148    /// Check if unordered lists can be merged with a single blank line between
4149    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4150        // Check if there are structural separators between the blocks
4151        // If has_meaningful_content_between returns true, it means there are structural separators
4152        if has_meaningful_content_between(self.content, current, next, self.lines) {
4153            return false; // Structural separators prevent merging
4154        }
4155
4156        // Only merge unordered lists with same marker across single blank
4157        !current.is_ordered && current.marker == next.marker
4158    }
4159
4160    /// Check if ordered lists can be merged when there's content between them
4161    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4162        // Do not merge lists if there are structural separators between them
4163        if has_meaningful_content_between(self.content, current, next, self.lines) {
4164            return false; // Structural separators prevent merging
4165        }
4166
4167        // Only consider merging ordered lists if there's no structural content between
4168        current.is_ordered && next.is_ordered
4169    }
4170
4171    /// Check if there are only blank lines between blocks
4172    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4173        for line_num in (current.end_line + 1)..next.start_line {
4174            if let Some(line_info) = self.lines.get(line_num - 1)
4175                && !line_info.content(self.content).trim().is_empty()
4176            {
4177                return false;
4178            }
4179        }
4180        true
4181    }
4182
4183    /// Merge two compatible list blocks into one
4184    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
4185        current.end_line = next.end_line;
4186        current.item_lines.extend_from_slice(&next.item_lines);
4187
4188        // Update max marker width
4189        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
4190
4191        // Handle marker consistency for unordered lists
4192        if !current.is_ordered && self.markers_differ(&current, next) {
4193            current.marker = None; // Mixed markers
4194        }
4195
4196        current
4197    }
4198
4199    /// Check if two blocks have different markers
4200    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
4201        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
4202    }
4203}
4204
4205/// Types of spacing between list blocks
4206#[derive(Debug, PartialEq)]
4207enum BlockSpacing {
4208    Consecutive,    // No gap between blocks
4209    SingleBlank,    // One blank line between blocks
4210    MultipleBlanks, // Multiple blank lines but no content
4211    ContentBetween, // Content exists between blocks
4212}
4213
4214/// Check if there's meaningful content (not just blank lines) between two list blocks
4215fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
4216    // Check lines between current.end_line and next.start_line
4217    for line_num in (current.end_line + 1)..next.start_line {
4218        if let Some(line_info) = lines.get(line_num - 1) {
4219            // Convert to 0-indexed
4220            let trimmed = line_info.content(content).trim();
4221
4222            // Skip empty lines
4223            if trimmed.is_empty() {
4224                continue;
4225            }
4226
4227            // Check for structural separators that should separate lists (CommonMark compliant)
4228
4229            // Headings separate lists
4230            if line_info.heading.is_some() {
4231                return true; // Has meaningful content - headings separate lists
4232            }
4233
4234            // Horizontal rules separate lists (---, ***, ___)
4235            if is_horizontal_rule(trimmed) {
4236                return true; // Has meaningful content - horizontal rules separate lists
4237            }
4238
4239            // Tables separate lists
4240            if crate::utils::skip_context::is_table_line(trimmed) {
4241                return true; // Has meaningful content - tables separate lists
4242            }
4243
4244            // Blockquotes separate lists
4245            if trimmed.starts_with('>') {
4246                return true; // Has meaningful content - blockquotes separate lists
4247            }
4248
4249            // Code block fences separate lists (unless properly indented as list content)
4250            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
4251                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4252
4253                // Check if this code block is properly indented as list continuation
4254                let min_continuation_indent = if current.is_ordered {
4255                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
4256                } else {
4257                    current.nesting_level + 2
4258                };
4259
4260                if line_indent < min_continuation_indent {
4261                    // This is a standalone code block that separates lists
4262                    return true; // Has meaningful content - standalone code blocks separate lists
4263                }
4264            }
4265
4266            // Check if this line has proper indentation for list continuation
4267            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4268
4269            // Calculate minimum indentation needed to be list continuation
4270            let min_indent = if current.is_ordered {
4271                current.nesting_level + current.max_marker_width
4272            } else {
4273                current.nesting_level + 2
4274            };
4275
4276            // If the line is not indented enough to be list continuation, it's meaningful content
4277            if line_indent < min_indent {
4278                return true; // Has meaningful content - content not indented as list continuation
4279            }
4280
4281            // If we reach here, the line is properly indented as list continuation
4282            // Continue checking other lines
4283        }
4284    }
4285
4286    // Only blank lines or properly indented list continuation content between blocks
4287    false
4288}
4289
4290/// Check if a line is a horizontal rule (---, ***, ___) per CommonMark spec.
4291/// CommonMark rules for thematic breaks (horizontal rules):
4292/// - May have 0-3 spaces of leading indentation (but NOT tabs)
4293/// - Must have 3+ of the same character (-, *, or _)
4294/// - May have spaces between characters
4295/// - No other characters allowed
4296pub fn is_horizontal_rule_line(line: &str) -> bool {
4297    // CommonMark: HRs can have 0-3 spaces of leading indentation, not tabs
4298    let leading_spaces = line.len() - line.trim_start_matches(' ').len();
4299    if leading_spaces > 3 || line.starts_with('\t') {
4300        return false;
4301    }
4302
4303    is_horizontal_rule_content(line.trim())
4304}
4305
4306/// Check if trimmed content matches horizontal rule pattern.
4307/// Use `is_horizontal_rule_line` for full CommonMark compliance including indentation check.
4308pub fn is_horizontal_rule_content(trimmed: &str) -> bool {
4309    if trimmed.len() < 3 {
4310        return false;
4311    }
4312
4313    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
4314    let chars: Vec<char> = trimmed.chars().collect();
4315    if let Some(&first_char) = chars.first()
4316        && (first_char == '-' || first_char == '*' || first_char == '_')
4317    {
4318        let mut count = 0;
4319        for &ch in &chars {
4320            if ch == first_char {
4321                count += 1;
4322            } else if ch != ' ' && ch != '\t' {
4323                return false; // Non-matching, non-whitespace character
4324            }
4325        }
4326        return count >= 3;
4327    }
4328    false
4329}
4330
4331/// Backwards-compatible alias for `is_horizontal_rule_content`
4332pub fn is_horizontal_rule(trimmed: &str) -> bool {
4333    is_horizontal_rule_content(trimmed)
4334}
4335
4336/// Check if content contains patterns that cause the markdown crate to panic
4337#[cfg(test)]
4338mod tests {
4339    use super::*;
4340
4341    #[test]
4342    fn test_empty_content() {
4343        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4344        assert_eq!(ctx.content, "");
4345        assert_eq!(ctx.line_offsets, vec![0]);
4346        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4347        assert_eq!(ctx.lines.len(), 0);
4348    }
4349
4350    #[test]
4351    fn test_single_line() {
4352        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
4353        assert_eq!(ctx.content, "# Hello");
4354        assert_eq!(ctx.line_offsets, vec![0]);
4355        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4356        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
4357    }
4358
4359    #[test]
4360    fn test_multi_line() {
4361        let content = "# Title\n\nSecond line\nThird line";
4362        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4363        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
4364        // Test offset to line/col
4365        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
4366        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
4367        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
4368        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
4369        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
4370    }
4371
4372    #[test]
4373    fn test_line_info() {
4374        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
4375        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4376
4377        // Test line info
4378        assert_eq!(ctx.lines.len(), 7);
4379
4380        // Line 1: "# Title"
4381        let line1 = &ctx.lines[0];
4382        assert_eq!(line1.content(ctx.content), "# Title");
4383        assert_eq!(line1.byte_offset, 0);
4384        assert_eq!(line1.indent, 0);
4385        assert!(!line1.is_blank);
4386        assert!(!line1.in_code_block);
4387        assert!(line1.list_item.is_none());
4388
4389        // Line 2: "    indented"
4390        let line2 = &ctx.lines[1];
4391        assert_eq!(line2.content(ctx.content), "    indented");
4392        assert_eq!(line2.byte_offset, 8);
4393        assert_eq!(line2.indent, 4);
4394        assert!(!line2.is_blank);
4395
4396        // Line 3: "" (blank)
4397        let line3 = &ctx.lines[2];
4398        assert_eq!(line3.content(ctx.content), "");
4399        assert!(line3.is_blank);
4400
4401        // Test helper methods
4402        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
4403        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
4404        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
4405        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
4406    }
4407
4408    #[test]
4409    fn test_list_item_detection() {
4410        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
4411        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4412
4413        // Line 1: "- Unordered item"
4414        let line1 = &ctx.lines[0];
4415        assert!(line1.list_item.is_some());
4416        let list1 = line1.list_item.as_ref().unwrap();
4417        assert_eq!(list1.marker, "-");
4418        assert!(!list1.is_ordered);
4419        assert_eq!(list1.marker_column, 0);
4420        assert_eq!(list1.content_column, 2);
4421
4422        // Line 2: "  * Nested item"
4423        let line2 = &ctx.lines[1];
4424        assert!(line2.list_item.is_some());
4425        let list2 = line2.list_item.as_ref().unwrap();
4426        assert_eq!(list2.marker, "*");
4427        assert_eq!(list2.marker_column, 2);
4428
4429        // Line 3: "1. Ordered item"
4430        let line3 = &ctx.lines[2];
4431        assert!(line3.list_item.is_some());
4432        let list3 = line3.list_item.as_ref().unwrap();
4433        assert_eq!(list3.marker, "1.");
4434        assert!(list3.is_ordered);
4435        assert_eq!(list3.number, Some(1));
4436
4437        // Line 6: "Not a list"
4438        let line6 = &ctx.lines[5];
4439        assert!(line6.list_item.is_none());
4440    }
4441
4442    #[test]
4443    fn test_offset_to_line_col_edge_cases() {
4444        let content = "a\nb\nc";
4445        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4446        // line_offsets: [0, 2, 4]
4447        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
4448        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
4449        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
4450        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
4451        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
4452        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
4453    }
4454
4455    #[test]
4456    fn test_mdx_esm_blocks() {
4457        let content = r##"import {Chart} from './snowfall.js'
4458export const year = 2023
4459
4460# Last year's snowfall
4461
4462In {year}, the snowfall was above average.
4463It was followed by a warm spring which caused
4464flood conditions in many of the nearby rivers.
4465
4466<Chart color="#fcb32c" year={year} />
4467"##;
4468
4469        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
4470
4471        // Check that lines 1 and 2 are marked as ESM blocks
4472        assert_eq!(ctx.lines.len(), 10);
4473        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
4474        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
4475        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
4476        assert!(
4477            !ctx.lines[3].in_esm_block,
4478            "Line 4 (heading) should NOT be in_esm_block"
4479        );
4480        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
4481        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
4482    }
4483
4484    #[test]
4485    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
4486        let content = r#"import {Chart} from './snowfall.js'
4487export const year = 2023
4488
4489# Last year's snowfall
4490"#;
4491
4492        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4493
4494        // ESM blocks should NOT be detected in Standard flavor
4495        assert!(
4496            !ctx.lines[0].in_esm_block,
4497            "Line 1 should NOT be in_esm_block in Standard flavor"
4498        );
4499        assert!(
4500            !ctx.lines[1].in_esm_block,
4501            "Line 2 should NOT be in_esm_block in Standard flavor"
4502        );
4503    }
4504
4505    #[test]
4506    fn test_blockquote_with_indented_content() {
4507        // Lines with `>` followed by heavily-indented content should be detected as blockquotes.
4508        // The content inside the blockquote may also be detected as a code block (which is correct),
4509        // but for MD046 purposes, we need to know the line is inside a blockquote.
4510        let content = r#"# Heading
4511
4512>      -S socket-path
4513>                    More text
4514"#;
4515        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4516
4517        // Line 3 (index 2) should be detected as blockquote
4518        assert!(
4519            ctx.lines.get(2).is_some_and(|l| l.blockquote.is_some()),
4520            "Line 3 should be a blockquote"
4521        );
4522        // Line 4 (index 3) should also be blockquote
4523        assert!(
4524            ctx.lines.get(3).is_some_and(|l| l.blockquote.is_some()),
4525            "Line 4 should be a blockquote"
4526        );
4527
4528        // Verify blockquote content is correctly parsed
4529        // Note: spaces_after includes the spaces between `>` and content
4530        let bq3 = ctx.lines.get(2).unwrap().blockquote.as_ref().unwrap();
4531        assert_eq!(bq3.content, "-S socket-path");
4532        assert_eq!(bq3.nesting_level, 1);
4533        // 6 spaces after the `>` marker
4534        assert!(bq3.has_multiple_spaces_after_marker);
4535
4536        let bq4 = ctx.lines.get(3).unwrap().blockquote.as_ref().unwrap();
4537        assert_eq!(bq4.content, "More text");
4538        assert_eq!(bq4.nesting_level, 1);
4539    }
4540
4541    #[test]
4542    fn test_footnote_definitions_not_parsed_as_reference_defs() {
4543        // Footnote definitions use [^id]: syntax and should NOT be parsed as reference definitions
4544        let content = r#"# Title
4545
4546A footnote[^1].
4547
4548[^1]: This is the footnote content.
4549
4550[^note]: Another footnote with [link](https://example.com).
4551
4552[regular]: ./path.md "A real reference definition"
4553"#;
4554        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4555
4556        // Should only have one reference definition (the regular one)
4557        assert_eq!(
4558            ctx.reference_defs.len(),
4559            1,
4560            "Footnotes should not be parsed as reference definitions"
4561        );
4562
4563        // The only reference def should be the regular one
4564        assert_eq!(ctx.reference_defs[0].id, "regular");
4565        assert_eq!(ctx.reference_defs[0].url, "./path.md");
4566        assert_eq!(
4567            ctx.reference_defs[0].title,
4568            Some("A real reference definition".to_string())
4569        );
4570    }
4571
4572    #[test]
4573    fn test_footnote_with_inline_link_not_misidentified() {
4574        // Regression test for issue #286: footnote containing an inline link
4575        // was incorrectly parsed as a reference definition with URL "[link](url)"
4576        let content = r#"# Title
4577
4578A footnote[^1].
4579
4580[^1]: [link](https://www.google.com).
4581"#;
4582        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4583
4584        // Should have no reference definitions
4585        assert!(
4586            ctx.reference_defs.is_empty(),
4587            "Footnote with inline link should not create a reference definition"
4588        );
4589    }
4590
4591    #[test]
4592    fn test_various_footnote_formats_excluded() {
4593        // Test various footnote ID formats are all excluded
4594        let content = r#"[^1]: Numeric footnote
4595[^note]: Named footnote
4596[^a]: Single char footnote
4597[^long-footnote-name]: Long named footnote
4598[^123abc]: Mixed alphanumeric
4599
4600[ref1]: ./file1.md
4601[ref2]: ./file2.md
4602"#;
4603        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4604
4605        // Should only have the two regular reference definitions
4606        assert_eq!(
4607            ctx.reference_defs.len(),
4608            2,
4609            "Only regular reference definitions should be parsed"
4610        );
4611
4612        let ids: Vec<&str> = ctx.reference_defs.iter().map(|r| r.id.as_str()).collect();
4613        assert!(ids.contains(&"ref1"));
4614        assert!(ids.contains(&"ref2"));
4615        assert!(!ids.iter().any(|id| id.starts_with('^')));
4616    }
4617
4618    // =========================================================================
4619    // Tests for has_char and char_count methods
4620    // =========================================================================
4621
4622    #[test]
4623    fn test_has_char_tracked_characters() {
4624        // Test all 12 tracked characters
4625        let content = "# Heading\n* list item\n_emphasis_ and -hyphen-\n+ plus\n> quote\n| table |\n[link]\n`code`\n<html>\n!image";
4626        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4627
4628        // All tracked characters should be detected
4629        assert!(ctx.has_char('#'), "Should detect hash");
4630        assert!(ctx.has_char('*'), "Should detect asterisk");
4631        assert!(ctx.has_char('_'), "Should detect underscore");
4632        assert!(ctx.has_char('-'), "Should detect hyphen");
4633        assert!(ctx.has_char('+'), "Should detect plus");
4634        assert!(ctx.has_char('>'), "Should detect gt");
4635        assert!(ctx.has_char('|'), "Should detect pipe");
4636        assert!(ctx.has_char('['), "Should detect bracket");
4637        assert!(ctx.has_char('`'), "Should detect backtick");
4638        assert!(ctx.has_char('<'), "Should detect lt");
4639        assert!(ctx.has_char('!'), "Should detect exclamation");
4640        assert!(ctx.has_char('\n'), "Should detect newline");
4641    }
4642
4643    #[test]
4644    fn test_has_char_absent_characters() {
4645        let content = "Simple text without special chars";
4646        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4647
4648        // None of the tracked characters should be present
4649        assert!(!ctx.has_char('#'), "Should not detect hash");
4650        assert!(!ctx.has_char('*'), "Should not detect asterisk");
4651        assert!(!ctx.has_char('_'), "Should not detect underscore");
4652        assert!(!ctx.has_char('-'), "Should not detect hyphen");
4653        assert!(!ctx.has_char('+'), "Should not detect plus");
4654        assert!(!ctx.has_char('>'), "Should not detect gt");
4655        assert!(!ctx.has_char('|'), "Should not detect pipe");
4656        assert!(!ctx.has_char('['), "Should not detect bracket");
4657        assert!(!ctx.has_char('`'), "Should not detect backtick");
4658        assert!(!ctx.has_char('<'), "Should not detect lt");
4659        assert!(!ctx.has_char('!'), "Should not detect exclamation");
4660        // Note: single line content has no newlines
4661        assert!(!ctx.has_char('\n'), "Should not detect newline in single line");
4662    }
4663
4664    #[test]
4665    fn test_has_char_fallback_for_untracked() {
4666        let content = "Text with @mention and $dollar and %percent";
4667        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4668
4669        // Untracked characters should fall back to content.contains()
4670        assert!(ctx.has_char('@'), "Should detect @ via fallback");
4671        assert!(ctx.has_char('$'), "Should detect $ via fallback");
4672        assert!(ctx.has_char('%'), "Should detect % via fallback");
4673        assert!(!ctx.has_char('^'), "Should not detect absent ^ via fallback");
4674    }
4675
4676    #[test]
4677    fn test_char_count_tracked_characters() {
4678        let content = "## Heading ##\n***bold***\n__emphasis__\n---\n+++\n>> nested\n|| table ||\n[[link]]\n``code``\n<<html>>\n!!";
4679        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4680
4681        // Count each tracked character
4682        assert_eq!(ctx.char_count('#'), 4, "Should count 4 hashes");
4683        assert_eq!(ctx.char_count('*'), 6, "Should count 6 asterisks");
4684        assert_eq!(ctx.char_count('_'), 4, "Should count 4 underscores");
4685        assert_eq!(ctx.char_count('-'), 3, "Should count 3 hyphens");
4686        assert_eq!(ctx.char_count('+'), 3, "Should count 3 pluses");
4687        assert_eq!(ctx.char_count('>'), 4, "Should count 4 gt (2 nested + 2 in <<html>>)");
4688        assert_eq!(ctx.char_count('|'), 4, "Should count 4 pipes");
4689        assert_eq!(ctx.char_count('['), 2, "Should count 2 brackets");
4690        assert_eq!(ctx.char_count('`'), 4, "Should count 4 backticks");
4691        assert_eq!(ctx.char_count('<'), 2, "Should count 2 lt");
4692        assert_eq!(ctx.char_count('!'), 2, "Should count 2 exclamations");
4693        assert_eq!(ctx.char_count('\n'), 10, "Should count 10 newlines");
4694    }
4695
4696    #[test]
4697    fn test_char_count_zero_for_absent() {
4698        let content = "Plain text";
4699        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4700
4701        assert_eq!(ctx.char_count('#'), 0);
4702        assert_eq!(ctx.char_count('*'), 0);
4703        assert_eq!(ctx.char_count('_'), 0);
4704        assert_eq!(ctx.char_count('\n'), 0);
4705    }
4706
4707    #[test]
4708    fn test_char_count_fallback_for_untracked() {
4709        let content = "@@@ $$ %%%";
4710        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4711
4712        assert_eq!(ctx.char_count('@'), 3, "Should count 3 @ via fallback");
4713        assert_eq!(ctx.char_count('$'), 2, "Should count 2 $ via fallback");
4714        assert_eq!(ctx.char_count('%'), 3, "Should count 3 % via fallback");
4715        assert_eq!(ctx.char_count('^'), 0, "Should count 0 for absent char");
4716    }
4717
4718    #[test]
4719    fn test_char_count_empty_content() {
4720        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4721
4722        assert_eq!(ctx.char_count('#'), 0);
4723        assert_eq!(ctx.char_count('*'), 0);
4724        assert_eq!(ctx.char_count('@'), 0);
4725        assert!(!ctx.has_char('#'));
4726        assert!(!ctx.has_char('@'));
4727    }
4728
4729    // =========================================================================
4730    // Tests for is_in_html_tag method
4731    // =========================================================================
4732
4733    #[test]
4734    fn test_is_in_html_tag_simple() {
4735        let content = "<div>content</div>";
4736        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4737
4738        // Inside opening tag
4739        assert!(ctx.is_in_html_tag(0), "Position 0 (<) should be in tag");
4740        assert!(ctx.is_in_html_tag(1), "Position 1 (d) should be in tag");
4741        assert!(ctx.is_in_html_tag(4), "Position 4 (>) should be in tag");
4742
4743        // Outside tag (in content)
4744        assert!(!ctx.is_in_html_tag(5), "Position 5 (c) should not be in tag");
4745        assert!(!ctx.is_in_html_tag(10), "Position 10 (t) should not be in tag");
4746
4747        // Inside closing tag
4748        assert!(ctx.is_in_html_tag(12), "Position 12 (<) should be in tag");
4749        assert!(ctx.is_in_html_tag(17), "Position 17 (>) should be in tag");
4750    }
4751
4752    #[test]
4753    fn test_is_in_html_tag_self_closing() {
4754        let content = "Text <br/> more text";
4755        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4756
4757        // Before tag
4758        assert!(!ctx.is_in_html_tag(0), "Position 0 should not be in tag");
4759        assert!(!ctx.is_in_html_tag(4), "Position 4 (space) should not be in tag");
4760
4761        // Inside self-closing tag
4762        assert!(ctx.is_in_html_tag(5), "Position 5 (<) should be in tag");
4763        assert!(ctx.is_in_html_tag(8), "Position 8 (/) should be in tag");
4764        assert!(ctx.is_in_html_tag(9), "Position 9 (>) should be in tag");
4765
4766        // After tag
4767        assert!(!ctx.is_in_html_tag(10), "Position 10 (space) should not be in tag");
4768    }
4769
4770    #[test]
4771    fn test_is_in_html_tag_with_attributes() {
4772        let content = r#"<a href="url" class="link">text</a>"#;
4773        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4774
4775        // All positions inside opening tag with attributes
4776        assert!(ctx.is_in_html_tag(0), "Start of tag");
4777        assert!(ctx.is_in_html_tag(10), "Inside href attribute");
4778        assert!(ctx.is_in_html_tag(20), "Inside class attribute");
4779        assert!(ctx.is_in_html_tag(26), "End of opening tag");
4780
4781        // Content between tags
4782        assert!(!ctx.is_in_html_tag(27), "Start of content");
4783        assert!(!ctx.is_in_html_tag(30), "End of content");
4784
4785        // Closing tag
4786        assert!(ctx.is_in_html_tag(31), "Start of closing tag");
4787    }
4788
4789    #[test]
4790    fn test_is_in_html_tag_multiline() {
4791        let content = "<div\n  class=\"test\"\n>\ncontent\n</div>";
4792        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4793
4794        // Opening tag spans multiple lines
4795        assert!(ctx.is_in_html_tag(0), "Start of multiline tag");
4796        assert!(ctx.is_in_html_tag(5), "After first newline in tag");
4797        assert!(ctx.is_in_html_tag(15), "Inside attribute");
4798
4799        // After closing > of opening tag
4800        let closing_bracket_pos = content.find(">\n").unwrap();
4801        assert!(!ctx.is_in_html_tag(closing_bracket_pos + 2), "Content after tag");
4802    }
4803
4804    #[test]
4805    fn test_is_in_html_tag_no_tags() {
4806        let content = "Plain text without any HTML";
4807        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4808
4809        // No position should be in an HTML tag
4810        for i in 0..content.len() {
4811            assert!(!ctx.is_in_html_tag(i), "Position {i} should not be in tag");
4812        }
4813    }
4814
4815    // =========================================================================
4816    // Tests for is_in_jinja_range method
4817    // =========================================================================
4818
4819    #[test]
4820    fn test_is_in_jinja_range_expression() {
4821        let content = "Hello {{ name }}!";
4822        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4823
4824        // Before Jinja
4825        assert!(!ctx.is_in_jinja_range(0), "H should not be in Jinja");
4826        assert!(!ctx.is_in_jinja_range(5), "Space before Jinja should not be in Jinja");
4827
4828        // Inside Jinja expression (positions 6-15 for "{{ name }}")
4829        assert!(ctx.is_in_jinja_range(6), "First brace should be in Jinja");
4830        assert!(ctx.is_in_jinja_range(7), "Second brace should be in Jinja");
4831        assert!(ctx.is_in_jinja_range(10), "name should be in Jinja");
4832        assert!(ctx.is_in_jinja_range(14), "Closing brace should be in Jinja");
4833        assert!(ctx.is_in_jinja_range(15), "Second closing brace should be in Jinja");
4834
4835        // After Jinja
4836        assert!(!ctx.is_in_jinja_range(16), "! should not be in Jinja");
4837    }
4838
4839    #[test]
4840    fn test_is_in_jinja_range_statement() {
4841        let content = "{% if condition %}content{% endif %}";
4842        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4843
4844        // Inside opening statement
4845        assert!(ctx.is_in_jinja_range(0), "Start of Jinja statement");
4846        assert!(ctx.is_in_jinja_range(5), "condition should be in Jinja");
4847        assert!(ctx.is_in_jinja_range(17), "End of opening statement");
4848
4849        // Content between
4850        assert!(!ctx.is_in_jinja_range(18), "content should not be in Jinja");
4851
4852        // Inside closing statement
4853        assert!(ctx.is_in_jinja_range(25), "Start of endif");
4854        assert!(ctx.is_in_jinja_range(32), "endif should be in Jinja");
4855    }
4856
4857    #[test]
4858    fn test_is_in_jinja_range_multiple() {
4859        let content = "{{ a }} and {{ b }}";
4860        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4861
4862        // First Jinja expression
4863        assert!(ctx.is_in_jinja_range(0));
4864        assert!(ctx.is_in_jinja_range(3));
4865        assert!(ctx.is_in_jinja_range(6));
4866
4867        // Between expressions
4868        assert!(!ctx.is_in_jinja_range(8));
4869        assert!(!ctx.is_in_jinja_range(11));
4870
4871        // Second Jinja expression
4872        assert!(ctx.is_in_jinja_range(12));
4873        assert!(ctx.is_in_jinja_range(15));
4874        assert!(ctx.is_in_jinja_range(18));
4875    }
4876
4877    #[test]
4878    fn test_is_in_jinja_range_no_jinja() {
4879        let content = "Plain text with single braces but not Jinja";
4880        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4881
4882        // No position should be in Jinja
4883        for i in 0..content.len() {
4884            assert!(!ctx.is_in_jinja_range(i), "Position {i} should not be in Jinja");
4885        }
4886    }
4887
4888    // =========================================================================
4889    // Tests for is_in_link_title method
4890    // =========================================================================
4891
4892    #[test]
4893    fn test_is_in_link_title_with_title() {
4894        let content = r#"[ref]: https://example.com "Title text"
4895
4896Some content."#;
4897        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4898
4899        // Verify we have a reference def with title
4900        assert_eq!(ctx.reference_defs.len(), 1);
4901        let def = &ctx.reference_defs[0];
4902        assert!(def.title_byte_start.is_some());
4903        assert!(def.title_byte_end.is_some());
4904
4905        let title_start = def.title_byte_start.unwrap();
4906        let title_end = def.title_byte_end.unwrap();
4907
4908        // Before title (in URL)
4909        assert!(!ctx.is_in_link_title(10), "URL should not be in title");
4910
4911        // Inside title
4912        assert!(ctx.is_in_link_title(title_start), "Title start should be in title");
4913        assert!(
4914            ctx.is_in_link_title(title_start + 5),
4915            "Middle of title should be in title"
4916        );
4917        assert!(ctx.is_in_link_title(title_end - 1), "End of title should be in title");
4918
4919        // After title
4920        assert!(
4921            !ctx.is_in_link_title(title_end),
4922            "After title end should not be in title"
4923        );
4924    }
4925
4926    #[test]
4927    fn test_is_in_link_title_without_title() {
4928        let content = "[ref]: https://example.com\n\nSome content.";
4929        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4930
4931        // Reference def without title
4932        assert_eq!(ctx.reference_defs.len(), 1);
4933        let def = &ctx.reference_defs[0];
4934        assert!(def.title_byte_start.is_none());
4935        assert!(def.title_byte_end.is_none());
4936
4937        // No position should be in a title
4938        for i in 0..content.len() {
4939            assert!(!ctx.is_in_link_title(i), "Position {i} should not be in title");
4940        }
4941    }
4942
4943    #[test]
4944    fn test_is_in_link_title_multiple_refs() {
4945        let content = r#"[ref1]: /url1 "Title One"
4946[ref2]: /url2
4947[ref3]: /url3 "Title Three"
4948"#;
4949        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4950
4951        // Should have 3 reference defs
4952        assert_eq!(ctx.reference_defs.len(), 3);
4953
4954        // ref1 has title
4955        let ref1 = ctx.reference_defs.iter().find(|r| r.id == "ref1").unwrap();
4956        assert!(ref1.title_byte_start.is_some());
4957
4958        // ref2 has no title
4959        let ref2 = ctx.reference_defs.iter().find(|r| r.id == "ref2").unwrap();
4960        assert!(ref2.title_byte_start.is_none());
4961
4962        // ref3 has title
4963        let ref3 = ctx.reference_defs.iter().find(|r| r.id == "ref3").unwrap();
4964        assert!(ref3.title_byte_start.is_some());
4965
4966        // Check positions in ref1's title
4967        if let (Some(start), Some(end)) = (ref1.title_byte_start, ref1.title_byte_end) {
4968            assert!(ctx.is_in_link_title(start + 1));
4969            assert!(!ctx.is_in_link_title(end + 5));
4970        }
4971
4972        // Check positions in ref3's title
4973        if let (Some(start), Some(_end)) = (ref3.title_byte_start, ref3.title_byte_end) {
4974            assert!(ctx.is_in_link_title(start + 1));
4975        }
4976    }
4977
4978    #[test]
4979    fn test_is_in_link_title_single_quotes() {
4980        let content = "[ref]: /url 'Single quoted title'\n";
4981        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4982
4983        assert_eq!(ctx.reference_defs.len(), 1);
4984        let def = &ctx.reference_defs[0];
4985
4986        if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
4987            assert!(ctx.is_in_link_title(start));
4988            assert!(ctx.is_in_link_title(start + 5));
4989            assert!(!ctx.is_in_link_title(end));
4990        }
4991    }
4992
4993    #[test]
4994    fn test_is_in_link_title_parentheses() {
4995        // Note: The reference def parser may not support parenthesized titles
4996        // This test verifies the is_in_link_title method works when titles exist
4997        let content = "[ref]: /url (Parenthesized title)\n";
4998        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4999
5000        // Parser behavior: may or may not parse parenthesized titles
5001        // We test that is_in_link_title correctly reflects whatever was parsed
5002        if ctx.reference_defs.is_empty() {
5003            // Parser didn't recognize this as a reference def
5004            for i in 0..content.len() {
5005                assert!(!ctx.is_in_link_title(i));
5006            }
5007        } else {
5008            let def = &ctx.reference_defs[0];
5009            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
5010                assert!(ctx.is_in_link_title(start));
5011                assert!(ctx.is_in_link_title(start + 5));
5012                assert!(!ctx.is_in_link_title(end));
5013            } else {
5014                // Title wasn't parsed, so no position should be in title
5015                for i in 0..content.len() {
5016                    assert!(!ctx.is_in_link_title(i));
5017                }
5018            }
5019        }
5020    }
5021
5022    #[test]
5023    fn test_is_in_link_title_no_refs() {
5024        let content = "Just plain text without any reference definitions.";
5025        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5026
5027        assert!(ctx.reference_defs.is_empty());
5028
5029        for i in 0..content.len() {
5030            assert!(!ctx.is_in_link_title(i));
5031        }
5032    }
5033
5034    // =========================================================================
5035    // Math span tests (Issue #289)
5036    // =========================================================================
5037
5038    #[test]
5039    fn test_math_spans_inline() {
5040        let content = "Text with inline math $[f](x)$ in it.";
5041        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5042
5043        let math_spans = ctx.math_spans();
5044        assert_eq!(math_spans.len(), 1, "Should detect one inline math span");
5045
5046        let span = &math_spans[0];
5047        assert!(!span.is_display, "Should be inline math, not display");
5048        assert_eq!(span.content, "[f](x)", "Content should be extracted correctly");
5049    }
5050
5051    #[test]
5052    fn test_math_spans_display_single_line() {
5053        let content = "$$X(\\zeta) = \\mathcal Z [x](\\zeta)$$";
5054        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5055
5056        let math_spans = ctx.math_spans();
5057        assert_eq!(math_spans.len(), 1, "Should detect one display math span");
5058
5059        let span = &math_spans[0];
5060        assert!(span.is_display, "Should be display math");
5061        assert!(
5062            span.content.contains("[x](\\zeta)"),
5063            "Content should contain the link-like pattern"
5064        );
5065    }
5066
5067    #[test]
5068    fn test_math_spans_display_multiline() {
5069        let content = "Before\n\n$$\n[x](\\zeta) = \\sum_k x(k)\n$$\n\nAfter";
5070        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5071
5072        let math_spans = ctx.math_spans();
5073        assert_eq!(math_spans.len(), 1, "Should detect one display math span");
5074
5075        let span = &math_spans[0];
5076        assert!(span.is_display, "Should be display math");
5077    }
5078
5079    #[test]
5080    fn test_is_in_math_span() {
5081        let content = "Text $[f](x)$ more text";
5082        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5083
5084        // Position inside the math span
5085        let math_start = content.find('$').unwrap();
5086        let math_end = content.rfind('$').unwrap() + 1;
5087
5088        assert!(
5089            ctx.is_in_math_span(math_start + 1),
5090            "Position inside math span should return true"
5091        );
5092        assert!(
5093            ctx.is_in_math_span(math_start + 3),
5094            "Position inside math span should return true"
5095        );
5096
5097        // Position outside the math span
5098        assert!(!ctx.is_in_math_span(0), "Position before math span should return false");
5099        assert!(
5100            !ctx.is_in_math_span(math_end + 1),
5101            "Position after math span should return false"
5102        );
5103    }
5104
5105    #[test]
5106    fn test_math_spans_mixed_with_code() {
5107        let content = "Math $[f](x)$ and code `[g](y)` mixed";
5108        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5109
5110        let math_spans = ctx.math_spans();
5111        let code_spans = ctx.code_spans();
5112
5113        assert_eq!(math_spans.len(), 1, "Should have one math span");
5114        assert_eq!(code_spans.len(), 1, "Should have one code span");
5115
5116        // Verify math span content
5117        assert_eq!(math_spans[0].content, "[f](x)");
5118        // Verify code span content
5119        assert_eq!(code_spans[0].content, "[g](y)");
5120    }
5121
5122    #[test]
5123    fn test_math_spans_no_math() {
5124        let content = "Regular text without any math at all.";
5125        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5126
5127        let math_spans = ctx.math_spans();
5128        assert!(math_spans.is_empty(), "Should have no math spans");
5129    }
5130
5131    #[test]
5132    fn test_math_spans_multiple() {
5133        let content = "First $a$ and second $b$ and display $$c$$";
5134        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5135
5136        let math_spans = ctx.math_spans();
5137        assert_eq!(math_spans.len(), 3, "Should detect three math spans");
5138
5139        // Two inline, one display
5140        let inline_count = math_spans.iter().filter(|s| !s.is_display).count();
5141        let display_count = math_spans.iter().filter(|s| s.is_display).count();
5142
5143        assert_eq!(inline_count, 2, "Should have two inline math spans");
5144        assert_eq!(display_count, 1, "Should have one display math span");
5145    }
5146
5147    #[test]
5148    fn test_is_in_math_span_boundary_positions() {
5149        // Test exact boundary positions: $[f](x)$
5150        // Byte positions:                0123456789
5151        let content = "$[f](x)$";
5152        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5153
5154        let math_spans = ctx.math_spans();
5155        assert_eq!(math_spans.len(), 1, "Should have one math span");
5156
5157        let span = &math_spans[0];
5158
5159        // Position at opening $ should be in span (byte 0)
5160        assert!(
5161            ctx.is_in_math_span(span.byte_offset),
5162            "Start position should be in span"
5163        );
5164
5165        // Position just inside should be in span
5166        assert!(
5167            ctx.is_in_math_span(span.byte_offset + 1),
5168            "Position after start should be in span"
5169        );
5170
5171        // Position at closing $ should be in span (exclusive end means we check byte_end - 1)
5172        assert!(
5173            ctx.is_in_math_span(span.byte_end - 1),
5174            "Position at end-1 should be in span"
5175        );
5176
5177        // Position at byte_end should NOT be in span (exclusive end)
5178        assert!(
5179            !ctx.is_in_math_span(span.byte_end),
5180            "Position at byte_end should NOT be in span (exclusive)"
5181        );
5182    }
5183
5184    #[test]
5185    fn test_math_spans_at_document_start() {
5186        let content = "$x$ text";
5187        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5188
5189        let math_spans = ctx.math_spans();
5190        assert_eq!(math_spans.len(), 1);
5191        assert_eq!(math_spans[0].byte_offset, 0, "Math should start at byte 0");
5192    }
5193
5194    #[test]
5195    fn test_math_spans_at_document_end() {
5196        let content = "text $x$";
5197        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5198
5199        let math_spans = ctx.math_spans();
5200        assert_eq!(math_spans.len(), 1);
5201        assert_eq!(math_spans[0].byte_end, content.len(), "Math should end at document end");
5202    }
5203
5204    #[test]
5205    fn test_math_spans_consecutive() {
5206        let content = "$a$$b$";
5207        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5208
5209        let math_spans = ctx.math_spans();
5210        // pulldown-cmark should parse these as separate spans
5211        assert!(!math_spans.is_empty(), "Should detect at least one math span");
5212
5213        // All positions should be in some math span
5214        for i in 0..content.len() {
5215            assert!(ctx.is_in_math_span(i), "Position {i} should be in a math span");
5216        }
5217    }
5218
5219    #[test]
5220    fn test_math_spans_currency_not_math() {
5221        // Unbalanced $ should not create math spans
5222        let content = "Price is $100";
5223        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5224
5225        let math_spans = ctx.math_spans();
5226        // pulldown-cmark requires balanced delimiters for math
5227        // $100 alone is not math
5228        assert!(
5229            math_spans.is_empty() || !math_spans.iter().any(|s| s.content.contains("100")),
5230            "Unbalanced $ should not create math span containing 100"
5231        );
5232    }
5233
5234    // =========================================================================
5235    // Tests for O(1) reference definition lookups via HashMap
5236    // =========================================================================
5237
5238    #[test]
5239    fn test_reference_lookup_o1_basic() {
5240        let content = r#"[ref1]: /url1
5241[REF2]: /url2 "Title"
5242[Ref3]: /url3
5243
5244Use [link][ref1] and [link][REF2]."#;
5245        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5246
5247        // Verify we have 3 reference defs
5248        assert_eq!(ctx.reference_defs.len(), 3);
5249
5250        // Test get_reference_url with various cases
5251        assert_eq!(ctx.get_reference_url("ref1"), Some("/url1"));
5252        assert_eq!(ctx.get_reference_url("REF1"), Some("/url1")); // case insensitive
5253        assert_eq!(ctx.get_reference_url("Ref1"), Some("/url1")); // case insensitive
5254        assert_eq!(ctx.get_reference_url("ref2"), Some("/url2"));
5255        assert_eq!(ctx.get_reference_url("REF2"), Some("/url2"));
5256        assert_eq!(ctx.get_reference_url("ref3"), Some("/url3"));
5257        assert_eq!(ctx.get_reference_url("nonexistent"), None);
5258    }
5259
5260    #[test]
5261    fn test_reference_lookup_o1_get_reference_def() {
5262        let content = r#"[myref]: https://example.com "My Title"
5263"#;
5264        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5265
5266        // Test get_reference_def
5267        let def = ctx.get_reference_def("myref").expect("Should find myref");
5268        assert_eq!(def.url, "https://example.com");
5269        assert_eq!(def.title.as_deref(), Some("My Title"));
5270
5271        // Case insensitive
5272        let def2 = ctx.get_reference_def("MYREF").expect("Should find MYREF");
5273        assert_eq!(def2.url, "https://example.com");
5274
5275        // Non-existent
5276        assert!(ctx.get_reference_def("nonexistent").is_none());
5277    }
5278
5279    #[test]
5280    fn test_reference_lookup_o1_has_reference_def() {
5281        let content = r#"[foo]: /foo
5282[BAR]: /bar
5283"#;
5284        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5285
5286        // Test has_reference_def
5287        assert!(ctx.has_reference_def("foo"));
5288        assert!(ctx.has_reference_def("FOO")); // case insensitive
5289        assert!(ctx.has_reference_def("bar"));
5290        assert!(ctx.has_reference_def("Bar")); // case insensitive
5291        assert!(!ctx.has_reference_def("baz")); // doesn't exist
5292    }
5293
5294    #[test]
5295    fn test_reference_lookup_o1_empty_content() {
5296        let content = "No references here.";
5297        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5298
5299        assert!(ctx.reference_defs.is_empty());
5300        assert_eq!(ctx.get_reference_url("anything"), None);
5301        assert!(ctx.get_reference_def("anything").is_none());
5302        assert!(!ctx.has_reference_def("anything"));
5303    }
5304
5305    #[test]
5306    fn test_reference_lookup_o1_special_characters_in_id() {
5307        let content = r#"[ref-with-dash]: /url1
5308[ref_with_underscore]: /url2
5309[ref.with.dots]: /url3
5310"#;
5311        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5312
5313        assert_eq!(ctx.get_reference_url("ref-with-dash"), Some("/url1"));
5314        assert_eq!(ctx.get_reference_url("ref_with_underscore"), Some("/url2"));
5315        assert_eq!(ctx.get_reference_url("ref.with.dots"), Some("/url3"));
5316    }
5317
5318    #[test]
5319    fn test_reference_lookup_o1_unicode_id() {
5320        let content = r#"[日本語]: /japanese
5321[émoji]: /emoji
5322"#;
5323        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
5324
5325        assert_eq!(ctx.get_reference_url("日本語"), Some("/japanese"));
5326        assert_eq!(ctx.get_reference_url("émoji"), Some("/emoji"));
5327        assert_eq!(ctx.get_reference_url("ÉMOJI"), Some("/emoji")); // uppercase
5328    }
5329}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs