rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::path::PathBuf;
8use std::sync::LazyLock;
9
10/// Macro for profiling sections - only active in non-WASM builds
11#[cfg(not(target_arch = "wasm32"))]
12macro_rules! profile_section {
13    ($name:expr, $profile:expr, $code:expr) => {{
14        let start = std::time::Instant::now();
15        let result = $code;
16        if $profile {
17            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
18        }
19        result
20    }};
21}
22
23#[cfg(target_arch = "wasm32")]
24macro_rules! profile_section {
25    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
26}
27
28// Comprehensive link pattern that captures both inline and reference links
29// Use (?s) flag to make . match newlines
30static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
31    Regex::new(
32        r#"(?sx)
33        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
34        (?:
35            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
36            |
37            \[([^\]]*)\]      # Reference ID in group 6
38        )"#
39    ).unwrap()
40});
41
42// Image pattern (similar to links but with ! prefix)
43// Use (?s) flag to make . match newlines
44static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r#"(?sx)
47        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
48        (?:
49            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
50            |
51            \[([^\]]*)\]      # Reference ID in group 6
52        )"#
53    ).unwrap()
54});
55
56// Reference definition pattern
57static REF_DEF_PATTERN: LazyLock<Regex> =
58    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
59
60// Pattern for bare URLs
61static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
62    Regex::new(
63        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
64    ).unwrap()
65});
66
67// Pattern for email addresses
68static BARE_EMAIL_PATTERN: LazyLock<Regex> =
69    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
70
71// Pattern for blockquote prefix in parse_list_blocks
72static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
73
74/// Pre-computed information about a line
75#[derive(Debug, Clone)]
76pub struct LineInfo {
77    /// Byte offset where this line starts in the document
78    pub byte_offset: usize,
79    /// Length of the line in bytes (without newline)
80    pub byte_len: usize,
81    /// Number of leading spaces/tabs
82    pub indent: usize,
83    /// Whether the line is blank (empty or only whitespace)
84    pub is_blank: bool,
85    /// Whether this line is inside a code block
86    pub in_code_block: bool,
87    /// Whether this line is inside front matter
88    pub in_front_matter: bool,
89    /// Whether this line is inside an HTML block
90    pub in_html_block: bool,
91    /// Whether this line is inside an HTML comment
92    pub in_html_comment: bool,
93    /// List item information if this line starts a list item
94    pub list_item: Option<ListItemInfo>,
95    /// Heading information if this line is a heading
96    pub heading: Option<HeadingInfo>,
97    /// Blockquote information if this line is a blockquote
98    pub blockquote: Option<BlockquoteInfo>,
99    /// Whether this line is inside a mkdocstrings autodoc block
100    pub in_mkdocstrings: bool,
101    /// Whether this line is part of an ESM import/export block (MDX only)
102    pub in_esm_block: bool,
103    /// Whether this line is a continuation of a multi-line code span from a previous line
104    pub in_code_span_continuation: bool,
105}
106
107impl LineInfo {
108    /// Get the line content as a string slice from the source document
109    pub fn content<'a>(&self, source: &'a str) -> &'a str {
110        &source[self.byte_offset..self.byte_offset + self.byte_len]
111    }
112}
113
114/// Information about a list item
115#[derive(Debug, Clone)]
116pub struct ListItemInfo {
117    /// The marker used (*, -, +, or number with . or ))
118    pub marker: String,
119    /// Whether it's ordered (true) or unordered (false)
120    pub is_ordered: bool,
121    /// The number for ordered lists
122    pub number: Option<usize>,
123    /// Column where the marker starts (0-based)
124    pub marker_column: usize,
125    /// Column where content after marker starts
126    pub content_column: usize,
127}
128
129/// Heading style type
130#[derive(Debug, Clone, PartialEq)]
131pub enum HeadingStyle {
132    /// ATX style heading (# Heading)
133    ATX,
134    /// Setext style heading with = underline
135    Setext1,
136    /// Setext style heading with - underline
137    Setext2,
138}
139
140/// Parsed link information
141#[derive(Debug, Clone)]
142pub struct ParsedLink<'a> {
143    /// Line number (1-indexed)
144    pub line: usize,
145    /// Start column (0-indexed) in the line
146    pub start_col: usize,
147    /// End column (0-indexed) in the line
148    pub end_col: usize,
149    /// Byte offset in document
150    pub byte_offset: usize,
151    /// End byte offset in document
152    pub byte_end: usize,
153    /// Link text
154    pub text: Cow<'a, str>,
155    /// Link URL or reference
156    pub url: Cow<'a, str>,
157    /// Whether this is a reference link [text][ref] vs inline [text](url)
158    pub is_reference: bool,
159    /// Reference ID for reference links
160    pub reference_id: Option<Cow<'a, str>>,
161    /// Link type from pulldown-cmark
162    pub link_type: LinkType,
163}
164
165/// Information about a broken link reported by pulldown-cmark
166#[derive(Debug, Clone)]
167pub struct BrokenLinkInfo {
168    /// The reference text that couldn't be resolved
169    pub reference: String,
170    /// Byte span in the source document
171    pub span: std::ops::Range<usize>,
172}
173
174/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
175#[derive(Debug, Clone)]
176pub struct FootnoteRef {
177    /// The footnote ID (without the ^ prefix)
178    pub id: String,
179    /// Line number (1-indexed)
180    pub line: usize,
181    /// Start byte offset in document
182    pub byte_offset: usize,
183    /// End byte offset in document
184    pub byte_end: usize,
185}
186
187/// Parsed image information
188#[derive(Debug, Clone)]
189pub struct ParsedImage<'a> {
190    /// Line number (1-indexed)
191    pub line: usize,
192    /// Start column (0-indexed) in the line
193    pub start_col: usize,
194    /// End column (0-indexed) in the line
195    pub end_col: usize,
196    /// Byte offset in document
197    pub byte_offset: usize,
198    /// End byte offset in document
199    pub byte_end: usize,
200    /// Alt text
201    pub alt_text: Cow<'a, str>,
202    /// Image URL or reference
203    pub url: Cow<'a, str>,
204    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
205    pub is_reference: bool,
206    /// Reference ID for reference images
207    pub reference_id: Option<Cow<'a, str>>,
208    /// Link type from pulldown-cmark
209    pub link_type: LinkType,
210}
211
212/// Reference definition [ref]: url "title"
213#[derive(Debug, Clone)]
214pub struct ReferenceDef {
215    /// Line number (1-indexed)
216    pub line: usize,
217    /// Reference ID (normalized to lowercase)
218    pub id: String,
219    /// URL
220    pub url: String,
221    /// Optional title
222    pub title: Option<String>,
223    /// Byte offset where the reference definition starts
224    pub byte_offset: usize,
225    /// Byte offset where the reference definition ends
226    pub byte_end: usize,
227    /// Byte offset where the title starts (if present, includes quote)
228    pub title_byte_start: Option<usize>,
229    /// Byte offset where the title ends (if present, includes quote)
230    pub title_byte_end: Option<usize>,
231}
232
233/// Parsed code span information
234#[derive(Debug, Clone)]
235pub struct CodeSpan {
236    /// Line number where the code span starts (1-indexed)
237    pub line: usize,
238    /// Line number where the code span ends (1-indexed)
239    pub end_line: usize,
240    /// Start column (0-indexed) in the line
241    pub start_col: usize,
242    /// End column (0-indexed) in the line
243    pub end_col: usize,
244    /// Byte offset in document
245    pub byte_offset: usize,
246    /// End byte offset in document
247    pub byte_end: usize,
248    /// Number of backticks used (1, 2, 3, etc.)
249    pub backtick_count: usize,
250    /// Content inside the code span (without backticks)
251    pub content: String,
252}
253
254/// Information about a heading
255#[derive(Debug, Clone)]
256pub struct HeadingInfo {
257    /// Heading level (1-6 for ATX, 1-2 for Setext)
258    pub level: u8,
259    /// Style of heading
260    pub style: HeadingStyle,
261    /// The heading marker (# characters or underline)
262    pub marker: String,
263    /// Column where the marker starts (0-based)
264    pub marker_column: usize,
265    /// Column where heading text starts
266    pub content_column: usize,
267    /// The heading text (without markers and without custom ID syntax)
268    pub text: String,
269    /// Custom header ID if present (e.g., from {#custom-id} syntax)
270    pub custom_id: Option<String>,
271    /// Original heading text including custom ID syntax
272    pub raw_text: String,
273    /// Whether it has a closing sequence (for ATX)
274    pub has_closing_sequence: bool,
275    /// The closing sequence if present
276    pub closing_sequence: String,
277    /// Whether this is a valid CommonMark heading (ATX headings require space after #)
278    /// False for malformed headings like `#NoSpace` that MD018 should flag
279    pub is_valid: bool,
280}
281
282/// A valid heading from a filtered iteration
283///
284/// Only includes headings that are CommonMark-compliant (have space after #).
285/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
286#[derive(Debug, Clone)]
287pub struct ValidHeading<'a> {
288    /// The 1-indexed line number in the document
289    pub line_num: usize,
290    /// Reference to the heading information
291    pub heading: &'a HeadingInfo,
292    /// Reference to the full line info (for rules that need additional context)
293    pub line_info: &'a LineInfo,
294}
295
296/// Iterator over valid CommonMark headings in a document
297///
298/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
299/// but should not be processed by other heading rules.
300pub struct ValidHeadingsIter<'a> {
301    lines: &'a [LineInfo],
302    current_index: usize,
303}
304
305impl<'a> ValidHeadingsIter<'a> {
306    fn new(lines: &'a [LineInfo]) -> Self {
307        Self {
308            lines,
309            current_index: 0,
310        }
311    }
312}
313
314impl<'a> Iterator for ValidHeadingsIter<'a> {
315    type Item = ValidHeading<'a>;
316
317    fn next(&mut self) -> Option<Self::Item> {
318        while self.current_index < self.lines.len() {
319            let idx = self.current_index;
320            self.current_index += 1;
321
322            let line_info = &self.lines[idx];
323            if let Some(heading) = &line_info.heading
324                && heading.is_valid
325            {
326                return Some(ValidHeading {
327                    line_num: idx + 1, // Convert 0-indexed to 1-indexed
328                    heading,
329                    line_info,
330                });
331            }
332        }
333        None
334    }
335}
336
337/// Information about a blockquote line
338#[derive(Debug, Clone)]
339pub struct BlockquoteInfo {
340    /// Nesting level (1 for >, 2 for >>, etc.)
341    pub nesting_level: usize,
342    /// The indentation before the blockquote marker
343    pub indent: String,
344    /// Column where the first > starts (0-based)
345    pub marker_column: usize,
346    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
347    pub prefix: String,
348    /// Content after the blockquote marker(s)
349    pub content: String,
350    /// Whether the line has no space after the marker
351    pub has_no_space_after_marker: bool,
352    /// Whether the line has multiple spaces after the marker
353    pub has_multiple_spaces_after_marker: bool,
354    /// Whether this is an empty blockquote line needing MD028 fix
355    pub needs_md028_fix: bool,
356}
357
358/// Information about a list block
359#[derive(Debug, Clone)]
360pub struct ListBlock {
361    /// Line number where the list starts (1-indexed)
362    pub start_line: usize,
363    /// Line number where the list ends (1-indexed)
364    pub end_line: usize,
365    /// Whether it's ordered or unordered
366    pub is_ordered: bool,
367    /// The consistent marker for unordered lists (if any)
368    pub marker: Option<String>,
369    /// Blockquote prefix for this list (empty if not in blockquote)
370    pub blockquote_prefix: String,
371    /// Lines that are list items within this block
372    pub item_lines: Vec<usize>,
373    /// Nesting level (0 for top-level lists)
374    pub nesting_level: usize,
375    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
376    pub max_marker_width: usize,
377}
378
379use std::sync::{Arc, OnceLock};
380
381/// Character frequency data for fast content analysis
382#[derive(Debug, Clone, Default)]
383pub struct CharFrequency {
384    /// Count of # characters (headings)
385    pub hash_count: usize,
386    /// Count of * characters (emphasis, lists, horizontal rules)
387    pub asterisk_count: usize,
388    /// Count of _ characters (emphasis, horizontal rules)
389    pub underscore_count: usize,
390    /// Count of - characters (lists, horizontal rules, setext headings)
391    pub hyphen_count: usize,
392    /// Count of + characters (lists)
393    pub plus_count: usize,
394    /// Count of > characters (blockquotes)
395    pub gt_count: usize,
396    /// Count of | characters (tables)
397    pub pipe_count: usize,
398    /// Count of [ characters (links, images)
399    pub bracket_count: usize,
400    /// Count of ` characters (code spans, code blocks)
401    pub backtick_count: usize,
402    /// Count of < characters (HTML tags, autolinks)
403    pub lt_count: usize,
404    /// Count of ! characters (images)
405    pub exclamation_count: usize,
406    /// Count of newline characters
407    pub newline_count: usize,
408}
409
410/// Pre-parsed HTML tag information
411#[derive(Debug, Clone)]
412pub struct HtmlTag {
413    /// Line number (1-indexed)
414    pub line: usize,
415    /// Start column (0-indexed) in the line
416    pub start_col: usize,
417    /// End column (0-indexed) in the line
418    pub end_col: usize,
419    /// Byte offset in document
420    pub byte_offset: usize,
421    /// End byte offset in document
422    pub byte_end: usize,
423    /// Tag name (e.g., "div", "img", "br")
424    pub tag_name: String,
425    /// Whether it's a closing tag (`</tag>`)
426    pub is_closing: bool,
427    /// Whether it's self-closing (`<tag />`)
428    pub is_self_closing: bool,
429    /// Raw tag content
430    pub raw_content: String,
431}
432
433/// Pre-parsed emphasis span information
434#[derive(Debug, Clone)]
435pub struct EmphasisSpan {
436    /// Line number (1-indexed)
437    pub line: usize,
438    /// Start column (0-indexed) in the line
439    pub start_col: usize,
440    /// End column (0-indexed) in the line
441    pub end_col: usize,
442    /// Byte offset in document
443    pub byte_offset: usize,
444    /// End byte offset in document
445    pub byte_end: usize,
446    /// Type of emphasis ('*' or '_')
447    pub marker: char,
448    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
449    pub marker_count: usize,
450    /// Content inside the emphasis
451    pub content: String,
452}
453
454/// Pre-parsed table row information
455#[derive(Debug, Clone)]
456pub struct TableRow {
457    /// Line number (1-indexed)
458    pub line: usize,
459    /// Whether this is a separator row (contains only |, -, :, and spaces)
460    pub is_separator: bool,
461    /// Number of columns (pipe-separated cells)
462    pub column_count: usize,
463    /// Alignment info from separator row
464    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
465}
466
467/// Pre-parsed bare URL information (not in links)
468#[derive(Debug, Clone)]
469pub struct BareUrl {
470    /// Line number (1-indexed)
471    pub line: usize,
472    /// Start column (0-indexed) in the line
473    pub start_col: usize,
474    /// End column (0-indexed) in the line
475    pub end_col: usize,
476    /// Byte offset in document
477    pub byte_offset: usize,
478    /// End byte offset in document
479    pub byte_end: usize,
480    /// The URL string
481    pub url: String,
482    /// Type of URL ("http", "https", "ftp", "email")
483    pub url_type: String,
484}
485
486pub struct LintContext<'a> {
487    pub content: &'a str,
488    pub line_offsets: Vec<usize>,
489    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
490    pub lines: Vec<LineInfo>,             // Pre-computed line information
491    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
492    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
493    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
494    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
495    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
496    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
497    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
498    pub char_frequency: CharFrequency,    // Character frequency analysis
499    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
500    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
501    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
502    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
503    has_mixed_list_nesting_cache: OnceLock<bool>, // Cached result for mixed ordered/unordered list nesting detection
504    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
505    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
506    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
507    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
508    pub flavor: MarkdownFlavor,           // Markdown flavor being used
509    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
510}
511
512/// Detailed blockquote parse result with all components
513struct BlockquoteComponents<'a> {
514    indent: &'a str,
515    markers: &'a str,
516    spaces_after: &'a str,
517    content: &'a str,
518}
519
520/// Parse blockquote prefix with detailed components using manual parsing
521#[inline]
522fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
523    let bytes = line.as_bytes();
524    let mut pos = 0;
525
526    // Parse leading whitespace (indent)
527    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
528        pos += 1;
529    }
530    let indent_end = pos;
531
532    // Must have at least one '>' marker
533    if pos >= bytes.len() || bytes[pos] != b'>' {
534        return None;
535    }
536
537    // Parse '>' markers
538    while pos < bytes.len() && bytes[pos] == b'>' {
539        pos += 1;
540    }
541    let markers_end = pos;
542
543    // Parse spaces after markers
544    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
545        pos += 1;
546    }
547    let spaces_end = pos;
548
549    Some(BlockquoteComponents {
550        indent: &line[0..indent_end],
551        markers: &line[indent_end..markers_end],
552        spaces_after: &line[markers_end..spaces_end],
553        content: &line[spaces_end..],
554    })
555}
556
557impl<'a> LintContext<'a> {
558    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
559        #[cfg(not(target_arch = "wasm32"))]
560        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
561        #[cfg(target_arch = "wasm32")]
562        let profile = false;
563
564        let line_offsets = profile_section!("Line offsets", profile, {
565            let mut offsets = vec![0];
566            for (i, c) in content.char_indices() {
567                if c == '\n' {
568                    offsets.push(i + 1);
569                }
570            }
571            offsets
572        });
573
574        // Detect code blocks once and cache them
575        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
576
577        // Pre-compute HTML comment ranges ONCE for all operations
578        let html_comment_ranges = profile_section!(
579            "HTML comment ranges",
580            profile,
581            crate::utils::skip_context::compute_html_comment_ranges(content)
582        );
583
584        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
585        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
586            if flavor == MarkdownFlavor::MkDocs {
587                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
588            } else {
589                Vec::new()
590            }
591        });
592
593        // Pre-compute line information (without headings/blockquotes yet)
594        let mut lines = profile_section!(
595            "Basic line info",
596            profile,
597            Self::compute_basic_line_info(
598                content,
599                &line_offsets,
600                &code_blocks,
601                flavor,
602                &html_comment_ranges,
603                &autodoc_ranges,
604            )
605        );
606
607        // Detect HTML blocks BEFORE heading detection
608        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
609
610        // Detect ESM import/export blocks in MDX files BEFORE heading detection
611        profile_section!(
612            "ESM blocks",
613            profile,
614            Self::detect_esm_blocks(content, &mut lines, flavor)
615        );
616
617        // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
618        let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
619
620        // Now detect headings and blockquotes
621        profile_section!(
622            "Headings & blockquotes",
623            profile,
624            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
625        );
626
627        // Parse code spans early so we can exclude them from link/image parsing
628        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
629
630        // Mark lines that are continuations of multi-line code spans
631        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
632        for span in &code_spans {
633            if span.end_line > span.line {
634                // Mark lines after the first line as continuations
635                for line_num in (span.line + 1)..=span.end_line {
636                    if let Some(line_info) = lines.get_mut(line_num - 1) {
637                        line_info.in_code_span_continuation = true;
638                    }
639                }
640            }
641        }
642
643        // Parse links, images, references, and list blocks
644        let (links, broken_links, footnote_refs) = profile_section!(
645            "Links",
646            profile,
647            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
648        );
649
650        let images = profile_section!(
651            "Images",
652            profile,
653            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
654        );
655
656        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
657
658        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
659
660        // Compute character frequency for fast content analysis
661        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
662
663        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
664        let table_blocks = profile_section!(
665            "Table blocks",
666            profile,
667            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
668                content,
669                &code_blocks,
670                &code_spans,
671                &html_comment_ranges,
672            )
673        );
674
675        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
676        let line_index = profile_section!(
677            "Line index",
678            profile,
679            crate::utils::range_utils::LineIndex::new(content)
680        );
681
682        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
683        let jinja_ranges = profile_section!(
684            "Jinja ranges",
685            profile,
686            crate::utils::jinja_utils::find_jinja_ranges(content)
687        );
688
689        Self {
690            content,
691            line_offsets,
692            code_blocks,
693            lines,
694            links,
695            images,
696            broken_links,
697            footnote_refs,
698            reference_defs,
699            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
700            list_blocks,
701            char_frequency,
702            html_tags_cache: OnceLock::new(),
703            emphasis_spans_cache: OnceLock::new(),
704            table_rows_cache: OnceLock::new(),
705            bare_urls_cache: OnceLock::new(),
706            has_mixed_list_nesting_cache: OnceLock::new(),
707            html_comment_ranges,
708            table_blocks,
709            line_index,
710            jinja_ranges,
711            flavor,
712            source_file,
713        }
714    }
715
716    /// Get code spans - computed lazily on first access
717    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
718        Arc::clone(
719            self.code_spans_cache
720                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
721        )
722    }
723
724    /// Get HTML comment ranges - pre-computed during LintContext construction
725    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
726        &self.html_comment_ranges
727    }
728
729    /// Get HTML tags - computed lazily on first access
730    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
731        Arc::clone(self.html_tags_cache.get_or_init(|| {
732            Arc::new(Self::parse_html_tags(
733                self.content,
734                &self.lines,
735                &self.code_blocks,
736                self.flavor,
737            ))
738        }))
739    }
740
741    /// Get emphasis spans - computed lazily on first access
742    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
743        Arc::clone(
744            self.emphasis_spans_cache
745                .get_or_init(|| Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))),
746        )
747    }
748
749    /// Get table rows - computed lazily on first access
750    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
751        Arc::clone(
752            self.table_rows_cache
753                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
754        )
755    }
756
757    /// Get bare URLs - computed lazily on first access
758    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
759        Arc::clone(
760            self.bare_urls_cache
761                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
762        )
763    }
764
765    /// Check if document has mixed ordered/unordered list nesting.
766    /// Result is cached after first computation (document-level invariant).
767    /// This is used by MD007 for smart style auto-detection.
768    pub fn has_mixed_list_nesting(&self) -> bool {
769        *self
770            .has_mixed_list_nesting_cache
771            .get_or_init(|| self.compute_mixed_list_nesting())
772    }
773
774    /// Internal computation for mixed list nesting (only called once per LintContext).
775    fn compute_mixed_list_nesting(&self) -> bool {
776        // Track parent list items by their marker position and type
777        // Using marker_column instead of indent because it works correctly
778        // for blockquoted content where indent doesn't account for the prefix
779        // Stack stores: (marker_column, is_ordered)
780        let mut stack: Vec<(usize, bool)> = Vec::new();
781        let mut last_was_blank = false;
782
783        for line_info in &self.lines {
784            // Skip non-content lines (code blocks, frontmatter, HTML comments, etc.)
785            if line_info.in_code_block
786                || line_info.in_front_matter
787                || line_info.in_mkdocstrings
788                || line_info.in_html_comment
789                || line_info.in_esm_block
790            {
791                continue;
792            }
793
794            // OPTIMIZATION: Use pre-computed is_blank instead of content().trim()
795            if line_info.is_blank {
796                last_was_blank = true;
797                continue;
798            }
799
800            if let Some(list_item) = &line_info.list_item {
801                // Normalize column 1 to column 0 (consistent with MD007 check function)
802                let current_pos = if list_item.marker_column == 1 {
803                    0
804                } else {
805                    list_item.marker_column
806                };
807
808                // If there was a blank line and this item is at root level, reset stack
809                if last_was_blank && current_pos == 0 {
810                    stack.clear();
811                }
812                last_was_blank = false;
813
814                // Pop items at same or greater position (they're siblings or deeper, not parents)
815                while let Some(&(pos, _)) = stack.last() {
816                    if pos >= current_pos {
817                        stack.pop();
818                    } else {
819                        break;
820                    }
821                }
822
823                // Check if immediate parent has different type - this is mixed nesting
824                if let Some(&(_, parent_is_ordered)) = stack.last()
825                    && parent_is_ordered != list_item.is_ordered
826                {
827                    return true; // Found mixed nesting - early exit
828                }
829
830                stack.push((current_pos, list_item.is_ordered));
831            } else {
832                // Non-list line (but not blank) - could be paragraph or other content
833                last_was_blank = false;
834            }
835        }
836
837        false
838    }
839
840    /// Map a byte offset to (line, column)
841    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
842        match self.line_offsets.binary_search(&offset) {
843            Ok(line) => (line + 1, 1),
844            Err(line) => {
845                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
846                (line, offset - line_start + 1)
847            }
848        }
849    }
850
851    /// Check if a position is within a code block or code span
852    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
853        // Check code blocks first
854        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
855            return true;
856        }
857
858        // Check inline code spans (lazy load if needed)
859        self.code_spans()
860            .iter()
861            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
862    }
863
864    /// Get line information by line number (1-indexed)
865    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
866        if line_num > 0 {
867            self.lines.get(line_num - 1)
868        } else {
869            None
870        }
871    }
872
873    /// Get byte offset for a line number (1-indexed)
874    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
875        self.line_info(line_num).map(|info| info.byte_offset)
876    }
877
878    /// Get URL for a reference link/image by its ID
879    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
880        let normalized_id = ref_id.to_lowercase();
881        self.reference_defs
882            .iter()
883            .find(|def| def.id == normalized_id)
884            .map(|def| def.url.as_str())
885    }
886
887    /// Check if a line is part of a list block
888    pub fn is_in_list_block(&self, line_num: usize) -> bool {
889        self.list_blocks
890            .iter()
891            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
892    }
893
894    /// Get the list block containing a specific line
895    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
896        self.list_blocks
897            .iter()
898            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
899    }
900
901    // Compatibility methods for DocumentStructure migration
902
903    /// Check if a line is within a code block
904    pub fn is_in_code_block(&self, line_num: usize) -> bool {
905        if line_num == 0 || line_num > self.lines.len() {
906            return false;
907        }
908        self.lines[line_num - 1].in_code_block
909    }
910
911    /// Check if a line is within front matter
912    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
913        if line_num == 0 || line_num > self.lines.len() {
914            return false;
915        }
916        self.lines[line_num - 1].in_front_matter
917    }
918
919    /// Check if a line is within an HTML block
920    pub fn is_in_html_block(&self, line_num: usize) -> bool {
921        if line_num == 0 || line_num > self.lines.len() {
922            return false;
923        }
924        self.lines[line_num - 1].in_html_block
925    }
926
927    /// Check if a line and column is within a code span
928    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
929        if line_num == 0 || line_num > self.lines.len() {
930            return false;
931        }
932
933        // Use the code spans cache to check
934        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
935        // Convert col to 0-indexed for comparison
936        let col_0indexed = if col > 0 { col - 1 } else { 0 };
937        let code_spans = self.code_spans();
938        code_spans.iter().any(|span| {
939            // Check if line is within the span's line range
940            if line_num < span.line || line_num > span.end_line {
941                return false;
942            }
943
944            if span.line == span.end_line {
945                // Single-line span: check column bounds
946                col_0indexed >= span.start_col && col_0indexed < span.end_col
947            } else if line_num == span.line {
948                // First line of multi-line span: anything after start_col is in span
949                col_0indexed >= span.start_col
950            } else if line_num == span.end_line {
951                // Last line of multi-line span: anything before end_col is in span
952                col_0indexed < span.end_col
953            } else {
954                // Middle line of multi-line span: entire line is in span
955                true
956            }
957        })
958    }
959
960    /// Check if a byte offset is within a code span
961    #[inline]
962    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
963        let code_spans = self.code_spans();
964        code_spans
965            .iter()
966            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
967    }
968
969    /// Check if a byte position is within a reference definition
970    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
971    #[inline]
972    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
973        self.reference_defs
974            .iter()
975            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
976    }
977
978    /// Check if a byte position is within an HTML comment
979    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
980    /// where k is the number of HTML comments (typically very small)
981    #[inline]
982    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
983        self.html_comment_ranges
984            .iter()
985            .any(|range| byte_pos >= range.start && byte_pos < range.end)
986    }
987
988    /// Check if a byte position is within an HTML tag (including multiline tags)
989    /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
990    #[inline]
991    pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
992        self.html_tags()
993            .iter()
994            .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
995    }
996
997    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
998    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
999        self.jinja_ranges
1000            .iter()
1001            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1002    }
1003
1004    /// Check if a byte position is within a link reference definition title
1005    pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1006        self.reference_defs.iter().any(|def| {
1007            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1008                byte_pos >= start && byte_pos < end
1009            } else {
1010                false
1011            }
1012        })
1013    }
1014
1015    /// Check if content has any instances of a specific character (fast)
1016    pub fn has_char(&self, ch: char) -> bool {
1017        match ch {
1018            '#' => self.char_frequency.hash_count > 0,
1019            '*' => self.char_frequency.asterisk_count > 0,
1020            '_' => self.char_frequency.underscore_count > 0,
1021            '-' => self.char_frequency.hyphen_count > 0,
1022            '+' => self.char_frequency.plus_count > 0,
1023            '>' => self.char_frequency.gt_count > 0,
1024            '|' => self.char_frequency.pipe_count > 0,
1025            '[' => self.char_frequency.bracket_count > 0,
1026            '`' => self.char_frequency.backtick_count > 0,
1027            '<' => self.char_frequency.lt_count > 0,
1028            '!' => self.char_frequency.exclamation_count > 0,
1029            '\n' => self.char_frequency.newline_count > 0,
1030            _ => self.content.contains(ch), // Fallback for other characters
1031        }
1032    }
1033
1034    /// Get count of a specific character (fast)
1035    pub fn char_count(&self, ch: char) -> usize {
1036        match ch {
1037            '#' => self.char_frequency.hash_count,
1038            '*' => self.char_frequency.asterisk_count,
1039            '_' => self.char_frequency.underscore_count,
1040            '-' => self.char_frequency.hyphen_count,
1041            '+' => self.char_frequency.plus_count,
1042            '>' => self.char_frequency.gt_count,
1043            '|' => self.char_frequency.pipe_count,
1044            '[' => self.char_frequency.bracket_count,
1045            '`' => self.char_frequency.backtick_count,
1046            '<' => self.char_frequency.lt_count,
1047            '!' => self.char_frequency.exclamation_count,
1048            '\n' => self.char_frequency.newline_count,
1049            _ => self.content.matches(ch).count(), // Fallback for other characters
1050        }
1051    }
1052
1053    /// Check if content likely contains headings (fast)
1054    pub fn likely_has_headings(&self) -> bool {
1055        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
1056    }
1057
1058    /// Check if content likely contains lists (fast)
1059    pub fn likely_has_lists(&self) -> bool {
1060        self.char_frequency.asterisk_count > 0
1061            || self.char_frequency.hyphen_count > 0
1062            || self.char_frequency.plus_count > 0
1063    }
1064
1065    /// Check if content likely contains emphasis (fast)
1066    pub fn likely_has_emphasis(&self) -> bool {
1067        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1068    }
1069
1070    /// Check if content likely contains tables (fast)
1071    pub fn likely_has_tables(&self) -> bool {
1072        self.char_frequency.pipe_count > 2
1073    }
1074
1075    /// Check if content likely contains blockquotes (fast)
1076    pub fn likely_has_blockquotes(&self) -> bool {
1077        self.char_frequency.gt_count > 0
1078    }
1079
1080    /// Check if content likely contains code (fast)
1081    pub fn likely_has_code(&self) -> bool {
1082        self.char_frequency.backtick_count > 0
1083    }
1084
1085    /// Check if content likely contains links or images (fast)
1086    pub fn likely_has_links_or_images(&self) -> bool {
1087        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1088    }
1089
1090    /// Check if content likely contains HTML (fast)
1091    pub fn likely_has_html(&self) -> bool {
1092        self.char_frequency.lt_count > 0
1093    }
1094
1095    /// Get HTML tags on a specific line
1096    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1097        self.html_tags()
1098            .iter()
1099            .filter(|tag| tag.line == line_num)
1100            .cloned()
1101            .collect()
1102    }
1103
1104    /// Get emphasis spans on a specific line
1105    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1106        self.emphasis_spans()
1107            .iter()
1108            .filter(|span| span.line == line_num)
1109            .cloned()
1110            .collect()
1111    }
1112
1113    /// Get table rows on a specific line
1114    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1115        self.table_rows()
1116            .iter()
1117            .filter(|row| row.line == line_num)
1118            .cloned()
1119            .collect()
1120    }
1121
1122    /// Get bare URLs on a specific line
1123    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1124        self.bare_urls()
1125            .iter()
1126            .filter(|url| url.line == line_num)
1127            .cloned()
1128            .collect()
1129    }
1130
1131    /// Find the line index for a given byte offset using binary search.
1132    /// Returns (line_index, line_number, column) where:
1133    /// - line_index is the 0-based index in the lines array
1134    /// - line_number is the 1-based line number
1135    /// - column is the byte offset within that line
1136    #[inline]
1137    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1138        // Binary search to find the line containing this byte offset
1139        let idx = match lines.binary_search_by(|line| {
1140            if byte_offset < line.byte_offset {
1141                std::cmp::Ordering::Greater
1142            } else if byte_offset > line.byte_offset + line.byte_len {
1143                std::cmp::Ordering::Less
1144            } else {
1145                std::cmp::Ordering::Equal
1146            }
1147        }) {
1148            Ok(idx) => idx,
1149            Err(idx) => idx.saturating_sub(1),
1150        };
1151
1152        let line = &lines[idx];
1153        let line_num = idx + 1;
1154        let col = byte_offset.saturating_sub(line.byte_offset);
1155
1156        (idx, line_num, col)
1157    }
1158
1159    /// Check if a byte offset is within a code span using binary search
1160    #[inline]
1161    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1162        // Since spans are sorted by byte_offset, use partition_point for binary search
1163        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1164
1165        // Check the span that starts at or before our offset
1166        if idx > 0 {
1167            let span = &code_spans[idx - 1];
1168            if offset >= span.byte_offset && offset < span.byte_end {
1169                return true;
1170            }
1171        }
1172
1173        false
1174    }
1175
1176    /// Collect byte ranges of all links using pulldown-cmark
1177    /// This is used to skip heading detection for lines that fall within link syntax
1178    /// (e.g., multiline links like `[text](url\n#fragment)`)
1179    fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1180        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1181
1182        let mut link_ranges = Vec::new();
1183        let mut options = Options::empty();
1184        options.insert(Options::ENABLE_WIKILINKS);
1185        options.insert(Options::ENABLE_FOOTNOTES);
1186
1187        let parser = Parser::new_ext(content, options).into_offset_iter();
1188        let mut link_stack: Vec<usize> = Vec::new();
1189
1190        for (event, range) in parser {
1191            match event {
1192                Event::Start(Tag::Link { .. }) => {
1193                    link_stack.push(range.start);
1194                }
1195                Event::End(TagEnd::Link) => {
1196                    if let Some(start_pos) = link_stack.pop() {
1197                        link_ranges.push((start_pos, range.end));
1198                    }
1199                }
1200                _ => {}
1201            }
1202        }
1203
1204        link_ranges
1205    }
1206
1207    /// Parse all links in the content
1208    fn parse_links(
1209        content: &'a str,
1210        lines: &[LineInfo],
1211        code_blocks: &[(usize, usize)],
1212        code_spans: &[CodeSpan],
1213        flavor: MarkdownFlavor,
1214        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1215    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1216        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1217        use std::collections::HashSet;
1218
1219        let mut links = Vec::with_capacity(content.len() / 500);
1220        let mut broken_links = Vec::new();
1221        let mut footnote_refs = Vec::new();
1222
1223        // Track byte positions of links found by pulldown-cmark
1224        let mut found_positions = HashSet::new();
1225
1226        // Use pulldown-cmark's streaming parser with BrokenLink callback
1227        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1228        // This automatically handles:
1229        // - Escaped links (won't generate events)
1230        // - Links in code blocks/spans (won't generate Link events)
1231        // - Images (generates Tag::Image instead)
1232        // - Reference resolution (dest_url is already resolved!)
1233        // - Broken references (callback is invoked)
1234        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1235        let mut options = Options::empty();
1236        options.insert(Options::ENABLE_WIKILINKS);
1237        options.insert(Options::ENABLE_FOOTNOTES);
1238
1239        let parser = Parser::new_with_broken_link_callback(
1240            content,
1241            options,
1242            Some(|link: BrokenLink<'_>| {
1243                broken_links.push(BrokenLinkInfo {
1244                    reference: link.reference.to_string(),
1245                    span: link.span.clone(),
1246                });
1247                None
1248            }),
1249        )
1250        .into_offset_iter();
1251
1252        let mut link_stack: Vec<(
1253            usize,
1254            usize,
1255            pulldown_cmark::CowStr<'a>,
1256            LinkType,
1257            pulldown_cmark::CowStr<'a>,
1258        )> = Vec::new();
1259        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1260
1261        for (event, range) in parser {
1262            match event {
1263                Event::Start(Tag::Link {
1264                    link_type,
1265                    dest_url,
1266                    id,
1267                    ..
1268                }) => {
1269                    // Link start - record position, URL, and reference ID
1270                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1271                    text_chunks.clear();
1272                }
1273                Event::Text(text) if !link_stack.is_empty() => {
1274                    // Track text content with its byte range
1275                    text_chunks.push((text.to_string(), range.start, range.end));
1276                }
1277                Event::Code(code) if !link_stack.is_empty() => {
1278                    // Include inline code in link text (with backticks)
1279                    let code_text = format!("`{code}`");
1280                    text_chunks.push((code_text, range.start, range.end));
1281                }
1282                Event::End(TagEnd::Link) => {
1283                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1284                        // Skip if in HTML comment
1285                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1286                            text_chunks.clear();
1287                            continue;
1288                        }
1289
1290                        // Find line and column information
1291                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1292
1293                        // Skip if this link is on a MkDocs snippet line
1294                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1295                            text_chunks.clear();
1296                            continue;
1297                        }
1298
1299                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1300
1301                        let is_reference = matches!(
1302                            link_type,
1303                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1304                        );
1305
1306                        // Extract link text directly from source bytes to preserve escaping
1307                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1308                        let link_text = if start_pos < content.len() {
1309                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1310
1311                            // Find MATCHING ] by tracking bracket depth for nested brackets
1312                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1313                            // Brackets inside code spans (between backticks) should be ignored
1314                            let mut close_pos = None;
1315                            let mut depth = 0;
1316                            let mut in_code_span = false;
1317
1318                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1319                                // Count preceding backslashes
1320                                let mut backslash_count = 0;
1321                                let mut j = i;
1322                                while j > 0 && link_bytes[j - 1] == b'\\' {
1323                                    backslash_count += 1;
1324                                    j -= 1;
1325                                }
1326                                let is_escaped = backslash_count % 2 != 0;
1327
1328                                // Track code spans - backticks toggle in/out of code
1329                                if byte == b'`' && !is_escaped {
1330                                    in_code_span = !in_code_span;
1331                                }
1332
1333                                // Only count brackets when NOT in a code span
1334                                if !is_escaped && !in_code_span {
1335                                    if byte == b'[' {
1336                                        depth += 1;
1337                                    } else if byte == b']' {
1338                                        if depth == 0 {
1339                                            // Found the matching closing bracket
1340                                            close_pos = Some(i);
1341                                            break;
1342                                        } else {
1343                                            depth -= 1;
1344                                        }
1345                                    }
1346                                }
1347                            }
1348
1349                            if let Some(pos) = close_pos {
1350                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1351                            } else {
1352                                Cow::Borrowed("")
1353                            }
1354                        } else {
1355                            Cow::Borrowed("")
1356                        };
1357
1358                        // For reference links, use the actual reference ID from pulldown-cmark
1359                        let reference_id = if is_reference && !ref_id.is_empty() {
1360                            Some(Cow::Owned(ref_id.to_lowercase()))
1361                        } else if is_reference {
1362                            // For collapsed/shortcut references without explicit ID, use the link text
1363                            Some(Cow::Owned(link_text.to_lowercase()))
1364                        } else {
1365                            None
1366                        };
1367
1368                        // Track this position as found
1369                        found_positions.insert(start_pos);
1370
1371                        links.push(ParsedLink {
1372                            line: line_num,
1373                            start_col: col_start,
1374                            end_col: col_end,
1375                            byte_offset: start_pos,
1376                            byte_end: range.end,
1377                            text: link_text,
1378                            url: Cow::Owned(url.to_string()),
1379                            is_reference,
1380                            reference_id,
1381                            link_type,
1382                        });
1383
1384                        text_chunks.clear();
1385                    }
1386                }
1387                Event::FootnoteReference(footnote_id) => {
1388                    // Capture footnote references like [^1], [^note]
1389                    // Skip if in HTML comment
1390                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1391                        continue;
1392                    }
1393
1394                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1395                    footnote_refs.push(FootnoteRef {
1396                        id: footnote_id.to_string(),
1397                        line: line_num,
1398                        byte_offset: range.start,
1399                        byte_end: range.end,
1400                    });
1401                }
1402                _ => {}
1403            }
1404        }
1405
1406        // Also find undefined references using regex
1407        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1408        // because the reference is undefined
1409        for cap in LINK_PATTERN.captures_iter(content) {
1410            let full_match = cap.get(0).unwrap();
1411            let match_start = full_match.start();
1412            let match_end = full_match.end();
1413
1414            // Skip if this was already found by pulldown-cmark (it's a valid link)
1415            if found_positions.contains(&match_start) {
1416                continue;
1417            }
1418
1419            // Skip if escaped
1420            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1421                continue;
1422            }
1423
1424            // Skip if it's an image
1425            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1426                continue;
1427            }
1428
1429            // Skip if in code block
1430            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1431                continue;
1432            }
1433
1434            // Skip if in code span
1435            if Self::is_offset_in_code_span(code_spans, match_start) {
1436                continue;
1437            }
1438
1439            // Skip if in HTML comment
1440            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1441                continue;
1442            }
1443
1444            // Find line and column information
1445            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1446
1447            // Skip if this link is on a MkDocs snippet line
1448            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1449                continue;
1450            }
1451
1452            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1453
1454            let text = cap.get(1).map_or("", |m| m.as_str());
1455
1456            // Only process reference links (group 6)
1457            if let Some(ref_id) = cap.get(6) {
1458                let ref_id_str = ref_id.as_str();
1459                let normalized_ref = if ref_id_str.is_empty() {
1460                    Cow::Owned(text.to_lowercase()) // Implicit reference
1461                } else {
1462                    Cow::Owned(ref_id_str.to_lowercase())
1463                };
1464
1465                // This is an undefined reference (pulldown-cmark didn't parse it)
1466                links.push(ParsedLink {
1467                    line: line_num,
1468                    start_col: col_start,
1469                    end_col: col_end,
1470                    byte_offset: match_start,
1471                    byte_end: match_end,
1472                    text: Cow::Borrowed(text),
1473                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1474                    is_reference: true,
1475                    reference_id: Some(normalized_ref),
1476                    link_type: LinkType::Reference, // Undefined references are reference-style
1477                });
1478            }
1479        }
1480
1481        (links, broken_links, footnote_refs)
1482    }
1483
1484    /// Parse all images in the content
1485    fn parse_images(
1486        content: &'a str,
1487        lines: &[LineInfo],
1488        code_blocks: &[(usize, usize)],
1489        code_spans: &[CodeSpan],
1490        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1491    ) -> Vec<ParsedImage<'a>> {
1492        use crate::utils::skip_context::is_in_html_comment_ranges;
1493        use std::collections::HashSet;
1494
1495        // Pre-size based on a heuristic: images are less common than links
1496        let mut images = Vec::with_capacity(content.len() / 1000);
1497        let mut found_positions = HashSet::new();
1498
1499        // Use pulldown-cmark for parsing - more accurate and faster
1500        let parser = Parser::new(content).into_offset_iter();
1501        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1502            Vec::new();
1503        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1504
1505        for (event, range) in parser {
1506            match event {
1507                Event::Start(Tag::Image {
1508                    link_type,
1509                    dest_url,
1510                    id,
1511                    ..
1512                }) => {
1513                    image_stack.push((range.start, dest_url, link_type, id));
1514                    text_chunks.clear();
1515                }
1516                Event::Text(text) if !image_stack.is_empty() => {
1517                    text_chunks.push((text.to_string(), range.start, range.end));
1518                }
1519                Event::Code(code) if !image_stack.is_empty() => {
1520                    let code_text = format!("`{code}`");
1521                    text_chunks.push((code_text, range.start, range.end));
1522                }
1523                Event::End(TagEnd::Image) => {
1524                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1525                        // Skip if in code block
1526                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1527                            continue;
1528                        }
1529
1530                        // Skip if in code span
1531                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1532                            continue;
1533                        }
1534
1535                        // Skip if in HTML comment
1536                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1537                            continue;
1538                        }
1539
1540                        // Find line and column using binary search
1541                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1542                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1543
1544                        let is_reference = matches!(
1545                            link_type,
1546                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1547                        );
1548
1549                        // Extract alt text directly from source bytes to preserve escaping
1550                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1551                        let alt_text = if start_pos < content.len() {
1552                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1553
1554                            // Find MATCHING ] by tracking bracket depth for nested brackets
1555                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1556                            let mut close_pos = None;
1557                            let mut depth = 0;
1558
1559                            if image_bytes.len() > 2 {
1560                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1561                                    // Count preceding backslashes
1562                                    let mut backslash_count = 0;
1563                                    let mut j = i;
1564                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1565                                        backslash_count += 1;
1566                                        j -= 1;
1567                                    }
1568                                    let is_escaped = backslash_count % 2 != 0;
1569
1570                                    if !is_escaped {
1571                                        if byte == b'[' {
1572                                            depth += 1;
1573                                        } else if byte == b']' {
1574                                            if depth == 0 {
1575                                                // Found the matching closing bracket
1576                                                close_pos = Some(i);
1577                                                break;
1578                                            } else {
1579                                                depth -= 1;
1580                                            }
1581                                        }
1582                                    }
1583                                }
1584                            }
1585
1586                            if let Some(pos) = close_pos {
1587                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1588                            } else {
1589                                Cow::Borrowed("")
1590                            }
1591                        } else {
1592                            Cow::Borrowed("")
1593                        };
1594
1595                        let reference_id = if is_reference && !ref_id.is_empty() {
1596                            Some(Cow::Owned(ref_id.to_lowercase()))
1597                        } else if is_reference {
1598                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1599                        } else {
1600                            None
1601                        };
1602
1603                        found_positions.insert(start_pos);
1604                        images.push(ParsedImage {
1605                            line: line_num,
1606                            start_col: col_start,
1607                            end_col: col_end,
1608                            byte_offset: start_pos,
1609                            byte_end: range.end,
1610                            alt_text,
1611                            url: Cow::Owned(url.to_string()),
1612                            is_reference,
1613                            reference_id,
1614                            link_type,
1615                        });
1616                    }
1617                }
1618                _ => {}
1619            }
1620        }
1621
1622        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1623        for cap in IMAGE_PATTERN.captures_iter(content) {
1624            let full_match = cap.get(0).unwrap();
1625            let match_start = full_match.start();
1626            let match_end = full_match.end();
1627
1628            // Skip if already found by pulldown-cmark
1629            if found_positions.contains(&match_start) {
1630                continue;
1631            }
1632
1633            // Skip if the ! is escaped
1634            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1635                continue;
1636            }
1637
1638            // Skip if in code block, code span, or HTML comment
1639            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1640                || Self::is_offset_in_code_span(code_spans, match_start)
1641                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1642            {
1643                continue;
1644            }
1645
1646            // Only process reference images (undefined references not found by pulldown-cmark)
1647            if let Some(ref_id) = cap.get(6) {
1648                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1649                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1650                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1651                let ref_id_str = ref_id.as_str();
1652                let normalized_ref = if ref_id_str.is_empty() {
1653                    Cow::Owned(alt_text.to_lowercase())
1654                } else {
1655                    Cow::Owned(ref_id_str.to_lowercase())
1656                };
1657
1658                images.push(ParsedImage {
1659                    line: line_num,
1660                    start_col: col_start,
1661                    end_col: col_end,
1662                    byte_offset: match_start,
1663                    byte_end: match_end,
1664                    alt_text: Cow::Borrowed(alt_text),
1665                    url: Cow::Borrowed(""),
1666                    is_reference: true,
1667                    reference_id: Some(normalized_ref),
1668                    link_type: LinkType::Reference, // Undefined references are reference-style
1669                });
1670            }
1671        }
1672
1673        images
1674    }
1675
1676    /// Parse reference definitions
1677    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1678        // Pre-size based on lines count as reference definitions are line-based
1679        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1680
1681        for (line_idx, line_info) in lines.iter().enumerate() {
1682            // Skip lines in code blocks
1683            if line_info.in_code_block {
1684                continue;
1685            }
1686
1687            let line = line_info.content(content);
1688            let line_num = line_idx + 1;
1689
1690            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1691                let id = cap.get(1).unwrap().as_str().to_lowercase();
1692                let url = cap.get(2).unwrap().as_str().to_string();
1693                let title_match = cap.get(3).or_else(|| cap.get(4));
1694                let title = title_match.map(|m| m.as_str().to_string());
1695
1696                // Calculate byte positions
1697                // The match starts at the beginning of the line (0) and extends to the end
1698                let match_obj = cap.get(0).unwrap();
1699                let byte_offset = line_info.byte_offset + match_obj.start();
1700                let byte_end = line_info.byte_offset + match_obj.end();
1701
1702                // Calculate title byte positions (includes the quote character before content)
1703                let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1704                    // The match is the content inside quotes, so we include the quote before
1705                    let start = line_info.byte_offset + m.start().saturating_sub(1);
1706                    let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1707                    (Some(start), Some(end))
1708                } else {
1709                    (None, None)
1710                };
1711
1712                refs.push(ReferenceDef {
1713                    line: line_num,
1714                    id,
1715                    url,
1716                    title,
1717                    byte_offset,
1718                    byte_end,
1719                    title_byte_start,
1720                    title_byte_end,
1721                });
1722            }
1723        }
1724
1725        refs
1726    }
1727
1728    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1729    /// Handles nested blockquotes like `> > > content`
1730    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1731    #[inline]
1732    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1733        let trimmed_start = line.trim_start();
1734        if !trimmed_start.starts_with('>') {
1735            return None;
1736        }
1737
1738        // Track total prefix length to handle nested blockquotes
1739        let mut remaining = line;
1740        let mut total_prefix_len = 0;
1741
1742        loop {
1743            let trimmed = remaining.trim_start();
1744            if !trimmed.starts_with('>') {
1745                break;
1746            }
1747
1748            // Add leading whitespace + '>' to prefix
1749            let leading_ws_len = remaining.len() - trimmed.len();
1750            total_prefix_len += leading_ws_len + 1;
1751
1752            let after_gt = &trimmed[1..];
1753
1754            // Handle optional whitespace after '>' (space or tab)
1755            if let Some(stripped) = after_gt.strip_prefix(' ') {
1756                total_prefix_len += 1;
1757                remaining = stripped;
1758            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1759                total_prefix_len += 1;
1760                remaining = stripped;
1761            } else {
1762                remaining = after_gt;
1763            }
1764        }
1765
1766        Some((&line[..total_prefix_len], remaining))
1767    }
1768
1769    /// Fast unordered list parser - replaces regex for 5-10x speedup
1770    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1771    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1772    #[inline]
1773    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1774        let bytes = line.as_bytes();
1775        let mut i = 0;
1776
1777        // Skip leading whitespace
1778        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1779            i += 1;
1780        }
1781
1782        // Check for marker
1783        if i >= bytes.len() {
1784            return None;
1785        }
1786        let marker = bytes[i] as char;
1787        if marker != '-' && marker != '*' && marker != '+' {
1788            return None;
1789        }
1790        let marker_pos = i;
1791        i += 1;
1792
1793        // Collect spacing after marker (space or tab only)
1794        let spacing_start = i;
1795        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1796            i += 1;
1797        }
1798
1799        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1800    }
1801
1802    /// Fast ordered list parser - replaces regex for 5-10x speedup
1803    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1804    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1805    #[inline]
1806    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1807        let bytes = line.as_bytes();
1808        let mut i = 0;
1809
1810        // Skip leading whitespace
1811        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1812            i += 1;
1813        }
1814
1815        // Collect digits
1816        let number_start = i;
1817        while i < bytes.len() && bytes[i].is_ascii_digit() {
1818            i += 1;
1819        }
1820        if i == number_start {
1821            return None; // No digits found
1822        }
1823
1824        // Check for delimiter
1825        if i >= bytes.len() {
1826            return None;
1827        }
1828        let delimiter = bytes[i] as char;
1829        if delimiter != '.' && delimiter != ')' {
1830            return None;
1831        }
1832        let delimiter_pos = i;
1833        i += 1;
1834
1835        // Collect spacing after delimiter (space or tab only)
1836        let spacing_start = i;
1837        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1838            i += 1;
1839        }
1840
1841        Some((
1842            &line[..number_start],
1843            &line[number_start..delimiter_pos],
1844            delimiter,
1845            &line[spacing_start..i],
1846            &line[i..],
1847        ))
1848    }
1849
1850    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1851    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1852    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1853        let num_lines = line_offsets.len();
1854        let mut in_code_block = vec![false; num_lines];
1855
1856        // For each code block, mark all lines within it
1857        for &(start, end) in code_blocks {
1858            // Ensure we're at valid UTF-8 boundaries
1859            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1860                let mut boundary = start;
1861                while boundary > 0 && !content.is_char_boundary(boundary) {
1862                    boundary -= 1;
1863                }
1864                boundary
1865            } else {
1866                start
1867            };
1868
1869            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1870                let mut boundary = end;
1871                while boundary < content.len() && !content.is_char_boundary(boundary) {
1872                    boundary += 1;
1873                }
1874                boundary
1875            } else {
1876                end.min(content.len())
1877            };
1878
1879            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1880            // That function now has proper list context awareness (see code_block_utils.rs)
1881            // and correctly distinguishes between:
1882            // - Fenced code blocks (``` or ~~~)
1883            // - Indented code blocks at document level (4 spaces + blank line before)
1884            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1885            //
1886            // We no longer need to re-validate here. The original validation logic
1887            // was causing false positives by marking list continuation paragraphs as
1888            // code blocks when they have 4 spaces of indentation.
1889
1890            // Use binary search to find the first and last line indices
1891            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1892            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1893            //
1894            // Find the line that CONTAINS safe_start: the line with the largest
1895            // start offset that is <= safe_start. partition_point gives us the
1896            // first line that starts AFTER safe_start, so we subtract 1.
1897            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1898            let first_line = first_line_after.saturating_sub(1);
1899            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1900
1901            // Mark all lines in the range at once
1902            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1903                *flag = true;
1904            }
1905        }
1906
1907        in_code_block
1908    }
1909
1910    /// Pre-compute basic line information (without headings/blockquotes)
1911    fn compute_basic_line_info(
1912        content: &str,
1913        line_offsets: &[usize],
1914        code_blocks: &[(usize, usize)],
1915        flavor: MarkdownFlavor,
1916        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1917        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1918    ) -> Vec<LineInfo> {
1919        let content_lines: Vec<&str> = content.lines().collect();
1920        let mut lines = Vec::with_capacity(content_lines.len());
1921
1922        // Pre-compute which lines are in code blocks
1923        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1924
1925        // Detect front matter boundaries FIRST, before any other parsing
1926        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1927        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1928
1929        for (i, line) in content_lines.iter().enumerate() {
1930            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1931            let indent = line.len() - line.trim_start().len();
1932
1933            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1934            let blockquote_parse = Self::parse_blockquote_prefix(line);
1935
1936            // For blank detection, consider blockquote context
1937            let is_blank = if let Some((_, content)) = blockquote_parse {
1938                // In blockquote context, check if content after prefix is blank
1939                content.trim().is_empty()
1940            } else {
1941                line.trim().is_empty()
1942            };
1943
1944            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1945            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1946
1947            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1948            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1949                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1950            // Check if the ENTIRE line is within an HTML comment (not just the line start)
1951            // This ensures content after `-->` on the same line is not incorrectly skipped
1952            let line_end_offset = byte_offset + line.len();
1953            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
1954                html_comment_ranges,
1955                byte_offset,
1956                line_end_offset,
1957            );
1958            let list_item = if !(in_code_block
1959                || is_blank
1960                || in_mkdocstrings
1961                || in_html_comment
1962                || (front_matter_end > 0 && i < front_matter_end))
1963            {
1964                // Strip blockquote prefix if present for list detection (reuse cached result)
1965                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1966                    (content, prefix.len())
1967                } else {
1968                    (&**line, 0)
1969                };
1970
1971                if let Some((leading_spaces, marker, spacing, _content)) =
1972                    Self::parse_unordered_list(line_for_list_check)
1973                {
1974                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1975                    let content_column = marker_column + 1 + spacing.len();
1976
1977                    // According to CommonMark spec, unordered list items MUST have at least one space
1978                    // after the marker (-, *, or +). Without a space, it's not a list item.
1979                    // This also naturally handles cases like:
1980                    // - *emphasis* (not a list)
1981                    // - **bold** (not a list)
1982                    // - --- (horizontal rule, not a list)
1983                    if spacing.is_empty() {
1984                        None
1985                    } else {
1986                        Some(ListItemInfo {
1987                            marker: marker.to_string(),
1988                            is_ordered: false,
1989                            number: None,
1990                            marker_column,
1991                            content_column,
1992                        })
1993                    }
1994                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1995                    Self::parse_ordered_list(line_for_list_check)
1996                {
1997                    let marker = format!("{number_str}{delimiter}");
1998                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1999                    let content_column = marker_column + marker.len() + spacing.len();
2000
2001                    // According to CommonMark spec, ordered list items MUST have at least one space
2002                    // after the marker (period or parenthesis). Without a space, it's not a list item.
2003                    if spacing.is_empty() {
2004                        None
2005                    } else {
2006                        Some(ListItemInfo {
2007                            marker,
2008                            is_ordered: true,
2009                            number: number_str.parse().ok(),
2010                            marker_column,
2011                            content_column,
2012                        })
2013                    }
2014                } else {
2015                    None
2016                }
2017            } else {
2018                None
2019            };
2020
2021            lines.push(LineInfo {
2022                byte_offset,
2023                byte_len: line.len(),
2024                indent,
2025                is_blank,
2026                in_code_block,
2027                in_front_matter: front_matter_end > 0 && i < front_matter_end,
2028                in_html_block: false, // Will be populated after line creation
2029                in_html_comment,
2030                list_item,
2031                heading: None,    // Will be populated in second pass for Setext headings
2032                blockquote: None, // Will be populated after line creation
2033                in_mkdocstrings,
2034                in_esm_block: false, // Will be populated after line creation for MDX files
2035                in_code_span_continuation: false, // Will be populated after code spans are parsed
2036            });
2037        }
2038
2039        lines
2040    }
2041
2042    /// Detect headings and blockquotes (called after HTML block detection)
2043    fn detect_headings_and_blockquotes(
2044        content: &str,
2045        lines: &mut [LineInfo],
2046        flavor: MarkdownFlavor,
2047        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2048        link_byte_ranges: &[(usize, usize)],
2049    ) {
2050        // Regex for heading detection
2051        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2052            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2053        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2054            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2055
2056        let content_lines: Vec<&str> = content.lines().collect();
2057
2058        // Detect front matter boundaries to skip those lines
2059        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2060
2061        // Detect headings (including Setext which needs look-ahead) and blockquotes
2062        for i in 0..lines.len() {
2063            if lines[i].in_code_block {
2064                continue;
2065            }
2066
2067            // Skip lines in front matter
2068            if front_matter_end > 0 && i < front_matter_end {
2069                continue;
2070            }
2071
2072            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2073            if lines[i].in_html_block {
2074                continue;
2075            }
2076
2077            let line = content_lines[i];
2078
2079            // Check for blockquotes (even on blank lines within blockquotes)
2080            if let Some(bq) = parse_blockquote_detailed(line) {
2081                let nesting_level = bq.markers.len(); // Each '>' is one level
2082                let marker_column = bq.indent.len();
2083
2084                // Build the prefix (indentation + markers + space)
2085                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2086
2087                // Check for various blockquote issues
2088                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2089                // Only flag multiple literal spaces, not tabs
2090                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
2091                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2092
2093                // Check if needs MD028 fix (empty blockquote line without proper spacing)
2094                // MD028 flags empty blockquote lines that don't have a single space after the marker
2095                // Lines like "> " or ">> " are already correct and don't need fixing
2096                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2097
2098                lines[i].blockquote = Some(BlockquoteInfo {
2099                    nesting_level,
2100                    indent: bq.indent.to_string(),
2101                    marker_column,
2102                    prefix,
2103                    content: bq.content.to_string(),
2104                    has_no_space_after_marker: has_no_space,
2105                    has_multiple_spaces_after_marker: has_multiple_spaces,
2106                    needs_md028_fix,
2107                });
2108            }
2109
2110            // Skip heading detection for blank lines
2111            if lines[i].is_blank {
2112                continue;
2113            }
2114
2115            // Check for ATX headings (but skip MkDocs snippet lines)
2116            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2117            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2118                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2119                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2120            } else {
2121                false
2122            };
2123
2124            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2125                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2126                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2127                    continue;
2128                }
2129                // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2130                // This prevents false positives where `#fragment` is detected as a heading
2131                let line_offset = lines[i].byte_offset;
2132                if link_byte_ranges
2133                    .iter()
2134                    .any(|&(start, end)| line_offset > start && line_offset < end)
2135                {
2136                    continue;
2137                }
2138                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2139                let hashes = caps.get(2).map_or("", |m| m.as_str());
2140                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2141                let rest = caps.get(4).map_or("", |m| m.as_str());
2142
2143                let level = hashes.len() as u8;
2144                let marker_column = leading_spaces.len();
2145
2146                // Check for closing sequence, but handle custom IDs that might come after
2147                let (text, has_closing, closing_seq) = {
2148                    // First check if there's a custom ID at the end
2149                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2150                        // Check if this looks like a valid custom ID (ends with })
2151                        if rest[id_start..].trim_end().ends_with('}') {
2152                            // Split off the custom ID
2153                            (&rest[..id_start], &rest[id_start..])
2154                        } else {
2155                            (rest, "")
2156                        }
2157                    } else {
2158                        (rest, "")
2159                    };
2160
2161                    // Now look for closing hashes in the part before the custom ID
2162                    let trimmed_rest = rest_without_id.trim_end();
2163                    if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2164                        // Find the start of the hash sequence by walking backwards
2165                        // Use char_indices to get byte positions at char boundaries
2166                        let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2167
2168                        // Find which char index corresponds to last_hash_byte_pos
2169                        let last_hash_char_idx = char_positions
2170                            .iter()
2171                            .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2172
2173                        if let Some(mut char_idx) = last_hash_char_idx {
2174                            // Walk backwards to find start of hash sequence
2175                            while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2176                                char_idx -= 1;
2177                            }
2178
2179                            // Get the byte position of the start of hashes
2180                            let start_of_hashes = char_positions[char_idx].0;
2181
2182                            // Check if there's at least one space before the closing hashes
2183                            let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2184
2185                            // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2186                            let potential_closing = &trimmed_rest[start_of_hashes..];
2187                            let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2188
2189                            if is_all_hashes && has_space_before {
2190                                // This is a closing sequence
2191                                let closing_hashes = potential_closing.to_string();
2192                                // The text is everything before the closing hashes
2193                                // Don't include the custom ID here - it will be extracted later
2194                                let text_part = if !custom_id_part.is_empty() {
2195                                    // If we have a custom ID, append it back to get the full rest
2196                                    // This allows the extract_header_id function to handle it properly
2197                                    format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2198                                } else {
2199                                    trimmed_rest[..start_of_hashes].trim_end().to_string()
2200                                };
2201                                (text_part, true, closing_hashes)
2202                            } else {
2203                                // Not a valid closing sequence, return the full content
2204                                (rest.to_string(), false, String::new())
2205                            }
2206                        } else {
2207                            // Couldn't find char boundary, return the full content
2208                            (rest.to_string(), false, String::new())
2209                        }
2210                    } else {
2211                        // No hashes found, return the full content
2212                        (rest.to_string(), false, String::new())
2213                    }
2214                };
2215
2216                let content_column = marker_column + hashes.len() + spaces_after.len();
2217
2218                // Extract custom header ID if present
2219                let raw_text = text.trim().to_string();
2220                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2221
2222                // If no custom ID was found on the header line, check the next line for standalone attr-list
2223                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2224                    let next_line = content_lines[i + 1];
2225                    if !lines[i + 1].in_code_block
2226                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2227                        && let Some(next_line_id) =
2228                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2229                    {
2230                        custom_id = Some(next_line_id);
2231                    }
2232                }
2233
2234                // ATX heading is "valid" for processing by heading rules if:
2235                // 1. Has space after # (CommonMark compliant): `# Heading`
2236                // 2. Is empty (just hashes): `#`
2237                // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2238                // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2239                //
2240                // Invalid patterns (hashtag-like) are skipped by most heading rules:
2241                // - `#tag` - single # with lowercase (social hashtag)
2242                // - `#123` - single # with number (GitHub issue ref)
2243                let is_valid = !spaces_after.is_empty()
2244                    || rest.is_empty()
2245                    || level > 1
2246                    || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2247
2248                lines[i].heading = Some(HeadingInfo {
2249                    level,
2250                    style: HeadingStyle::ATX,
2251                    marker: hashes.to_string(),
2252                    marker_column,
2253                    content_column,
2254                    text: clean_text,
2255                    custom_id,
2256                    raw_text,
2257                    has_closing_sequence: has_closing,
2258                    closing_sequence: closing_seq,
2259                    is_valid,
2260                });
2261            }
2262            // Check for Setext headings (need to look at next line)
2263            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2264                let next_line = content_lines[i + 1];
2265                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2266                    // Skip if next line is front matter delimiter
2267                    if front_matter_end > 0 && i < front_matter_end {
2268                        continue;
2269                    }
2270
2271                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2272                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2273                    {
2274                        continue;
2275                    }
2276
2277                    let underline = next_line.trim();
2278
2279                    let level = if underline.starts_with('=') { 1 } else { 2 };
2280                    let style = if level == 1 {
2281                        HeadingStyle::Setext1
2282                    } else {
2283                        HeadingStyle::Setext2
2284                    };
2285
2286                    // Extract custom header ID if present
2287                    let raw_text = line.trim().to_string();
2288                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2289
2290                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2291                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2292                        let attr_line = content_lines[i + 2];
2293                        if !lines[i + 2].in_code_block
2294                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2295                            && let Some(attr_line_id) =
2296                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2297                        {
2298                            custom_id = Some(attr_line_id);
2299                        }
2300                    }
2301
2302                    lines[i].heading = Some(HeadingInfo {
2303                        level,
2304                        style,
2305                        marker: underline.to_string(),
2306                        marker_column: next_line.len() - next_line.trim_start().len(),
2307                        content_column: lines[i].indent,
2308                        text: clean_text,
2309                        custom_id,
2310                        raw_text,
2311                        has_closing_sequence: false,
2312                        closing_sequence: String::new(),
2313                        is_valid: true, // Setext headings are always valid
2314                    });
2315                }
2316            }
2317        }
2318    }
2319
2320    /// Detect HTML blocks in the content
2321    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2322        // HTML block elements that trigger block context
2323        // Includes HTML5 media, embedded content, and interactive elements
2324        const BLOCK_ELEMENTS: &[&str] = &[
2325            "address",
2326            "article",
2327            "aside",
2328            "audio",
2329            "blockquote",
2330            "canvas",
2331            "details",
2332            "dialog",
2333            "dd",
2334            "div",
2335            "dl",
2336            "dt",
2337            "embed",
2338            "fieldset",
2339            "figcaption",
2340            "figure",
2341            "footer",
2342            "form",
2343            "h1",
2344            "h2",
2345            "h3",
2346            "h4",
2347            "h5",
2348            "h6",
2349            "header",
2350            "hr",
2351            "iframe",
2352            "li",
2353            "main",
2354            "menu",
2355            "nav",
2356            "noscript",
2357            "object",
2358            "ol",
2359            "p",
2360            "picture",
2361            "pre",
2362            "script",
2363            "search",
2364            "section",
2365            "source",
2366            "style",
2367            "summary",
2368            "svg",
2369            "table",
2370            "tbody",
2371            "td",
2372            "template",
2373            "textarea",
2374            "tfoot",
2375            "th",
2376            "thead",
2377            "tr",
2378            "track",
2379            "ul",
2380            "video",
2381        ];
2382
2383        let mut i = 0;
2384        while i < lines.len() {
2385            // Skip if already in code block or front matter
2386            if lines[i].in_code_block || lines[i].in_front_matter {
2387                i += 1;
2388                continue;
2389            }
2390
2391            let trimmed = lines[i].content(content).trim_start();
2392
2393            // Check if line starts with an HTML tag
2394            if trimmed.starts_with('<') && trimmed.len() > 1 {
2395                // Extract tag name safely
2396                let after_bracket = &trimmed[1..];
2397                let is_closing = after_bracket.starts_with('/');
2398                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2399
2400                // Extract tag name (stop at space, >, /, or end of string)
2401                let tag_name = tag_start
2402                    .chars()
2403                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2404                    .collect::<String>()
2405                    .to_lowercase();
2406
2407                // Check if it's a block element
2408                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2409                    // Mark this line as in HTML block
2410                    lines[i].in_html_block = true;
2411
2412                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2413                    // This avoids complex nesting logic that might cause infinite loops
2414                    if !is_closing {
2415                        let closing_tag = format!("</{tag_name}>");
2416                        // style and script tags can contain blank lines (CSS/JS formatting)
2417                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2418                        let mut j = i + 1;
2419                        while j < lines.len() && j < i + 100 {
2420                            // Limit search to 100 lines
2421                            // Stop at blank lines (except for style/script tags)
2422                            if !allow_blank_lines && lines[j].is_blank {
2423                                break;
2424                            }
2425
2426                            lines[j].in_html_block = true;
2427
2428                            // Check if this line contains the closing tag
2429                            if lines[j].content(content).contains(&closing_tag) {
2430                                break;
2431                            }
2432                            j += 1;
2433                        }
2434                    }
2435                }
2436            }
2437
2438            i += 1;
2439        }
2440    }
2441
2442    /// Detect ESM import/export blocks in MDX files
2443    /// ESM blocks consist of contiguous import/export statements at the top of the file
2444    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2445        // Only process MDX files
2446        if !flavor.supports_esm_blocks() {
2447            return;
2448        }
2449
2450        let mut in_multiline_comment = false;
2451
2452        for line in lines.iter_mut() {
2453            // Skip blank lines and HTML comments
2454            if line.is_blank || line.in_html_comment {
2455                continue;
2456            }
2457
2458            let trimmed = line.content(content).trim_start();
2459
2460            // Handle continuation of multi-line JS comments
2461            if in_multiline_comment {
2462                if trimmed.contains("*/") {
2463                    in_multiline_comment = false;
2464                }
2465                continue;
2466            }
2467
2468            // Skip single-line JS comments (// and ///)
2469            if trimmed.starts_with("//") {
2470                continue;
2471            }
2472
2473            // Handle start of multi-line JS comment
2474            if trimmed.starts_with("/*") {
2475                if !trimmed.contains("*/") {
2476                    in_multiline_comment = true;
2477                }
2478                continue;
2479            }
2480
2481            // Check if line starts with import or export
2482            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2483                line.in_esm_block = true;
2484            } else {
2485                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2486                break;
2487            }
2488        }
2489    }
2490
2491    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2492    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2493        let mut code_spans = Vec::new();
2494
2495        // Quick check - if no backticks, no code spans
2496        if !content.contains('`') {
2497            return code_spans;
2498        }
2499
2500        // Use pulldown-cmark's streaming parser with byte offsets
2501        let parser = Parser::new(content).into_offset_iter();
2502
2503        for (event, range) in parser {
2504            if let Event::Code(_) = event {
2505                let start_pos = range.start;
2506                let end_pos = range.end;
2507
2508                // The range includes the backticks, extract the actual content
2509                let full_span = &content[start_pos..end_pos];
2510                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2511
2512                // Extract content between backticks, preserving spaces
2513                let content_start = start_pos + backtick_count;
2514                let content_end = end_pos - backtick_count;
2515                let span_content = if content_start < content_end {
2516                    content[content_start..content_end].to_string()
2517                } else {
2518                    String::new()
2519                };
2520
2521                // Use binary search to find line number - O(log n) instead of O(n)
2522                // Find the rightmost line whose byte_offset <= start_pos
2523                let line_idx = lines
2524                    .partition_point(|line| line.byte_offset <= start_pos)
2525                    .saturating_sub(1);
2526                let line_num = line_idx + 1;
2527                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2528
2529                // Find end column using binary search
2530                let end_line_idx = lines
2531                    .partition_point(|line| line.byte_offset <= end_pos)
2532                    .saturating_sub(1);
2533                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2534
2535                // Convert byte offsets to character positions for correct Unicode handling
2536                // This ensures consistency with warning.column which uses character positions
2537                let line_content = lines[line_idx].content(content);
2538                let col_start = if byte_col_start <= line_content.len() {
2539                    line_content[..byte_col_start].chars().count()
2540                } else {
2541                    line_content.chars().count()
2542                };
2543
2544                let end_line_content = lines[end_line_idx].content(content);
2545                let col_end = if byte_col_end <= end_line_content.len() {
2546                    end_line_content[..byte_col_end].chars().count()
2547                } else {
2548                    end_line_content.chars().count()
2549                };
2550
2551                code_spans.push(CodeSpan {
2552                    line: line_num,
2553                    end_line: end_line_idx + 1,
2554                    start_col: col_start,
2555                    end_col: col_end,
2556                    byte_offset: start_pos,
2557                    byte_end: end_pos,
2558                    backtick_count,
2559                    content: span_content,
2560                });
2561            }
2562        }
2563
2564        // Sort by position to ensure consistent ordering
2565        code_spans.sort_by_key(|span| span.byte_offset);
2566
2567        code_spans
2568    }
2569
2570    /// Parse all list blocks in the content (legacy line-by-line approach)
2571    ///
2572    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2573    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2574    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2575    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2576    ///   treated as list continuation (based on the list marker width)
2577    ///
2578    /// When a new list item is encountered, we check if list-breaking content was seen
2579    /// since the last item. If so, we start a new list block.
2580    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2581        // Minimum indentation for unordered list continuation per CommonMark spec
2582        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2583
2584        /// Initialize or reset the forward-scanning tracking state.
2585        /// This helper eliminates code duplication across three initialization sites.
2586        #[inline]
2587        fn reset_tracking_state(
2588            list_item: &ListItemInfo,
2589            has_list_breaking_content: &mut bool,
2590            min_continuation: &mut usize,
2591        ) {
2592            *has_list_breaking_content = false;
2593            let marker_width = if list_item.is_ordered {
2594                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2595            } else {
2596                list_item.marker.len()
2597            };
2598            *min_continuation = if list_item.is_ordered {
2599                marker_width
2600            } else {
2601                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2602            };
2603        }
2604
2605        // Pre-size based on lines that could be list items
2606        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2607        let mut current_block: Option<ListBlock> = None;
2608        let mut last_list_item_line = 0;
2609        let mut current_indent_level = 0;
2610        let mut last_marker_width = 0;
2611
2612        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2613        let mut has_list_breaking_content_since_last_item = false;
2614        let mut min_continuation_for_tracking = 0;
2615
2616        for (line_idx, line_info) in lines.iter().enumerate() {
2617            let line_num = line_idx + 1;
2618
2619            // Enhanced code block handling using Design #3's context analysis
2620            if line_info.in_code_block {
2621                if let Some(ref mut block) = current_block {
2622                    // Calculate minimum indentation for list continuation
2623                    let min_continuation_indent =
2624                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2625
2626                    // Analyze code block context using the three-tier classification
2627                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2628
2629                    match context {
2630                        CodeBlockContext::Indented => {
2631                            // Code block is properly indented - continues the list
2632                            block.end_line = line_num;
2633                            continue;
2634                        }
2635                        CodeBlockContext::Standalone => {
2636                            // Code block separates lists - end current block
2637                            let completed_block = current_block.take().unwrap();
2638                            list_blocks.push(completed_block);
2639                            continue;
2640                        }
2641                        CodeBlockContext::Adjacent => {
2642                            // Edge case - use conservative behavior (continue list)
2643                            block.end_line = line_num;
2644                            continue;
2645                        }
2646                    }
2647                } else {
2648                    // No current list block - skip code block lines
2649                    continue;
2650                }
2651            }
2652
2653            // Extract blockquote prefix if any
2654            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2655                caps.get(0).unwrap().as_str().to_string()
2656            } else {
2657                String::new()
2658            };
2659
2660            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2661            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2662            if current_block.is_some()
2663                && line_info.list_item.is_none()
2664                && !line_info.is_blank
2665                && !line_info.in_code_span_continuation
2666            {
2667                let line_content = line_info.content(content).trim();
2668
2669                // Check for structural separators that break lists
2670                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2671                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2672                // as they indicate improper indentation rather than lazy continuation.
2673                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2674                let breaks_list = line_info.heading.is_some()
2675                    || line_content.starts_with("---")
2676                    || line_content.starts_with("***")
2677                    || line_content.starts_with("___")
2678                    || crate::utils::skip_context::is_table_line(line_content)
2679                    || line_content.starts_with(">")
2680                    || (line_info.indent > 0
2681                        && line_info.indent < min_continuation_for_tracking
2682                        && !is_lazy_continuation);
2683
2684                if breaks_list {
2685                    has_list_breaking_content_since_last_item = true;
2686                }
2687            }
2688
2689            // If this line is a code span continuation within an active list block,
2690            // extend the block's end_line to include this line (maintains list continuity)
2691            if line_info.in_code_span_continuation
2692                && line_info.list_item.is_none()
2693                && let Some(ref mut block) = current_block
2694            {
2695                block.end_line = line_num;
2696            }
2697
2698            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2699            // properly indented lines within the list). This ensures the workaround at line 2448
2700            // works correctly when there are multiple continuation lines before a nested list item.
2701            // Also include lazy continuation lines (indent=0) per CommonMark spec.
2702            let is_valid_continuation =
2703                line_info.indent >= min_continuation_for_tracking || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
2704            if !line_info.in_code_span_continuation
2705                && line_info.list_item.is_none()
2706                && !line_info.is_blank
2707                && !line_info.in_code_block
2708                && is_valid_continuation
2709                && let Some(ref mut block) = current_block
2710            {
2711                block.end_line = line_num;
2712            }
2713
2714            // Check if this line is a list item
2715            if let Some(list_item) = &line_info.list_item {
2716                // Calculate nesting level based on indentation
2717                let item_indent = list_item.marker_column;
2718                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2719
2720                if let Some(ref mut block) = current_block {
2721                    // Check if this continues the current block
2722                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2723                    // or a continuation at the same or lower level
2724                    let is_nested = nesting > block.nesting_level;
2725                    let same_type =
2726                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2727                    let same_context = block.blockquote_prefix == blockquote_prefix;
2728                    // Allow one blank line after last item, or lines immediately after block content
2729                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
2730
2731                    // For unordered lists, also check marker consistency
2732                    let marker_compatible =
2733                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2734
2735                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2736                    // This eliminates the quadratic bottleneck from issue #148
2737                    let has_non_list_content = has_list_breaking_content_since_last_item;
2738
2739                    // A list continues if:
2740                    // 1. It's a nested item (indented more than the parent), OR
2741                    // 2. It's the same type at the same level with reasonable distance
2742                    let mut continues_list = if is_nested {
2743                        // Nested items always continue the list if they're in the same context
2744                        same_context && reasonable_distance && !has_non_list_content
2745                    } else {
2746                        // Same-level items need to match type and markers
2747                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2748                    };
2749
2750                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2751                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2752                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2753                        // Check if the previous line was a list item or a continuation of a list item
2754                        // (including lazy continuation lines)
2755                        if block.item_lines.contains(&(line_num - 1)) {
2756                            // They're consecutive list items - force them to be in the same list
2757                            continues_list = true;
2758                        } else {
2759                            // Previous line is a continuation line within this block
2760                            // (e.g., lazy continuation with indent=0)
2761                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
2762                            continues_list = true;
2763                        }
2764                    }
2765
2766                    if continues_list {
2767                        // Extend current block
2768                        block.end_line = line_num;
2769                        block.item_lines.push(line_num);
2770
2771                        // Update max marker width
2772                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2773                            list_item.marker.len() + 1
2774                        } else {
2775                            list_item.marker.len()
2776                        });
2777
2778                        // Update marker consistency for unordered lists
2779                        if !block.is_ordered
2780                            && block.marker.is_some()
2781                            && block.marker.as_ref() != Some(&list_item.marker)
2782                        {
2783                            // Mixed markers, clear the marker field
2784                            block.marker = None;
2785                        }
2786
2787                        // Reset tracked state for issue #148 optimization
2788                        reset_tracking_state(
2789                            list_item,
2790                            &mut has_list_breaking_content_since_last_item,
2791                            &mut min_continuation_for_tracking,
2792                        );
2793                    } else {
2794                        // End current block and start a new one
2795
2796                        list_blocks.push(block.clone());
2797
2798                        *block = ListBlock {
2799                            start_line: line_num,
2800                            end_line: line_num,
2801                            is_ordered: list_item.is_ordered,
2802                            marker: if list_item.is_ordered {
2803                                None
2804                            } else {
2805                                Some(list_item.marker.clone())
2806                            },
2807                            blockquote_prefix: blockquote_prefix.clone(),
2808                            item_lines: vec![line_num],
2809                            nesting_level: nesting,
2810                            max_marker_width: if list_item.is_ordered {
2811                                list_item.marker.len() + 1
2812                            } else {
2813                                list_item.marker.len()
2814                            },
2815                        };
2816
2817                        // Initialize tracked state for new block (issue #148 optimization)
2818                        reset_tracking_state(
2819                            list_item,
2820                            &mut has_list_breaking_content_since_last_item,
2821                            &mut min_continuation_for_tracking,
2822                        );
2823                    }
2824                } else {
2825                    // Start a new block
2826                    current_block = Some(ListBlock {
2827                        start_line: line_num,
2828                        end_line: line_num,
2829                        is_ordered: list_item.is_ordered,
2830                        marker: if list_item.is_ordered {
2831                            None
2832                        } else {
2833                            Some(list_item.marker.clone())
2834                        },
2835                        blockquote_prefix,
2836                        item_lines: vec![line_num],
2837                        nesting_level: nesting,
2838                        max_marker_width: list_item.marker.len(),
2839                    });
2840
2841                    // Initialize tracked state for new block (issue #148 optimization)
2842                    reset_tracking_state(
2843                        list_item,
2844                        &mut has_list_breaking_content_since_last_item,
2845                        &mut min_continuation_for_tracking,
2846                    );
2847                }
2848
2849                last_list_item_line = line_num;
2850                current_indent_level = item_indent;
2851                last_marker_width = if list_item.is_ordered {
2852                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2853                } else {
2854                    list_item.marker.len()
2855                };
2856            } else if let Some(ref mut block) = current_block {
2857                // Not a list item - check if it continues the current block
2858
2859                // For MD032 compatibility, we use a simple approach:
2860                // - Indented lines continue the list
2861                // - Blank lines followed by indented content continue the list
2862                // - Everything else ends the list
2863
2864                // Check if the last line in the list block ended with a backslash (hard line break)
2865                // This handles cases where list items use backslash for hard line breaks
2866                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2867                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2868                } else {
2869                    false
2870                };
2871
2872                // Calculate minimum indentation for list continuation
2873                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2874                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2875                let min_continuation_indent = if block.is_ordered {
2876                    current_indent_level + last_marker_width
2877                } else {
2878                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2879                };
2880
2881                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2882                    // Indented line or backslash continuation continues the list
2883                    block.end_line = line_num;
2884                } else if line_info.is_blank {
2885                    // Blank line - check if it's internal to the list or ending it
2886                    // We only include blank lines that are followed by more list content
2887                    let mut check_idx = line_idx + 1;
2888                    let mut found_continuation = false;
2889
2890                    // Skip additional blank lines
2891                    while check_idx < lines.len() && lines[check_idx].is_blank {
2892                        check_idx += 1;
2893                    }
2894
2895                    if check_idx < lines.len() {
2896                        let next_line = &lines[check_idx];
2897                        // Check if followed by indented content (list continuation)
2898                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2899                            found_continuation = true;
2900                        }
2901                        // Check if followed by another list item at the same level
2902                        else if !next_line.in_code_block
2903                            && next_line.list_item.is_some()
2904                            && let Some(item) = &next_line.list_item
2905                        {
2906                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2907                                .find(next_line.content(content))
2908                                .map_or(String::new(), |m| m.as_str().to_string());
2909                            if item.marker_column == current_indent_level
2910                                && item.is_ordered == block.is_ordered
2911                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2912                            {
2913                                // Check if there was meaningful content between the list items (unused now)
2914                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2915                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2916                                    if let Some(between_line) = lines.get(idx) {
2917                                        let between_content = between_line.content(content);
2918                                        let trimmed = between_content.trim();
2919                                        // Skip empty lines
2920                                        if trimmed.is_empty() {
2921                                            return false;
2922                                        }
2923                                        // Check for meaningful content
2924                                        let line_indent = between_content.len() - between_content.trim_start().len();
2925
2926                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2927                                        if trimmed.starts_with("```")
2928                                            || trimmed.starts_with("~~~")
2929                                            || trimmed.starts_with("---")
2930                                            || trimmed.starts_with("***")
2931                                            || trimmed.starts_with("___")
2932                                            || trimmed.starts_with(">")
2933                                            || crate::utils::skip_context::is_table_line(trimmed)
2934                                            || between_line.heading.is_some()
2935                                        {
2936                                            return true; // These are structural separators - meaningful content that breaks lists
2937                                        }
2938
2939                                        // Only properly indented content continues the list
2940                                        line_indent >= min_continuation_indent
2941                                    } else {
2942                                        false
2943                                    }
2944                                });
2945
2946                                if block.is_ordered {
2947                                    // For ordered lists: don't continue if there are structural separators
2948                                    // Check if there are structural separators between the list items
2949                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2950                                        if let Some(between_line) = lines.get(idx) {
2951                                            let trimmed = between_line.content(content).trim();
2952                                            if trimmed.is_empty() {
2953                                                return false;
2954                                            }
2955                                            // Check for structural separators that break lists
2956                                            trimmed.starts_with("```")
2957                                                || trimmed.starts_with("~~~")
2958                                                || trimmed.starts_with("---")
2959                                                || trimmed.starts_with("***")
2960                                                || trimmed.starts_with("___")
2961                                                || trimmed.starts_with(">")
2962                                                || crate::utils::skip_context::is_table_line(trimmed)
2963                                                || between_line.heading.is_some()
2964                                        } else {
2965                                            false
2966                                        }
2967                                    });
2968                                    found_continuation = !has_structural_separators;
2969                                } else {
2970                                    // For unordered lists: also check for structural separators
2971                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2972                                        if let Some(between_line) = lines.get(idx) {
2973                                            let trimmed = between_line.content(content).trim();
2974                                            if trimmed.is_empty() {
2975                                                return false;
2976                                            }
2977                                            // Check for structural separators that break lists
2978                                            trimmed.starts_with("```")
2979                                                || trimmed.starts_with("~~~")
2980                                                || trimmed.starts_with("---")
2981                                                || trimmed.starts_with("***")
2982                                                || trimmed.starts_with("___")
2983                                                || trimmed.starts_with(">")
2984                                                || crate::utils::skip_context::is_table_line(trimmed)
2985                                                || between_line.heading.is_some()
2986                                        } else {
2987                                            false
2988                                        }
2989                                    });
2990                                    found_continuation = !has_structural_separators;
2991                                }
2992                            }
2993                        }
2994                    }
2995
2996                    if found_continuation {
2997                        // Include the blank line in the block
2998                        block.end_line = line_num;
2999                    } else {
3000                        // Blank line ends the list - don't include it
3001                        list_blocks.push(block.clone());
3002                        current_block = None;
3003                    }
3004                } else {
3005                    // Check for lazy continuation - non-indented line immediately after a list item
3006                    // But only if the line has sufficient indentation for the list type
3007                    let min_required_indent = if block.is_ordered {
3008                        current_indent_level + last_marker_width
3009                    } else {
3010                        current_indent_level + 2
3011                    };
3012
3013                    // For lazy continuation to apply, the line must either:
3014                    // 1. Have no indentation (true lazy continuation)
3015                    // 2. Have sufficient indentation for the list type
3016                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
3017                    let line_content = line_info.content(content).trim();
3018
3019                    // Check for table-like patterns
3020                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3021
3022                    let is_structural_separator = line_info.heading.is_some()
3023                        || line_content.starts_with("```")
3024                        || line_content.starts_with("~~~")
3025                        || line_content.starts_with("---")
3026                        || line_content.starts_with("***")
3027                        || line_content.starts_with("___")
3028                        || line_content.starts_with(">")
3029                        || looks_like_table;
3030
3031                    // Allow lazy continuation if we're still within the same list block
3032                    // (not just immediately after a list item)
3033                    let is_lazy_continuation = !is_structural_separator
3034                        && !line_info.is_blank
3035                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
3036
3037                    if is_lazy_continuation {
3038                        // Additional check: if the line starts with uppercase and looks like a new sentence,
3039                        // it's probably not a continuation
3040                        let content_to_check = if !blockquote_prefix.is_empty() {
3041                            // Strip blockquote prefix to check the actual content
3042                            line_info
3043                                .content(content)
3044                                .strip_prefix(&blockquote_prefix)
3045                                .unwrap_or(line_info.content(content))
3046                                .trim()
3047                        } else {
3048                            line_info.content(content).trim()
3049                        };
3050
3051                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
3052
3053                        // If it starts with uppercase and the previous line ended with punctuation,
3054                        // it's likely a new paragraph, not a continuation
3055                        if starts_with_uppercase && last_list_item_line > 0 {
3056                            // This looks like a new paragraph
3057                            list_blocks.push(block.clone());
3058                            current_block = None;
3059                        } else {
3060                            // This is a lazy continuation line
3061                            block.end_line = line_num;
3062                        }
3063                    } else {
3064                        // Non-indented, non-blank line that's not a lazy continuation - end the block
3065                        list_blocks.push(block.clone());
3066                        current_block = None;
3067                    }
3068                }
3069            }
3070        }
3071
3072        // Don't forget the last block
3073        if let Some(block) = current_block {
3074            list_blocks.push(block);
3075        }
3076
3077        // Merge adjacent blocks that should be one
3078        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3079
3080        list_blocks
3081    }
3082
3083    /// Compute character frequency for fast content analysis
3084    fn compute_char_frequency(content: &str) -> CharFrequency {
3085        let mut frequency = CharFrequency::default();
3086
3087        for ch in content.chars() {
3088            match ch {
3089                '#' => frequency.hash_count += 1,
3090                '*' => frequency.asterisk_count += 1,
3091                '_' => frequency.underscore_count += 1,
3092                '-' => frequency.hyphen_count += 1,
3093                '+' => frequency.plus_count += 1,
3094                '>' => frequency.gt_count += 1,
3095                '|' => frequency.pipe_count += 1,
3096                '[' => frequency.bracket_count += 1,
3097                '`' => frequency.backtick_count += 1,
3098                '<' => frequency.lt_count += 1,
3099                '!' => frequency.exclamation_count += 1,
3100                '\n' => frequency.newline_count += 1,
3101                _ => {}
3102            }
3103        }
3104
3105        frequency
3106    }
3107
3108    /// Parse HTML tags in the content
3109    fn parse_html_tags(
3110        content: &str,
3111        lines: &[LineInfo],
3112        code_blocks: &[(usize, usize)],
3113        flavor: MarkdownFlavor,
3114    ) -> Vec<HtmlTag> {
3115        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3116            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3117
3118        let mut html_tags = Vec::with_capacity(content.matches('<').count());
3119
3120        for cap in HTML_TAG_REGEX.captures_iter(content) {
3121            let full_match = cap.get(0).unwrap();
3122            let match_start = full_match.start();
3123            let match_end = full_match.end();
3124
3125            // Skip if in code block
3126            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3127                continue;
3128            }
3129
3130            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3131            let tag_name_original = cap.get(2).unwrap().as_str();
3132            let tag_name = tag_name_original.to_lowercase();
3133            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3134
3135            // Skip JSX components in MDX files (tags starting with uppercase letter)
3136            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3137            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3138                continue;
3139            }
3140
3141            // Find which line this tag is on
3142            let mut line_num = 1;
3143            let mut col_start = match_start;
3144            let mut col_end = match_end;
3145            for (idx, line_info) in lines.iter().enumerate() {
3146                if match_start >= line_info.byte_offset {
3147                    line_num = idx + 1;
3148                    col_start = match_start - line_info.byte_offset;
3149                    col_end = match_end - line_info.byte_offset;
3150                } else {
3151                    break;
3152                }
3153            }
3154
3155            html_tags.push(HtmlTag {
3156                line: line_num,
3157                start_col: col_start,
3158                end_col: col_end,
3159                byte_offset: match_start,
3160                byte_end: match_end,
3161                tag_name,
3162                is_closing,
3163                is_self_closing,
3164                raw_content: full_match.as_str().to_string(),
3165            });
3166        }
3167
3168        html_tags
3169    }
3170
3171    /// Parse emphasis spans in the content
3172    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
3173        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
3174            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
3175
3176        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
3177
3178        for cap in EMPHASIS_REGEX.captures_iter(content) {
3179            let full_match = cap.get(0).unwrap();
3180            let match_start = full_match.start();
3181            let match_end = full_match.end();
3182
3183            // Skip if in code block
3184            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3185                continue;
3186            }
3187
3188            let opening_markers = cap.get(1).unwrap().as_str();
3189            let content_part = cap.get(2).unwrap().as_str();
3190            let closing_markers = cap.get(3).unwrap().as_str();
3191
3192            // Validate matching markers
3193            if opening_markers.chars().next() != closing_markers.chars().next()
3194                || opening_markers.len() != closing_markers.len()
3195            {
3196                continue;
3197            }
3198
3199            let marker = opening_markers.chars().next().unwrap();
3200            let marker_count = opening_markers.len();
3201
3202            // Find which line this emphasis is on
3203            let mut line_num = 1;
3204            let mut col_start = match_start;
3205            let mut col_end = match_end;
3206            for (idx, line_info) in lines.iter().enumerate() {
3207                if match_start >= line_info.byte_offset {
3208                    line_num = idx + 1;
3209                    col_start = match_start - line_info.byte_offset;
3210                    col_end = match_end - line_info.byte_offset;
3211                } else {
3212                    break;
3213                }
3214            }
3215
3216            emphasis_spans.push(EmphasisSpan {
3217                line: line_num,
3218                start_col: col_start,
3219                end_col: col_end,
3220                byte_offset: match_start,
3221                byte_end: match_end,
3222                marker,
3223                marker_count,
3224                content: content_part.to_string(),
3225            });
3226        }
3227
3228        emphasis_spans
3229    }
3230
3231    /// Parse table rows in the content
3232    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3233        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3234
3235        for (line_idx, line_info) in lines.iter().enumerate() {
3236            // Skip lines in code blocks or blank lines
3237            if line_info.in_code_block || line_info.is_blank {
3238                continue;
3239            }
3240
3241            let line = line_info.content(content);
3242            let line_num = line_idx + 1;
3243
3244            // Check if this line contains pipes (potential table row)
3245            if !line.contains('|') {
3246                continue;
3247            }
3248
3249            // Count columns by splitting on pipes
3250            let parts: Vec<&str> = line.split('|').collect();
3251            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3252
3253            // Check if this is a separator row
3254            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3255            let mut column_alignments = Vec::new();
3256
3257            if is_separator {
3258                for part in &parts[1..parts.len() - 1] {
3259                    // Skip first and last empty parts
3260                    let trimmed = part.trim();
3261                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3262                        "center".to_string()
3263                    } else if trimmed.ends_with(':') {
3264                        "right".to_string()
3265                    } else if trimmed.starts_with(':') {
3266                        "left".to_string()
3267                    } else {
3268                        "none".to_string()
3269                    };
3270                    column_alignments.push(alignment);
3271                }
3272            }
3273
3274            table_rows.push(TableRow {
3275                line: line_num,
3276                is_separator,
3277                column_count,
3278                column_alignments,
3279            });
3280        }
3281
3282        table_rows
3283    }
3284
3285    /// Parse bare URLs and emails in the content
3286    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3287        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3288
3289        // Check for bare URLs (not in angle brackets or markdown links)
3290        for cap in BARE_URL_PATTERN.captures_iter(content) {
3291            let full_match = cap.get(0).unwrap();
3292            let match_start = full_match.start();
3293            let match_end = full_match.end();
3294
3295            // Skip if in code block
3296            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3297                continue;
3298            }
3299
3300            // Skip if already in angle brackets or markdown links
3301            let preceding_char = if match_start > 0 {
3302                content.chars().nth(match_start - 1)
3303            } else {
3304                None
3305            };
3306            let following_char = content.chars().nth(match_end);
3307
3308            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3309                continue;
3310            }
3311            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3312                continue;
3313            }
3314
3315            let url = full_match.as_str();
3316            let url_type = if url.starts_with("https://") {
3317                "https"
3318            } else if url.starts_with("http://") {
3319                "http"
3320            } else if url.starts_with("ftp://") {
3321                "ftp"
3322            } else {
3323                "other"
3324            };
3325
3326            // Find which line this URL is on
3327            let mut line_num = 1;
3328            let mut col_start = match_start;
3329            let mut col_end = match_end;
3330            for (idx, line_info) in lines.iter().enumerate() {
3331                if match_start >= line_info.byte_offset {
3332                    line_num = idx + 1;
3333                    col_start = match_start - line_info.byte_offset;
3334                    col_end = match_end - line_info.byte_offset;
3335                } else {
3336                    break;
3337                }
3338            }
3339
3340            bare_urls.push(BareUrl {
3341                line: line_num,
3342                start_col: col_start,
3343                end_col: col_end,
3344                byte_offset: match_start,
3345                byte_end: match_end,
3346                url: url.to_string(),
3347                url_type: url_type.to_string(),
3348            });
3349        }
3350
3351        // Check for bare email addresses
3352        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3353            let full_match = cap.get(0).unwrap();
3354            let match_start = full_match.start();
3355            let match_end = full_match.end();
3356
3357            // Skip if in code block
3358            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3359                continue;
3360            }
3361
3362            // Skip if already in angle brackets or markdown links
3363            let preceding_char = if match_start > 0 {
3364                content.chars().nth(match_start - 1)
3365            } else {
3366                None
3367            };
3368            let following_char = content.chars().nth(match_end);
3369
3370            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3371                continue;
3372            }
3373            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3374                continue;
3375            }
3376
3377            let email = full_match.as_str();
3378
3379            // Find which line this email is on
3380            let mut line_num = 1;
3381            let mut col_start = match_start;
3382            let mut col_end = match_end;
3383            for (idx, line_info) in lines.iter().enumerate() {
3384                if match_start >= line_info.byte_offset {
3385                    line_num = idx + 1;
3386                    col_start = match_start - line_info.byte_offset;
3387                    col_end = match_end - line_info.byte_offset;
3388                } else {
3389                    break;
3390                }
3391            }
3392
3393            bare_urls.push(BareUrl {
3394                line: line_num,
3395                start_col: col_start,
3396                end_col: col_end,
3397                byte_offset: match_start,
3398                byte_end: match_end,
3399                url: email.to_string(),
3400                url_type: "email".to_string(),
3401            });
3402        }
3403
3404        bare_urls
3405    }
3406
3407    /// Get an iterator over valid CommonMark headings
3408    ///
3409    /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
3410    /// that should be flagged by MD018 but should not be processed by other heading rules.
3411    ///
3412    /// # Examples
3413    ///
3414    /// ```rust
3415    /// use rumdl_lib::lint_context::LintContext;
3416    /// use rumdl_lib::config::MarkdownFlavor;
3417    ///
3418    /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
3419    /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3420    ///
3421    /// for heading in ctx.valid_headings() {
3422    ///     println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
3423    /// }
3424    /// // Only prints valid headings, skips `#NoSpace`
3425    /// ```
3426    #[must_use]
3427    pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
3428        ValidHeadingsIter::new(&self.lines)
3429    }
3430
3431    /// Check if the document contains any valid CommonMark headings
3432    ///
3433    /// Returns `true` if there is at least one heading with proper space after `#`.
3434    #[must_use]
3435    pub fn has_valid_headings(&self) -> bool {
3436        self.lines
3437            .iter()
3438            .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
3439    }
3440}
3441
3442/// Merge adjacent list blocks that should be treated as one
3443fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3444    if list_blocks.len() < 2 {
3445        return;
3446    }
3447
3448    let mut merger = ListBlockMerger::new(content, lines);
3449    *list_blocks = merger.merge(list_blocks);
3450}
3451
3452/// Helper struct to manage the complex logic of merging list blocks
3453struct ListBlockMerger<'a> {
3454    content: &'a str,
3455    lines: &'a [LineInfo],
3456}
3457
3458impl<'a> ListBlockMerger<'a> {
3459    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3460        Self { content, lines }
3461    }
3462
3463    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3464        let mut merged = Vec::with_capacity(list_blocks.len());
3465        let mut current = list_blocks[0].clone();
3466
3467        for next in list_blocks.iter().skip(1) {
3468            if self.should_merge_blocks(&current, next) {
3469                current = self.merge_two_blocks(current, next);
3470            } else {
3471                merged.push(current);
3472                current = next.clone();
3473            }
3474        }
3475
3476        merged.push(current);
3477        merged
3478    }
3479
3480    /// Determine if two adjacent list blocks should be merged
3481    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3482        // Basic compatibility checks
3483        if !self.blocks_are_compatible(current, next) {
3484            return false;
3485        }
3486
3487        // Check spacing and content between blocks
3488        let spacing = self.analyze_spacing_between(current, next);
3489        match spacing {
3490            BlockSpacing::Consecutive => true,
3491            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3492            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3493                self.can_merge_with_content_between(current, next)
3494            }
3495        }
3496    }
3497
3498    /// Check if blocks have compatible structure for merging
3499    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3500        current.is_ordered == next.is_ordered
3501            && current.blockquote_prefix == next.blockquote_prefix
3502            && current.nesting_level == next.nesting_level
3503    }
3504
3505    /// Analyze the spacing between two list blocks
3506    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3507        let gap = next.start_line - current.end_line;
3508
3509        match gap {
3510            1 => BlockSpacing::Consecutive,
3511            2 => BlockSpacing::SingleBlank,
3512            _ if gap > 2 => {
3513                if self.has_only_blank_lines_between(current, next) {
3514                    BlockSpacing::MultipleBlanks
3515                } else {
3516                    BlockSpacing::ContentBetween
3517                }
3518            }
3519            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3520        }
3521    }
3522
3523    /// Check if unordered lists can be merged with a single blank line between
3524    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3525        // Check if there are structural separators between the blocks
3526        // If has_meaningful_content_between returns true, it means there are structural separators
3527        if has_meaningful_content_between(self.content, current, next, self.lines) {
3528            return false; // Structural separators prevent merging
3529        }
3530
3531        // Only merge unordered lists with same marker across single blank
3532        !current.is_ordered && current.marker == next.marker
3533    }
3534
3535    /// Check if ordered lists can be merged when there's content between them
3536    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3537        // Do not merge lists if there are structural separators between them
3538        if has_meaningful_content_between(self.content, current, next, self.lines) {
3539            return false; // Structural separators prevent merging
3540        }
3541
3542        // Only consider merging ordered lists if there's no structural content between
3543        current.is_ordered && next.is_ordered
3544    }
3545
3546    /// Check if there are only blank lines between blocks
3547    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3548        for line_num in (current.end_line + 1)..next.start_line {
3549            if let Some(line_info) = self.lines.get(line_num - 1)
3550                && !line_info.content(self.content).trim().is_empty()
3551            {
3552                return false;
3553            }
3554        }
3555        true
3556    }
3557
3558    /// Merge two compatible list blocks into one
3559    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3560        current.end_line = next.end_line;
3561        current.item_lines.extend_from_slice(&next.item_lines);
3562
3563        // Update max marker width
3564        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3565
3566        // Handle marker consistency for unordered lists
3567        if !current.is_ordered && self.markers_differ(&current, next) {
3568            current.marker = None; // Mixed markers
3569        }
3570
3571        current
3572    }
3573
3574    /// Check if two blocks have different markers
3575    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3576        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3577    }
3578}
3579
3580/// Types of spacing between list blocks
3581#[derive(Debug, PartialEq)]
3582enum BlockSpacing {
3583    Consecutive,    // No gap between blocks
3584    SingleBlank,    // One blank line between blocks
3585    MultipleBlanks, // Multiple blank lines but no content
3586    ContentBetween, // Content exists between blocks
3587}
3588
3589/// Check if there's meaningful content (not just blank lines) between two list blocks
3590fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3591    // Check lines between current.end_line and next.start_line
3592    for line_num in (current.end_line + 1)..next.start_line {
3593        if let Some(line_info) = lines.get(line_num - 1) {
3594            // Convert to 0-indexed
3595            let trimmed = line_info.content(content).trim();
3596
3597            // Skip empty lines
3598            if trimmed.is_empty() {
3599                continue;
3600            }
3601
3602            // Check for structural separators that should separate lists (CommonMark compliant)
3603
3604            // Headings separate lists
3605            if line_info.heading.is_some() {
3606                return true; // Has meaningful content - headings separate lists
3607            }
3608
3609            // Horizontal rules separate lists (---, ***, ___)
3610            if is_horizontal_rule(trimmed) {
3611                return true; // Has meaningful content - horizontal rules separate lists
3612            }
3613
3614            // Tables separate lists
3615            if crate::utils::skip_context::is_table_line(trimmed) {
3616                return true; // Has meaningful content - tables separate lists
3617            }
3618
3619            // Blockquotes separate lists
3620            if trimmed.starts_with('>') {
3621                return true; // Has meaningful content - blockquotes separate lists
3622            }
3623
3624            // Code block fences separate lists (unless properly indented as list content)
3625            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3626                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3627
3628                // Check if this code block is properly indented as list continuation
3629                let min_continuation_indent = if current.is_ordered {
3630                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3631                } else {
3632                    current.nesting_level + 2
3633                };
3634
3635                if line_indent < min_continuation_indent {
3636                    // This is a standalone code block that separates lists
3637                    return true; // Has meaningful content - standalone code blocks separate lists
3638                }
3639            }
3640
3641            // Check if this line has proper indentation for list continuation
3642            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3643
3644            // Calculate minimum indentation needed to be list continuation
3645            let min_indent = if current.is_ordered {
3646                current.nesting_level + current.max_marker_width
3647            } else {
3648                current.nesting_level + 2
3649            };
3650
3651            // If the line is not indented enough to be list continuation, it's meaningful content
3652            if line_indent < min_indent {
3653                return true; // Has meaningful content - content not indented as list continuation
3654            }
3655
3656            // If we reach here, the line is properly indented as list continuation
3657            // Continue checking other lines
3658        }
3659    }
3660
3661    // Only blank lines or properly indented list continuation content between blocks
3662    false
3663}
3664
3665/// Check if a line is a horizontal rule (---, ***, ___)
3666fn is_horizontal_rule(trimmed: &str) -> bool {
3667    if trimmed.len() < 3 {
3668        return false;
3669    }
3670
3671    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3672    let chars: Vec<char> = trimmed.chars().collect();
3673    if let Some(&first_char) = chars.first()
3674        && (first_char == '-' || first_char == '*' || first_char == '_')
3675    {
3676        let mut count = 0;
3677        for &ch in &chars {
3678            if ch == first_char {
3679                count += 1;
3680            } else if ch != ' ' && ch != '\t' {
3681                return false; // Non-matching, non-whitespace character
3682            }
3683        }
3684        return count >= 3;
3685    }
3686    false
3687}
3688
3689/// Check if content contains patterns that cause the markdown crate to panic
3690#[cfg(test)]
3691mod tests {
3692    use super::*;
3693
3694    #[test]
3695    fn test_empty_content() {
3696        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
3697        assert_eq!(ctx.content, "");
3698        assert_eq!(ctx.line_offsets, vec![0]);
3699        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3700        assert_eq!(ctx.lines.len(), 0);
3701    }
3702
3703    #[test]
3704    fn test_single_line() {
3705        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
3706        assert_eq!(ctx.content, "# Hello");
3707        assert_eq!(ctx.line_offsets, vec![0]);
3708        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3709        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3710    }
3711
3712    #[test]
3713    fn test_multi_line() {
3714        let content = "# Title\n\nSecond line\nThird line";
3715        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3716        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3717        // Test offset to line/col
3718        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3719        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3720        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3721        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3722        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3723    }
3724
3725    #[test]
3726    fn test_line_info() {
3727        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3728        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3729
3730        // Test line info
3731        assert_eq!(ctx.lines.len(), 7);
3732
3733        // Line 1: "# Title"
3734        let line1 = &ctx.lines[0];
3735        assert_eq!(line1.content(ctx.content), "# Title");
3736        assert_eq!(line1.byte_offset, 0);
3737        assert_eq!(line1.indent, 0);
3738        assert!(!line1.is_blank);
3739        assert!(!line1.in_code_block);
3740        assert!(line1.list_item.is_none());
3741
3742        // Line 2: "    indented"
3743        let line2 = &ctx.lines[1];
3744        assert_eq!(line2.content(ctx.content), "    indented");
3745        assert_eq!(line2.byte_offset, 8);
3746        assert_eq!(line2.indent, 4);
3747        assert!(!line2.is_blank);
3748
3749        // Line 3: "" (blank)
3750        let line3 = &ctx.lines[2];
3751        assert_eq!(line3.content(ctx.content), "");
3752        assert!(line3.is_blank);
3753
3754        // Test helper methods
3755        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3756        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3757        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3758        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3759    }
3760
3761    #[test]
3762    fn test_list_item_detection() {
3763        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3764        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3765
3766        // Line 1: "- Unordered item"
3767        let line1 = &ctx.lines[0];
3768        assert!(line1.list_item.is_some());
3769        let list1 = line1.list_item.as_ref().unwrap();
3770        assert_eq!(list1.marker, "-");
3771        assert!(!list1.is_ordered);
3772        assert_eq!(list1.marker_column, 0);
3773        assert_eq!(list1.content_column, 2);
3774
3775        // Line 2: "  * Nested item"
3776        let line2 = &ctx.lines[1];
3777        assert!(line2.list_item.is_some());
3778        let list2 = line2.list_item.as_ref().unwrap();
3779        assert_eq!(list2.marker, "*");
3780        assert_eq!(list2.marker_column, 2);
3781
3782        // Line 3: "1. Ordered item"
3783        let line3 = &ctx.lines[2];
3784        assert!(line3.list_item.is_some());
3785        let list3 = line3.list_item.as_ref().unwrap();
3786        assert_eq!(list3.marker, "1.");
3787        assert!(list3.is_ordered);
3788        assert_eq!(list3.number, Some(1));
3789
3790        // Line 6: "Not a list"
3791        let line6 = &ctx.lines[5];
3792        assert!(line6.list_item.is_none());
3793    }
3794
3795    #[test]
3796    fn test_offset_to_line_col_edge_cases() {
3797        let content = "a\nb\nc";
3798        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3799        // line_offsets: [0, 2, 4]
3800        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3801        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3802        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3803        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3804        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3805        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3806    }
3807
3808    #[test]
3809    fn test_mdx_esm_blocks() {
3810        let content = r##"import {Chart} from './snowfall.js'
3811export const year = 2023
3812
3813# Last year's snowfall
3814
3815In {year}, the snowfall was above average.
3816It was followed by a warm spring which caused
3817flood conditions in many of the nearby rivers.
3818
3819<Chart color="#fcb32c" year={year} />
3820"##;
3821
3822        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
3823
3824        // Check that lines 1 and 2 are marked as ESM blocks
3825        assert_eq!(ctx.lines.len(), 10);
3826        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3827        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3828        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3829        assert!(
3830            !ctx.lines[3].in_esm_block,
3831            "Line 4 (heading) should NOT be in_esm_block"
3832        );
3833        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3834        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3835    }
3836
3837    #[test]
3838    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3839        let content = r#"import {Chart} from './snowfall.js'
3840export const year = 2023
3841
3842# Last year's snowfall
3843"#;
3844
3845        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3846
3847        // ESM blocks should NOT be detected in Standard flavor
3848        assert!(
3849            !ctx.lines[0].in_esm_block,
3850            "Line 1 should NOT be in_esm_block in Standard flavor"
3851        );
3852        assert!(
3853            !ctx.lines[1].in_esm_block,
3854            "Line 2 should NOT be in_esm_block in Standard flavor"
3855        );
3856    }
3857}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs