rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::path::PathBuf;
8use std::sync::LazyLock;
9
10/// Macro for profiling sections - only active in non-WASM builds
11#[cfg(not(target_arch = "wasm32"))]
12macro_rules! profile_section {
13    ($name:expr, $profile:expr, $code:expr) => {{
14        let start = std::time::Instant::now();
15        let result = $code;
16        if $profile {
17            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
18        }
19        result
20    }};
21}
22
23#[cfg(target_arch = "wasm32")]
24macro_rules! profile_section {
25    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
26}
27
28// Comprehensive link pattern that captures both inline and reference links
29// Use (?s) flag to make . match newlines
30static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
31    Regex::new(
32        r#"(?sx)
33        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
34        (?:
35            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
36            |
37            \[([^\]]*)\]      # Reference ID in group 6
38        )"#
39    ).unwrap()
40});
41
42// Image pattern (similar to links but with ! prefix)
43// Use (?s) flag to make . match newlines
44static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r#"(?sx)
47        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
48        (?:
49            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
50            |
51            \[([^\]]*)\]      # Reference ID in group 6
52        )"#
53    ).unwrap()
54});
55
56// Reference definition pattern
57static REF_DEF_PATTERN: LazyLock<Regex> =
58    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
59
60// Pattern for bare URLs
61static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
62    Regex::new(
63        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
64    ).unwrap()
65});
66
67// Pattern for email addresses
68static BARE_EMAIL_PATTERN: LazyLock<Regex> =
69    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
70
71// Pattern for blockquote prefix in parse_list_blocks
72static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
73
74/// Pre-computed information about a line
75#[derive(Debug, Clone)]
76pub struct LineInfo {
77    /// Byte offset where this line starts in the document
78    pub byte_offset: usize,
79    /// Length of the line in bytes (without newline)
80    pub byte_len: usize,
81    /// Number of leading spaces/tabs
82    pub indent: usize,
83    /// Whether the line is blank (empty or only whitespace)
84    pub is_blank: bool,
85    /// Whether this line is inside a code block
86    pub in_code_block: bool,
87    /// Whether this line is inside front matter
88    pub in_front_matter: bool,
89    /// Whether this line is inside an HTML block
90    pub in_html_block: bool,
91    /// Whether this line is inside an HTML comment
92    pub in_html_comment: bool,
93    /// List item information if this line starts a list item
94    pub list_item: Option<ListItemInfo>,
95    /// Heading information if this line is a heading
96    pub heading: Option<HeadingInfo>,
97    /// Blockquote information if this line is a blockquote
98    pub blockquote: Option<BlockquoteInfo>,
99    /// Whether this line is inside a mkdocstrings autodoc block
100    pub in_mkdocstrings: bool,
101    /// Whether this line is part of an ESM import/export block (MDX only)
102    pub in_esm_block: bool,
103    /// Whether this line is a continuation of a multi-line code span from a previous line
104    pub in_code_span_continuation: bool,
105}
106
107impl LineInfo {
108    /// Get the line content as a string slice from the source document
109    pub fn content<'a>(&self, source: &'a str) -> &'a str {
110        &source[self.byte_offset..self.byte_offset + self.byte_len]
111    }
112}
113
114/// Information about a list item
115#[derive(Debug, Clone)]
116pub struct ListItemInfo {
117    /// The marker used (*, -, +, or number with . or ))
118    pub marker: String,
119    /// Whether it's ordered (true) or unordered (false)
120    pub is_ordered: bool,
121    /// The number for ordered lists
122    pub number: Option<usize>,
123    /// Column where the marker starts (0-based)
124    pub marker_column: usize,
125    /// Column where content after marker starts
126    pub content_column: usize,
127}
128
129/// Heading style type
130#[derive(Debug, Clone, PartialEq)]
131pub enum HeadingStyle {
132    /// ATX style heading (# Heading)
133    ATX,
134    /// Setext style heading with = underline
135    Setext1,
136    /// Setext style heading with - underline
137    Setext2,
138}
139
140/// Parsed link information
141#[derive(Debug, Clone)]
142pub struct ParsedLink<'a> {
143    /// Line number (1-indexed)
144    pub line: usize,
145    /// Start column (0-indexed) in the line
146    pub start_col: usize,
147    /// End column (0-indexed) in the line
148    pub end_col: usize,
149    /// Byte offset in document
150    pub byte_offset: usize,
151    /// End byte offset in document
152    pub byte_end: usize,
153    /// Link text
154    pub text: Cow<'a, str>,
155    /// Link URL or reference
156    pub url: Cow<'a, str>,
157    /// Whether this is a reference link [text][ref] vs inline [text](url)
158    pub is_reference: bool,
159    /// Reference ID for reference links
160    pub reference_id: Option<Cow<'a, str>>,
161    /// Link type from pulldown-cmark
162    pub link_type: LinkType,
163}
164
165/// Information about a broken link reported by pulldown-cmark
166#[derive(Debug, Clone)]
167pub struct BrokenLinkInfo {
168    /// The reference text that couldn't be resolved
169    pub reference: String,
170    /// Byte span in the source document
171    pub span: std::ops::Range<usize>,
172}
173
174/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
175#[derive(Debug, Clone)]
176pub struct FootnoteRef {
177    /// The footnote ID (without the ^ prefix)
178    pub id: String,
179    /// Line number (1-indexed)
180    pub line: usize,
181    /// Start byte offset in document
182    pub byte_offset: usize,
183    /// End byte offset in document
184    pub byte_end: usize,
185}
186
187/// Parsed image information
188#[derive(Debug, Clone)]
189pub struct ParsedImage<'a> {
190    /// Line number (1-indexed)
191    pub line: usize,
192    /// Start column (0-indexed) in the line
193    pub start_col: usize,
194    /// End column (0-indexed) in the line
195    pub end_col: usize,
196    /// Byte offset in document
197    pub byte_offset: usize,
198    /// End byte offset in document
199    pub byte_end: usize,
200    /// Alt text
201    pub alt_text: Cow<'a, str>,
202    /// Image URL or reference
203    pub url: Cow<'a, str>,
204    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
205    pub is_reference: bool,
206    /// Reference ID for reference images
207    pub reference_id: Option<Cow<'a, str>>,
208    /// Link type from pulldown-cmark
209    pub link_type: LinkType,
210}
211
212/// Reference definition [ref]: url "title"
213#[derive(Debug, Clone)]
214pub struct ReferenceDef {
215    /// Line number (1-indexed)
216    pub line: usize,
217    /// Reference ID (normalized to lowercase)
218    pub id: String,
219    /// URL
220    pub url: String,
221    /// Optional title
222    pub title: Option<String>,
223    /// Byte offset where the reference definition starts
224    pub byte_offset: usize,
225    /// Byte offset where the reference definition ends
226    pub byte_end: usize,
227}
228
229/// Parsed code span information
230#[derive(Debug, Clone)]
231pub struct CodeSpan {
232    /// Line number where the code span starts (1-indexed)
233    pub line: usize,
234    /// Line number where the code span ends (1-indexed)
235    pub end_line: usize,
236    /// Start column (0-indexed) in the line
237    pub start_col: usize,
238    /// End column (0-indexed) in the line
239    pub end_col: usize,
240    /// Byte offset in document
241    pub byte_offset: usize,
242    /// End byte offset in document
243    pub byte_end: usize,
244    /// Number of backticks used (1, 2, 3, etc.)
245    pub backtick_count: usize,
246    /// Content inside the code span (without backticks)
247    pub content: String,
248}
249
250/// Information about a heading
251#[derive(Debug, Clone)]
252pub struct HeadingInfo {
253    /// Heading level (1-6 for ATX, 1-2 for Setext)
254    pub level: u8,
255    /// Style of heading
256    pub style: HeadingStyle,
257    /// The heading marker (# characters or underline)
258    pub marker: String,
259    /// Column where the marker starts (0-based)
260    pub marker_column: usize,
261    /// Column where heading text starts
262    pub content_column: usize,
263    /// The heading text (without markers and without custom ID syntax)
264    pub text: String,
265    /// Custom header ID if present (e.g., from {#custom-id} syntax)
266    pub custom_id: Option<String>,
267    /// Original heading text including custom ID syntax
268    pub raw_text: String,
269    /// Whether it has a closing sequence (for ATX)
270    pub has_closing_sequence: bool,
271    /// The closing sequence if present
272    pub closing_sequence: String,
273}
274
275/// Information about a blockquote line
276#[derive(Debug, Clone)]
277pub struct BlockquoteInfo {
278    /// Nesting level (1 for >, 2 for >>, etc.)
279    pub nesting_level: usize,
280    /// The indentation before the blockquote marker
281    pub indent: String,
282    /// Column where the first > starts (0-based)
283    pub marker_column: usize,
284    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
285    pub prefix: String,
286    /// Content after the blockquote marker(s)
287    pub content: String,
288    /// Whether the line has no space after the marker
289    pub has_no_space_after_marker: bool,
290    /// Whether the line has multiple spaces after the marker
291    pub has_multiple_spaces_after_marker: bool,
292    /// Whether this is an empty blockquote line needing MD028 fix
293    pub needs_md028_fix: bool,
294}
295
296/// Information about a list block
297#[derive(Debug, Clone)]
298pub struct ListBlock {
299    /// Line number where the list starts (1-indexed)
300    pub start_line: usize,
301    /// Line number where the list ends (1-indexed)
302    pub end_line: usize,
303    /// Whether it's ordered or unordered
304    pub is_ordered: bool,
305    /// The consistent marker for unordered lists (if any)
306    pub marker: Option<String>,
307    /// Blockquote prefix for this list (empty if not in blockquote)
308    pub blockquote_prefix: String,
309    /// Lines that are list items within this block
310    pub item_lines: Vec<usize>,
311    /// Nesting level (0 for top-level lists)
312    pub nesting_level: usize,
313    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
314    pub max_marker_width: usize,
315}
316
317use std::sync::{Arc, OnceLock};
318
319/// Character frequency data for fast content analysis
320#[derive(Debug, Clone, Default)]
321pub struct CharFrequency {
322    /// Count of # characters (headings)
323    pub hash_count: usize,
324    /// Count of * characters (emphasis, lists, horizontal rules)
325    pub asterisk_count: usize,
326    /// Count of _ characters (emphasis, horizontal rules)
327    pub underscore_count: usize,
328    /// Count of - characters (lists, horizontal rules, setext headings)
329    pub hyphen_count: usize,
330    /// Count of + characters (lists)
331    pub plus_count: usize,
332    /// Count of > characters (blockquotes)
333    pub gt_count: usize,
334    /// Count of | characters (tables)
335    pub pipe_count: usize,
336    /// Count of [ characters (links, images)
337    pub bracket_count: usize,
338    /// Count of ` characters (code spans, code blocks)
339    pub backtick_count: usize,
340    /// Count of < characters (HTML tags, autolinks)
341    pub lt_count: usize,
342    /// Count of ! characters (images)
343    pub exclamation_count: usize,
344    /// Count of newline characters
345    pub newline_count: usize,
346}
347
348/// Pre-parsed HTML tag information
349#[derive(Debug, Clone)]
350pub struct HtmlTag {
351    /// Line number (1-indexed)
352    pub line: usize,
353    /// Start column (0-indexed) in the line
354    pub start_col: usize,
355    /// End column (0-indexed) in the line
356    pub end_col: usize,
357    /// Byte offset in document
358    pub byte_offset: usize,
359    /// End byte offset in document
360    pub byte_end: usize,
361    /// Tag name (e.g., "div", "img", "br")
362    pub tag_name: String,
363    /// Whether it's a closing tag (`</tag>`)
364    pub is_closing: bool,
365    /// Whether it's self-closing (`<tag />`)
366    pub is_self_closing: bool,
367    /// Raw tag content
368    pub raw_content: String,
369}
370
371/// Pre-parsed emphasis span information
372#[derive(Debug, Clone)]
373pub struct EmphasisSpan {
374    /// Line number (1-indexed)
375    pub line: usize,
376    /// Start column (0-indexed) in the line
377    pub start_col: usize,
378    /// End column (0-indexed) in the line
379    pub end_col: usize,
380    /// Byte offset in document
381    pub byte_offset: usize,
382    /// End byte offset in document
383    pub byte_end: usize,
384    /// Type of emphasis ('*' or '_')
385    pub marker: char,
386    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
387    pub marker_count: usize,
388    /// Content inside the emphasis
389    pub content: String,
390}
391
392/// Pre-parsed table row information
393#[derive(Debug, Clone)]
394pub struct TableRow {
395    /// Line number (1-indexed)
396    pub line: usize,
397    /// Whether this is a separator row (contains only |, -, :, and spaces)
398    pub is_separator: bool,
399    /// Number of columns (pipe-separated cells)
400    pub column_count: usize,
401    /// Alignment info from separator row
402    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
403}
404
405/// Pre-parsed bare URL information (not in links)
406#[derive(Debug, Clone)]
407pub struct BareUrl {
408    /// Line number (1-indexed)
409    pub line: usize,
410    /// Start column (0-indexed) in the line
411    pub start_col: usize,
412    /// End column (0-indexed) in the line
413    pub end_col: usize,
414    /// Byte offset in document
415    pub byte_offset: usize,
416    /// End byte offset in document
417    pub byte_end: usize,
418    /// The URL string
419    pub url: String,
420    /// Type of URL ("http", "https", "ftp", "email")
421    pub url_type: String,
422}
423
424pub struct LintContext<'a> {
425    pub content: &'a str,
426    pub line_offsets: Vec<usize>,
427    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
428    pub lines: Vec<LineInfo>,             // Pre-computed line information
429    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
430    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
431    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
432    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
433    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
434    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
435    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
436    pub char_frequency: CharFrequency,    // Character frequency analysis
437    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
438    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
439    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
440    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
441    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
442    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
443    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
444    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
445    pub flavor: MarkdownFlavor,           // Markdown flavor being used
446    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
447}
448
449/// Detailed blockquote parse result with all components
450struct BlockquoteComponents<'a> {
451    indent: &'a str,
452    markers: &'a str,
453    spaces_after: &'a str,
454    content: &'a str,
455}
456
457/// Parse blockquote prefix with detailed components using manual parsing
458#[inline]
459fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
460    let bytes = line.as_bytes();
461    let mut pos = 0;
462
463    // Parse leading whitespace (indent)
464    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
465        pos += 1;
466    }
467    let indent_end = pos;
468
469    // Must have at least one '>' marker
470    if pos >= bytes.len() || bytes[pos] != b'>' {
471        return None;
472    }
473
474    // Parse '>' markers
475    while pos < bytes.len() && bytes[pos] == b'>' {
476        pos += 1;
477    }
478    let markers_end = pos;
479
480    // Parse spaces after markers
481    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
482        pos += 1;
483    }
484    let spaces_end = pos;
485
486    Some(BlockquoteComponents {
487        indent: &line[0..indent_end],
488        markers: &line[indent_end..markers_end],
489        spaces_after: &line[markers_end..spaces_end],
490        content: &line[spaces_end..],
491    })
492}
493
494impl<'a> LintContext<'a> {
495    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
496        #[cfg(not(target_arch = "wasm32"))]
497        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
498        #[cfg(target_arch = "wasm32")]
499        let profile = false;
500
501        let line_offsets = profile_section!("Line offsets", profile, {
502            let mut offsets = vec![0];
503            for (i, c) in content.char_indices() {
504                if c == '\n' {
505                    offsets.push(i + 1);
506                }
507            }
508            offsets
509        });
510
511        // Detect code blocks once and cache them
512        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
513
514        // Pre-compute HTML comment ranges ONCE for all operations
515        let html_comment_ranges = profile_section!(
516            "HTML comment ranges",
517            profile,
518            crate::utils::skip_context::compute_html_comment_ranges(content)
519        );
520
521        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
522        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
523            if flavor == MarkdownFlavor::MkDocs {
524                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
525            } else {
526                Vec::new()
527            }
528        });
529
530        // Pre-compute line information (without headings/blockquotes yet)
531        let mut lines = profile_section!(
532            "Basic line info",
533            profile,
534            Self::compute_basic_line_info(
535                content,
536                &line_offsets,
537                &code_blocks,
538                flavor,
539                &html_comment_ranges,
540                &autodoc_ranges,
541            )
542        );
543
544        // Detect HTML blocks BEFORE heading detection
545        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
546
547        // Detect ESM import/export blocks in MDX files BEFORE heading detection
548        profile_section!(
549            "ESM blocks",
550            profile,
551            Self::detect_esm_blocks(content, &mut lines, flavor)
552        );
553
554        // Now detect headings and blockquotes
555        profile_section!(
556            "Headings & blockquotes",
557            profile,
558            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
559        );
560
561        // Parse code spans early so we can exclude them from link/image parsing
562        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
563
564        // Mark lines that are continuations of multi-line code spans
565        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
566        for span in &code_spans {
567            if span.end_line > span.line {
568                // Mark lines after the first line as continuations
569                for line_num in (span.line + 1)..=span.end_line {
570                    if let Some(line_info) = lines.get_mut(line_num - 1) {
571                        line_info.in_code_span_continuation = true;
572                    }
573                }
574            }
575        }
576
577        // Parse links, images, references, and list blocks
578        let (links, broken_links, footnote_refs) = profile_section!(
579            "Links",
580            profile,
581            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
582        );
583
584        let images = profile_section!(
585            "Images",
586            profile,
587            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
588        );
589
590        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
591
592        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
593
594        // Compute character frequency for fast content analysis
595        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
596
597        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
598        let table_blocks = profile_section!(
599            "Table blocks",
600            profile,
601            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
602                content,
603                &code_blocks,
604                &code_spans,
605                &html_comment_ranges,
606            )
607        );
608
609        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
610        let line_index = profile_section!(
611            "Line index",
612            profile,
613            crate::utils::range_utils::LineIndex::new(content)
614        );
615
616        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
617        let jinja_ranges = profile_section!(
618            "Jinja ranges",
619            profile,
620            crate::utils::jinja_utils::find_jinja_ranges(content)
621        );
622
623        Self {
624            content,
625            line_offsets,
626            code_blocks,
627            lines,
628            links,
629            images,
630            broken_links,
631            footnote_refs,
632            reference_defs,
633            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
634            list_blocks,
635            char_frequency,
636            html_tags_cache: OnceLock::new(),
637            emphasis_spans_cache: OnceLock::new(),
638            table_rows_cache: OnceLock::new(),
639            bare_urls_cache: OnceLock::new(),
640            html_comment_ranges,
641            table_blocks,
642            line_index,
643            jinja_ranges,
644            flavor,
645            source_file,
646        }
647    }
648
649    /// Get code spans - computed lazily on first access
650    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
651        Arc::clone(
652            self.code_spans_cache
653                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
654        )
655    }
656
657    /// Get HTML comment ranges - pre-computed during LintContext construction
658    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
659        &self.html_comment_ranges
660    }
661
662    /// Get HTML tags - computed lazily on first access
663    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
664        Arc::clone(self.html_tags_cache.get_or_init(|| {
665            Arc::new(Self::parse_html_tags(
666                self.content,
667                &self.lines,
668                &self.code_blocks,
669                self.flavor,
670            ))
671        }))
672    }
673
674    /// Get emphasis spans - computed lazily on first access
675    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
676        Arc::clone(
677            self.emphasis_spans_cache
678                .get_or_init(|| Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))),
679        )
680    }
681
682    /// Get table rows - computed lazily on first access
683    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
684        Arc::clone(
685            self.table_rows_cache
686                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
687        )
688    }
689
690    /// Get bare URLs - computed lazily on first access
691    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
692        Arc::clone(
693            self.bare_urls_cache
694                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
695        )
696    }
697
698    /// Map a byte offset to (line, column)
699    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
700        match self.line_offsets.binary_search(&offset) {
701            Ok(line) => (line + 1, 1),
702            Err(line) => {
703                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
704                (line, offset - line_start + 1)
705            }
706        }
707    }
708
709    /// Check if a position is within a code block or code span
710    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
711        // Check code blocks first
712        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
713            return true;
714        }
715
716        // Check inline code spans (lazy load if needed)
717        self.code_spans()
718            .iter()
719            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
720    }
721
722    /// Get line information by line number (1-indexed)
723    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
724        if line_num > 0 {
725            self.lines.get(line_num - 1)
726        } else {
727            None
728        }
729    }
730
731    /// Get byte offset for a line number (1-indexed)
732    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
733        self.line_info(line_num).map(|info| info.byte_offset)
734    }
735
736    /// Get URL for a reference link/image by its ID
737    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
738        let normalized_id = ref_id.to_lowercase();
739        self.reference_defs
740            .iter()
741            .find(|def| def.id == normalized_id)
742            .map(|def| def.url.as_str())
743    }
744
745    /// Check if a line is part of a list block
746    pub fn is_in_list_block(&self, line_num: usize) -> bool {
747        self.list_blocks
748            .iter()
749            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
750    }
751
752    /// Get the list block containing a specific line
753    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
754        self.list_blocks
755            .iter()
756            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
757    }
758
759    // Compatibility methods for DocumentStructure migration
760
761    /// Check if a line is within a code block
762    pub fn is_in_code_block(&self, line_num: usize) -> bool {
763        if line_num == 0 || line_num > self.lines.len() {
764            return false;
765        }
766        self.lines[line_num - 1].in_code_block
767    }
768
769    /// Check if a line is within front matter
770    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
771        if line_num == 0 || line_num > self.lines.len() {
772            return false;
773        }
774        self.lines[line_num - 1].in_front_matter
775    }
776
777    /// Check if a line is within an HTML block
778    pub fn is_in_html_block(&self, line_num: usize) -> bool {
779        if line_num == 0 || line_num > self.lines.len() {
780            return false;
781        }
782        self.lines[line_num - 1].in_html_block
783    }
784
785    /// Check if a line and column is within a code span
786    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
787        if line_num == 0 || line_num > self.lines.len() {
788            return false;
789        }
790
791        // Use the code spans cache to check
792        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
793        // Convert col to 0-indexed for comparison
794        let col_0indexed = if col > 0 { col - 1 } else { 0 };
795        let code_spans = self.code_spans();
796        code_spans.iter().any(|span| {
797            // Check if line is within the span's line range
798            if line_num < span.line || line_num > span.end_line {
799                return false;
800            }
801
802            if span.line == span.end_line {
803                // Single-line span: check column bounds
804                col_0indexed >= span.start_col && col_0indexed < span.end_col
805            } else if line_num == span.line {
806                // First line of multi-line span: anything after start_col is in span
807                col_0indexed >= span.start_col
808            } else if line_num == span.end_line {
809                // Last line of multi-line span: anything before end_col is in span
810                col_0indexed < span.end_col
811            } else {
812                // Middle line of multi-line span: entire line is in span
813                true
814            }
815        })
816    }
817
818    /// Check if a byte offset is within a code span
819    #[inline]
820    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
821        let code_spans = self.code_spans();
822        code_spans
823            .iter()
824            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
825    }
826
827    /// Check if a byte position is within a reference definition
828    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
829    #[inline]
830    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
831        self.reference_defs
832            .iter()
833            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
834    }
835
836    /// Check if a byte position is within an HTML comment
837    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
838    /// where k is the number of HTML comments (typically very small)
839    #[inline]
840    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
841        self.html_comment_ranges
842            .iter()
843            .any(|range| byte_pos >= range.start && byte_pos < range.end)
844    }
845
846    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
847    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
848        self.jinja_ranges
849            .iter()
850            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
851    }
852
853    /// Check if content has any instances of a specific character (fast)
854    pub fn has_char(&self, ch: char) -> bool {
855        match ch {
856            '#' => self.char_frequency.hash_count > 0,
857            '*' => self.char_frequency.asterisk_count > 0,
858            '_' => self.char_frequency.underscore_count > 0,
859            '-' => self.char_frequency.hyphen_count > 0,
860            '+' => self.char_frequency.plus_count > 0,
861            '>' => self.char_frequency.gt_count > 0,
862            '|' => self.char_frequency.pipe_count > 0,
863            '[' => self.char_frequency.bracket_count > 0,
864            '`' => self.char_frequency.backtick_count > 0,
865            '<' => self.char_frequency.lt_count > 0,
866            '!' => self.char_frequency.exclamation_count > 0,
867            '\n' => self.char_frequency.newline_count > 0,
868            _ => self.content.contains(ch), // Fallback for other characters
869        }
870    }
871
872    /// Get count of a specific character (fast)
873    pub fn char_count(&self, ch: char) -> usize {
874        match ch {
875            '#' => self.char_frequency.hash_count,
876            '*' => self.char_frequency.asterisk_count,
877            '_' => self.char_frequency.underscore_count,
878            '-' => self.char_frequency.hyphen_count,
879            '+' => self.char_frequency.plus_count,
880            '>' => self.char_frequency.gt_count,
881            '|' => self.char_frequency.pipe_count,
882            '[' => self.char_frequency.bracket_count,
883            '`' => self.char_frequency.backtick_count,
884            '<' => self.char_frequency.lt_count,
885            '!' => self.char_frequency.exclamation_count,
886            '\n' => self.char_frequency.newline_count,
887            _ => self.content.matches(ch).count(), // Fallback for other characters
888        }
889    }
890
891    /// Check if content likely contains headings (fast)
892    pub fn likely_has_headings(&self) -> bool {
893        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
894    }
895
896    /// Check if content likely contains lists (fast)
897    pub fn likely_has_lists(&self) -> bool {
898        self.char_frequency.asterisk_count > 0
899            || self.char_frequency.hyphen_count > 0
900            || self.char_frequency.plus_count > 0
901    }
902
903    /// Check if content likely contains emphasis (fast)
904    pub fn likely_has_emphasis(&self) -> bool {
905        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
906    }
907
908    /// Check if content likely contains tables (fast)
909    pub fn likely_has_tables(&self) -> bool {
910        self.char_frequency.pipe_count > 2
911    }
912
913    /// Check if content likely contains blockquotes (fast)
914    pub fn likely_has_blockquotes(&self) -> bool {
915        self.char_frequency.gt_count > 0
916    }
917
918    /// Check if content likely contains code (fast)
919    pub fn likely_has_code(&self) -> bool {
920        self.char_frequency.backtick_count > 0
921    }
922
923    /// Check if content likely contains links or images (fast)
924    pub fn likely_has_links_or_images(&self) -> bool {
925        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
926    }
927
928    /// Check if content likely contains HTML (fast)
929    pub fn likely_has_html(&self) -> bool {
930        self.char_frequency.lt_count > 0
931    }
932
933    /// Get HTML tags on a specific line
934    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
935        self.html_tags()
936            .iter()
937            .filter(|tag| tag.line == line_num)
938            .cloned()
939            .collect()
940    }
941
942    /// Get emphasis spans on a specific line
943    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
944        self.emphasis_spans()
945            .iter()
946            .filter(|span| span.line == line_num)
947            .cloned()
948            .collect()
949    }
950
951    /// Get table rows on a specific line
952    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
953        self.table_rows()
954            .iter()
955            .filter(|row| row.line == line_num)
956            .cloned()
957            .collect()
958    }
959
960    /// Get bare URLs on a specific line
961    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
962        self.bare_urls()
963            .iter()
964            .filter(|url| url.line == line_num)
965            .cloned()
966            .collect()
967    }
968
969    /// Find the line index for a given byte offset using binary search.
970    /// Returns (line_index, line_number, column) where:
971    /// - line_index is the 0-based index in the lines array
972    /// - line_number is the 1-based line number
973    /// - column is the byte offset within that line
974    #[inline]
975    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
976        // Binary search to find the line containing this byte offset
977        let idx = match lines.binary_search_by(|line| {
978            if byte_offset < line.byte_offset {
979                std::cmp::Ordering::Greater
980            } else if byte_offset > line.byte_offset + line.byte_len {
981                std::cmp::Ordering::Less
982            } else {
983                std::cmp::Ordering::Equal
984            }
985        }) {
986            Ok(idx) => idx,
987            Err(idx) => idx.saturating_sub(1),
988        };
989
990        let line = &lines[idx];
991        let line_num = idx + 1;
992        let col = byte_offset.saturating_sub(line.byte_offset);
993
994        (idx, line_num, col)
995    }
996
997    /// Check if a byte offset is within a code span using binary search
998    #[inline]
999    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1000        // Since spans are sorted by byte_offset, use partition_point for binary search
1001        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1002
1003        // Check the span that starts at or before our offset
1004        if idx > 0 {
1005            let span = &code_spans[idx - 1];
1006            if offset >= span.byte_offset && offset < span.byte_end {
1007                return true;
1008            }
1009        }
1010
1011        false
1012    }
1013
1014    /// Parse all links in the content
1015    fn parse_links(
1016        content: &'a str,
1017        lines: &[LineInfo],
1018        code_blocks: &[(usize, usize)],
1019        code_spans: &[CodeSpan],
1020        flavor: MarkdownFlavor,
1021        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1022    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1023        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1024        use std::collections::HashSet;
1025
1026        let mut links = Vec::with_capacity(content.len() / 500);
1027        let mut broken_links = Vec::new();
1028        let mut footnote_refs = Vec::new();
1029
1030        // Track byte positions of links found by pulldown-cmark
1031        let mut found_positions = HashSet::new();
1032
1033        // Use pulldown-cmark's streaming parser with BrokenLink callback
1034        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1035        // This automatically handles:
1036        // - Escaped links (won't generate events)
1037        // - Links in code blocks/spans (won't generate Link events)
1038        // - Images (generates Tag::Image instead)
1039        // - Reference resolution (dest_url is already resolved!)
1040        // - Broken references (callback is invoked)
1041        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1042        let mut options = Options::empty();
1043        options.insert(Options::ENABLE_WIKILINKS);
1044        options.insert(Options::ENABLE_FOOTNOTES);
1045
1046        let parser = Parser::new_with_broken_link_callback(
1047            content,
1048            options,
1049            Some(|link: BrokenLink<'_>| {
1050                broken_links.push(BrokenLinkInfo {
1051                    reference: link.reference.to_string(),
1052                    span: link.span.clone(),
1053                });
1054                None
1055            }),
1056        )
1057        .into_offset_iter();
1058
1059        let mut link_stack: Vec<(
1060            usize,
1061            usize,
1062            pulldown_cmark::CowStr<'a>,
1063            LinkType,
1064            pulldown_cmark::CowStr<'a>,
1065        )> = Vec::new();
1066        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1067
1068        for (event, range) in parser {
1069            match event {
1070                Event::Start(Tag::Link {
1071                    link_type,
1072                    dest_url,
1073                    id,
1074                    ..
1075                }) => {
1076                    // Link start - record position, URL, and reference ID
1077                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1078                    text_chunks.clear();
1079                }
1080                Event::Text(text) if !link_stack.is_empty() => {
1081                    // Track text content with its byte range
1082                    text_chunks.push((text.to_string(), range.start, range.end));
1083                }
1084                Event::Code(code) if !link_stack.is_empty() => {
1085                    // Include inline code in link text (with backticks)
1086                    let code_text = format!("`{code}`");
1087                    text_chunks.push((code_text, range.start, range.end));
1088                }
1089                Event::End(TagEnd::Link) => {
1090                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1091                        // Skip if in HTML comment
1092                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1093                            text_chunks.clear();
1094                            continue;
1095                        }
1096
1097                        // Find line and column information
1098                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1099
1100                        // Skip if this link is on a MkDocs snippet line
1101                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1102                            text_chunks.clear();
1103                            continue;
1104                        }
1105
1106                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1107
1108                        let is_reference = matches!(
1109                            link_type,
1110                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1111                        );
1112
1113                        // Extract link text directly from source bytes to preserve escaping
1114                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1115                        let link_text = if start_pos < content.len() {
1116                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1117
1118                            // Find MATCHING ] by tracking bracket depth for nested brackets
1119                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1120                            // Brackets inside code spans (between backticks) should be ignored
1121                            let mut close_pos = None;
1122                            let mut depth = 0;
1123                            let mut in_code_span = false;
1124
1125                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1126                                // Count preceding backslashes
1127                                let mut backslash_count = 0;
1128                                let mut j = i;
1129                                while j > 0 && link_bytes[j - 1] == b'\\' {
1130                                    backslash_count += 1;
1131                                    j -= 1;
1132                                }
1133                                let is_escaped = backslash_count % 2 != 0;
1134
1135                                // Track code spans - backticks toggle in/out of code
1136                                if byte == b'`' && !is_escaped {
1137                                    in_code_span = !in_code_span;
1138                                }
1139
1140                                // Only count brackets when NOT in a code span
1141                                if !is_escaped && !in_code_span {
1142                                    if byte == b'[' {
1143                                        depth += 1;
1144                                    } else if byte == b']' {
1145                                        if depth == 0 {
1146                                            // Found the matching closing bracket
1147                                            close_pos = Some(i);
1148                                            break;
1149                                        } else {
1150                                            depth -= 1;
1151                                        }
1152                                    }
1153                                }
1154                            }
1155
1156                            if let Some(pos) = close_pos {
1157                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1158                            } else {
1159                                Cow::Borrowed("")
1160                            }
1161                        } else {
1162                            Cow::Borrowed("")
1163                        };
1164
1165                        // For reference links, use the actual reference ID from pulldown-cmark
1166                        let reference_id = if is_reference && !ref_id.is_empty() {
1167                            Some(Cow::Owned(ref_id.to_lowercase()))
1168                        } else if is_reference {
1169                            // For collapsed/shortcut references without explicit ID, use the link text
1170                            Some(Cow::Owned(link_text.to_lowercase()))
1171                        } else {
1172                            None
1173                        };
1174
1175                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1176                        // Check for escaped image syntax: \![text](url)
1177                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1178                        let has_escaped_bang = start_pos >= 2
1179                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1180                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1181
1182                        // Check for escaped bracket: \[text](url)
1183                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1184                        let has_escaped_bracket =
1185                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1186
1187                        if has_escaped_bang || has_escaped_bracket {
1188                            text_chunks.clear();
1189                            continue; // Skip: this is escaped markdown, not a real link
1190                        }
1191
1192                        // Track this position as found
1193                        found_positions.insert(start_pos);
1194
1195                        links.push(ParsedLink {
1196                            line: line_num,
1197                            start_col: col_start,
1198                            end_col: col_end,
1199                            byte_offset: start_pos,
1200                            byte_end: range.end,
1201                            text: link_text,
1202                            url: Cow::Owned(url.to_string()),
1203                            is_reference,
1204                            reference_id,
1205                            link_type,
1206                        });
1207
1208                        text_chunks.clear();
1209                    }
1210                }
1211                Event::FootnoteReference(footnote_id) => {
1212                    // Capture footnote references like [^1], [^note]
1213                    // Skip if in HTML comment
1214                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1215                        continue;
1216                    }
1217
1218                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1219                    footnote_refs.push(FootnoteRef {
1220                        id: footnote_id.to_string(),
1221                        line: line_num,
1222                        byte_offset: range.start,
1223                        byte_end: range.end,
1224                    });
1225                }
1226                _ => {}
1227            }
1228        }
1229
1230        // Also find undefined references using regex
1231        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1232        // because the reference is undefined
1233        for cap in LINK_PATTERN.captures_iter(content) {
1234            let full_match = cap.get(0).unwrap();
1235            let match_start = full_match.start();
1236            let match_end = full_match.end();
1237
1238            // Skip if this was already found by pulldown-cmark (it's a valid link)
1239            if found_positions.contains(&match_start) {
1240                continue;
1241            }
1242
1243            // Skip if escaped
1244            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1245                continue;
1246            }
1247
1248            // Skip if it's an image
1249            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1250                continue;
1251            }
1252
1253            // Skip if in code block
1254            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1255                continue;
1256            }
1257
1258            // Skip if in code span
1259            if Self::is_offset_in_code_span(code_spans, match_start) {
1260                continue;
1261            }
1262
1263            // Skip if in HTML comment
1264            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1265                continue;
1266            }
1267
1268            // Find line and column information
1269            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1270
1271            // Skip if this link is on a MkDocs snippet line
1272            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1273                continue;
1274            }
1275
1276            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1277
1278            let text = cap.get(1).map_or("", |m| m.as_str());
1279
1280            // Only process reference links (group 6)
1281            if let Some(ref_id) = cap.get(6) {
1282                let ref_id_str = ref_id.as_str();
1283                let normalized_ref = if ref_id_str.is_empty() {
1284                    Cow::Owned(text.to_lowercase()) // Implicit reference
1285                } else {
1286                    Cow::Owned(ref_id_str.to_lowercase())
1287                };
1288
1289                // This is an undefined reference (pulldown-cmark didn't parse it)
1290                links.push(ParsedLink {
1291                    line: line_num,
1292                    start_col: col_start,
1293                    end_col: col_end,
1294                    byte_offset: match_start,
1295                    byte_end: match_end,
1296                    text: Cow::Borrowed(text),
1297                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1298                    is_reference: true,
1299                    reference_id: Some(normalized_ref),
1300                    link_type: LinkType::Reference, // Undefined references are reference-style
1301                });
1302            }
1303        }
1304
1305        (links, broken_links, footnote_refs)
1306    }
1307
1308    /// Parse all images in the content
1309    fn parse_images(
1310        content: &'a str,
1311        lines: &[LineInfo],
1312        code_blocks: &[(usize, usize)],
1313        code_spans: &[CodeSpan],
1314        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1315    ) -> Vec<ParsedImage<'a>> {
1316        use crate::utils::skip_context::is_in_html_comment_ranges;
1317        use std::collections::HashSet;
1318
1319        // Pre-size based on a heuristic: images are less common than links
1320        let mut images = Vec::with_capacity(content.len() / 1000);
1321        let mut found_positions = HashSet::new();
1322
1323        // Use pulldown-cmark for parsing - more accurate and faster
1324        let parser = Parser::new(content).into_offset_iter();
1325        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1326            Vec::new();
1327        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1328
1329        for (event, range) in parser {
1330            match event {
1331                Event::Start(Tag::Image {
1332                    link_type,
1333                    dest_url,
1334                    id,
1335                    ..
1336                }) => {
1337                    image_stack.push((range.start, dest_url, link_type, id));
1338                    text_chunks.clear();
1339                }
1340                Event::Text(text) if !image_stack.is_empty() => {
1341                    text_chunks.push((text.to_string(), range.start, range.end));
1342                }
1343                Event::Code(code) if !image_stack.is_empty() => {
1344                    let code_text = format!("`{code}`");
1345                    text_chunks.push((code_text, range.start, range.end));
1346                }
1347                Event::End(TagEnd::Image) => {
1348                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1349                        // Skip if in code block
1350                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1351                            continue;
1352                        }
1353
1354                        // Skip if in code span
1355                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1356                            continue;
1357                        }
1358
1359                        // Skip if in HTML comment
1360                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1361                            continue;
1362                        }
1363
1364                        // Find line and column using binary search
1365                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1366                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1367
1368                        let is_reference = matches!(
1369                            link_type,
1370                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1371                        );
1372
1373                        // Extract alt text directly from source bytes to preserve escaping
1374                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1375                        let alt_text = if start_pos < content.len() {
1376                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1377
1378                            // Find MATCHING ] by tracking bracket depth for nested brackets
1379                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1380                            let mut close_pos = None;
1381                            let mut depth = 0;
1382
1383                            if image_bytes.len() > 2 {
1384                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1385                                    // Count preceding backslashes
1386                                    let mut backslash_count = 0;
1387                                    let mut j = i;
1388                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1389                                        backslash_count += 1;
1390                                        j -= 1;
1391                                    }
1392                                    let is_escaped = backslash_count % 2 != 0;
1393
1394                                    if !is_escaped {
1395                                        if byte == b'[' {
1396                                            depth += 1;
1397                                        } else if byte == b']' {
1398                                            if depth == 0 {
1399                                                // Found the matching closing bracket
1400                                                close_pos = Some(i);
1401                                                break;
1402                                            } else {
1403                                                depth -= 1;
1404                                            }
1405                                        }
1406                                    }
1407                                }
1408                            }
1409
1410                            if let Some(pos) = close_pos {
1411                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1412                            } else {
1413                                Cow::Borrowed("")
1414                            }
1415                        } else {
1416                            Cow::Borrowed("")
1417                        };
1418
1419                        let reference_id = if is_reference && !ref_id.is_empty() {
1420                            Some(Cow::Owned(ref_id.to_lowercase()))
1421                        } else if is_reference {
1422                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1423                        } else {
1424                            None
1425                        };
1426
1427                        found_positions.insert(start_pos);
1428                        images.push(ParsedImage {
1429                            line: line_num,
1430                            start_col: col_start,
1431                            end_col: col_end,
1432                            byte_offset: start_pos,
1433                            byte_end: range.end,
1434                            alt_text,
1435                            url: Cow::Owned(url.to_string()),
1436                            is_reference,
1437                            reference_id,
1438                            link_type,
1439                        });
1440                    }
1441                }
1442                _ => {}
1443            }
1444        }
1445
1446        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1447        for cap in IMAGE_PATTERN.captures_iter(content) {
1448            let full_match = cap.get(0).unwrap();
1449            let match_start = full_match.start();
1450            let match_end = full_match.end();
1451
1452            // Skip if already found by pulldown-cmark
1453            if found_positions.contains(&match_start) {
1454                continue;
1455            }
1456
1457            // Skip if the ! is escaped
1458            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1459                continue;
1460            }
1461
1462            // Skip if in code block, code span, or HTML comment
1463            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1464                || Self::is_offset_in_code_span(code_spans, match_start)
1465                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1466            {
1467                continue;
1468            }
1469
1470            // Only process reference images (undefined references not found by pulldown-cmark)
1471            if let Some(ref_id) = cap.get(6) {
1472                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1473                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1474                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1475                let ref_id_str = ref_id.as_str();
1476                let normalized_ref = if ref_id_str.is_empty() {
1477                    Cow::Owned(alt_text.to_lowercase())
1478                } else {
1479                    Cow::Owned(ref_id_str.to_lowercase())
1480                };
1481
1482                images.push(ParsedImage {
1483                    line: line_num,
1484                    start_col: col_start,
1485                    end_col: col_end,
1486                    byte_offset: match_start,
1487                    byte_end: match_end,
1488                    alt_text: Cow::Borrowed(alt_text),
1489                    url: Cow::Borrowed(""),
1490                    is_reference: true,
1491                    reference_id: Some(normalized_ref),
1492                    link_type: LinkType::Reference, // Undefined references are reference-style
1493                });
1494            }
1495        }
1496
1497        images
1498    }
1499
1500    /// Parse reference definitions
1501    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1502        // Pre-size based on lines count as reference definitions are line-based
1503        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1504
1505        for (line_idx, line_info) in lines.iter().enumerate() {
1506            // Skip lines in code blocks
1507            if line_info.in_code_block {
1508                continue;
1509            }
1510
1511            let line = line_info.content(content);
1512            let line_num = line_idx + 1;
1513
1514            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1515                let id = cap.get(1).unwrap().as_str().to_lowercase();
1516                let url = cap.get(2).unwrap().as_str().to_string();
1517                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1518
1519                // Calculate byte positions
1520                // The match starts at the beginning of the line (0) and extends to the end
1521                let match_obj = cap.get(0).unwrap();
1522                let byte_offset = line_info.byte_offset + match_obj.start();
1523                let byte_end = line_info.byte_offset + match_obj.end();
1524
1525                refs.push(ReferenceDef {
1526                    line: line_num,
1527                    id,
1528                    url,
1529                    title,
1530                    byte_offset,
1531                    byte_end,
1532                });
1533            }
1534        }
1535
1536        refs
1537    }
1538
1539    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1540    /// Handles nested blockquotes like `> > > content`
1541    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1542    #[inline]
1543    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1544        let trimmed_start = line.trim_start();
1545        if !trimmed_start.starts_with('>') {
1546            return None;
1547        }
1548
1549        // Track total prefix length to handle nested blockquotes
1550        let mut remaining = line;
1551        let mut total_prefix_len = 0;
1552
1553        loop {
1554            let trimmed = remaining.trim_start();
1555            if !trimmed.starts_with('>') {
1556                break;
1557            }
1558
1559            // Add leading whitespace + '>' to prefix
1560            let leading_ws_len = remaining.len() - trimmed.len();
1561            total_prefix_len += leading_ws_len + 1;
1562
1563            let after_gt = &trimmed[1..];
1564
1565            // Handle optional whitespace after '>' (space or tab)
1566            if let Some(stripped) = after_gt.strip_prefix(' ') {
1567                total_prefix_len += 1;
1568                remaining = stripped;
1569            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1570                total_prefix_len += 1;
1571                remaining = stripped;
1572            } else {
1573                remaining = after_gt;
1574            }
1575        }
1576
1577        Some((&line[..total_prefix_len], remaining))
1578    }
1579
1580    /// Fast unordered list parser - replaces regex for 5-10x speedup
1581    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1582    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1583    #[inline]
1584    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1585        let bytes = line.as_bytes();
1586        let mut i = 0;
1587
1588        // Skip leading whitespace
1589        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1590            i += 1;
1591        }
1592
1593        // Check for marker
1594        if i >= bytes.len() {
1595            return None;
1596        }
1597        let marker = bytes[i] as char;
1598        if marker != '-' && marker != '*' && marker != '+' {
1599            return None;
1600        }
1601        let marker_pos = i;
1602        i += 1;
1603
1604        // Collect spacing after marker (space or tab only)
1605        let spacing_start = i;
1606        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1607            i += 1;
1608        }
1609
1610        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1611    }
1612
1613    /// Fast ordered list parser - replaces regex for 5-10x speedup
1614    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1615    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1616    #[inline]
1617    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1618        let bytes = line.as_bytes();
1619        let mut i = 0;
1620
1621        // Skip leading whitespace
1622        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1623            i += 1;
1624        }
1625
1626        // Collect digits
1627        let number_start = i;
1628        while i < bytes.len() && bytes[i].is_ascii_digit() {
1629            i += 1;
1630        }
1631        if i == number_start {
1632            return None; // No digits found
1633        }
1634
1635        // Check for delimiter
1636        if i >= bytes.len() {
1637            return None;
1638        }
1639        let delimiter = bytes[i] as char;
1640        if delimiter != '.' && delimiter != ')' {
1641            return None;
1642        }
1643        let delimiter_pos = i;
1644        i += 1;
1645
1646        // Collect spacing after delimiter (space or tab only)
1647        let spacing_start = i;
1648        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1649            i += 1;
1650        }
1651
1652        Some((
1653            &line[..number_start],
1654            &line[number_start..delimiter_pos],
1655            delimiter,
1656            &line[spacing_start..i],
1657            &line[i..],
1658        ))
1659    }
1660
1661    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1662    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1663    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1664        let num_lines = line_offsets.len();
1665        let mut in_code_block = vec![false; num_lines];
1666
1667        // For each code block, mark all lines within it
1668        for &(start, end) in code_blocks {
1669            // Ensure we're at valid UTF-8 boundaries
1670            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1671                let mut boundary = start;
1672                while boundary > 0 && !content.is_char_boundary(boundary) {
1673                    boundary -= 1;
1674                }
1675                boundary
1676            } else {
1677                start
1678            };
1679
1680            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1681                let mut boundary = end;
1682                while boundary < content.len() && !content.is_char_boundary(boundary) {
1683                    boundary += 1;
1684                }
1685                boundary
1686            } else {
1687                end.min(content.len())
1688            };
1689
1690            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1691            // That function now has proper list context awareness (see code_block_utils.rs)
1692            // and correctly distinguishes between:
1693            // - Fenced code blocks (``` or ~~~)
1694            // - Indented code blocks at document level (4 spaces + blank line before)
1695            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1696            //
1697            // We no longer need to re-validate here. The original validation logic
1698            // was causing false positives by marking list continuation paragraphs as
1699            // code blocks when they have 4 spaces of indentation.
1700
1701            // Use binary search to find the first and last line indices
1702            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1703            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1704            //
1705            // Find the line that CONTAINS safe_start: the line with the largest
1706            // start offset that is <= safe_start. partition_point gives us the
1707            // first line that starts AFTER safe_start, so we subtract 1.
1708            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1709            let first_line = first_line_after.saturating_sub(1);
1710            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1711
1712            // Mark all lines in the range at once
1713            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1714                *flag = true;
1715            }
1716        }
1717
1718        in_code_block
1719    }
1720
1721    /// Pre-compute basic line information (without headings/blockquotes)
1722    fn compute_basic_line_info(
1723        content: &str,
1724        line_offsets: &[usize],
1725        code_blocks: &[(usize, usize)],
1726        flavor: MarkdownFlavor,
1727        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1728        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1729    ) -> Vec<LineInfo> {
1730        let content_lines: Vec<&str> = content.lines().collect();
1731        let mut lines = Vec::with_capacity(content_lines.len());
1732
1733        // Pre-compute which lines are in code blocks
1734        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1735
1736        // Detect front matter boundaries FIRST, before any other parsing
1737        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1738        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1739
1740        for (i, line) in content_lines.iter().enumerate() {
1741            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1742            let indent = line.len() - line.trim_start().len();
1743
1744            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1745            let blockquote_parse = Self::parse_blockquote_prefix(line);
1746
1747            // For blank detection, consider blockquote context
1748            let is_blank = if let Some((_, content)) = blockquote_parse {
1749                // In blockquote context, check if content after prefix is blank
1750                content.trim().is_empty()
1751            } else {
1752                line.trim().is_empty()
1753            };
1754
1755            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1756            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1757
1758            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1759            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1760                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1761            // Check if the ENTIRE line is within an HTML comment (not just the line start)
1762            // This ensures content after `-->` on the same line is not incorrectly skipped
1763            let line_end_offset = byte_offset + line.len();
1764            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
1765                html_comment_ranges,
1766                byte_offset,
1767                line_end_offset,
1768            );
1769            let list_item = if !(in_code_block
1770                || is_blank
1771                || in_mkdocstrings
1772                || in_html_comment
1773                || (front_matter_end > 0 && i < front_matter_end))
1774            {
1775                // Strip blockquote prefix if present for list detection (reuse cached result)
1776                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1777                    (content, prefix.len())
1778                } else {
1779                    (&**line, 0)
1780                };
1781
1782                if let Some((leading_spaces, marker, spacing, _content)) =
1783                    Self::parse_unordered_list(line_for_list_check)
1784                {
1785                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1786                    let content_column = marker_column + 1 + spacing.len();
1787
1788                    // According to CommonMark spec, unordered list items MUST have at least one space
1789                    // after the marker (-, *, or +). Without a space, it's not a list item.
1790                    // This also naturally handles cases like:
1791                    // - *emphasis* (not a list)
1792                    // - **bold** (not a list)
1793                    // - --- (horizontal rule, not a list)
1794                    if spacing.is_empty() {
1795                        None
1796                    } else {
1797                        Some(ListItemInfo {
1798                            marker: marker.to_string(),
1799                            is_ordered: false,
1800                            number: None,
1801                            marker_column,
1802                            content_column,
1803                        })
1804                    }
1805                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1806                    Self::parse_ordered_list(line_for_list_check)
1807                {
1808                    let marker = format!("{number_str}{delimiter}");
1809                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1810                    let content_column = marker_column + marker.len() + spacing.len();
1811
1812                    // According to CommonMark spec, ordered list items MUST have at least one space
1813                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1814                    if spacing.is_empty() {
1815                        None
1816                    } else {
1817                        Some(ListItemInfo {
1818                            marker,
1819                            is_ordered: true,
1820                            number: number_str.parse().ok(),
1821                            marker_column,
1822                            content_column,
1823                        })
1824                    }
1825                } else {
1826                    None
1827                }
1828            } else {
1829                None
1830            };
1831
1832            lines.push(LineInfo {
1833                byte_offset,
1834                byte_len: line.len(),
1835                indent,
1836                is_blank,
1837                in_code_block,
1838                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1839                in_html_block: false, // Will be populated after line creation
1840                in_html_comment,
1841                list_item,
1842                heading: None,    // Will be populated in second pass for Setext headings
1843                blockquote: None, // Will be populated after line creation
1844                in_mkdocstrings,
1845                in_esm_block: false, // Will be populated after line creation for MDX files
1846                in_code_span_continuation: false, // Will be populated after code spans are parsed
1847            });
1848        }
1849
1850        lines
1851    }
1852
1853    /// Detect headings and blockquotes (called after HTML block detection)
1854    fn detect_headings_and_blockquotes(
1855        content: &str,
1856        lines: &mut [LineInfo],
1857        flavor: MarkdownFlavor,
1858        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1859    ) {
1860        // Regex for heading detection
1861        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1862            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1863        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1864            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1865
1866        let content_lines: Vec<&str> = content.lines().collect();
1867
1868        // Detect front matter boundaries to skip those lines
1869        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1870
1871        // Detect headings (including Setext which needs look-ahead) and blockquotes
1872        for i in 0..lines.len() {
1873            if lines[i].in_code_block {
1874                continue;
1875            }
1876
1877            // Skip lines in front matter
1878            if front_matter_end > 0 && i < front_matter_end {
1879                continue;
1880            }
1881
1882            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1883            if lines[i].in_html_block {
1884                continue;
1885            }
1886
1887            let line = content_lines[i];
1888
1889            // Check for blockquotes (even on blank lines within blockquotes)
1890            if let Some(bq) = parse_blockquote_detailed(line) {
1891                let nesting_level = bq.markers.len(); // Each '>' is one level
1892                let marker_column = bq.indent.len();
1893
1894                // Build the prefix (indentation + markers + space)
1895                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1896
1897                // Check for various blockquote issues
1898                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1899                // Only flag multiple literal spaces, not tabs
1900                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
1901                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1902
1903                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1904                // MD028 flags empty blockquote lines that don't have a single space after the marker
1905                // Lines like "> " or ">> " are already correct and don't need fixing
1906                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1907
1908                lines[i].blockquote = Some(BlockquoteInfo {
1909                    nesting_level,
1910                    indent: bq.indent.to_string(),
1911                    marker_column,
1912                    prefix,
1913                    content: bq.content.to_string(),
1914                    has_no_space_after_marker: has_no_space,
1915                    has_multiple_spaces_after_marker: has_multiple_spaces,
1916                    needs_md028_fix,
1917                });
1918            }
1919
1920            // Skip heading detection for blank lines
1921            if lines[i].is_blank {
1922                continue;
1923            }
1924
1925            // Check for ATX headings (but skip MkDocs snippet lines)
1926            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1927            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1928                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1929                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1930            } else {
1931                false
1932            };
1933
1934            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1935                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1936                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1937                    continue;
1938                }
1939                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1940                let hashes = caps.get(2).map_or("", |m| m.as_str());
1941                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1942                let rest = caps.get(4).map_or("", |m| m.as_str());
1943
1944                let level = hashes.len() as u8;
1945                let marker_column = leading_spaces.len();
1946
1947                // Check for closing sequence, but handle custom IDs that might come after
1948                let (text, has_closing, closing_seq) = {
1949                    // First check if there's a custom ID at the end
1950                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1951                        // Check if this looks like a valid custom ID (ends with })
1952                        if rest[id_start..].trim_end().ends_with('}') {
1953                            // Split off the custom ID
1954                            (&rest[..id_start], &rest[id_start..])
1955                        } else {
1956                            (rest, "")
1957                        }
1958                    } else {
1959                        (rest, "")
1960                    };
1961
1962                    // Now look for closing hashes in the part before the custom ID
1963                    let trimmed_rest = rest_without_id.trim_end();
1964                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1965                        // Look for the start of the hash sequence
1966                        let mut start_of_hashes = last_hash_pos;
1967                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1968                            start_of_hashes -= 1;
1969                        }
1970
1971                        // Check if there's at least one space before the closing hashes
1972                        let has_space_before = start_of_hashes == 0
1973                            || trimmed_rest
1974                                .chars()
1975                                .nth(start_of_hashes - 1)
1976                                .is_some_and(|c| c.is_whitespace());
1977
1978                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1979                        let potential_closing = &trimmed_rest[start_of_hashes..];
1980                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1981
1982                        if is_all_hashes && has_space_before {
1983                            // This is a closing sequence
1984                            let closing_hashes = potential_closing.to_string();
1985                            // The text is everything before the closing hashes
1986                            // Don't include the custom ID here - it will be extracted later
1987                            let text_part = if !custom_id_part.is_empty() {
1988                                // If we have a custom ID, append it back to get the full rest
1989                                // This allows the extract_header_id function to handle it properly
1990                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1991                            } else {
1992                                rest_without_id[..start_of_hashes].trim_end().to_string()
1993                            };
1994                            (text_part, true, closing_hashes)
1995                        } else {
1996                            // Not a valid closing sequence, return the full content
1997                            (rest.to_string(), false, String::new())
1998                        }
1999                    } else {
2000                        // No hashes found, return the full content
2001                        (rest.to_string(), false, String::new())
2002                    }
2003                };
2004
2005                let content_column = marker_column + hashes.len() + spaces_after.len();
2006
2007                // Extract custom header ID if present
2008                let raw_text = text.trim().to_string();
2009                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2010
2011                // If no custom ID was found on the header line, check the next line for standalone attr-list
2012                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2013                    let next_line = content_lines[i + 1];
2014                    if !lines[i + 1].in_code_block
2015                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2016                        && let Some(next_line_id) =
2017                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2018                    {
2019                        custom_id = Some(next_line_id);
2020                    }
2021                }
2022
2023                lines[i].heading = Some(HeadingInfo {
2024                    level,
2025                    style: HeadingStyle::ATX,
2026                    marker: hashes.to_string(),
2027                    marker_column,
2028                    content_column,
2029                    text: clean_text,
2030                    custom_id,
2031                    raw_text,
2032                    has_closing_sequence: has_closing,
2033                    closing_sequence: closing_seq,
2034                });
2035            }
2036            // Check for Setext headings (need to look at next line)
2037            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2038                let next_line = content_lines[i + 1];
2039                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2040                    // Skip if next line is front matter delimiter
2041                    if front_matter_end > 0 && i < front_matter_end {
2042                        continue;
2043                    }
2044
2045                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2046                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2047                    {
2048                        continue;
2049                    }
2050
2051                    let underline = next_line.trim();
2052
2053                    let level = if underline.starts_with('=') { 1 } else { 2 };
2054                    let style = if level == 1 {
2055                        HeadingStyle::Setext1
2056                    } else {
2057                        HeadingStyle::Setext2
2058                    };
2059
2060                    // Extract custom header ID if present
2061                    let raw_text = line.trim().to_string();
2062                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2063
2064                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2065                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2066                        let attr_line = content_lines[i + 2];
2067                        if !lines[i + 2].in_code_block
2068                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2069                            && let Some(attr_line_id) =
2070                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2071                        {
2072                            custom_id = Some(attr_line_id);
2073                        }
2074                    }
2075
2076                    lines[i].heading = Some(HeadingInfo {
2077                        level,
2078                        style,
2079                        marker: underline.to_string(),
2080                        marker_column: next_line.len() - next_line.trim_start().len(),
2081                        content_column: lines[i].indent,
2082                        text: clean_text,
2083                        custom_id,
2084                        raw_text,
2085                        has_closing_sequence: false,
2086                        closing_sequence: String::new(),
2087                    });
2088                }
2089            }
2090        }
2091    }
2092
2093    /// Detect HTML blocks in the content
2094    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2095        // HTML block elements that trigger block context
2096        // Includes HTML5 media, embedded content, and interactive elements
2097        const BLOCK_ELEMENTS: &[&str] = &[
2098            "address",
2099            "article",
2100            "aside",
2101            "audio",
2102            "blockquote",
2103            "canvas",
2104            "details",
2105            "dialog",
2106            "dd",
2107            "div",
2108            "dl",
2109            "dt",
2110            "embed",
2111            "fieldset",
2112            "figcaption",
2113            "figure",
2114            "footer",
2115            "form",
2116            "h1",
2117            "h2",
2118            "h3",
2119            "h4",
2120            "h5",
2121            "h6",
2122            "header",
2123            "hr",
2124            "iframe",
2125            "li",
2126            "main",
2127            "menu",
2128            "nav",
2129            "noscript",
2130            "object",
2131            "ol",
2132            "p",
2133            "picture",
2134            "pre",
2135            "script",
2136            "search",
2137            "section",
2138            "source",
2139            "style",
2140            "summary",
2141            "svg",
2142            "table",
2143            "tbody",
2144            "td",
2145            "template",
2146            "textarea",
2147            "tfoot",
2148            "th",
2149            "thead",
2150            "tr",
2151            "track",
2152            "ul",
2153            "video",
2154        ];
2155
2156        let mut i = 0;
2157        while i < lines.len() {
2158            // Skip if already in code block or front matter
2159            if lines[i].in_code_block || lines[i].in_front_matter {
2160                i += 1;
2161                continue;
2162            }
2163
2164            let trimmed = lines[i].content(content).trim_start();
2165
2166            // Check if line starts with an HTML tag
2167            if trimmed.starts_with('<') && trimmed.len() > 1 {
2168                // Extract tag name safely
2169                let after_bracket = &trimmed[1..];
2170                let is_closing = after_bracket.starts_with('/');
2171                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2172
2173                // Extract tag name (stop at space, >, /, or end of string)
2174                let tag_name = tag_start
2175                    .chars()
2176                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2177                    .collect::<String>()
2178                    .to_lowercase();
2179
2180                // Check if it's a block element
2181                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2182                    // Mark this line as in HTML block
2183                    lines[i].in_html_block = true;
2184
2185                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2186                    // This avoids complex nesting logic that might cause infinite loops
2187                    if !is_closing {
2188                        let closing_tag = format!("</{tag_name}>");
2189                        // style and script tags can contain blank lines (CSS/JS formatting)
2190                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2191                        let mut j = i + 1;
2192                        while j < lines.len() && j < i + 100 {
2193                            // Limit search to 100 lines
2194                            // Stop at blank lines (except for style/script tags)
2195                            if !allow_blank_lines && lines[j].is_blank {
2196                                break;
2197                            }
2198
2199                            lines[j].in_html_block = true;
2200
2201                            // Check if this line contains the closing tag
2202                            if lines[j].content(content).contains(&closing_tag) {
2203                                break;
2204                            }
2205                            j += 1;
2206                        }
2207                    }
2208                }
2209            }
2210
2211            i += 1;
2212        }
2213    }
2214
2215    /// Detect ESM import/export blocks in MDX files
2216    /// ESM blocks consist of contiguous import/export statements at the top of the file
2217    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2218        // Only process MDX files
2219        if !flavor.supports_esm_blocks() {
2220            return;
2221        }
2222
2223        let mut in_multiline_comment = false;
2224
2225        for line in lines.iter_mut() {
2226            // Skip blank lines and HTML comments
2227            if line.is_blank || line.in_html_comment {
2228                continue;
2229            }
2230
2231            let trimmed = line.content(content).trim_start();
2232
2233            // Handle continuation of multi-line JS comments
2234            if in_multiline_comment {
2235                if trimmed.contains("*/") {
2236                    in_multiline_comment = false;
2237                }
2238                continue;
2239            }
2240
2241            // Skip single-line JS comments (// and ///)
2242            if trimmed.starts_with("//") {
2243                continue;
2244            }
2245
2246            // Handle start of multi-line JS comment
2247            if trimmed.starts_with("/*") {
2248                if !trimmed.contains("*/") {
2249                    in_multiline_comment = true;
2250                }
2251                continue;
2252            }
2253
2254            // Check if line starts with import or export
2255            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2256                line.in_esm_block = true;
2257            } else {
2258                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2259                break;
2260            }
2261        }
2262    }
2263
2264    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2265    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2266        let mut code_spans = Vec::new();
2267
2268        // Quick check - if no backticks, no code spans
2269        if !content.contains('`') {
2270            return code_spans;
2271        }
2272
2273        // Use pulldown-cmark's streaming parser with byte offsets
2274        let parser = Parser::new(content).into_offset_iter();
2275
2276        for (event, range) in parser {
2277            if let Event::Code(_) = event {
2278                let start_pos = range.start;
2279                let end_pos = range.end;
2280
2281                // The range includes the backticks, extract the actual content
2282                let full_span = &content[start_pos..end_pos];
2283                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2284
2285                // Extract content between backticks, preserving spaces
2286                let content_start = start_pos + backtick_count;
2287                let content_end = end_pos - backtick_count;
2288                let span_content = if content_start < content_end {
2289                    content[content_start..content_end].to_string()
2290                } else {
2291                    String::new()
2292                };
2293
2294                // Use binary search to find line number - O(log n) instead of O(n)
2295                // Find the rightmost line whose byte_offset <= start_pos
2296                let line_idx = lines
2297                    .partition_point(|line| line.byte_offset <= start_pos)
2298                    .saturating_sub(1);
2299                let line_num = line_idx + 1;
2300                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2301
2302                // Find end column using binary search
2303                let end_line_idx = lines
2304                    .partition_point(|line| line.byte_offset <= end_pos)
2305                    .saturating_sub(1);
2306                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2307
2308                // Convert byte offsets to character positions for correct Unicode handling
2309                // This ensures consistency with warning.column which uses character positions
2310                let line_content = lines[line_idx].content(content);
2311                let col_start = if byte_col_start <= line_content.len() {
2312                    line_content[..byte_col_start].chars().count()
2313                } else {
2314                    line_content.chars().count()
2315                };
2316
2317                let end_line_content = lines[end_line_idx].content(content);
2318                let col_end = if byte_col_end <= end_line_content.len() {
2319                    end_line_content[..byte_col_end].chars().count()
2320                } else {
2321                    end_line_content.chars().count()
2322                };
2323
2324                code_spans.push(CodeSpan {
2325                    line: line_num,
2326                    end_line: end_line_idx + 1,
2327                    start_col: col_start,
2328                    end_col: col_end,
2329                    byte_offset: start_pos,
2330                    byte_end: end_pos,
2331                    backtick_count,
2332                    content: span_content,
2333                });
2334            }
2335        }
2336
2337        // Sort by position to ensure consistent ordering
2338        code_spans.sort_by_key(|span| span.byte_offset);
2339
2340        code_spans
2341    }
2342
2343    /// Parse all list blocks in the content (legacy line-by-line approach)
2344    ///
2345    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2346    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2347    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2348    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2349    ///   treated as list continuation (based on the list marker width)
2350    ///
2351    /// When a new list item is encountered, we check if list-breaking content was seen
2352    /// since the last item. If so, we start a new list block.
2353    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2354        // Minimum indentation for unordered list continuation per CommonMark spec
2355        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2356
2357        /// Initialize or reset the forward-scanning tracking state.
2358        /// This helper eliminates code duplication across three initialization sites.
2359        #[inline]
2360        fn reset_tracking_state(
2361            list_item: &ListItemInfo,
2362            has_list_breaking_content: &mut bool,
2363            min_continuation: &mut usize,
2364        ) {
2365            *has_list_breaking_content = false;
2366            let marker_width = if list_item.is_ordered {
2367                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2368            } else {
2369                list_item.marker.len()
2370            };
2371            *min_continuation = if list_item.is_ordered {
2372                marker_width
2373            } else {
2374                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2375            };
2376        }
2377
2378        // Pre-size based on lines that could be list items
2379        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2380        let mut current_block: Option<ListBlock> = None;
2381        let mut last_list_item_line = 0;
2382        let mut current_indent_level = 0;
2383        let mut last_marker_width = 0;
2384
2385        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2386        let mut has_list_breaking_content_since_last_item = false;
2387        let mut min_continuation_for_tracking = 0;
2388
2389        for (line_idx, line_info) in lines.iter().enumerate() {
2390            let line_num = line_idx + 1;
2391
2392            // Enhanced code block handling using Design #3's context analysis
2393            if line_info.in_code_block {
2394                if let Some(ref mut block) = current_block {
2395                    // Calculate minimum indentation for list continuation
2396                    let min_continuation_indent =
2397                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2398
2399                    // Analyze code block context using the three-tier classification
2400                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2401
2402                    match context {
2403                        CodeBlockContext::Indented => {
2404                            // Code block is properly indented - continues the list
2405                            block.end_line = line_num;
2406                            continue;
2407                        }
2408                        CodeBlockContext::Standalone => {
2409                            // Code block separates lists - end current block
2410                            let completed_block = current_block.take().unwrap();
2411                            list_blocks.push(completed_block);
2412                            continue;
2413                        }
2414                        CodeBlockContext::Adjacent => {
2415                            // Edge case - use conservative behavior (continue list)
2416                            block.end_line = line_num;
2417                            continue;
2418                        }
2419                    }
2420                } else {
2421                    // No current list block - skip code block lines
2422                    continue;
2423                }
2424            }
2425
2426            // Extract blockquote prefix if any
2427            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2428                caps.get(0).unwrap().as_str().to_string()
2429            } else {
2430                String::new()
2431            };
2432
2433            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2434            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2435            if current_block.is_some()
2436                && line_info.list_item.is_none()
2437                && !line_info.is_blank
2438                && !line_info.in_code_span_continuation
2439            {
2440                let line_content = line_info.content(content).trim();
2441
2442                // Check for structural separators that break lists
2443                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2444                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2445                // as they indicate improper indentation rather than lazy continuation.
2446                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2447                let breaks_list = line_info.heading.is_some()
2448                    || line_content.starts_with("---")
2449                    || line_content.starts_with("***")
2450                    || line_content.starts_with("___")
2451                    || crate::utils::skip_context::is_table_line(line_content)
2452                    || line_content.starts_with(">")
2453                    || (line_info.indent > 0
2454                        && line_info.indent < min_continuation_for_tracking
2455                        && !is_lazy_continuation);
2456
2457                if breaks_list {
2458                    has_list_breaking_content_since_last_item = true;
2459                }
2460            }
2461
2462            // If this line is a code span continuation within an active list block,
2463            // extend the block's end_line to include this line (maintains list continuity)
2464            if line_info.in_code_span_continuation
2465                && line_info.list_item.is_none()
2466                && let Some(ref mut block) = current_block
2467            {
2468                block.end_line = line_num;
2469            }
2470
2471            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2472            // properly indented lines within the list). This ensures the workaround at line 2448
2473            // works correctly when there are multiple continuation lines before a nested list item.
2474            // Also include lazy continuation lines (indent=0) per CommonMark spec.
2475            let is_valid_continuation =
2476                line_info.indent >= min_continuation_for_tracking || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
2477            if !line_info.in_code_span_continuation
2478                && line_info.list_item.is_none()
2479                && !line_info.is_blank
2480                && !line_info.in_code_block
2481                && is_valid_continuation
2482                && let Some(ref mut block) = current_block
2483            {
2484                block.end_line = line_num;
2485            }
2486
2487            // Check if this line is a list item
2488            if let Some(list_item) = &line_info.list_item {
2489                // Calculate nesting level based on indentation
2490                let item_indent = list_item.marker_column;
2491                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2492
2493                if let Some(ref mut block) = current_block {
2494                    // Check if this continues the current block
2495                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2496                    // or a continuation at the same or lower level
2497                    let is_nested = nesting > block.nesting_level;
2498                    let same_type =
2499                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2500                    let same_context = block.blockquote_prefix == blockquote_prefix;
2501                    // Allow one blank line after last item, or lines immediately after block content
2502                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
2503
2504                    // For unordered lists, also check marker consistency
2505                    let marker_compatible =
2506                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2507
2508                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2509                    // This eliminates the quadratic bottleneck from issue #148
2510                    let has_non_list_content = has_list_breaking_content_since_last_item;
2511
2512                    // A list continues if:
2513                    // 1. It's a nested item (indented more than the parent), OR
2514                    // 2. It's the same type at the same level with reasonable distance
2515                    let mut continues_list = if is_nested {
2516                        // Nested items always continue the list if they're in the same context
2517                        same_context && reasonable_distance && !has_non_list_content
2518                    } else {
2519                        // Same-level items need to match type and markers
2520                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2521                    };
2522
2523                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2524                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2525                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2526                        // Check if the previous line was a list item or a continuation of a list item
2527                        // (including lazy continuation lines)
2528                        if block.item_lines.contains(&(line_num - 1)) {
2529                            // They're consecutive list items - force them to be in the same list
2530                            continues_list = true;
2531                        } else {
2532                            // Previous line is a continuation line within this block
2533                            // (e.g., lazy continuation with indent=0)
2534                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
2535                            continues_list = true;
2536                        }
2537                    }
2538
2539                    if continues_list {
2540                        // Extend current block
2541                        block.end_line = line_num;
2542                        block.item_lines.push(line_num);
2543
2544                        // Update max marker width
2545                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2546                            list_item.marker.len() + 1
2547                        } else {
2548                            list_item.marker.len()
2549                        });
2550
2551                        // Update marker consistency for unordered lists
2552                        if !block.is_ordered
2553                            && block.marker.is_some()
2554                            && block.marker.as_ref() != Some(&list_item.marker)
2555                        {
2556                            // Mixed markers, clear the marker field
2557                            block.marker = None;
2558                        }
2559
2560                        // Reset tracked state for issue #148 optimization
2561                        reset_tracking_state(
2562                            list_item,
2563                            &mut has_list_breaking_content_since_last_item,
2564                            &mut min_continuation_for_tracking,
2565                        );
2566                    } else {
2567                        // End current block and start a new one
2568
2569                        list_blocks.push(block.clone());
2570
2571                        *block = ListBlock {
2572                            start_line: line_num,
2573                            end_line: line_num,
2574                            is_ordered: list_item.is_ordered,
2575                            marker: if list_item.is_ordered {
2576                                None
2577                            } else {
2578                                Some(list_item.marker.clone())
2579                            },
2580                            blockquote_prefix: blockquote_prefix.clone(),
2581                            item_lines: vec![line_num],
2582                            nesting_level: nesting,
2583                            max_marker_width: if list_item.is_ordered {
2584                                list_item.marker.len() + 1
2585                            } else {
2586                                list_item.marker.len()
2587                            },
2588                        };
2589
2590                        // Initialize tracked state for new block (issue #148 optimization)
2591                        reset_tracking_state(
2592                            list_item,
2593                            &mut has_list_breaking_content_since_last_item,
2594                            &mut min_continuation_for_tracking,
2595                        );
2596                    }
2597                } else {
2598                    // Start a new block
2599                    current_block = Some(ListBlock {
2600                        start_line: line_num,
2601                        end_line: line_num,
2602                        is_ordered: list_item.is_ordered,
2603                        marker: if list_item.is_ordered {
2604                            None
2605                        } else {
2606                            Some(list_item.marker.clone())
2607                        },
2608                        blockquote_prefix,
2609                        item_lines: vec![line_num],
2610                        nesting_level: nesting,
2611                        max_marker_width: list_item.marker.len(),
2612                    });
2613
2614                    // Initialize tracked state for new block (issue #148 optimization)
2615                    reset_tracking_state(
2616                        list_item,
2617                        &mut has_list_breaking_content_since_last_item,
2618                        &mut min_continuation_for_tracking,
2619                    );
2620                }
2621
2622                last_list_item_line = line_num;
2623                current_indent_level = item_indent;
2624                last_marker_width = if list_item.is_ordered {
2625                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2626                } else {
2627                    list_item.marker.len()
2628                };
2629            } else if let Some(ref mut block) = current_block {
2630                // Not a list item - check if it continues the current block
2631
2632                // For MD032 compatibility, we use a simple approach:
2633                // - Indented lines continue the list
2634                // - Blank lines followed by indented content continue the list
2635                // - Everything else ends the list
2636
2637                // Check if the last line in the list block ended with a backslash (hard line break)
2638                // This handles cases where list items use backslash for hard line breaks
2639                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2640                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2641                } else {
2642                    false
2643                };
2644
2645                // Calculate minimum indentation for list continuation
2646                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2647                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2648                let min_continuation_indent = if block.is_ordered {
2649                    current_indent_level + last_marker_width
2650                } else {
2651                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2652                };
2653
2654                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2655                    // Indented line or backslash continuation continues the list
2656                    block.end_line = line_num;
2657                } else if line_info.is_blank {
2658                    // Blank line - check if it's internal to the list or ending it
2659                    // We only include blank lines that are followed by more list content
2660                    let mut check_idx = line_idx + 1;
2661                    let mut found_continuation = false;
2662
2663                    // Skip additional blank lines
2664                    while check_idx < lines.len() && lines[check_idx].is_blank {
2665                        check_idx += 1;
2666                    }
2667
2668                    if check_idx < lines.len() {
2669                        let next_line = &lines[check_idx];
2670                        // Check if followed by indented content (list continuation)
2671                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2672                            found_continuation = true;
2673                        }
2674                        // Check if followed by another list item at the same level
2675                        else if !next_line.in_code_block
2676                            && next_line.list_item.is_some()
2677                            && let Some(item) = &next_line.list_item
2678                        {
2679                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2680                                .find(next_line.content(content))
2681                                .map_or(String::new(), |m| m.as_str().to_string());
2682                            if item.marker_column == current_indent_level
2683                                && item.is_ordered == block.is_ordered
2684                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2685                            {
2686                                // Check if there was meaningful content between the list items (unused now)
2687                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2688                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2689                                    if let Some(between_line) = lines.get(idx) {
2690                                        let between_content = between_line.content(content);
2691                                        let trimmed = between_content.trim();
2692                                        // Skip empty lines
2693                                        if trimmed.is_empty() {
2694                                            return false;
2695                                        }
2696                                        // Check for meaningful content
2697                                        let line_indent = between_content.len() - between_content.trim_start().len();
2698
2699                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2700                                        if trimmed.starts_with("```")
2701                                            || trimmed.starts_with("~~~")
2702                                            || trimmed.starts_with("---")
2703                                            || trimmed.starts_with("***")
2704                                            || trimmed.starts_with("___")
2705                                            || trimmed.starts_with(">")
2706                                            || crate::utils::skip_context::is_table_line(trimmed)
2707                                            || between_line.heading.is_some()
2708                                        {
2709                                            return true; // These are structural separators - meaningful content that breaks lists
2710                                        }
2711
2712                                        // Only properly indented content continues the list
2713                                        line_indent >= min_continuation_indent
2714                                    } else {
2715                                        false
2716                                    }
2717                                });
2718
2719                                if block.is_ordered {
2720                                    // For ordered lists: don't continue if there are structural separators
2721                                    // Check if there are structural separators between the list items
2722                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2723                                        if let Some(between_line) = lines.get(idx) {
2724                                            let trimmed = between_line.content(content).trim();
2725                                            if trimmed.is_empty() {
2726                                                return false;
2727                                            }
2728                                            // Check for structural separators that break lists
2729                                            trimmed.starts_with("```")
2730                                                || trimmed.starts_with("~~~")
2731                                                || trimmed.starts_with("---")
2732                                                || trimmed.starts_with("***")
2733                                                || trimmed.starts_with("___")
2734                                                || trimmed.starts_with(">")
2735                                                || crate::utils::skip_context::is_table_line(trimmed)
2736                                                || between_line.heading.is_some()
2737                                        } else {
2738                                            false
2739                                        }
2740                                    });
2741                                    found_continuation = !has_structural_separators;
2742                                } else {
2743                                    // For unordered lists: also check for structural separators
2744                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2745                                        if let Some(between_line) = lines.get(idx) {
2746                                            let trimmed = between_line.content(content).trim();
2747                                            if trimmed.is_empty() {
2748                                                return false;
2749                                            }
2750                                            // Check for structural separators that break lists
2751                                            trimmed.starts_with("```")
2752                                                || trimmed.starts_with("~~~")
2753                                                || trimmed.starts_with("---")
2754                                                || trimmed.starts_with("***")
2755                                                || trimmed.starts_with("___")
2756                                                || trimmed.starts_with(">")
2757                                                || crate::utils::skip_context::is_table_line(trimmed)
2758                                                || between_line.heading.is_some()
2759                                        } else {
2760                                            false
2761                                        }
2762                                    });
2763                                    found_continuation = !has_structural_separators;
2764                                }
2765                            }
2766                        }
2767                    }
2768
2769                    if found_continuation {
2770                        // Include the blank line in the block
2771                        block.end_line = line_num;
2772                    } else {
2773                        // Blank line ends the list - don't include it
2774                        list_blocks.push(block.clone());
2775                        current_block = None;
2776                    }
2777                } else {
2778                    // Check for lazy continuation - non-indented line immediately after a list item
2779                    // But only if the line has sufficient indentation for the list type
2780                    let min_required_indent = if block.is_ordered {
2781                        current_indent_level + last_marker_width
2782                    } else {
2783                        current_indent_level + 2
2784                    };
2785
2786                    // For lazy continuation to apply, the line must either:
2787                    // 1. Have no indentation (true lazy continuation)
2788                    // 2. Have sufficient indentation for the list type
2789                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2790                    let line_content = line_info.content(content).trim();
2791
2792                    // Check for table-like patterns
2793                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
2794
2795                    let is_structural_separator = line_info.heading.is_some()
2796                        || line_content.starts_with("```")
2797                        || line_content.starts_with("~~~")
2798                        || line_content.starts_with("---")
2799                        || line_content.starts_with("***")
2800                        || line_content.starts_with("___")
2801                        || line_content.starts_with(">")
2802                        || looks_like_table;
2803
2804                    // Allow lazy continuation if we're still within the same list block
2805                    // (not just immediately after a list item)
2806                    let is_lazy_continuation = !is_structural_separator
2807                        && !line_info.is_blank
2808                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2809
2810                    if is_lazy_continuation {
2811                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2812                        // it's probably not a continuation
2813                        let content_to_check = if !blockquote_prefix.is_empty() {
2814                            // Strip blockquote prefix to check the actual content
2815                            line_info
2816                                .content(content)
2817                                .strip_prefix(&blockquote_prefix)
2818                                .unwrap_or(line_info.content(content))
2819                                .trim()
2820                        } else {
2821                            line_info.content(content).trim()
2822                        };
2823
2824                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2825
2826                        // If it starts with uppercase and the previous line ended with punctuation,
2827                        // it's likely a new paragraph, not a continuation
2828                        if starts_with_uppercase && last_list_item_line > 0 {
2829                            // This looks like a new paragraph
2830                            list_blocks.push(block.clone());
2831                            current_block = None;
2832                        } else {
2833                            // This is a lazy continuation line
2834                            block.end_line = line_num;
2835                        }
2836                    } else {
2837                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2838                        list_blocks.push(block.clone());
2839                        current_block = None;
2840                    }
2841                }
2842            }
2843        }
2844
2845        // Don't forget the last block
2846        if let Some(block) = current_block {
2847            list_blocks.push(block);
2848        }
2849
2850        // Merge adjacent blocks that should be one
2851        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2852
2853        list_blocks
2854    }
2855
2856    /// Compute character frequency for fast content analysis
2857    fn compute_char_frequency(content: &str) -> CharFrequency {
2858        let mut frequency = CharFrequency::default();
2859
2860        for ch in content.chars() {
2861            match ch {
2862                '#' => frequency.hash_count += 1,
2863                '*' => frequency.asterisk_count += 1,
2864                '_' => frequency.underscore_count += 1,
2865                '-' => frequency.hyphen_count += 1,
2866                '+' => frequency.plus_count += 1,
2867                '>' => frequency.gt_count += 1,
2868                '|' => frequency.pipe_count += 1,
2869                '[' => frequency.bracket_count += 1,
2870                '`' => frequency.backtick_count += 1,
2871                '<' => frequency.lt_count += 1,
2872                '!' => frequency.exclamation_count += 1,
2873                '\n' => frequency.newline_count += 1,
2874                _ => {}
2875            }
2876        }
2877
2878        frequency
2879    }
2880
2881    /// Parse HTML tags in the content
2882    fn parse_html_tags(
2883        content: &str,
2884        lines: &[LineInfo],
2885        code_blocks: &[(usize, usize)],
2886        flavor: MarkdownFlavor,
2887    ) -> Vec<HtmlTag> {
2888        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2889            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2890
2891        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2892
2893        for cap in HTML_TAG_REGEX.captures_iter(content) {
2894            let full_match = cap.get(0).unwrap();
2895            let match_start = full_match.start();
2896            let match_end = full_match.end();
2897
2898            // Skip if in code block
2899            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2900                continue;
2901            }
2902
2903            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2904            let tag_name_original = cap.get(2).unwrap().as_str();
2905            let tag_name = tag_name_original.to_lowercase();
2906            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2907
2908            // Skip JSX components in MDX files (tags starting with uppercase letter)
2909            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2910            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2911                continue;
2912            }
2913
2914            // Find which line this tag is on
2915            let mut line_num = 1;
2916            let mut col_start = match_start;
2917            let mut col_end = match_end;
2918            for (idx, line_info) in lines.iter().enumerate() {
2919                if match_start >= line_info.byte_offset {
2920                    line_num = idx + 1;
2921                    col_start = match_start - line_info.byte_offset;
2922                    col_end = match_end - line_info.byte_offset;
2923                } else {
2924                    break;
2925                }
2926            }
2927
2928            html_tags.push(HtmlTag {
2929                line: line_num,
2930                start_col: col_start,
2931                end_col: col_end,
2932                byte_offset: match_start,
2933                byte_end: match_end,
2934                tag_name,
2935                is_closing,
2936                is_self_closing,
2937                raw_content: full_match.as_str().to_string(),
2938            });
2939        }
2940
2941        html_tags
2942    }
2943
2944    /// Parse emphasis spans in the content
2945    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2946        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2947            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2948
2949        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2950
2951        for cap in EMPHASIS_REGEX.captures_iter(content) {
2952            let full_match = cap.get(0).unwrap();
2953            let match_start = full_match.start();
2954            let match_end = full_match.end();
2955
2956            // Skip if in code block
2957            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2958                continue;
2959            }
2960
2961            let opening_markers = cap.get(1).unwrap().as_str();
2962            let content_part = cap.get(2).unwrap().as_str();
2963            let closing_markers = cap.get(3).unwrap().as_str();
2964
2965            // Validate matching markers
2966            if opening_markers.chars().next() != closing_markers.chars().next()
2967                || opening_markers.len() != closing_markers.len()
2968            {
2969                continue;
2970            }
2971
2972            let marker = opening_markers.chars().next().unwrap();
2973            let marker_count = opening_markers.len();
2974
2975            // Find which line this emphasis is on
2976            let mut line_num = 1;
2977            let mut col_start = match_start;
2978            let mut col_end = match_end;
2979            for (idx, line_info) in lines.iter().enumerate() {
2980                if match_start >= line_info.byte_offset {
2981                    line_num = idx + 1;
2982                    col_start = match_start - line_info.byte_offset;
2983                    col_end = match_end - line_info.byte_offset;
2984                } else {
2985                    break;
2986                }
2987            }
2988
2989            emphasis_spans.push(EmphasisSpan {
2990                line: line_num,
2991                start_col: col_start,
2992                end_col: col_end,
2993                byte_offset: match_start,
2994                byte_end: match_end,
2995                marker,
2996                marker_count,
2997                content: content_part.to_string(),
2998            });
2999        }
3000
3001        emphasis_spans
3002    }
3003
3004    /// Parse table rows in the content
3005    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3006        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3007
3008        for (line_idx, line_info) in lines.iter().enumerate() {
3009            // Skip lines in code blocks or blank lines
3010            if line_info.in_code_block || line_info.is_blank {
3011                continue;
3012            }
3013
3014            let line = line_info.content(content);
3015            let line_num = line_idx + 1;
3016
3017            // Check if this line contains pipes (potential table row)
3018            if !line.contains('|') {
3019                continue;
3020            }
3021
3022            // Count columns by splitting on pipes
3023            let parts: Vec<&str> = line.split('|').collect();
3024            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3025
3026            // Check if this is a separator row
3027            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3028            let mut column_alignments = Vec::new();
3029
3030            if is_separator {
3031                for part in &parts[1..parts.len() - 1] {
3032                    // Skip first and last empty parts
3033                    let trimmed = part.trim();
3034                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3035                        "center".to_string()
3036                    } else if trimmed.ends_with(':') {
3037                        "right".to_string()
3038                    } else if trimmed.starts_with(':') {
3039                        "left".to_string()
3040                    } else {
3041                        "none".to_string()
3042                    };
3043                    column_alignments.push(alignment);
3044                }
3045            }
3046
3047            table_rows.push(TableRow {
3048                line: line_num,
3049                is_separator,
3050                column_count,
3051                column_alignments,
3052            });
3053        }
3054
3055        table_rows
3056    }
3057
3058    /// Parse bare URLs and emails in the content
3059    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3060        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3061
3062        // Check for bare URLs (not in angle brackets or markdown links)
3063        for cap in BARE_URL_PATTERN.captures_iter(content) {
3064            let full_match = cap.get(0).unwrap();
3065            let match_start = full_match.start();
3066            let match_end = full_match.end();
3067
3068            // Skip if in code block
3069            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3070                continue;
3071            }
3072
3073            // Skip if already in angle brackets or markdown links
3074            let preceding_char = if match_start > 0 {
3075                content.chars().nth(match_start - 1)
3076            } else {
3077                None
3078            };
3079            let following_char = content.chars().nth(match_end);
3080
3081            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3082                continue;
3083            }
3084            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3085                continue;
3086            }
3087
3088            let url = full_match.as_str();
3089            let url_type = if url.starts_with("https://") {
3090                "https"
3091            } else if url.starts_with("http://") {
3092                "http"
3093            } else if url.starts_with("ftp://") {
3094                "ftp"
3095            } else {
3096                "other"
3097            };
3098
3099            // Find which line this URL is on
3100            let mut line_num = 1;
3101            let mut col_start = match_start;
3102            let mut col_end = match_end;
3103            for (idx, line_info) in lines.iter().enumerate() {
3104                if match_start >= line_info.byte_offset {
3105                    line_num = idx + 1;
3106                    col_start = match_start - line_info.byte_offset;
3107                    col_end = match_end - line_info.byte_offset;
3108                } else {
3109                    break;
3110                }
3111            }
3112
3113            bare_urls.push(BareUrl {
3114                line: line_num,
3115                start_col: col_start,
3116                end_col: col_end,
3117                byte_offset: match_start,
3118                byte_end: match_end,
3119                url: url.to_string(),
3120                url_type: url_type.to_string(),
3121            });
3122        }
3123
3124        // Check for bare email addresses
3125        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3126            let full_match = cap.get(0).unwrap();
3127            let match_start = full_match.start();
3128            let match_end = full_match.end();
3129
3130            // Skip if in code block
3131            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3132                continue;
3133            }
3134
3135            // Skip if already in angle brackets or markdown links
3136            let preceding_char = if match_start > 0 {
3137                content.chars().nth(match_start - 1)
3138            } else {
3139                None
3140            };
3141            let following_char = content.chars().nth(match_end);
3142
3143            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3144                continue;
3145            }
3146            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3147                continue;
3148            }
3149
3150            let email = full_match.as_str();
3151
3152            // Find which line this email is on
3153            let mut line_num = 1;
3154            let mut col_start = match_start;
3155            let mut col_end = match_end;
3156            for (idx, line_info) in lines.iter().enumerate() {
3157                if match_start >= line_info.byte_offset {
3158                    line_num = idx + 1;
3159                    col_start = match_start - line_info.byte_offset;
3160                    col_end = match_end - line_info.byte_offset;
3161                } else {
3162                    break;
3163                }
3164            }
3165
3166            bare_urls.push(BareUrl {
3167                line: line_num,
3168                start_col: col_start,
3169                end_col: col_end,
3170                byte_offset: match_start,
3171                byte_end: match_end,
3172                url: email.to_string(),
3173                url_type: "email".to_string(),
3174            });
3175        }
3176
3177        bare_urls
3178    }
3179}
3180
3181/// Merge adjacent list blocks that should be treated as one
3182fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3183    if list_blocks.len() < 2 {
3184        return;
3185    }
3186
3187    let mut merger = ListBlockMerger::new(content, lines);
3188    *list_blocks = merger.merge(list_blocks);
3189}
3190
3191/// Helper struct to manage the complex logic of merging list blocks
3192struct ListBlockMerger<'a> {
3193    content: &'a str,
3194    lines: &'a [LineInfo],
3195}
3196
3197impl<'a> ListBlockMerger<'a> {
3198    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3199        Self { content, lines }
3200    }
3201
3202    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3203        let mut merged = Vec::with_capacity(list_blocks.len());
3204        let mut current = list_blocks[0].clone();
3205
3206        for next in list_blocks.iter().skip(1) {
3207            if self.should_merge_blocks(&current, next) {
3208                current = self.merge_two_blocks(current, next);
3209            } else {
3210                merged.push(current);
3211                current = next.clone();
3212            }
3213        }
3214
3215        merged.push(current);
3216        merged
3217    }
3218
3219    /// Determine if two adjacent list blocks should be merged
3220    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3221        // Basic compatibility checks
3222        if !self.blocks_are_compatible(current, next) {
3223            return false;
3224        }
3225
3226        // Check spacing and content between blocks
3227        let spacing = self.analyze_spacing_between(current, next);
3228        match spacing {
3229            BlockSpacing::Consecutive => true,
3230            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3231            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3232                self.can_merge_with_content_between(current, next)
3233            }
3234        }
3235    }
3236
3237    /// Check if blocks have compatible structure for merging
3238    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3239        current.is_ordered == next.is_ordered
3240            && current.blockquote_prefix == next.blockquote_prefix
3241            && current.nesting_level == next.nesting_level
3242    }
3243
3244    /// Analyze the spacing between two list blocks
3245    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3246        let gap = next.start_line - current.end_line;
3247
3248        match gap {
3249            1 => BlockSpacing::Consecutive,
3250            2 => BlockSpacing::SingleBlank,
3251            _ if gap > 2 => {
3252                if self.has_only_blank_lines_between(current, next) {
3253                    BlockSpacing::MultipleBlanks
3254                } else {
3255                    BlockSpacing::ContentBetween
3256                }
3257            }
3258            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3259        }
3260    }
3261
3262    /// Check if unordered lists can be merged with a single blank line between
3263    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3264        // Check if there are structural separators between the blocks
3265        // If has_meaningful_content_between returns true, it means there are structural separators
3266        if has_meaningful_content_between(self.content, current, next, self.lines) {
3267            return false; // Structural separators prevent merging
3268        }
3269
3270        // Only merge unordered lists with same marker across single blank
3271        !current.is_ordered && current.marker == next.marker
3272    }
3273
3274    /// Check if ordered lists can be merged when there's content between them
3275    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3276        // Do not merge lists if there are structural separators between them
3277        if has_meaningful_content_between(self.content, current, next, self.lines) {
3278            return false; // Structural separators prevent merging
3279        }
3280
3281        // Only consider merging ordered lists if there's no structural content between
3282        current.is_ordered && next.is_ordered
3283    }
3284
3285    /// Check if there are only blank lines between blocks
3286    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3287        for line_num in (current.end_line + 1)..next.start_line {
3288            if let Some(line_info) = self.lines.get(line_num - 1)
3289                && !line_info.content(self.content).trim().is_empty()
3290            {
3291                return false;
3292            }
3293        }
3294        true
3295    }
3296
3297    /// Merge two compatible list blocks into one
3298    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3299        current.end_line = next.end_line;
3300        current.item_lines.extend_from_slice(&next.item_lines);
3301
3302        // Update max marker width
3303        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3304
3305        // Handle marker consistency for unordered lists
3306        if !current.is_ordered && self.markers_differ(&current, next) {
3307            current.marker = None; // Mixed markers
3308        }
3309
3310        current
3311    }
3312
3313    /// Check if two blocks have different markers
3314    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3315        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3316    }
3317}
3318
3319/// Types of spacing between list blocks
3320#[derive(Debug, PartialEq)]
3321enum BlockSpacing {
3322    Consecutive,    // No gap between blocks
3323    SingleBlank,    // One blank line between blocks
3324    MultipleBlanks, // Multiple blank lines but no content
3325    ContentBetween, // Content exists between blocks
3326}
3327
3328/// Check if there's meaningful content (not just blank lines) between two list blocks
3329fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3330    // Check lines between current.end_line and next.start_line
3331    for line_num in (current.end_line + 1)..next.start_line {
3332        if let Some(line_info) = lines.get(line_num - 1) {
3333            // Convert to 0-indexed
3334            let trimmed = line_info.content(content).trim();
3335
3336            // Skip empty lines
3337            if trimmed.is_empty() {
3338                continue;
3339            }
3340
3341            // Check for structural separators that should separate lists (CommonMark compliant)
3342
3343            // Headings separate lists
3344            if line_info.heading.is_some() {
3345                return true; // Has meaningful content - headings separate lists
3346            }
3347
3348            // Horizontal rules separate lists (---, ***, ___)
3349            if is_horizontal_rule(trimmed) {
3350                return true; // Has meaningful content - horizontal rules separate lists
3351            }
3352
3353            // Tables separate lists
3354            if crate::utils::skip_context::is_table_line(trimmed) {
3355                return true; // Has meaningful content - tables separate lists
3356            }
3357
3358            // Blockquotes separate lists
3359            if trimmed.starts_with('>') {
3360                return true; // Has meaningful content - blockquotes separate lists
3361            }
3362
3363            // Code block fences separate lists (unless properly indented as list content)
3364            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3365                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3366
3367                // Check if this code block is properly indented as list continuation
3368                let min_continuation_indent = if current.is_ordered {
3369                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3370                } else {
3371                    current.nesting_level + 2
3372                };
3373
3374                if line_indent < min_continuation_indent {
3375                    // This is a standalone code block that separates lists
3376                    return true; // Has meaningful content - standalone code blocks separate lists
3377                }
3378            }
3379
3380            // Check if this line has proper indentation for list continuation
3381            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3382
3383            // Calculate minimum indentation needed to be list continuation
3384            let min_indent = if current.is_ordered {
3385                current.nesting_level + current.max_marker_width
3386            } else {
3387                current.nesting_level + 2
3388            };
3389
3390            // If the line is not indented enough to be list continuation, it's meaningful content
3391            if line_indent < min_indent {
3392                return true; // Has meaningful content - content not indented as list continuation
3393            }
3394
3395            // If we reach here, the line is properly indented as list continuation
3396            // Continue checking other lines
3397        }
3398    }
3399
3400    // Only blank lines or properly indented list continuation content between blocks
3401    false
3402}
3403
3404/// Check if a line is a horizontal rule (---, ***, ___)
3405fn is_horizontal_rule(trimmed: &str) -> bool {
3406    if trimmed.len() < 3 {
3407        return false;
3408    }
3409
3410    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3411    let chars: Vec<char> = trimmed.chars().collect();
3412    if let Some(&first_char) = chars.first()
3413        && (first_char == '-' || first_char == '*' || first_char == '_')
3414    {
3415        let mut count = 0;
3416        for &ch in &chars {
3417            if ch == first_char {
3418                count += 1;
3419            } else if ch != ' ' && ch != '\t' {
3420                return false; // Non-matching, non-whitespace character
3421            }
3422        }
3423        return count >= 3;
3424    }
3425    false
3426}
3427
3428/// Check if content contains patterns that cause the markdown crate to panic
3429#[cfg(test)]
3430mod tests {
3431    use super::*;
3432
3433    #[test]
3434    fn test_empty_content() {
3435        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
3436        assert_eq!(ctx.content, "");
3437        assert_eq!(ctx.line_offsets, vec![0]);
3438        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3439        assert_eq!(ctx.lines.len(), 0);
3440    }
3441
3442    #[test]
3443    fn test_single_line() {
3444        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
3445        assert_eq!(ctx.content, "# Hello");
3446        assert_eq!(ctx.line_offsets, vec![0]);
3447        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3448        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3449    }
3450
3451    #[test]
3452    fn test_multi_line() {
3453        let content = "# Title\n\nSecond line\nThird line";
3454        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3455        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3456        // Test offset to line/col
3457        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3458        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3459        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3460        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3461        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3462    }
3463
3464    #[test]
3465    fn test_line_info() {
3466        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3467        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3468
3469        // Test line info
3470        assert_eq!(ctx.lines.len(), 7);
3471
3472        // Line 1: "# Title"
3473        let line1 = &ctx.lines[0];
3474        assert_eq!(line1.content(ctx.content), "# Title");
3475        assert_eq!(line1.byte_offset, 0);
3476        assert_eq!(line1.indent, 0);
3477        assert!(!line1.is_blank);
3478        assert!(!line1.in_code_block);
3479        assert!(line1.list_item.is_none());
3480
3481        // Line 2: "    indented"
3482        let line2 = &ctx.lines[1];
3483        assert_eq!(line2.content(ctx.content), "    indented");
3484        assert_eq!(line2.byte_offset, 8);
3485        assert_eq!(line2.indent, 4);
3486        assert!(!line2.is_blank);
3487
3488        // Line 3: "" (blank)
3489        let line3 = &ctx.lines[2];
3490        assert_eq!(line3.content(ctx.content), "");
3491        assert!(line3.is_blank);
3492
3493        // Test helper methods
3494        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3495        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3496        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3497        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3498    }
3499
3500    #[test]
3501    fn test_list_item_detection() {
3502        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3503        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3504
3505        // Line 1: "- Unordered item"
3506        let line1 = &ctx.lines[0];
3507        assert!(line1.list_item.is_some());
3508        let list1 = line1.list_item.as_ref().unwrap();
3509        assert_eq!(list1.marker, "-");
3510        assert!(!list1.is_ordered);
3511        assert_eq!(list1.marker_column, 0);
3512        assert_eq!(list1.content_column, 2);
3513
3514        // Line 2: "  * Nested item"
3515        let line2 = &ctx.lines[1];
3516        assert!(line2.list_item.is_some());
3517        let list2 = line2.list_item.as_ref().unwrap();
3518        assert_eq!(list2.marker, "*");
3519        assert_eq!(list2.marker_column, 2);
3520
3521        // Line 3: "1. Ordered item"
3522        let line3 = &ctx.lines[2];
3523        assert!(line3.list_item.is_some());
3524        let list3 = line3.list_item.as_ref().unwrap();
3525        assert_eq!(list3.marker, "1.");
3526        assert!(list3.is_ordered);
3527        assert_eq!(list3.number, Some(1));
3528
3529        // Line 6: "Not a list"
3530        let line6 = &ctx.lines[5];
3531        assert!(line6.list_item.is_none());
3532    }
3533
3534    #[test]
3535    fn test_offset_to_line_col_edge_cases() {
3536        let content = "a\nb\nc";
3537        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3538        // line_offsets: [0, 2, 4]
3539        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3540        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3541        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3542        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3543        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3544        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3545    }
3546
3547    #[test]
3548    fn test_mdx_esm_blocks() {
3549        let content = r##"import {Chart} from './snowfall.js'
3550export const year = 2023
3551
3552# Last year's snowfall
3553
3554In {year}, the snowfall was above average.
3555It was followed by a warm spring which caused
3556flood conditions in many of the nearby rivers.
3557
3558<Chart color="#fcb32c" year={year} />
3559"##;
3560
3561        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
3562
3563        // Check that lines 1 and 2 are marked as ESM blocks
3564        assert_eq!(ctx.lines.len(), 10);
3565        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3566        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3567        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3568        assert!(
3569            !ctx.lines[3].in_esm_block,
3570            "Line 4 (heading) should NOT be in_esm_block"
3571        );
3572        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3573        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3574    }
3575
3576    #[test]
3577    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3578        let content = r#"import {Chart} from './snowfall.js'
3579export const year = 2023
3580
3581# Last year's snowfall
3582"#;
3583
3584        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3585
3586        // ESM blocks should NOT be detected in Standard flavor
3587        assert!(
3588            !ctx.lines[0].in_esm_block,
3589            "Line 1 should NOT be in_esm_block in Standard flavor"
3590        );
3591        assert!(
3592            !ctx.lines[1].in_esm_block,
3593            "Line 2 should NOT be in_esm_block in Standard flavor"
3594        );
3595    }
3596}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs