rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::path::PathBuf;
8use std::sync::LazyLock;
9
10/// Macro for profiling sections - only active in non-WASM builds
11#[cfg(not(target_arch = "wasm32"))]
12macro_rules! profile_section {
13    ($name:expr, $profile:expr, $code:expr) => {{
14        let start = std::time::Instant::now();
15        let result = $code;
16        if $profile {
17            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
18        }
19        result
20    }};
21}
22
23#[cfg(target_arch = "wasm32")]
24macro_rules! profile_section {
25    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
26}
27
28// Comprehensive link pattern that captures both inline and reference links
29// Use (?s) flag to make . match newlines
30static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
31    Regex::new(
32        r#"(?sx)
33        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
34        (?:
35            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
36            |
37            \[([^\]]*)\]      # Reference ID in group 6
38        )"#
39    ).unwrap()
40});
41
42// Image pattern (similar to links but with ! prefix)
43// Use (?s) flag to make . match newlines
44static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r#"(?sx)
47        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
48        (?:
49            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
50            |
51            \[([^\]]*)\]      # Reference ID in group 6
52        )"#
53    ).unwrap()
54});
55
56// Reference definition pattern
57static REF_DEF_PATTERN: LazyLock<Regex> =
58    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
59
60// Pattern for bare URLs
61static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
62    Regex::new(
63        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
64    ).unwrap()
65});
66
67// Pattern for email addresses
68static BARE_EMAIL_PATTERN: LazyLock<Regex> =
69    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
70
71// Pattern for blockquote prefix in parse_list_blocks
72static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
73
74/// Pre-computed information about a line
75#[derive(Debug, Clone)]
76pub struct LineInfo {
77    /// Byte offset where this line starts in the document
78    pub byte_offset: usize,
79    /// Length of the line in bytes (without newline)
80    pub byte_len: usize,
81    /// Number of leading spaces/tabs
82    pub indent: usize,
83    /// Whether the line is blank (empty or only whitespace)
84    pub is_blank: bool,
85    /// Whether this line is inside a code block
86    pub in_code_block: bool,
87    /// Whether this line is inside front matter
88    pub in_front_matter: bool,
89    /// Whether this line is inside an HTML block
90    pub in_html_block: bool,
91    /// Whether this line is inside an HTML comment
92    pub in_html_comment: bool,
93    /// List item information if this line starts a list item
94    pub list_item: Option<ListItemInfo>,
95    /// Heading information if this line is a heading
96    pub heading: Option<HeadingInfo>,
97    /// Blockquote information if this line is a blockquote
98    pub blockquote: Option<BlockquoteInfo>,
99    /// Whether this line is inside a mkdocstrings autodoc block
100    pub in_mkdocstrings: bool,
101    /// Whether this line is part of an ESM import/export block (MDX only)
102    pub in_esm_block: bool,
103    /// Whether this line is a continuation of a multi-line code span from a previous line
104    pub in_code_span_continuation: bool,
105}
106
107impl LineInfo {
108    /// Get the line content as a string slice from the source document
109    pub fn content<'a>(&self, source: &'a str) -> &'a str {
110        &source[self.byte_offset..self.byte_offset + self.byte_len]
111    }
112}
113
114/// Information about a list item
115#[derive(Debug, Clone)]
116pub struct ListItemInfo {
117    /// The marker used (*, -, +, or number with . or ))
118    pub marker: String,
119    /// Whether it's ordered (true) or unordered (false)
120    pub is_ordered: bool,
121    /// The number for ordered lists
122    pub number: Option<usize>,
123    /// Column where the marker starts (0-based)
124    pub marker_column: usize,
125    /// Column where content after marker starts
126    pub content_column: usize,
127}
128
129/// Heading style type
130#[derive(Debug, Clone, PartialEq)]
131pub enum HeadingStyle {
132    /// ATX style heading (# Heading)
133    ATX,
134    /// Setext style heading with = underline
135    Setext1,
136    /// Setext style heading with - underline
137    Setext2,
138}
139
140/// Parsed link information
141#[derive(Debug, Clone)]
142pub struct ParsedLink<'a> {
143    /// Line number (1-indexed)
144    pub line: usize,
145    /// Start column (0-indexed) in the line
146    pub start_col: usize,
147    /// End column (0-indexed) in the line
148    pub end_col: usize,
149    /// Byte offset in document
150    pub byte_offset: usize,
151    /// End byte offset in document
152    pub byte_end: usize,
153    /// Link text
154    pub text: Cow<'a, str>,
155    /// Link URL or reference
156    pub url: Cow<'a, str>,
157    /// Whether this is a reference link [text][ref] vs inline [text](url)
158    pub is_reference: bool,
159    /// Reference ID for reference links
160    pub reference_id: Option<Cow<'a, str>>,
161    /// Link type from pulldown-cmark
162    pub link_type: LinkType,
163}
164
165/// Information about a broken link reported by pulldown-cmark
166#[derive(Debug, Clone)]
167pub struct BrokenLinkInfo {
168    /// The reference text that couldn't be resolved
169    pub reference: String,
170    /// Byte span in the source document
171    pub span: std::ops::Range<usize>,
172}
173
174/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
175#[derive(Debug, Clone)]
176pub struct FootnoteRef {
177    /// The footnote ID (without the ^ prefix)
178    pub id: String,
179    /// Line number (1-indexed)
180    pub line: usize,
181    /// Start byte offset in document
182    pub byte_offset: usize,
183    /// End byte offset in document
184    pub byte_end: usize,
185}
186
187/// Parsed image information
188#[derive(Debug, Clone)]
189pub struct ParsedImage<'a> {
190    /// Line number (1-indexed)
191    pub line: usize,
192    /// Start column (0-indexed) in the line
193    pub start_col: usize,
194    /// End column (0-indexed) in the line
195    pub end_col: usize,
196    /// Byte offset in document
197    pub byte_offset: usize,
198    /// End byte offset in document
199    pub byte_end: usize,
200    /// Alt text
201    pub alt_text: Cow<'a, str>,
202    /// Image URL or reference
203    pub url: Cow<'a, str>,
204    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
205    pub is_reference: bool,
206    /// Reference ID for reference images
207    pub reference_id: Option<Cow<'a, str>>,
208    /// Link type from pulldown-cmark
209    pub link_type: LinkType,
210}
211
212/// Reference definition [ref]: url "title"
213#[derive(Debug, Clone)]
214pub struct ReferenceDef {
215    /// Line number (1-indexed)
216    pub line: usize,
217    /// Reference ID (normalized to lowercase)
218    pub id: String,
219    /// URL
220    pub url: String,
221    /// Optional title
222    pub title: Option<String>,
223    /// Byte offset where the reference definition starts
224    pub byte_offset: usize,
225    /// Byte offset where the reference definition ends
226    pub byte_end: usize,
227}
228
229/// Parsed code span information
230#[derive(Debug, Clone)]
231pub struct CodeSpan {
232    /// Line number where the code span starts (1-indexed)
233    pub line: usize,
234    /// Line number where the code span ends (1-indexed)
235    pub end_line: usize,
236    /// Start column (0-indexed) in the line
237    pub start_col: usize,
238    /// End column (0-indexed) in the line
239    pub end_col: usize,
240    /// Byte offset in document
241    pub byte_offset: usize,
242    /// End byte offset in document
243    pub byte_end: usize,
244    /// Number of backticks used (1, 2, 3, etc.)
245    pub backtick_count: usize,
246    /// Content inside the code span (without backticks)
247    pub content: String,
248}
249
250/// Information about a heading
251#[derive(Debug, Clone)]
252pub struct HeadingInfo {
253    /// Heading level (1-6 for ATX, 1-2 for Setext)
254    pub level: u8,
255    /// Style of heading
256    pub style: HeadingStyle,
257    /// The heading marker (# characters or underline)
258    pub marker: String,
259    /// Column where the marker starts (0-based)
260    pub marker_column: usize,
261    /// Column where heading text starts
262    pub content_column: usize,
263    /// The heading text (without markers and without custom ID syntax)
264    pub text: String,
265    /// Custom header ID if present (e.g., from {#custom-id} syntax)
266    pub custom_id: Option<String>,
267    /// Original heading text including custom ID syntax
268    pub raw_text: String,
269    /// Whether it has a closing sequence (for ATX)
270    pub has_closing_sequence: bool,
271    /// The closing sequence if present
272    pub closing_sequence: String,
273}
274
275/// Information about a blockquote line
276#[derive(Debug, Clone)]
277pub struct BlockquoteInfo {
278    /// Nesting level (1 for >, 2 for >>, etc.)
279    pub nesting_level: usize,
280    /// The indentation before the blockquote marker
281    pub indent: String,
282    /// Column where the first > starts (0-based)
283    pub marker_column: usize,
284    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
285    pub prefix: String,
286    /// Content after the blockquote marker(s)
287    pub content: String,
288    /// Whether the line has no space after the marker
289    pub has_no_space_after_marker: bool,
290    /// Whether the line has multiple spaces after the marker
291    pub has_multiple_spaces_after_marker: bool,
292    /// Whether this is an empty blockquote line needing MD028 fix
293    pub needs_md028_fix: bool,
294}
295
296/// Information about a list block
297#[derive(Debug, Clone)]
298pub struct ListBlock {
299    /// Line number where the list starts (1-indexed)
300    pub start_line: usize,
301    /// Line number where the list ends (1-indexed)
302    pub end_line: usize,
303    /// Whether it's ordered or unordered
304    pub is_ordered: bool,
305    /// The consistent marker for unordered lists (if any)
306    pub marker: Option<String>,
307    /// Blockquote prefix for this list (empty if not in blockquote)
308    pub blockquote_prefix: String,
309    /// Lines that are list items within this block
310    pub item_lines: Vec<usize>,
311    /// Nesting level (0 for top-level lists)
312    pub nesting_level: usize,
313    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
314    pub max_marker_width: usize,
315}
316
317use std::sync::{Arc, Mutex};
318
319/// Character frequency data for fast content analysis
320#[derive(Debug, Clone, Default)]
321pub struct CharFrequency {
322    /// Count of # characters (headings)
323    pub hash_count: usize,
324    /// Count of * characters (emphasis, lists, horizontal rules)
325    pub asterisk_count: usize,
326    /// Count of _ characters (emphasis, horizontal rules)
327    pub underscore_count: usize,
328    /// Count of - characters (lists, horizontal rules, setext headings)
329    pub hyphen_count: usize,
330    /// Count of + characters (lists)
331    pub plus_count: usize,
332    /// Count of > characters (blockquotes)
333    pub gt_count: usize,
334    /// Count of | characters (tables)
335    pub pipe_count: usize,
336    /// Count of [ characters (links, images)
337    pub bracket_count: usize,
338    /// Count of ` characters (code spans, code blocks)
339    pub backtick_count: usize,
340    /// Count of < characters (HTML tags, autolinks)
341    pub lt_count: usize,
342    /// Count of ! characters (images)
343    pub exclamation_count: usize,
344    /// Count of newline characters
345    pub newline_count: usize,
346}
347
348/// Pre-parsed HTML tag information
349#[derive(Debug, Clone)]
350pub struct HtmlTag {
351    /// Line number (1-indexed)
352    pub line: usize,
353    /// Start column (0-indexed) in the line
354    pub start_col: usize,
355    /// End column (0-indexed) in the line
356    pub end_col: usize,
357    /// Byte offset in document
358    pub byte_offset: usize,
359    /// End byte offset in document
360    pub byte_end: usize,
361    /// Tag name (e.g., "div", "img", "br")
362    pub tag_name: String,
363    /// Whether it's a closing tag (`</tag>`)
364    pub is_closing: bool,
365    /// Whether it's self-closing (`<tag />`)
366    pub is_self_closing: bool,
367    /// Raw tag content
368    pub raw_content: String,
369}
370
371/// Pre-parsed emphasis span information
372#[derive(Debug, Clone)]
373pub struct EmphasisSpan {
374    /// Line number (1-indexed)
375    pub line: usize,
376    /// Start column (0-indexed) in the line
377    pub start_col: usize,
378    /// End column (0-indexed) in the line
379    pub end_col: usize,
380    /// Byte offset in document
381    pub byte_offset: usize,
382    /// End byte offset in document
383    pub byte_end: usize,
384    /// Type of emphasis ('*' or '_')
385    pub marker: char,
386    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
387    pub marker_count: usize,
388    /// Content inside the emphasis
389    pub content: String,
390}
391
392/// Pre-parsed table row information
393#[derive(Debug, Clone)]
394pub struct TableRow {
395    /// Line number (1-indexed)
396    pub line: usize,
397    /// Whether this is a separator row (contains only |, -, :, and spaces)
398    pub is_separator: bool,
399    /// Number of columns (pipe-separated cells)
400    pub column_count: usize,
401    /// Alignment info from separator row
402    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
403}
404
405/// Pre-parsed bare URL information (not in links)
406#[derive(Debug, Clone)]
407pub struct BareUrl {
408    /// Line number (1-indexed)
409    pub line: usize,
410    /// Start column (0-indexed) in the line
411    pub start_col: usize,
412    /// End column (0-indexed) in the line
413    pub end_col: usize,
414    /// Byte offset in document
415    pub byte_offset: usize,
416    /// End byte offset in document
417    pub byte_end: usize,
418    /// The URL string
419    pub url: String,
420    /// Type of URL ("http", "https", "ftp", "email")
421    pub url_type: String,
422}
423
424pub struct LintContext<'a> {
425    pub content: &'a str,
426    pub line_offsets: Vec<usize>,
427    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
428    pub lines: Vec<LineInfo>,             // Pre-computed line information
429    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
430    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
431    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
432    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
433    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
434    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
435    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
436    pub char_frequency: CharFrequency,    // Character frequency analysis
437    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
438    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
439    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
440    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
441    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
442    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
443    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
444    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
445    pub flavor: MarkdownFlavor,           // Markdown flavor being used
446    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
447}
448
449/// Detailed blockquote parse result with all components
450struct BlockquoteComponents<'a> {
451    indent: &'a str,
452    markers: &'a str,
453    spaces_after: &'a str,
454    content: &'a str,
455}
456
457/// Parse blockquote prefix with detailed components using manual parsing
458#[inline]
459fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
460    let bytes = line.as_bytes();
461    let mut pos = 0;
462
463    // Parse leading whitespace (indent)
464    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
465        pos += 1;
466    }
467    let indent_end = pos;
468
469    // Must have at least one '>' marker
470    if pos >= bytes.len() || bytes[pos] != b'>' {
471        return None;
472    }
473
474    // Parse '>' markers
475    while pos < bytes.len() && bytes[pos] == b'>' {
476        pos += 1;
477    }
478    let markers_end = pos;
479
480    // Parse spaces after markers
481    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
482        pos += 1;
483    }
484    let spaces_end = pos;
485
486    Some(BlockquoteComponents {
487        indent: &line[0..indent_end],
488        markers: &line[indent_end..markers_end],
489        spaces_after: &line[markers_end..spaces_end],
490        content: &line[spaces_end..],
491    })
492}
493
494impl<'a> LintContext<'a> {
495    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
496        #[cfg(not(target_arch = "wasm32"))]
497        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
498        #[cfg(target_arch = "wasm32")]
499        let profile = false;
500
501        let line_offsets = profile_section!("Line offsets", profile, {
502            let mut offsets = vec![0];
503            for (i, c) in content.char_indices() {
504                if c == '\n' {
505                    offsets.push(i + 1);
506                }
507            }
508            offsets
509        });
510
511        // Detect code blocks once and cache them
512        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
513
514        // Pre-compute HTML comment ranges ONCE for all operations
515        let html_comment_ranges = profile_section!(
516            "HTML comment ranges",
517            profile,
518            crate::utils::skip_context::compute_html_comment_ranges(content)
519        );
520
521        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
522        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
523            if flavor == MarkdownFlavor::MkDocs {
524                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
525            } else {
526                Vec::new()
527            }
528        });
529
530        // Pre-compute line information (without headings/blockquotes yet)
531        let mut lines = profile_section!(
532            "Basic line info",
533            profile,
534            Self::compute_basic_line_info(
535                content,
536                &line_offsets,
537                &code_blocks,
538                flavor,
539                &html_comment_ranges,
540                &autodoc_ranges,
541            )
542        );
543
544        // Detect HTML blocks BEFORE heading detection
545        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
546
547        // Detect ESM import/export blocks in MDX files BEFORE heading detection
548        profile_section!(
549            "ESM blocks",
550            profile,
551            Self::detect_esm_blocks(content, &mut lines, flavor)
552        );
553
554        // Now detect headings and blockquotes
555        profile_section!(
556            "Headings & blockquotes",
557            profile,
558            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
559        );
560
561        // Parse code spans early so we can exclude them from link/image parsing
562        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
563
564        // Mark lines that are continuations of multi-line code spans
565        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
566        for span in &code_spans {
567            if span.end_line > span.line {
568                // Mark lines after the first line as continuations
569                for line_num in (span.line + 1)..=span.end_line {
570                    if let Some(line_info) = lines.get_mut(line_num - 1) {
571                        line_info.in_code_span_continuation = true;
572                    }
573                }
574            }
575        }
576
577        // Parse links, images, references, and list blocks
578        let (links, broken_links, footnote_refs) = profile_section!(
579            "Links",
580            profile,
581            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
582        );
583
584        let images = profile_section!(
585            "Images",
586            profile,
587            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
588        );
589
590        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
591
592        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
593
594        // Compute character frequency for fast content analysis
595        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
596
597        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
598        let table_blocks = profile_section!(
599            "Table blocks",
600            profile,
601            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
602                content,
603                &code_blocks,
604                &code_spans,
605                &html_comment_ranges,
606            )
607        );
608
609        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
610        let line_index = profile_section!(
611            "Line index",
612            profile,
613            crate::utils::range_utils::LineIndex::new(content)
614        );
615
616        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
617        let jinja_ranges = profile_section!(
618            "Jinja ranges",
619            profile,
620            crate::utils::jinja_utils::find_jinja_ranges(content)
621        );
622
623        Self {
624            content,
625            line_offsets,
626            code_blocks,
627            lines,
628            links,
629            images,
630            broken_links,
631            footnote_refs,
632            reference_defs,
633            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
634            list_blocks,
635            char_frequency,
636            html_tags_cache: Mutex::new(None),
637            emphasis_spans_cache: Mutex::new(None),
638            table_rows_cache: Mutex::new(None),
639            bare_urls_cache: Mutex::new(None),
640            html_comment_ranges,
641            table_blocks,
642            line_index,
643            jinja_ranges,
644            flavor,
645            source_file,
646        }
647    }
648
649    /// Get code spans - computed lazily on first access
650    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
651        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
652
653        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
654    }
655
656    /// Get HTML comment ranges - pre-computed during LintContext construction
657    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
658        &self.html_comment_ranges
659    }
660
661    /// Get HTML tags - computed lazily on first access
662    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
663        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
664
665        Arc::clone(cache.get_or_insert_with(|| {
666            Arc::new(Self::parse_html_tags(
667                self.content,
668                &self.lines,
669                &self.code_blocks,
670                self.flavor,
671            ))
672        }))
673    }
674
675    /// Get emphasis spans - computed lazily on first access
676    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
677        let mut cache = self
678            .emphasis_spans_cache
679            .lock()
680            .expect("Emphasis spans cache mutex poisoned");
681
682        Arc::clone(
683            cache.get_or_insert_with(|| {
684                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
685            }),
686        )
687    }
688
689    /// Get table rows - computed lazily on first access
690    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
691        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
692
693        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
694    }
695
696    /// Get bare URLs - computed lazily on first access
697    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
698        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
699
700        Arc::clone(
701            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
702        )
703    }
704
705    /// Map a byte offset to (line, column)
706    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
707        match self.line_offsets.binary_search(&offset) {
708            Ok(line) => (line + 1, 1),
709            Err(line) => {
710                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
711                (line, offset - line_start + 1)
712            }
713        }
714    }
715
716    /// Check if a position is within a code block or code span
717    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
718        // Check code blocks first
719        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
720            return true;
721        }
722
723        // Check inline code spans (lazy load if needed)
724        self.code_spans()
725            .iter()
726            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
727    }
728
729    /// Get line information by line number (1-indexed)
730    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
731        if line_num > 0 {
732            self.lines.get(line_num - 1)
733        } else {
734            None
735        }
736    }
737
738    /// Get byte offset for a line number (1-indexed)
739    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
740        self.line_info(line_num).map(|info| info.byte_offset)
741    }
742
743    /// Get URL for a reference link/image by its ID
744    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
745        let normalized_id = ref_id.to_lowercase();
746        self.reference_defs
747            .iter()
748            .find(|def| def.id == normalized_id)
749            .map(|def| def.url.as_str())
750    }
751
752    /// Check if a line is part of a list block
753    pub fn is_in_list_block(&self, line_num: usize) -> bool {
754        self.list_blocks
755            .iter()
756            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
757    }
758
759    /// Get the list block containing a specific line
760    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
761        self.list_blocks
762            .iter()
763            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
764    }
765
766    // Compatibility methods for DocumentStructure migration
767
768    /// Check if a line is within a code block
769    pub fn is_in_code_block(&self, line_num: usize) -> bool {
770        if line_num == 0 || line_num > self.lines.len() {
771            return false;
772        }
773        self.lines[line_num - 1].in_code_block
774    }
775
776    /// Check if a line is within front matter
777    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
778        if line_num == 0 || line_num > self.lines.len() {
779            return false;
780        }
781        self.lines[line_num - 1].in_front_matter
782    }
783
784    /// Check if a line is within an HTML block
785    pub fn is_in_html_block(&self, line_num: usize) -> bool {
786        if line_num == 0 || line_num > self.lines.len() {
787            return false;
788        }
789        self.lines[line_num - 1].in_html_block
790    }
791
792    /// Check if a line and column is within a code span
793    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
794        if line_num == 0 || line_num > self.lines.len() {
795            return false;
796        }
797
798        // Use the code spans cache to check
799        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
800        // Convert col to 0-indexed for comparison
801        let col_0indexed = if col > 0 { col - 1 } else { 0 };
802        let code_spans = self.code_spans();
803        code_spans.iter().any(|span| {
804            // Check if line is within the span's line range
805            if line_num < span.line || line_num > span.end_line {
806                return false;
807            }
808
809            if span.line == span.end_line {
810                // Single-line span: check column bounds
811                col_0indexed >= span.start_col && col_0indexed < span.end_col
812            } else if line_num == span.line {
813                // First line of multi-line span: anything after start_col is in span
814                col_0indexed >= span.start_col
815            } else if line_num == span.end_line {
816                // Last line of multi-line span: anything before end_col is in span
817                col_0indexed < span.end_col
818            } else {
819                // Middle line of multi-line span: entire line is in span
820                true
821            }
822        })
823    }
824
825    /// Check if a byte offset is within a code span
826    #[inline]
827    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
828        let code_spans = self.code_spans();
829        code_spans
830            .iter()
831            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
832    }
833
834    /// Check if a byte position is within a reference definition
835    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
836    #[inline]
837    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
838        self.reference_defs
839            .iter()
840            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
841    }
842
843    /// Check if a byte position is within an HTML comment
844    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
845    /// where k is the number of HTML comments (typically very small)
846    #[inline]
847    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
848        self.html_comment_ranges
849            .iter()
850            .any(|range| byte_pos >= range.start && byte_pos < range.end)
851    }
852
853    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
854    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
855        self.jinja_ranges
856            .iter()
857            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
858    }
859
860    /// Check if content has any instances of a specific character (fast)
861    pub fn has_char(&self, ch: char) -> bool {
862        match ch {
863            '#' => self.char_frequency.hash_count > 0,
864            '*' => self.char_frequency.asterisk_count > 0,
865            '_' => self.char_frequency.underscore_count > 0,
866            '-' => self.char_frequency.hyphen_count > 0,
867            '+' => self.char_frequency.plus_count > 0,
868            '>' => self.char_frequency.gt_count > 0,
869            '|' => self.char_frequency.pipe_count > 0,
870            '[' => self.char_frequency.bracket_count > 0,
871            '`' => self.char_frequency.backtick_count > 0,
872            '<' => self.char_frequency.lt_count > 0,
873            '!' => self.char_frequency.exclamation_count > 0,
874            '\n' => self.char_frequency.newline_count > 0,
875            _ => self.content.contains(ch), // Fallback for other characters
876        }
877    }
878
879    /// Get count of a specific character (fast)
880    pub fn char_count(&self, ch: char) -> usize {
881        match ch {
882            '#' => self.char_frequency.hash_count,
883            '*' => self.char_frequency.asterisk_count,
884            '_' => self.char_frequency.underscore_count,
885            '-' => self.char_frequency.hyphen_count,
886            '+' => self.char_frequency.plus_count,
887            '>' => self.char_frequency.gt_count,
888            '|' => self.char_frequency.pipe_count,
889            '[' => self.char_frequency.bracket_count,
890            '`' => self.char_frequency.backtick_count,
891            '<' => self.char_frequency.lt_count,
892            '!' => self.char_frequency.exclamation_count,
893            '\n' => self.char_frequency.newline_count,
894            _ => self.content.matches(ch).count(), // Fallback for other characters
895        }
896    }
897
898    /// Check if content likely contains headings (fast)
899    pub fn likely_has_headings(&self) -> bool {
900        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
901    }
902
903    /// Check if content likely contains lists (fast)
904    pub fn likely_has_lists(&self) -> bool {
905        self.char_frequency.asterisk_count > 0
906            || self.char_frequency.hyphen_count > 0
907            || self.char_frequency.plus_count > 0
908    }
909
910    /// Check if content likely contains emphasis (fast)
911    pub fn likely_has_emphasis(&self) -> bool {
912        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
913    }
914
915    /// Check if content likely contains tables (fast)
916    pub fn likely_has_tables(&self) -> bool {
917        self.char_frequency.pipe_count > 2
918    }
919
920    /// Check if content likely contains blockquotes (fast)
921    pub fn likely_has_blockquotes(&self) -> bool {
922        self.char_frequency.gt_count > 0
923    }
924
925    /// Check if content likely contains code (fast)
926    pub fn likely_has_code(&self) -> bool {
927        self.char_frequency.backtick_count > 0
928    }
929
930    /// Check if content likely contains links or images (fast)
931    pub fn likely_has_links_or_images(&self) -> bool {
932        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
933    }
934
935    /// Check if content likely contains HTML (fast)
936    pub fn likely_has_html(&self) -> bool {
937        self.char_frequency.lt_count > 0
938    }
939
940    /// Get HTML tags on a specific line
941    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
942        self.html_tags()
943            .iter()
944            .filter(|tag| tag.line == line_num)
945            .cloned()
946            .collect()
947    }
948
949    /// Get emphasis spans on a specific line
950    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
951        self.emphasis_spans()
952            .iter()
953            .filter(|span| span.line == line_num)
954            .cloned()
955            .collect()
956    }
957
958    /// Get table rows on a specific line
959    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
960        self.table_rows()
961            .iter()
962            .filter(|row| row.line == line_num)
963            .cloned()
964            .collect()
965    }
966
967    /// Get bare URLs on a specific line
968    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
969        self.bare_urls()
970            .iter()
971            .filter(|url| url.line == line_num)
972            .cloned()
973            .collect()
974    }
975
976    /// Find the line index for a given byte offset using binary search.
977    /// Returns (line_index, line_number, column) where:
978    /// - line_index is the 0-based index in the lines array
979    /// - line_number is the 1-based line number
980    /// - column is the byte offset within that line
981    #[inline]
982    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
983        // Binary search to find the line containing this byte offset
984        let idx = match lines.binary_search_by(|line| {
985            if byte_offset < line.byte_offset {
986                std::cmp::Ordering::Greater
987            } else if byte_offset > line.byte_offset + line.byte_len {
988                std::cmp::Ordering::Less
989            } else {
990                std::cmp::Ordering::Equal
991            }
992        }) {
993            Ok(idx) => idx,
994            Err(idx) => idx.saturating_sub(1),
995        };
996
997        let line = &lines[idx];
998        let line_num = idx + 1;
999        let col = byte_offset.saturating_sub(line.byte_offset);
1000
1001        (idx, line_num, col)
1002    }
1003
1004    /// Check if a byte offset is within a code span using binary search
1005    #[inline]
1006    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1007        // Since spans are sorted by byte_offset, use partition_point for binary search
1008        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1009
1010        // Check the span that starts at or before our offset
1011        if idx > 0 {
1012            let span = &code_spans[idx - 1];
1013            if offset >= span.byte_offset && offset < span.byte_end {
1014                return true;
1015            }
1016        }
1017
1018        false
1019    }
1020
1021    /// Parse all links in the content
1022    fn parse_links(
1023        content: &'a str,
1024        lines: &[LineInfo],
1025        code_blocks: &[(usize, usize)],
1026        code_spans: &[CodeSpan],
1027        flavor: MarkdownFlavor,
1028        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1029    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1030        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1031        use std::collections::HashSet;
1032
1033        let mut links = Vec::with_capacity(content.len() / 500);
1034        let mut broken_links = Vec::new();
1035        let mut footnote_refs = Vec::new();
1036
1037        // Track byte positions of links found by pulldown-cmark
1038        let mut found_positions = HashSet::new();
1039
1040        // Use pulldown-cmark's streaming parser with BrokenLink callback
1041        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1042        // This automatically handles:
1043        // - Escaped links (won't generate events)
1044        // - Links in code blocks/spans (won't generate Link events)
1045        // - Images (generates Tag::Image instead)
1046        // - Reference resolution (dest_url is already resolved!)
1047        // - Broken references (callback is invoked)
1048        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1049        let mut options = Options::empty();
1050        options.insert(Options::ENABLE_WIKILINKS);
1051        options.insert(Options::ENABLE_FOOTNOTES);
1052
1053        let parser = Parser::new_with_broken_link_callback(
1054            content,
1055            options,
1056            Some(|link: BrokenLink<'_>| {
1057                broken_links.push(BrokenLinkInfo {
1058                    reference: link.reference.to_string(),
1059                    span: link.span.clone(),
1060                });
1061                None
1062            }),
1063        )
1064        .into_offset_iter();
1065
1066        let mut link_stack: Vec<(
1067            usize,
1068            usize,
1069            pulldown_cmark::CowStr<'a>,
1070            LinkType,
1071            pulldown_cmark::CowStr<'a>,
1072        )> = Vec::new();
1073        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1074
1075        for (event, range) in parser {
1076            match event {
1077                Event::Start(Tag::Link {
1078                    link_type,
1079                    dest_url,
1080                    id,
1081                    ..
1082                }) => {
1083                    // Link start - record position, URL, and reference ID
1084                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1085                    text_chunks.clear();
1086                }
1087                Event::Text(text) if !link_stack.is_empty() => {
1088                    // Track text content with its byte range
1089                    text_chunks.push((text.to_string(), range.start, range.end));
1090                }
1091                Event::Code(code) if !link_stack.is_empty() => {
1092                    // Include inline code in link text (with backticks)
1093                    let code_text = format!("`{code}`");
1094                    text_chunks.push((code_text, range.start, range.end));
1095                }
1096                Event::End(TagEnd::Link) => {
1097                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1098                        // Skip if in HTML comment
1099                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1100                            text_chunks.clear();
1101                            continue;
1102                        }
1103
1104                        // Find line and column information
1105                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1106
1107                        // Skip if this link is on a MkDocs snippet line
1108                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1109                            text_chunks.clear();
1110                            continue;
1111                        }
1112
1113                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1114
1115                        let is_reference = matches!(
1116                            link_type,
1117                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1118                        );
1119
1120                        // Extract link text directly from source bytes to preserve escaping
1121                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1122                        let link_text = if start_pos < content.len() {
1123                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1124
1125                            // Find MATCHING ] by tracking bracket depth for nested brackets
1126                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1127                            // Brackets inside code spans (between backticks) should be ignored
1128                            let mut close_pos = None;
1129                            let mut depth = 0;
1130                            let mut in_code_span = false;
1131
1132                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1133                                // Count preceding backslashes
1134                                let mut backslash_count = 0;
1135                                let mut j = i;
1136                                while j > 0 && link_bytes[j - 1] == b'\\' {
1137                                    backslash_count += 1;
1138                                    j -= 1;
1139                                }
1140                                let is_escaped = backslash_count % 2 != 0;
1141
1142                                // Track code spans - backticks toggle in/out of code
1143                                if byte == b'`' && !is_escaped {
1144                                    in_code_span = !in_code_span;
1145                                }
1146
1147                                // Only count brackets when NOT in a code span
1148                                if !is_escaped && !in_code_span {
1149                                    if byte == b'[' {
1150                                        depth += 1;
1151                                    } else if byte == b']' {
1152                                        if depth == 0 {
1153                                            // Found the matching closing bracket
1154                                            close_pos = Some(i);
1155                                            break;
1156                                        } else {
1157                                            depth -= 1;
1158                                        }
1159                                    }
1160                                }
1161                            }
1162
1163                            if let Some(pos) = close_pos {
1164                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1165                            } else {
1166                                Cow::Borrowed("")
1167                            }
1168                        } else {
1169                            Cow::Borrowed("")
1170                        };
1171
1172                        // For reference links, use the actual reference ID from pulldown-cmark
1173                        let reference_id = if is_reference && !ref_id.is_empty() {
1174                            Some(Cow::Owned(ref_id.to_lowercase()))
1175                        } else if is_reference {
1176                            // For collapsed/shortcut references without explicit ID, use the link text
1177                            Some(Cow::Owned(link_text.to_lowercase()))
1178                        } else {
1179                            None
1180                        };
1181
1182                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1183                        // Check for escaped image syntax: \![text](url)
1184                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1185                        let has_escaped_bang = start_pos >= 2
1186                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1187                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1188
1189                        // Check for escaped bracket: \[text](url)
1190                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1191                        let has_escaped_bracket =
1192                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1193
1194                        if has_escaped_bang || has_escaped_bracket {
1195                            text_chunks.clear();
1196                            continue; // Skip: this is escaped markdown, not a real link
1197                        }
1198
1199                        // Track this position as found
1200                        found_positions.insert(start_pos);
1201
1202                        links.push(ParsedLink {
1203                            line: line_num,
1204                            start_col: col_start,
1205                            end_col: col_end,
1206                            byte_offset: start_pos,
1207                            byte_end: range.end,
1208                            text: link_text,
1209                            url: Cow::Owned(url.to_string()),
1210                            is_reference,
1211                            reference_id,
1212                            link_type,
1213                        });
1214
1215                        text_chunks.clear();
1216                    }
1217                }
1218                Event::FootnoteReference(footnote_id) => {
1219                    // Capture footnote references like [^1], [^note]
1220                    // Skip if in HTML comment
1221                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1222                        continue;
1223                    }
1224
1225                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1226                    footnote_refs.push(FootnoteRef {
1227                        id: footnote_id.to_string(),
1228                        line: line_num,
1229                        byte_offset: range.start,
1230                        byte_end: range.end,
1231                    });
1232                }
1233                _ => {}
1234            }
1235        }
1236
1237        // Also find undefined references using regex
1238        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1239        // because the reference is undefined
1240        for cap in LINK_PATTERN.captures_iter(content) {
1241            let full_match = cap.get(0).unwrap();
1242            let match_start = full_match.start();
1243            let match_end = full_match.end();
1244
1245            // Skip if this was already found by pulldown-cmark (it's a valid link)
1246            if found_positions.contains(&match_start) {
1247                continue;
1248            }
1249
1250            // Skip if escaped
1251            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1252                continue;
1253            }
1254
1255            // Skip if it's an image
1256            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1257                continue;
1258            }
1259
1260            // Skip if in code block
1261            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1262                continue;
1263            }
1264
1265            // Skip if in code span
1266            if Self::is_offset_in_code_span(code_spans, match_start) {
1267                continue;
1268            }
1269
1270            // Skip if in HTML comment
1271            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1272                continue;
1273            }
1274
1275            // Find line and column information
1276            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1277
1278            // Skip if this link is on a MkDocs snippet line
1279            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1280                continue;
1281            }
1282
1283            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1284
1285            let text = cap.get(1).map_or("", |m| m.as_str());
1286
1287            // Only process reference links (group 6)
1288            if let Some(ref_id) = cap.get(6) {
1289                let ref_id_str = ref_id.as_str();
1290                let normalized_ref = if ref_id_str.is_empty() {
1291                    Cow::Owned(text.to_lowercase()) // Implicit reference
1292                } else {
1293                    Cow::Owned(ref_id_str.to_lowercase())
1294                };
1295
1296                // This is an undefined reference (pulldown-cmark didn't parse it)
1297                links.push(ParsedLink {
1298                    line: line_num,
1299                    start_col: col_start,
1300                    end_col: col_end,
1301                    byte_offset: match_start,
1302                    byte_end: match_end,
1303                    text: Cow::Borrowed(text),
1304                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1305                    is_reference: true,
1306                    reference_id: Some(normalized_ref),
1307                    link_type: LinkType::Reference, // Undefined references are reference-style
1308                });
1309            }
1310        }
1311
1312        (links, broken_links, footnote_refs)
1313    }
1314
1315    /// Parse all images in the content
1316    fn parse_images(
1317        content: &'a str,
1318        lines: &[LineInfo],
1319        code_blocks: &[(usize, usize)],
1320        code_spans: &[CodeSpan],
1321        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1322    ) -> Vec<ParsedImage<'a>> {
1323        use crate::utils::skip_context::is_in_html_comment_ranges;
1324        use std::collections::HashSet;
1325
1326        // Pre-size based on a heuristic: images are less common than links
1327        let mut images = Vec::with_capacity(content.len() / 1000);
1328        let mut found_positions = HashSet::new();
1329
1330        // Use pulldown-cmark for parsing - more accurate and faster
1331        let parser = Parser::new(content).into_offset_iter();
1332        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1333            Vec::new();
1334        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1335
1336        for (event, range) in parser {
1337            match event {
1338                Event::Start(Tag::Image {
1339                    link_type,
1340                    dest_url,
1341                    id,
1342                    ..
1343                }) => {
1344                    image_stack.push((range.start, dest_url, link_type, id));
1345                    text_chunks.clear();
1346                }
1347                Event::Text(text) if !image_stack.is_empty() => {
1348                    text_chunks.push((text.to_string(), range.start, range.end));
1349                }
1350                Event::Code(code) if !image_stack.is_empty() => {
1351                    let code_text = format!("`{code}`");
1352                    text_chunks.push((code_text, range.start, range.end));
1353                }
1354                Event::End(TagEnd::Image) => {
1355                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1356                        // Skip if in code block
1357                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1358                            continue;
1359                        }
1360
1361                        // Skip if in code span
1362                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1363                            continue;
1364                        }
1365
1366                        // Skip if in HTML comment
1367                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1368                            continue;
1369                        }
1370
1371                        // Find line and column using binary search
1372                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1373                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1374
1375                        let is_reference = matches!(
1376                            link_type,
1377                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1378                        );
1379
1380                        // Extract alt text directly from source bytes to preserve escaping
1381                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1382                        let alt_text = if start_pos < content.len() {
1383                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1384
1385                            // Find MATCHING ] by tracking bracket depth for nested brackets
1386                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1387                            let mut close_pos = None;
1388                            let mut depth = 0;
1389
1390                            if image_bytes.len() > 2 {
1391                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1392                                    // Count preceding backslashes
1393                                    let mut backslash_count = 0;
1394                                    let mut j = i;
1395                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1396                                        backslash_count += 1;
1397                                        j -= 1;
1398                                    }
1399                                    let is_escaped = backslash_count % 2 != 0;
1400
1401                                    if !is_escaped {
1402                                        if byte == b'[' {
1403                                            depth += 1;
1404                                        } else if byte == b']' {
1405                                            if depth == 0 {
1406                                                // Found the matching closing bracket
1407                                                close_pos = Some(i);
1408                                                break;
1409                                            } else {
1410                                                depth -= 1;
1411                                            }
1412                                        }
1413                                    }
1414                                }
1415                            }
1416
1417                            if let Some(pos) = close_pos {
1418                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1419                            } else {
1420                                Cow::Borrowed("")
1421                            }
1422                        } else {
1423                            Cow::Borrowed("")
1424                        };
1425
1426                        let reference_id = if is_reference && !ref_id.is_empty() {
1427                            Some(Cow::Owned(ref_id.to_lowercase()))
1428                        } else if is_reference {
1429                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1430                        } else {
1431                            None
1432                        };
1433
1434                        found_positions.insert(start_pos);
1435                        images.push(ParsedImage {
1436                            line: line_num,
1437                            start_col: col_start,
1438                            end_col: col_end,
1439                            byte_offset: start_pos,
1440                            byte_end: range.end,
1441                            alt_text,
1442                            url: Cow::Owned(url.to_string()),
1443                            is_reference,
1444                            reference_id,
1445                            link_type,
1446                        });
1447                    }
1448                }
1449                _ => {}
1450            }
1451        }
1452
1453        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1454        for cap in IMAGE_PATTERN.captures_iter(content) {
1455            let full_match = cap.get(0).unwrap();
1456            let match_start = full_match.start();
1457            let match_end = full_match.end();
1458
1459            // Skip if already found by pulldown-cmark
1460            if found_positions.contains(&match_start) {
1461                continue;
1462            }
1463
1464            // Skip if the ! is escaped
1465            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1466                continue;
1467            }
1468
1469            // Skip if in code block, code span, or HTML comment
1470            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1471                || Self::is_offset_in_code_span(code_spans, match_start)
1472                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1473            {
1474                continue;
1475            }
1476
1477            // Only process reference images (undefined references not found by pulldown-cmark)
1478            if let Some(ref_id) = cap.get(6) {
1479                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1480                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1481                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1482                let ref_id_str = ref_id.as_str();
1483                let normalized_ref = if ref_id_str.is_empty() {
1484                    Cow::Owned(alt_text.to_lowercase())
1485                } else {
1486                    Cow::Owned(ref_id_str.to_lowercase())
1487                };
1488
1489                images.push(ParsedImage {
1490                    line: line_num,
1491                    start_col: col_start,
1492                    end_col: col_end,
1493                    byte_offset: match_start,
1494                    byte_end: match_end,
1495                    alt_text: Cow::Borrowed(alt_text),
1496                    url: Cow::Borrowed(""),
1497                    is_reference: true,
1498                    reference_id: Some(normalized_ref),
1499                    link_type: LinkType::Reference, // Undefined references are reference-style
1500                });
1501            }
1502        }
1503
1504        images
1505    }
1506
1507    /// Parse reference definitions
1508    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1509        // Pre-size based on lines count as reference definitions are line-based
1510        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1511
1512        for (line_idx, line_info) in lines.iter().enumerate() {
1513            // Skip lines in code blocks
1514            if line_info.in_code_block {
1515                continue;
1516            }
1517
1518            let line = line_info.content(content);
1519            let line_num = line_idx + 1;
1520
1521            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1522                let id = cap.get(1).unwrap().as_str().to_lowercase();
1523                let url = cap.get(2).unwrap().as_str().to_string();
1524                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1525
1526                // Calculate byte positions
1527                // The match starts at the beginning of the line (0) and extends to the end
1528                let match_obj = cap.get(0).unwrap();
1529                let byte_offset = line_info.byte_offset + match_obj.start();
1530                let byte_end = line_info.byte_offset + match_obj.end();
1531
1532                refs.push(ReferenceDef {
1533                    line: line_num,
1534                    id,
1535                    url,
1536                    title,
1537                    byte_offset,
1538                    byte_end,
1539                });
1540            }
1541        }
1542
1543        refs
1544    }
1545
1546    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1547    /// Matches: ^(\s*>\s*)(.*)
1548    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1549    #[inline]
1550    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1551        let trimmed_start = line.trim_start();
1552        if !trimmed_start.starts_with('>') {
1553            return None;
1554        }
1555
1556        let leading_ws_len = line.len() - trimmed_start.len();
1557        let after_gt = &trimmed_start[1..];
1558        let content = after_gt.trim_start();
1559        let ws_after_gt_len = after_gt.len() - content.len();
1560        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1561
1562        Some((&line[..prefix_len], content))
1563    }
1564
1565    /// Fast unordered list parser - replaces regex for 5-10x speedup
1566    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1567    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1568    #[inline]
1569    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1570        let bytes = line.as_bytes();
1571        let mut i = 0;
1572
1573        // Skip leading whitespace
1574        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1575            i += 1;
1576        }
1577
1578        // Check for marker
1579        if i >= bytes.len() {
1580            return None;
1581        }
1582        let marker = bytes[i] as char;
1583        if marker != '-' && marker != '*' && marker != '+' {
1584            return None;
1585        }
1586        let marker_pos = i;
1587        i += 1;
1588
1589        // Collect spacing after marker (space or tab only)
1590        let spacing_start = i;
1591        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1592            i += 1;
1593        }
1594
1595        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1596    }
1597
1598    /// Fast ordered list parser - replaces regex for 5-10x speedup
1599    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1600    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1601    #[inline]
1602    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1603        let bytes = line.as_bytes();
1604        let mut i = 0;
1605
1606        // Skip leading whitespace
1607        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1608            i += 1;
1609        }
1610
1611        // Collect digits
1612        let number_start = i;
1613        while i < bytes.len() && bytes[i].is_ascii_digit() {
1614            i += 1;
1615        }
1616        if i == number_start {
1617            return None; // No digits found
1618        }
1619
1620        // Check for delimiter
1621        if i >= bytes.len() {
1622            return None;
1623        }
1624        let delimiter = bytes[i] as char;
1625        if delimiter != '.' && delimiter != ')' {
1626            return None;
1627        }
1628        let delimiter_pos = i;
1629        i += 1;
1630
1631        // Collect spacing after delimiter (space or tab only)
1632        let spacing_start = i;
1633        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1634            i += 1;
1635        }
1636
1637        Some((
1638            &line[..number_start],
1639            &line[number_start..delimiter_pos],
1640            delimiter,
1641            &line[spacing_start..i],
1642            &line[i..],
1643        ))
1644    }
1645
1646    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1647    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1648    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1649        let num_lines = line_offsets.len();
1650        let mut in_code_block = vec![false; num_lines];
1651
1652        // For each code block, mark all lines within it
1653        for &(start, end) in code_blocks {
1654            // Ensure we're at valid UTF-8 boundaries
1655            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1656                let mut boundary = start;
1657                while boundary > 0 && !content.is_char_boundary(boundary) {
1658                    boundary -= 1;
1659                }
1660                boundary
1661            } else {
1662                start
1663            };
1664
1665            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1666                let mut boundary = end;
1667                while boundary < content.len() && !content.is_char_boundary(boundary) {
1668                    boundary += 1;
1669                }
1670                boundary
1671            } else {
1672                end.min(content.len())
1673            };
1674
1675            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1676            // That function now has proper list context awareness (see code_block_utils.rs)
1677            // and correctly distinguishes between:
1678            // - Fenced code blocks (``` or ~~~)
1679            // - Indented code blocks at document level (4 spaces + blank line before)
1680            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1681            //
1682            // We no longer need to re-validate here. The original validation logic
1683            // was causing false positives by marking list continuation paragraphs as
1684            // code blocks when they have 4 spaces of indentation.
1685
1686            // Use binary search to find the first and last line indices
1687            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1688            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1689            //
1690            // Find the line that CONTAINS safe_start: the line with the largest
1691            // start offset that is <= safe_start. partition_point gives us the
1692            // first line that starts AFTER safe_start, so we subtract 1.
1693            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1694            let first_line = first_line_after.saturating_sub(1);
1695            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1696
1697            // Mark all lines in the range at once
1698            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1699                *flag = true;
1700            }
1701        }
1702
1703        in_code_block
1704    }
1705
1706    /// Pre-compute basic line information (without headings/blockquotes)
1707    fn compute_basic_line_info(
1708        content: &str,
1709        line_offsets: &[usize],
1710        code_blocks: &[(usize, usize)],
1711        flavor: MarkdownFlavor,
1712        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1713        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1714    ) -> Vec<LineInfo> {
1715        let content_lines: Vec<&str> = content.lines().collect();
1716        let mut lines = Vec::with_capacity(content_lines.len());
1717
1718        // Pre-compute which lines are in code blocks
1719        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1720
1721        // Detect front matter boundaries FIRST, before any other parsing
1722        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1723        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1724
1725        for (i, line) in content_lines.iter().enumerate() {
1726            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1727            let indent = line.len() - line.trim_start().len();
1728
1729            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1730            let blockquote_parse = Self::parse_blockquote_prefix(line);
1731
1732            // For blank detection, consider blockquote context
1733            let is_blank = if let Some((_, content)) = blockquote_parse {
1734                // In blockquote context, check if content after prefix is blank
1735                content.trim().is_empty()
1736            } else {
1737                line.trim().is_empty()
1738            };
1739
1740            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1741            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1742
1743            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1744            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1745                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1746            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1747            let in_html_comment =
1748                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1749            let list_item = if !(in_code_block
1750                || is_blank
1751                || in_mkdocstrings
1752                || in_html_comment
1753                || (front_matter_end > 0 && i < front_matter_end))
1754            {
1755                // Strip blockquote prefix if present for list detection (reuse cached result)
1756                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1757                    (content, prefix.len())
1758                } else {
1759                    (&**line, 0)
1760                };
1761
1762                if let Some((leading_spaces, marker, spacing, _content)) =
1763                    Self::parse_unordered_list(line_for_list_check)
1764                {
1765                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1766                    let content_column = marker_column + 1 + spacing.len();
1767
1768                    // According to CommonMark spec, unordered list items MUST have at least one space
1769                    // after the marker (-, *, or +). Without a space, it's not a list item.
1770                    // This also naturally handles cases like:
1771                    // - *emphasis* (not a list)
1772                    // - **bold** (not a list)
1773                    // - --- (horizontal rule, not a list)
1774                    if spacing.is_empty() {
1775                        None
1776                    } else {
1777                        Some(ListItemInfo {
1778                            marker: marker.to_string(),
1779                            is_ordered: false,
1780                            number: None,
1781                            marker_column,
1782                            content_column,
1783                        })
1784                    }
1785                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1786                    Self::parse_ordered_list(line_for_list_check)
1787                {
1788                    let marker = format!("{number_str}{delimiter}");
1789                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1790                    let content_column = marker_column + marker.len() + spacing.len();
1791
1792                    // According to CommonMark spec, ordered list items MUST have at least one space
1793                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1794                    if spacing.is_empty() {
1795                        None
1796                    } else {
1797                        Some(ListItemInfo {
1798                            marker,
1799                            is_ordered: true,
1800                            number: number_str.parse().ok(),
1801                            marker_column,
1802                            content_column,
1803                        })
1804                    }
1805                } else {
1806                    None
1807                }
1808            } else {
1809                None
1810            };
1811
1812            lines.push(LineInfo {
1813                byte_offset,
1814                byte_len: line.len(),
1815                indent,
1816                is_blank,
1817                in_code_block,
1818                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1819                in_html_block: false, // Will be populated after line creation
1820                in_html_comment,
1821                list_item,
1822                heading: None,    // Will be populated in second pass for Setext headings
1823                blockquote: None, // Will be populated after line creation
1824                in_mkdocstrings,
1825                in_esm_block: false, // Will be populated after line creation for MDX files
1826                in_code_span_continuation: false, // Will be populated after code spans are parsed
1827            });
1828        }
1829
1830        lines
1831    }
1832
1833    /// Detect headings and blockquotes (called after HTML block detection)
1834    fn detect_headings_and_blockquotes(
1835        content: &str,
1836        lines: &mut [LineInfo],
1837        flavor: MarkdownFlavor,
1838        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1839    ) {
1840        // Regex for heading detection
1841        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1842            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1843        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1844            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1845
1846        let content_lines: Vec<&str> = content.lines().collect();
1847
1848        // Detect front matter boundaries to skip those lines
1849        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1850
1851        // Detect headings (including Setext which needs look-ahead) and blockquotes
1852        for i in 0..lines.len() {
1853            if lines[i].in_code_block {
1854                continue;
1855            }
1856
1857            // Skip lines in front matter
1858            if front_matter_end > 0 && i < front_matter_end {
1859                continue;
1860            }
1861
1862            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1863            if lines[i].in_html_block {
1864                continue;
1865            }
1866
1867            let line = content_lines[i];
1868
1869            // Check for blockquotes (even on blank lines within blockquotes)
1870            if let Some(bq) = parse_blockquote_detailed(line) {
1871                let nesting_level = bq.markers.len(); // Each '>' is one level
1872                let marker_column = bq.indent.len();
1873
1874                // Build the prefix (indentation + markers + space)
1875                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1876
1877                // Check for various blockquote issues
1878                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1879                // Only flag multiple literal spaces, not tabs
1880                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
1881                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1882
1883                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1884                // MD028 flags empty blockquote lines that don't have a single space after the marker
1885                // Lines like "> " or ">> " are already correct and don't need fixing
1886                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1887
1888                lines[i].blockquote = Some(BlockquoteInfo {
1889                    nesting_level,
1890                    indent: bq.indent.to_string(),
1891                    marker_column,
1892                    prefix,
1893                    content: bq.content.to_string(),
1894                    has_no_space_after_marker: has_no_space,
1895                    has_multiple_spaces_after_marker: has_multiple_spaces,
1896                    needs_md028_fix,
1897                });
1898            }
1899
1900            // Skip heading detection for blank lines
1901            if lines[i].is_blank {
1902                continue;
1903            }
1904
1905            // Check for ATX headings (but skip MkDocs snippet lines)
1906            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1907            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1908                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1909                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1910            } else {
1911                false
1912            };
1913
1914            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1915                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1916                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1917                    continue;
1918                }
1919                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1920                let hashes = caps.get(2).map_or("", |m| m.as_str());
1921                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1922                let rest = caps.get(4).map_or("", |m| m.as_str());
1923
1924                let level = hashes.len() as u8;
1925                let marker_column = leading_spaces.len();
1926
1927                // Check for closing sequence, but handle custom IDs that might come after
1928                let (text, has_closing, closing_seq) = {
1929                    // First check if there's a custom ID at the end
1930                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1931                        // Check if this looks like a valid custom ID (ends with })
1932                        if rest[id_start..].trim_end().ends_with('}') {
1933                            // Split off the custom ID
1934                            (&rest[..id_start], &rest[id_start..])
1935                        } else {
1936                            (rest, "")
1937                        }
1938                    } else {
1939                        (rest, "")
1940                    };
1941
1942                    // Now look for closing hashes in the part before the custom ID
1943                    let trimmed_rest = rest_without_id.trim_end();
1944                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1945                        // Look for the start of the hash sequence
1946                        let mut start_of_hashes = last_hash_pos;
1947                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1948                            start_of_hashes -= 1;
1949                        }
1950
1951                        // Check if there's at least one space before the closing hashes
1952                        let has_space_before = start_of_hashes == 0
1953                            || trimmed_rest
1954                                .chars()
1955                                .nth(start_of_hashes - 1)
1956                                .is_some_and(|c| c.is_whitespace());
1957
1958                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1959                        let potential_closing = &trimmed_rest[start_of_hashes..];
1960                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1961
1962                        if is_all_hashes && has_space_before {
1963                            // This is a closing sequence
1964                            let closing_hashes = potential_closing.to_string();
1965                            // The text is everything before the closing hashes
1966                            // Don't include the custom ID here - it will be extracted later
1967                            let text_part = if !custom_id_part.is_empty() {
1968                                // If we have a custom ID, append it back to get the full rest
1969                                // This allows the extract_header_id function to handle it properly
1970                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1971                            } else {
1972                                rest_without_id[..start_of_hashes].trim_end().to_string()
1973                            };
1974                            (text_part, true, closing_hashes)
1975                        } else {
1976                            // Not a valid closing sequence, return the full content
1977                            (rest.to_string(), false, String::new())
1978                        }
1979                    } else {
1980                        // No hashes found, return the full content
1981                        (rest.to_string(), false, String::new())
1982                    }
1983                };
1984
1985                let content_column = marker_column + hashes.len() + spaces_after.len();
1986
1987                // Extract custom header ID if present
1988                let raw_text = text.trim().to_string();
1989                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1990
1991                // If no custom ID was found on the header line, check the next line for standalone attr-list
1992                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1993                    let next_line = content_lines[i + 1];
1994                    if !lines[i + 1].in_code_block
1995                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1996                        && let Some(next_line_id) =
1997                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1998                    {
1999                        custom_id = Some(next_line_id);
2000                    }
2001                }
2002
2003                lines[i].heading = Some(HeadingInfo {
2004                    level,
2005                    style: HeadingStyle::ATX,
2006                    marker: hashes.to_string(),
2007                    marker_column,
2008                    content_column,
2009                    text: clean_text,
2010                    custom_id,
2011                    raw_text,
2012                    has_closing_sequence: has_closing,
2013                    closing_sequence: closing_seq,
2014                });
2015            }
2016            // Check for Setext headings (need to look at next line)
2017            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2018                let next_line = content_lines[i + 1];
2019                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2020                    // Skip if next line is front matter delimiter
2021                    if front_matter_end > 0 && i < front_matter_end {
2022                        continue;
2023                    }
2024
2025                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2026                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2027                    {
2028                        continue;
2029                    }
2030
2031                    let underline = next_line.trim();
2032
2033                    let level = if underline.starts_with('=') { 1 } else { 2 };
2034                    let style = if level == 1 {
2035                        HeadingStyle::Setext1
2036                    } else {
2037                        HeadingStyle::Setext2
2038                    };
2039
2040                    // Extract custom header ID if present
2041                    let raw_text = line.trim().to_string();
2042                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2043
2044                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2045                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2046                        let attr_line = content_lines[i + 2];
2047                        if !lines[i + 2].in_code_block
2048                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2049                            && let Some(attr_line_id) =
2050                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2051                        {
2052                            custom_id = Some(attr_line_id);
2053                        }
2054                    }
2055
2056                    lines[i].heading = Some(HeadingInfo {
2057                        level,
2058                        style,
2059                        marker: underline.to_string(),
2060                        marker_column: next_line.len() - next_line.trim_start().len(),
2061                        content_column: lines[i].indent,
2062                        text: clean_text,
2063                        custom_id,
2064                        raw_text,
2065                        has_closing_sequence: false,
2066                        closing_sequence: String::new(),
2067                    });
2068                }
2069            }
2070        }
2071    }
2072
2073    /// Detect HTML blocks in the content
2074    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2075        // HTML block elements that trigger block context
2076        const BLOCK_ELEMENTS: &[&str] = &[
2077            "address",
2078            "article",
2079            "aside",
2080            "blockquote",
2081            "details",
2082            "dialog",
2083            "dd",
2084            "div",
2085            "dl",
2086            "dt",
2087            "fieldset",
2088            "figcaption",
2089            "figure",
2090            "footer",
2091            "form",
2092            "h1",
2093            "h2",
2094            "h3",
2095            "h4",
2096            "h5",
2097            "h6",
2098            "header",
2099            "hr",
2100            "li",
2101            "main",
2102            "nav",
2103            "ol",
2104            "p",
2105            "picture",
2106            "pre",
2107            "script",
2108            "section",
2109            "style",
2110            "table",
2111            "tbody",
2112            "td",
2113            "textarea",
2114            "tfoot",
2115            "th",
2116            "thead",
2117            "tr",
2118            "ul",
2119        ];
2120
2121        let mut i = 0;
2122        while i < lines.len() {
2123            // Skip if already in code block or front matter
2124            if lines[i].in_code_block || lines[i].in_front_matter {
2125                i += 1;
2126                continue;
2127            }
2128
2129            let trimmed = lines[i].content(content).trim_start();
2130
2131            // Check if line starts with an HTML tag
2132            if trimmed.starts_with('<') && trimmed.len() > 1 {
2133                // Extract tag name safely
2134                let after_bracket = &trimmed[1..];
2135                let is_closing = after_bracket.starts_with('/');
2136                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2137
2138                // Extract tag name (stop at space, >, /, or end of string)
2139                let tag_name = tag_start
2140                    .chars()
2141                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2142                    .collect::<String>()
2143                    .to_lowercase();
2144
2145                // Check if it's a block element
2146                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2147                    // Mark this line as in HTML block
2148                    lines[i].in_html_block = true;
2149
2150                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2151                    // This avoids complex nesting logic that might cause infinite loops
2152                    if !is_closing {
2153                        let closing_tag = format!("</{tag_name}>");
2154                        // style and script tags can contain blank lines (CSS/JS formatting)
2155                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2156                        let mut j = i + 1;
2157                        while j < lines.len() && j < i + 100 {
2158                            // Limit search to 100 lines
2159                            // Stop at blank lines (except for style/script tags)
2160                            if !allow_blank_lines && lines[j].is_blank {
2161                                break;
2162                            }
2163
2164                            lines[j].in_html_block = true;
2165
2166                            // Check if this line contains the closing tag
2167                            if lines[j].content(content).contains(&closing_tag) {
2168                                break;
2169                            }
2170                            j += 1;
2171                        }
2172                    }
2173                }
2174            }
2175
2176            i += 1;
2177        }
2178    }
2179
2180    /// Detect ESM import/export blocks in MDX files
2181    /// ESM blocks consist of contiguous import/export statements at the top of the file
2182    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2183        // Only process MDX files
2184        if !flavor.supports_esm_blocks() {
2185            return;
2186        }
2187
2188        for line in lines.iter_mut() {
2189            // Skip blank lines and comments at the start
2190            if line.is_blank || line.in_html_comment {
2191                continue;
2192            }
2193
2194            // Check if line starts with import or export
2195            let trimmed = line.content(content).trim_start();
2196            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2197                line.in_esm_block = true;
2198            } else {
2199                // Once we hit a non-ESM line, we're done with the ESM block
2200                break;
2201            }
2202        }
2203    }
2204
2205    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2206    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2207        let mut code_spans = Vec::new();
2208
2209        // Quick check - if no backticks, no code spans
2210        if !content.contains('`') {
2211            return code_spans;
2212        }
2213
2214        // Use pulldown-cmark's streaming parser with byte offsets
2215        let parser = Parser::new(content).into_offset_iter();
2216
2217        for (event, range) in parser {
2218            if let Event::Code(_) = event {
2219                let start_pos = range.start;
2220                let end_pos = range.end;
2221
2222                // The range includes the backticks, extract the actual content
2223                let full_span = &content[start_pos..end_pos];
2224                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2225
2226                // Extract content between backticks, preserving spaces
2227                let content_start = start_pos + backtick_count;
2228                let content_end = end_pos - backtick_count;
2229                let span_content = if content_start < content_end {
2230                    content[content_start..content_end].to_string()
2231                } else {
2232                    String::new()
2233                };
2234
2235                // Use binary search to find line number - O(log n) instead of O(n)
2236                // Find the rightmost line whose byte_offset <= start_pos
2237                let line_idx = lines
2238                    .partition_point(|line| line.byte_offset <= start_pos)
2239                    .saturating_sub(1);
2240                let line_num = line_idx + 1;
2241                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2242
2243                // Find end column using binary search
2244                let end_line_idx = lines
2245                    .partition_point(|line| line.byte_offset <= end_pos)
2246                    .saturating_sub(1);
2247                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2248
2249                // Convert byte offsets to character positions for correct Unicode handling
2250                // This ensures consistency with warning.column which uses character positions
2251                let line_content = lines[line_idx].content(content);
2252                let col_start = if byte_col_start <= line_content.len() {
2253                    line_content[..byte_col_start].chars().count()
2254                } else {
2255                    line_content.chars().count()
2256                };
2257
2258                let end_line_content = lines[end_line_idx].content(content);
2259                let col_end = if byte_col_end <= end_line_content.len() {
2260                    end_line_content[..byte_col_end].chars().count()
2261                } else {
2262                    end_line_content.chars().count()
2263                };
2264
2265                code_spans.push(CodeSpan {
2266                    line: line_num,
2267                    end_line: end_line_idx + 1,
2268                    start_col: col_start,
2269                    end_col: col_end,
2270                    byte_offset: start_pos,
2271                    byte_end: end_pos,
2272                    backtick_count,
2273                    content: span_content,
2274                });
2275            }
2276        }
2277
2278        // Sort by position to ensure consistent ordering
2279        code_spans.sort_by_key(|span| span.byte_offset);
2280
2281        code_spans
2282    }
2283
2284    /// Parse all list blocks in the content (legacy line-by-line approach)
2285    ///
2286    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2287    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2288    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2289    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2290    ///   treated as list continuation (based on the list marker width)
2291    ///
2292    /// When a new list item is encountered, we check if list-breaking content was seen
2293    /// since the last item. If so, we start a new list block.
2294    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2295        // Minimum indentation for unordered list continuation per CommonMark spec
2296        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2297
2298        /// Initialize or reset the forward-scanning tracking state.
2299        /// This helper eliminates code duplication across three initialization sites.
2300        #[inline]
2301        fn reset_tracking_state(
2302            list_item: &ListItemInfo,
2303            has_list_breaking_content: &mut bool,
2304            min_continuation: &mut usize,
2305        ) {
2306            *has_list_breaking_content = false;
2307            let marker_width = if list_item.is_ordered {
2308                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2309            } else {
2310                list_item.marker.len()
2311            };
2312            *min_continuation = if list_item.is_ordered {
2313                marker_width
2314            } else {
2315                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2316            };
2317        }
2318
2319        // Pre-size based on lines that could be list items
2320        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2321        let mut current_block: Option<ListBlock> = None;
2322        let mut last_list_item_line = 0;
2323        let mut current_indent_level = 0;
2324        let mut last_marker_width = 0;
2325
2326        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2327        let mut has_list_breaking_content_since_last_item = false;
2328        let mut min_continuation_for_tracking = 0;
2329
2330        for (line_idx, line_info) in lines.iter().enumerate() {
2331            let line_num = line_idx + 1;
2332
2333            // Enhanced code block handling using Design #3's context analysis
2334            if line_info.in_code_block {
2335                if let Some(ref mut block) = current_block {
2336                    // Calculate minimum indentation for list continuation
2337                    let min_continuation_indent =
2338                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2339
2340                    // Analyze code block context using the three-tier classification
2341                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2342
2343                    match context {
2344                        CodeBlockContext::Indented => {
2345                            // Code block is properly indented - continues the list
2346                            block.end_line = line_num;
2347                            continue;
2348                        }
2349                        CodeBlockContext::Standalone => {
2350                            // Code block separates lists - end current block
2351                            let completed_block = current_block.take().unwrap();
2352                            list_blocks.push(completed_block);
2353                            continue;
2354                        }
2355                        CodeBlockContext::Adjacent => {
2356                            // Edge case - use conservative behavior (continue list)
2357                            block.end_line = line_num;
2358                            continue;
2359                        }
2360                    }
2361                } else {
2362                    // No current list block - skip code block lines
2363                    continue;
2364                }
2365            }
2366
2367            // Extract blockquote prefix if any
2368            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2369                caps.get(0).unwrap().as_str().to_string()
2370            } else {
2371                String::new()
2372            };
2373
2374            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2375            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2376            if current_block.is_some()
2377                && line_info.list_item.is_none()
2378                && !line_info.is_blank
2379                && !line_info.in_code_span_continuation
2380            {
2381                let line_content = line_info.content(content).trim();
2382
2383                // Check for structural separators that break lists
2384                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2385                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2386                // as they indicate improper indentation rather than lazy continuation.
2387                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2388                let breaks_list = line_info.heading.is_some()
2389                    || line_content.starts_with("---")
2390                    || line_content.starts_with("***")
2391                    || line_content.starts_with("___")
2392                    || crate::utils::skip_context::is_table_line(line_content)
2393                    || line_content.starts_with(">")
2394                    || (line_info.indent > 0
2395                        && line_info.indent < min_continuation_for_tracking
2396                        && !is_lazy_continuation);
2397
2398                if breaks_list {
2399                    has_list_breaking_content_since_last_item = true;
2400                }
2401            }
2402
2403            // If this line is a code span continuation within an active list block,
2404            // extend the block's end_line to include this line (maintains list continuity)
2405            if line_info.in_code_span_continuation
2406                && line_info.list_item.is_none()
2407                && let Some(ref mut block) = current_block
2408            {
2409                block.end_line = line_num;
2410            }
2411
2412            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2413            // properly indented lines within the list). This ensures the workaround at line 2448
2414            // works correctly when there are multiple continuation lines before a nested list item.
2415            // Also include lazy continuation lines (indent=0) per CommonMark spec.
2416            let is_valid_continuation =
2417                line_info.indent >= min_continuation_for_tracking || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
2418            if !line_info.in_code_span_continuation
2419                && line_info.list_item.is_none()
2420                && !line_info.is_blank
2421                && !line_info.in_code_block
2422                && is_valid_continuation
2423                && let Some(ref mut block) = current_block
2424            {
2425                block.end_line = line_num;
2426            }
2427
2428            // Check if this line is a list item
2429            if let Some(list_item) = &line_info.list_item {
2430                // Calculate nesting level based on indentation
2431                let item_indent = list_item.marker_column;
2432                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2433
2434                if let Some(ref mut block) = current_block {
2435                    // Check if this continues the current block
2436                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2437                    // or a continuation at the same or lower level
2438                    let is_nested = nesting > block.nesting_level;
2439                    let same_type =
2440                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2441                    let same_context = block.blockquote_prefix == blockquote_prefix;
2442                    // Allow one blank line after last item, or lines immediately after block content
2443                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
2444
2445                    // For unordered lists, also check marker consistency
2446                    let marker_compatible =
2447                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2448
2449                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2450                    // This eliminates the quadratic bottleneck from issue #148
2451                    let has_non_list_content = has_list_breaking_content_since_last_item;
2452
2453                    // A list continues if:
2454                    // 1. It's a nested item (indented more than the parent), OR
2455                    // 2. It's the same type at the same level with reasonable distance
2456                    let mut continues_list = if is_nested {
2457                        // Nested items always continue the list if they're in the same context
2458                        same_context && reasonable_distance && !has_non_list_content
2459                    } else {
2460                        // Same-level items need to match type and markers
2461                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2462                    };
2463
2464                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2465                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2466                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2467                        // Check if the previous line was a list item or a continuation of a list item
2468                        // (including lazy continuation lines)
2469                        if block.item_lines.contains(&(line_num - 1)) {
2470                            // They're consecutive list items - force them to be in the same list
2471                            continues_list = true;
2472                        } else {
2473                            // Previous line is a continuation line within this block
2474                            // (e.g., lazy continuation with indent=0)
2475                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
2476                            continues_list = true;
2477                        }
2478                    }
2479
2480                    if continues_list {
2481                        // Extend current block
2482                        block.end_line = line_num;
2483                        block.item_lines.push(line_num);
2484
2485                        // Update max marker width
2486                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2487                            list_item.marker.len() + 1
2488                        } else {
2489                            list_item.marker.len()
2490                        });
2491
2492                        // Update marker consistency for unordered lists
2493                        if !block.is_ordered
2494                            && block.marker.is_some()
2495                            && block.marker.as_ref() != Some(&list_item.marker)
2496                        {
2497                            // Mixed markers, clear the marker field
2498                            block.marker = None;
2499                        }
2500
2501                        // Reset tracked state for issue #148 optimization
2502                        reset_tracking_state(
2503                            list_item,
2504                            &mut has_list_breaking_content_since_last_item,
2505                            &mut min_continuation_for_tracking,
2506                        );
2507                    } else {
2508                        // End current block and start a new one
2509
2510                        list_blocks.push(block.clone());
2511
2512                        *block = ListBlock {
2513                            start_line: line_num,
2514                            end_line: line_num,
2515                            is_ordered: list_item.is_ordered,
2516                            marker: if list_item.is_ordered {
2517                                None
2518                            } else {
2519                                Some(list_item.marker.clone())
2520                            },
2521                            blockquote_prefix: blockquote_prefix.clone(),
2522                            item_lines: vec![line_num],
2523                            nesting_level: nesting,
2524                            max_marker_width: if list_item.is_ordered {
2525                                list_item.marker.len() + 1
2526                            } else {
2527                                list_item.marker.len()
2528                            },
2529                        };
2530
2531                        // Initialize tracked state for new block (issue #148 optimization)
2532                        reset_tracking_state(
2533                            list_item,
2534                            &mut has_list_breaking_content_since_last_item,
2535                            &mut min_continuation_for_tracking,
2536                        );
2537                    }
2538                } else {
2539                    // Start a new block
2540                    current_block = Some(ListBlock {
2541                        start_line: line_num,
2542                        end_line: line_num,
2543                        is_ordered: list_item.is_ordered,
2544                        marker: if list_item.is_ordered {
2545                            None
2546                        } else {
2547                            Some(list_item.marker.clone())
2548                        },
2549                        blockquote_prefix,
2550                        item_lines: vec![line_num],
2551                        nesting_level: nesting,
2552                        max_marker_width: list_item.marker.len(),
2553                    });
2554
2555                    // Initialize tracked state for new block (issue #148 optimization)
2556                    reset_tracking_state(
2557                        list_item,
2558                        &mut has_list_breaking_content_since_last_item,
2559                        &mut min_continuation_for_tracking,
2560                    );
2561                }
2562
2563                last_list_item_line = line_num;
2564                current_indent_level = item_indent;
2565                last_marker_width = if list_item.is_ordered {
2566                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2567                } else {
2568                    list_item.marker.len()
2569                };
2570            } else if let Some(ref mut block) = current_block {
2571                // Not a list item - check if it continues the current block
2572
2573                // For MD032 compatibility, we use a simple approach:
2574                // - Indented lines continue the list
2575                // - Blank lines followed by indented content continue the list
2576                // - Everything else ends the list
2577
2578                // Check if the last line in the list block ended with a backslash (hard line break)
2579                // This handles cases where list items use backslash for hard line breaks
2580                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2581                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2582                } else {
2583                    false
2584                };
2585
2586                // Calculate minimum indentation for list continuation
2587                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2588                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2589                let min_continuation_indent = if block.is_ordered {
2590                    current_indent_level + last_marker_width
2591                } else {
2592                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2593                };
2594
2595                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2596                    // Indented line or backslash continuation continues the list
2597                    block.end_line = line_num;
2598                } else if line_info.is_blank {
2599                    // Blank line - check if it's internal to the list or ending it
2600                    // We only include blank lines that are followed by more list content
2601                    let mut check_idx = line_idx + 1;
2602                    let mut found_continuation = false;
2603
2604                    // Skip additional blank lines
2605                    while check_idx < lines.len() && lines[check_idx].is_blank {
2606                        check_idx += 1;
2607                    }
2608
2609                    if check_idx < lines.len() {
2610                        let next_line = &lines[check_idx];
2611                        // Check if followed by indented content (list continuation)
2612                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2613                            found_continuation = true;
2614                        }
2615                        // Check if followed by another list item at the same level
2616                        else if !next_line.in_code_block
2617                            && next_line.list_item.is_some()
2618                            && let Some(item) = &next_line.list_item
2619                        {
2620                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2621                                .find(next_line.content(content))
2622                                .map_or(String::new(), |m| m.as_str().to_string());
2623                            if item.marker_column == current_indent_level
2624                                && item.is_ordered == block.is_ordered
2625                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2626                            {
2627                                // Check if there was meaningful content between the list items (unused now)
2628                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2629                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2630                                    if let Some(between_line) = lines.get(idx) {
2631                                        let between_content = between_line.content(content);
2632                                        let trimmed = between_content.trim();
2633                                        // Skip empty lines
2634                                        if trimmed.is_empty() {
2635                                            return false;
2636                                        }
2637                                        // Check for meaningful content
2638                                        let line_indent = between_content.len() - between_content.trim_start().len();
2639
2640                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2641                                        if trimmed.starts_with("```")
2642                                            || trimmed.starts_with("~~~")
2643                                            || trimmed.starts_with("---")
2644                                            || trimmed.starts_with("***")
2645                                            || trimmed.starts_with("___")
2646                                            || trimmed.starts_with(">")
2647                                            || crate::utils::skip_context::is_table_line(trimmed)
2648                                            || between_line.heading.is_some()
2649                                        {
2650                                            return true; // These are structural separators - meaningful content that breaks lists
2651                                        }
2652
2653                                        // Only properly indented content continues the list
2654                                        line_indent >= min_continuation_indent
2655                                    } else {
2656                                        false
2657                                    }
2658                                });
2659
2660                                if block.is_ordered {
2661                                    // For ordered lists: don't continue if there are structural separators
2662                                    // Check if there are structural separators between the list items
2663                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2664                                        if let Some(between_line) = lines.get(idx) {
2665                                            let trimmed = between_line.content(content).trim();
2666                                            if trimmed.is_empty() {
2667                                                return false;
2668                                            }
2669                                            // Check for structural separators that break lists
2670                                            trimmed.starts_with("```")
2671                                                || trimmed.starts_with("~~~")
2672                                                || trimmed.starts_with("---")
2673                                                || trimmed.starts_with("***")
2674                                                || trimmed.starts_with("___")
2675                                                || trimmed.starts_with(">")
2676                                                || crate::utils::skip_context::is_table_line(trimmed)
2677                                                || between_line.heading.is_some()
2678                                        } else {
2679                                            false
2680                                        }
2681                                    });
2682                                    found_continuation = !has_structural_separators;
2683                                } else {
2684                                    // For unordered lists: also check for structural separators
2685                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2686                                        if let Some(between_line) = lines.get(idx) {
2687                                            let trimmed = between_line.content(content).trim();
2688                                            if trimmed.is_empty() {
2689                                                return false;
2690                                            }
2691                                            // Check for structural separators that break lists
2692                                            trimmed.starts_with("```")
2693                                                || trimmed.starts_with("~~~")
2694                                                || trimmed.starts_with("---")
2695                                                || trimmed.starts_with("***")
2696                                                || trimmed.starts_with("___")
2697                                                || trimmed.starts_with(">")
2698                                                || crate::utils::skip_context::is_table_line(trimmed)
2699                                                || between_line.heading.is_some()
2700                                        } else {
2701                                            false
2702                                        }
2703                                    });
2704                                    found_continuation = !has_structural_separators;
2705                                }
2706                            }
2707                        }
2708                    }
2709
2710                    if found_continuation {
2711                        // Include the blank line in the block
2712                        block.end_line = line_num;
2713                    } else {
2714                        // Blank line ends the list - don't include it
2715                        list_blocks.push(block.clone());
2716                        current_block = None;
2717                    }
2718                } else {
2719                    // Check for lazy continuation - non-indented line immediately after a list item
2720                    // But only if the line has sufficient indentation for the list type
2721                    let min_required_indent = if block.is_ordered {
2722                        current_indent_level + last_marker_width
2723                    } else {
2724                        current_indent_level + 2
2725                    };
2726
2727                    // For lazy continuation to apply, the line must either:
2728                    // 1. Have no indentation (true lazy continuation)
2729                    // 2. Have sufficient indentation for the list type
2730                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2731                    let line_content = line_info.content(content).trim();
2732
2733                    // Check for table-like patterns
2734                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
2735
2736                    let is_structural_separator = line_info.heading.is_some()
2737                        || line_content.starts_with("```")
2738                        || line_content.starts_with("~~~")
2739                        || line_content.starts_with("---")
2740                        || line_content.starts_with("***")
2741                        || line_content.starts_with("___")
2742                        || line_content.starts_with(">")
2743                        || looks_like_table;
2744
2745                    // Allow lazy continuation if we're still within the same list block
2746                    // (not just immediately after a list item)
2747                    let is_lazy_continuation = !is_structural_separator
2748                        && !line_info.is_blank
2749                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2750
2751                    if is_lazy_continuation {
2752                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2753                        // it's probably not a continuation
2754                        let content_to_check = if !blockquote_prefix.is_empty() {
2755                            // Strip blockquote prefix to check the actual content
2756                            line_info
2757                                .content(content)
2758                                .strip_prefix(&blockquote_prefix)
2759                                .unwrap_or(line_info.content(content))
2760                                .trim()
2761                        } else {
2762                            line_info.content(content).trim()
2763                        };
2764
2765                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2766
2767                        // If it starts with uppercase and the previous line ended with punctuation,
2768                        // it's likely a new paragraph, not a continuation
2769                        if starts_with_uppercase && last_list_item_line > 0 {
2770                            // This looks like a new paragraph
2771                            list_blocks.push(block.clone());
2772                            current_block = None;
2773                        } else {
2774                            // This is a lazy continuation line
2775                            block.end_line = line_num;
2776                        }
2777                    } else {
2778                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2779                        list_blocks.push(block.clone());
2780                        current_block = None;
2781                    }
2782                }
2783            }
2784        }
2785
2786        // Don't forget the last block
2787        if let Some(block) = current_block {
2788            list_blocks.push(block);
2789        }
2790
2791        // Merge adjacent blocks that should be one
2792        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2793
2794        list_blocks
2795    }
2796
2797    /// Compute character frequency for fast content analysis
2798    fn compute_char_frequency(content: &str) -> CharFrequency {
2799        let mut frequency = CharFrequency::default();
2800
2801        for ch in content.chars() {
2802            match ch {
2803                '#' => frequency.hash_count += 1,
2804                '*' => frequency.asterisk_count += 1,
2805                '_' => frequency.underscore_count += 1,
2806                '-' => frequency.hyphen_count += 1,
2807                '+' => frequency.plus_count += 1,
2808                '>' => frequency.gt_count += 1,
2809                '|' => frequency.pipe_count += 1,
2810                '[' => frequency.bracket_count += 1,
2811                '`' => frequency.backtick_count += 1,
2812                '<' => frequency.lt_count += 1,
2813                '!' => frequency.exclamation_count += 1,
2814                '\n' => frequency.newline_count += 1,
2815                _ => {}
2816            }
2817        }
2818
2819        frequency
2820    }
2821
2822    /// Parse HTML tags in the content
2823    fn parse_html_tags(
2824        content: &str,
2825        lines: &[LineInfo],
2826        code_blocks: &[(usize, usize)],
2827        flavor: MarkdownFlavor,
2828    ) -> Vec<HtmlTag> {
2829        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2830            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2831
2832        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2833
2834        for cap in HTML_TAG_REGEX.captures_iter(content) {
2835            let full_match = cap.get(0).unwrap();
2836            let match_start = full_match.start();
2837            let match_end = full_match.end();
2838
2839            // Skip if in code block
2840            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2841                continue;
2842            }
2843
2844            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2845            let tag_name_original = cap.get(2).unwrap().as_str();
2846            let tag_name = tag_name_original.to_lowercase();
2847            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2848
2849            // Skip JSX components in MDX files (tags starting with uppercase letter)
2850            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2851            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2852                continue;
2853            }
2854
2855            // Find which line this tag is on
2856            let mut line_num = 1;
2857            let mut col_start = match_start;
2858            let mut col_end = match_end;
2859            for (idx, line_info) in lines.iter().enumerate() {
2860                if match_start >= line_info.byte_offset {
2861                    line_num = idx + 1;
2862                    col_start = match_start - line_info.byte_offset;
2863                    col_end = match_end - line_info.byte_offset;
2864                } else {
2865                    break;
2866                }
2867            }
2868
2869            html_tags.push(HtmlTag {
2870                line: line_num,
2871                start_col: col_start,
2872                end_col: col_end,
2873                byte_offset: match_start,
2874                byte_end: match_end,
2875                tag_name,
2876                is_closing,
2877                is_self_closing,
2878                raw_content: full_match.as_str().to_string(),
2879            });
2880        }
2881
2882        html_tags
2883    }
2884
2885    /// Parse emphasis spans in the content
2886    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2887        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2888            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2889
2890        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2891
2892        for cap in EMPHASIS_REGEX.captures_iter(content) {
2893            let full_match = cap.get(0).unwrap();
2894            let match_start = full_match.start();
2895            let match_end = full_match.end();
2896
2897            // Skip if in code block
2898            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2899                continue;
2900            }
2901
2902            let opening_markers = cap.get(1).unwrap().as_str();
2903            let content_part = cap.get(2).unwrap().as_str();
2904            let closing_markers = cap.get(3).unwrap().as_str();
2905
2906            // Validate matching markers
2907            if opening_markers.chars().next() != closing_markers.chars().next()
2908                || opening_markers.len() != closing_markers.len()
2909            {
2910                continue;
2911            }
2912
2913            let marker = opening_markers.chars().next().unwrap();
2914            let marker_count = opening_markers.len();
2915
2916            // Find which line this emphasis is on
2917            let mut line_num = 1;
2918            let mut col_start = match_start;
2919            let mut col_end = match_end;
2920            for (idx, line_info) in lines.iter().enumerate() {
2921                if match_start >= line_info.byte_offset {
2922                    line_num = idx + 1;
2923                    col_start = match_start - line_info.byte_offset;
2924                    col_end = match_end - line_info.byte_offset;
2925                } else {
2926                    break;
2927                }
2928            }
2929
2930            emphasis_spans.push(EmphasisSpan {
2931                line: line_num,
2932                start_col: col_start,
2933                end_col: col_end,
2934                byte_offset: match_start,
2935                byte_end: match_end,
2936                marker,
2937                marker_count,
2938                content: content_part.to_string(),
2939            });
2940        }
2941
2942        emphasis_spans
2943    }
2944
2945    /// Parse table rows in the content
2946    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2947        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2948
2949        for (line_idx, line_info) in lines.iter().enumerate() {
2950            // Skip lines in code blocks or blank lines
2951            if line_info.in_code_block || line_info.is_blank {
2952                continue;
2953            }
2954
2955            let line = line_info.content(content);
2956            let line_num = line_idx + 1;
2957
2958            // Check if this line contains pipes (potential table row)
2959            if !line.contains('|') {
2960                continue;
2961            }
2962
2963            // Count columns by splitting on pipes
2964            let parts: Vec<&str> = line.split('|').collect();
2965            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2966
2967            // Check if this is a separator row
2968            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2969            let mut column_alignments = Vec::new();
2970
2971            if is_separator {
2972                for part in &parts[1..parts.len() - 1] {
2973                    // Skip first and last empty parts
2974                    let trimmed = part.trim();
2975                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2976                        "center".to_string()
2977                    } else if trimmed.ends_with(':') {
2978                        "right".to_string()
2979                    } else if trimmed.starts_with(':') {
2980                        "left".to_string()
2981                    } else {
2982                        "none".to_string()
2983                    };
2984                    column_alignments.push(alignment);
2985                }
2986            }
2987
2988            table_rows.push(TableRow {
2989                line: line_num,
2990                is_separator,
2991                column_count,
2992                column_alignments,
2993            });
2994        }
2995
2996        table_rows
2997    }
2998
2999    /// Parse bare URLs and emails in the content
3000    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3001        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3002
3003        // Check for bare URLs (not in angle brackets or markdown links)
3004        for cap in BARE_URL_PATTERN.captures_iter(content) {
3005            let full_match = cap.get(0).unwrap();
3006            let match_start = full_match.start();
3007            let match_end = full_match.end();
3008
3009            // Skip if in code block
3010            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3011                continue;
3012            }
3013
3014            // Skip if already in angle brackets or markdown links
3015            let preceding_char = if match_start > 0 {
3016                content.chars().nth(match_start - 1)
3017            } else {
3018                None
3019            };
3020            let following_char = content.chars().nth(match_end);
3021
3022            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3023                continue;
3024            }
3025            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3026                continue;
3027            }
3028
3029            let url = full_match.as_str();
3030            let url_type = if url.starts_with("https://") {
3031                "https"
3032            } else if url.starts_with("http://") {
3033                "http"
3034            } else if url.starts_with("ftp://") {
3035                "ftp"
3036            } else {
3037                "other"
3038            };
3039
3040            // Find which line this URL is on
3041            let mut line_num = 1;
3042            let mut col_start = match_start;
3043            let mut col_end = match_end;
3044            for (idx, line_info) in lines.iter().enumerate() {
3045                if match_start >= line_info.byte_offset {
3046                    line_num = idx + 1;
3047                    col_start = match_start - line_info.byte_offset;
3048                    col_end = match_end - line_info.byte_offset;
3049                } else {
3050                    break;
3051                }
3052            }
3053
3054            bare_urls.push(BareUrl {
3055                line: line_num,
3056                start_col: col_start,
3057                end_col: col_end,
3058                byte_offset: match_start,
3059                byte_end: match_end,
3060                url: url.to_string(),
3061                url_type: url_type.to_string(),
3062            });
3063        }
3064
3065        // Check for bare email addresses
3066        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3067            let full_match = cap.get(0).unwrap();
3068            let match_start = full_match.start();
3069            let match_end = full_match.end();
3070
3071            // Skip if in code block
3072            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3073                continue;
3074            }
3075
3076            // Skip if already in angle brackets or markdown links
3077            let preceding_char = if match_start > 0 {
3078                content.chars().nth(match_start - 1)
3079            } else {
3080                None
3081            };
3082            let following_char = content.chars().nth(match_end);
3083
3084            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3085                continue;
3086            }
3087            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3088                continue;
3089            }
3090
3091            let email = full_match.as_str();
3092
3093            // Find which line this email is on
3094            let mut line_num = 1;
3095            let mut col_start = match_start;
3096            let mut col_end = match_end;
3097            for (idx, line_info) in lines.iter().enumerate() {
3098                if match_start >= line_info.byte_offset {
3099                    line_num = idx + 1;
3100                    col_start = match_start - line_info.byte_offset;
3101                    col_end = match_end - line_info.byte_offset;
3102                } else {
3103                    break;
3104                }
3105            }
3106
3107            bare_urls.push(BareUrl {
3108                line: line_num,
3109                start_col: col_start,
3110                end_col: col_end,
3111                byte_offset: match_start,
3112                byte_end: match_end,
3113                url: email.to_string(),
3114                url_type: "email".to_string(),
3115            });
3116        }
3117
3118        bare_urls
3119    }
3120}
3121
3122/// Merge adjacent list blocks that should be treated as one
3123fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3124    if list_blocks.len() < 2 {
3125        return;
3126    }
3127
3128    let mut merger = ListBlockMerger::new(content, lines);
3129    *list_blocks = merger.merge(list_blocks);
3130}
3131
3132/// Helper struct to manage the complex logic of merging list blocks
3133struct ListBlockMerger<'a> {
3134    content: &'a str,
3135    lines: &'a [LineInfo],
3136}
3137
3138impl<'a> ListBlockMerger<'a> {
3139    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3140        Self { content, lines }
3141    }
3142
3143    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3144        let mut merged = Vec::with_capacity(list_blocks.len());
3145        let mut current = list_blocks[0].clone();
3146
3147        for next in list_blocks.iter().skip(1) {
3148            if self.should_merge_blocks(&current, next) {
3149                current = self.merge_two_blocks(current, next);
3150            } else {
3151                merged.push(current);
3152                current = next.clone();
3153            }
3154        }
3155
3156        merged.push(current);
3157        merged
3158    }
3159
3160    /// Determine if two adjacent list blocks should be merged
3161    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3162        // Basic compatibility checks
3163        if !self.blocks_are_compatible(current, next) {
3164            return false;
3165        }
3166
3167        // Check spacing and content between blocks
3168        let spacing = self.analyze_spacing_between(current, next);
3169        match spacing {
3170            BlockSpacing::Consecutive => true,
3171            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3172            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3173                self.can_merge_with_content_between(current, next)
3174            }
3175        }
3176    }
3177
3178    /// Check if blocks have compatible structure for merging
3179    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3180        current.is_ordered == next.is_ordered
3181            && current.blockquote_prefix == next.blockquote_prefix
3182            && current.nesting_level == next.nesting_level
3183    }
3184
3185    /// Analyze the spacing between two list blocks
3186    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3187        let gap = next.start_line - current.end_line;
3188
3189        match gap {
3190            1 => BlockSpacing::Consecutive,
3191            2 => BlockSpacing::SingleBlank,
3192            _ if gap > 2 => {
3193                if self.has_only_blank_lines_between(current, next) {
3194                    BlockSpacing::MultipleBlanks
3195                } else {
3196                    BlockSpacing::ContentBetween
3197                }
3198            }
3199            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3200        }
3201    }
3202
3203    /// Check if unordered lists can be merged with a single blank line between
3204    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3205        // Check if there are structural separators between the blocks
3206        // If has_meaningful_content_between returns true, it means there are structural separators
3207        if has_meaningful_content_between(self.content, current, next, self.lines) {
3208            return false; // Structural separators prevent merging
3209        }
3210
3211        // Only merge unordered lists with same marker across single blank
3212        !current.is_ordered && current.marker == next.marker
3213    }
3214
3215    /// Check if ordered lists can be merged when there's content between them
3216    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3217        // Do not merge lists if there are structural separators between them
3218        if has_meaningful_content_between(self.content, current, next, self.lines) {
3219            return false; // Structural separators prevent merging
3220        }
3221
3222        // Only consider merging ordered lists if there's no structural content between
3223        current.is_ordered && next.is_ordered
3224    }
3225
3226    /// Check if there are only blank lines between blocks
3227    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3228        for line_num in (current.end_line + 1)..next.start_line {
3229            if let Some(line_info) = self.lines.get(line_num - 1)
3230                && !line_info.content(self.content).trim().is_empty()
3231            {
3232                return false;
3233            }
3234        }
3235        true
3236    }
3237
3238    /// Merge two compatible list blocks into one
3239    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3240        current.end_line = next.end_line;
3241        current.item_lines.extend_from_slice(&next.item_lines);
3242
3243        // Update max marker width
3244        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3245
3246        // Handle marker consistency for unordered lists
3247        if !current.is_ordered && self.markers_differ(&current, next) {
3248            current.marker = None; // Mixed markers
3249        }
3250
3251        current
3252    }
3253
3254    /// Check if two blocks have different markers
3255    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3256        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3257    }
3258}
3259
3260/// Types of spacing between list blocks
3261#[derive(Debug, PartialEq)]
3262enum BlockSpacing {
3263    Consecutive,    // No gap between blocks
3264    SingleBlank,    // One blank line between blocks
3265    MultipleBlanks, // Multiple blank lines but no content
3266    ContentBetween, // Content exists between blocks
3267}
3268
3269/// Check if there's meaningful content (not just blank lines) between two list blocks
3270fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3271    // Check lines between current.end_line and next.start_line
3272    for line_num in (current.end_line + 1)..next.start_line {
3273        if let Some(line_info) = lines.get(line_num - 1) {
3274            // Convert to 0-indexed
3275            let trimmed = line_info.content(content).trim();
3276
3277            // Skip empty lines
3278            if trimmed.is_empty() {
3279                continue;
3280            }
3281
3282            // Check for structural separators that should separate lists (CommonMark compliant)
3283
3284            // Headings separate lists
3285            if line_info.heading.is_some() {
3286                return true; // Has meaningful content - headings separate lists
3287            }
3288
3289            // Horizontal rules separate lists (---, ***, ___)
3290            if is_horizontal_rule(trimmed) {
3291                return true; // Has meaningful content - horizontal rules separate lists
3292            }
3293
3294            // Tables separate lists
3295            if crate::utils::skip_context::is_table_line(trimmed) {
3296                return true; // Has meaningful content - tables separate lists
3297            }
3298
3299            // Blockquotes separate lists
3300            if trimmed.starts_with('>') {
3301                return true; // Has meaningful content - blockquotes separate lists
3302            }
3303
3304            // Code block fences separate lists (unless properly indented as list content)
3305            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3306                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3307
3308                // Check if this code block is properly indented as list continuation
3309                let min_continuation_indent = if current.is_ordered {
3310                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3311                } else {
3312                    current.nesting_level + 2
3313                };
3314
3315                if line_indent < min_continuation_indent {
3316                    // This is a standalone code block that separates lists
3317                    return true; // Has meaningful content - standalone code blocks separate lists
3318                }
3319            }
3320
3321            // Check if this line has proper indentation for list continuation
3322            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3323
3324            // Calculate minimum indentation needed to be list continuation
3325            let min_indent = if current.is_ordered {
3326                current.nesting_level + current.max_marker_width
3327            } else {
3328                current.nesting_level + 2
3329            };
3330
3331            // If the line is not indented enough to be list continuation, it's meaningful content
3332            if line_indent < min_indent {
3333                return true; // Has meaningful content - content not indented as list continuation
3334            }
3335
3336            // If we reach here, the line is properly indented as list continuation
3337            // Continue checking other lines
3338        }
3339    }
3340
3341    // Only blank lines or properly indented list continuation content between blocks
3342    false
3343}
3344
3345/// Check if a line is a horizontal rule (---, ***, ___)
3346fn is_horizontal_rule(trimmed: &str) -> bool {
3347    if trimmed.len() < 3 {
3348        return false;
3349    }
3350
3351    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3352    let chars: Vec<char> = trimmed.chars().collect();
3353    if let Some(&first_char) = chars.first()
3354        && (first_char == '-' || first_char == '*' || first_char == '_')
3355    {
3356        let mut count = 0;
3357        for &ch in &chars {
3358            if ch == first_char {
3359                count += 1;
3360            } else if ch != ' ' && ch != '\t' {
3361                return false; // Non-matching, non-whitespace character
3362            }
3363        }
3364        return count >= 3;
3365    }
3366    false
3367}
3368
3369/// Check if content contains patterns that cause the markdown crate to panic
3370#[cfg(test)]
3371mod tests {
3372    use super::*;
3373
3374    #[test]
3375    fn test_empty_content() {
3376        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
3377        assert_eq!(ctx.content, "");
3378        assert_eq!(ctx.line_offsets, vec![0]);
3379        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3380        assert_eq!(ctx.lines.len(), 0);
3381    }
3382
3383    #[test]
3384    fn test_single_line() {
3385        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
3386        assert_eq!(ctx.content, "# Hello");
3387        assert_eq!(ctx.line_offsets, vec![0]);
3388        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3389        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3390    }
3391
3392    #[test]
3393    fn test_multi_line() {
3394        let content = "# Title\n\nSecond line\nThird line";
3395        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3396        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3397        // Test offset to line/col
3398        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3399        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3400        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3401        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3402        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3403    }
3404
3405    #[test]
3406    fn test_line_info() {
3407        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3408        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3409
3410        // Test line info
3411        assert_eq!(ctx.lines.len(), 7);
3412
3413        // Line 1: "# Title"
3414        let line1 = &ctx.lines[0];
3415        assert_eq!(line1.content(ctx.content), "# Title");
3416        assert_eq!(line1.byte_offset, 0);
3417        assert_eq!(line1.indent, 0);
3418        assert!(!line1.is_blank);
3419        assert!(!line1.in_code_block);
3420        assert!(line1.list_item.is_none());
3421
3422        // Line 2: "    indented"
3423        let line2 = &ctx.lines[1];
3424        assert_eq!(line2.content(ctx.content), "    indented");
3425        assert_eq!(line2.byte_offset, 8);
3426        assert_eq!(line2.indent, 4);
3427        assert!(!line2.is_blank);
3428
3429        // Line 3: "" (blank)
3430        let line3 = &ctx.lines[2];
3431        assert_eq!(line3.content(ctx.content), "");
3432        assert!(line3.is_blank);
3433
3434        // Test helper methods
3435        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3436        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3437        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3438        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3439    }
3440
3441    #[test]
3442    fn test_list_item_detection() {
3443        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3444        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3445
3446        // Line 1: "- Unordered item"
3447        let line1 = &ctx.lines[0];
3448        assert!(line1.list_item.is_some());
3449        let list1 = line1.list_item.as_ref().unwrap();
3450        assert_eq!(list1.marker, "-");
3451        assert!(!list1.is_ordered);
3452        assert_eq!(list1.marker_column, 0);
3453        assert_eq!(list1.content_column, 2);
3454
3455        // Line 2: "  * Nested item"
3456        let line2 = &ctx.lines[1];
3457        assert!(line2.list_item.is_some());
3458        let list2 = line2.list_item.as_ref().unwrap();
3459        assert_eq!(list2.marker, "*");
3460        assert_eq!(list2.marker_column, 2);
3461
3462        // Line 3: "1. Ordered item"
3463        let line3 = &ctx.lines[2];
3464        assert!(line3.list_item.is_some());
3465        let list3 = line3.list_item.as_ref().unwrap();
3466        assert_eq!(list3.marker, "1.");
3467        assert!(list3.is_ordered);
3468        assert_eq!(list3.number, Some(1));
3469
3470        // Line 6: "Not a list"
3471        let line6 = &ctx.lines[5];
3472        assert!(line6.list_item.is_none());
3473    }
3474
3475    #[test]
3476    fn test_offset_to_line_col_edge_cases() {
3477        let content = "a\nb\nc";
3478        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3479        // line_offsets: [0, 2, 4]
3480        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3481        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3482        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3483        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3484        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3485        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3486    }
3487
3488    #[test]
3489    fn test_mdx_esm_blocks() {
3490        let content = r##"import {Chart} from './snowfall.js'
3491export const year = 2023
3492
3493# Last year's snowfall
3494
3495In {year}, the snowfall was above average.
3496It was followed by a warm spring which caused
3497flood conditions in many of the nearby rivers.
3498
3499<Chart color="#fcb32c" year={year} />
3500"##;
3501
3502        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
3503
3504        // Check that lines 1 and 2 are marked as ESM blocks
3505        assert_eq!(ctx.lines.len(), 10);
3506        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3507        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3508        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3509        assert!(
3510            !ctx.lines[3].in_esm_block,
3511            "Line 4 (heading) should NOT be in_esm_block"
3512        );
3513        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3514        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3515    }
3516
3517    #[test]
3518    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3519        let content = r#"import {Chart} from './snowfall.js'
3520export const year = 2023
3521
3522# Last year's snowfall
3523"#;
3524
3525        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3526
3527        // ESM blocks should NOT be detected in Standard flavor
3528        assert!(
3529            !ctx.lines[0].in_esm_block,
3530            "Line 1 should NOT be in_esm_block in Standard flavor"
3531        );
3532        assert!(
3533            !ctx.lines[1].in_esm_block,
3534            "Line 2 should NOT be in_esm_block in Standard flavor"
3535        );
3536    }
3537}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs