rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::path::PathBuf;
8use std::sync::LazyLock;
9
10/// Macro for profiling sections - only active in non-WASM builds
11#[cfg(not(target_arch = "wasm32"))]
12macro_rules! profile_section {
13    ($name:expr, $profile:expr, $code:expr) => {{
14        let start = std::time::Instant::now();
15        let result = $code;
16        if $profile {
17            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
18        }
19        result
20    }};
21}
22
23#[cfg(target_arch = "wasm32")]
24macro_rules! profile_section {
25    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
26}
27
28// Comprehensive link pattern that captures both inline and reference links
29// Use (?s) flag to make . match newlines
30static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
31    Regex::new(
32        r#"(?sx)
33        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
34        (?:
35            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
36            |
37            \[([^\]]*)\]      # Reference ID in group 6
38        )"#
39    ).unwrap()
40});
41
42// Image pattern (similar to links but with ! prefix)
43// Use (?s) flag to make . match newlines
44static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r#"(?sx)
47        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
48        (?:
49            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
50            |
51            \[([^\]]*)\]      # Reference ID in group 6
52        )"#
53    ).unwrap()
54});
55
56// Reference definition pattern
57static REF_DEF_PATTERN: LazyLock<Regex> =
58    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
59
60// Pattern for bare URLs
61static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
62    Regex::new(
63        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
64    ).unwrap()
65});
66
67// Pattern for email addresses
68static BARE_EMAIL_PATTERN: LazyLock<Regex> =
69    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
70
71// Pattern for blockquote prefix in parse_list_blocks
72static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
73
74/// Pre-computed information about a line
75#[derive(Debug, Clone)]
76pub struct LineInfo {
77    /// Byte offset where this line starts in the document
78    pub byte_offset: usize,
79    /// Length of the line in bytes (without newline)
80    pub byte_len: usize,
81    /// Number of leading spaces/tabs
82    pub indent: usize,
83    /// Whether the line is blank (empty or only whitespace)
84    pub is_blank: bool,
85    /// Whether this line is inside a code block
86    pub in_code_block: bool,
87    /// Whether this line is inside front matter
88    pub in_front_matter: bool,
89    /// Whether this line is inside an HTML block
90    pub in_html_block: bool,
91    /// Whether this line is inside an HTML comment
92    pub in_html_comment: bool,
93    /// List item information if this line starts a list item
94    pub list_item: Option<ListItemInfo>,
95    /// Heading information if this line is a heading
96    pub heading: Option<HeadingInfo>,
97    /// Blockquote information if this line is a blockquote
98    pub blockquote: Option<BlockquoteInfo>,
99    /// Whether this line is inside a mkdocstrings autodoc block
100    pub in_mkdocstrings: bool,
101    /// Whether this line is part of an ESM import/export block (MDX only)
102    pub in_esm_block: bool,
103    /// Whether this line is a continuation of a multi-line code span from a previous line
104    pub in_code_span_continuation: bool,
105}
106
107impl LineInfo {
108    /// Get the line content as a string slice from the source document
109    pub fn content<'a>(&self, source: &'a str) -> &'a str {
110        &source[self.byte_offset..self.byte_offset + self.byte_len]
111    }
112}
113
114/// Information about a list item
115#[derive(Debug, Clone)]
116pub struct ListItemInfo {
117    /// The marker used (*, -, +, or number with . or ))
118    pub marker: String,
119    /// Whether it's ordered (true) or unordered (false)
120    pub is_ordered: bool,
121    /// The number for ordered lists
122    pub number: Option<usize>,
123    /// Column where the marker starts (0-based)
124    pub marker_column: usize,
125    /// Column where content after marker starts
126    pub content_column: usize,
127}
128
129/// Heading style type
130#[derive(Debug, Clone, PartialEq)]
131pub enum HeadingStyle {
132    /// ATX style heading (# Heading)
133    ATX,
134    /// Setext style heading with = underline
135    Setext1,
136    /// Setext style heading with - underline
137    Setext2,
138}
139
140/// Parsed link information
141#[derive(Debug, Clone)]
142pub struct ParsedLink<'a> {
143    /// Line number (1-indexed)
144    pub line: usize,
145    /// Start column (0-indexed) in the line
146    pub start_col: usize,
147    /// End column (0-indexed) in the line
148    pub end_col: usize,
149    /// Byte offset in document
150    pub byte_offset: usize,
151    /// End byte offset in document
152    pub byte_end: usize,
153    /// Link text
154    pub text: Cow<'a, str>,
155    /// Link URL or reference
156    pub url: Cow<'a, str>,
157    /// Whether this is a reference link [text][ref] vs inline [text](url)
158    pub is_reference: bool,
159    /// Reference ID for reference links
160    pub reference_id: Option<Cow<'a, str>>,
161    /// Link type from pulldown-cmark
162    pub link_type: LinkType,
163}
164
165/// Information about a broken link reported by pulldown-cmark
166#[derive(Debug, Clone)]
167pub struct BrokenLinkInfo {
168    /// The reference text that couldn't be resolved
169    pub reference: String,
170    /// Byte span in the source document
171    pub span: std::ops::Range<usize>,
172}
173
174/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
175#[derive(Debug, Clone)]
176pub struct FootnoteRef {
177    /// The footnote ID (without the ^ prefix)
178    pub id: String,
179    /// Line number (1-indexed)
180    pub line: usize,
181    /// Start byte offset in document
182    pub byte_offset: usize,
183    /// End byte offset in document
184    pub byte_end: usize,
185}
186
187/// Parsed image information
188#[derive(Debug, Clone)]
189pub struct ParsedImage<'a> {
190    /// Line number (1-indexed)
191    pub line: usize,
192    /// Start column (0-indexed) in the line
193    pub start_col: usize,
194    /// End column (0-indexed) in the line
195    pub end_col: usize,
196    /// Byte offset in document
197    pub byte_offset: usize,
198    /// End byte offset in document
199    pub byte_end: usize,
200    /// Alt text
201    pub alt_text: Cow<'a, str>,
202    /// Image URL or reference
203    pub url: Cow<'a, str>,
204    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
205    pub is_reference: bool,
206    /// Reference ID for reference images
207    pub reference_id: Option<Cow<'a, str>>,
208    /// Link type from pulldown-cmark
209    pub link_type: LinkType,
210}
211
212/// Reference definition [ref]: url "title"
213#[derive(Debug, Clone)]
214pub struct ReferenceDef {
215    /// Line number (1-indexed)
216    pub line: usize,
217    /// Reference ID (normalized to lowercase)
218    pub id: String,
219    /// URL
220    pub url: String,
221    /// Optional title
222    pub title: Option<String>,
223    /// Byte offset where the reference definition starts
224    pub byte_offset: usize,
225    /// Byte offset where the reference definition ends
226    pub byte_end: usize,
227}
228
229/// Parsed code span information
230#[derive(Debug, Clone)]
231pub struct CodeSpan {
232    /// Line number where the code span starts (1-indexed)
233    pub line: usize,
234    /// Line number where the code span ends (1-indexed)
235    pub end_line: usize,
236    /// Start column (0-indexed) in the line
237    pub start_col: usize,
238    /// End column (0-indexed) in the line
239    pub end_col: usize,
240    /// Byte offset in document
241    pub byte_offset: usize,
242    /// End byte offset in document
243    pub byte_end: usize,
244    /// Number of backticks used (1, 2, 3, etc.)
245    pub backtick_count: usize,
246    /// Content inside the code span (without backticks)
247    pub content: String,
248}
249
250/// Information about a heading
251#[derive(Debug, Clone)]
252pub struct HeadingInfo {
253    /// Heading level (1-6 for ATX, 1-2 for Setext)
254    pub level: u8,
255    /// Style of heading
256    pub style: HeadingStyle,
257    /// The heading marker (# characters or underline)
258    pub marker: String,
259    /// Column where the marker starts (0-based)
260    pub marker_column: usize,
261    /// Column where heading text starts
262    pub content_column: usize,
263    /// The heading text (without markers and without custom ID syntax)
264    pub text: String,
265    /// Custom header ID if present (e.g., from {#custom-id} syntax)
266    pub custom_id: Option<String>,
267    /// Original heading text including custom ID syntax
268    pub raw_text: String,
269    /// Whether it has a closing sequence (for ATX)
270    pub has_closing_sequence: bool,
271    /// The closing sequence if present
272    pub closing_sequence: String,
273}
274
275/// Information about a blockquote line
276#[derive(Debug, Clone)]
277pub struct BlockquoteInfo {
278    /// Nesting level (1 for >, 2 for >>, etc.)
279    pub nesting_level: usize,
280    /// The indentation before the blockquote marker
281    pub indent: String,
282    /// Column where the first > starts (0-based)
283    pub marker_column: usize,
284    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
285    pub prefix: String,
286    /// Content after the blockquote marker(s)
287    pub content: String,
288    /// Whether the line has no space after the marker
289    pub has_no_space_after_marker: bool,
290    /// Whether the line has multiple spaces after the marker
291    pub has_multiple_spaces_after_marker: bool,
292    /// Whether this is an empty blockquote line needing MD028 fix
293    pub needs_md028_fix: bool,
294}
295
296/// Information about a list block
297#[derive(Debug, Clone)]
298pub struct ListBlock {
299    /// Line number where the list starts (1-indexed)
300    pub start_line: usize,
301    /// Line number where the list ends (1-indexed)
302    pub end_line: usize,
303    /// Whether it's ordered or unordered
304    pub is_ordered: bool,
305    /// The consistent marker for unordered lists (if any)
306    pub marker: Option<String>,
307    /// Blockquote prefix for this list (empty if not in blockquote)
308    pub blockquote_prefix: String,
309    /// Lines that are list items within this block
310    pub item_lines: Vec<usize>,
311    /// Nesting level (0 for top-level lists)
312    pub nesting_level: usize,
313    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
314    pub max_marker_width: usize,
315}
316
317use std::sync::{Arc, Mutex};
318
319/// Character frequency data for fast content analysis
320#[derive(Debug, Clone, Default)]
321pub struct CharFrequency {
322    /// Count of # characters (headings)
323    pub hash_count: usize,
324    /// Count of * characters (emphasis, lists, horizontal rules)
325    pub asterisk_count: usize,
326    /// Count of _ characters (emphasis, horizontal rules)
327    pub underscore_count: usize,
328    /// Count of - characters (lists, horizontal rules, setext headings)
329    pub hyphen_count: usize,
330    /// Count of + characters (lists)
331    pub plus_count: usize,
332    /// Count of > characters (blockquotes)
333    pub gt_count: usize,
334    /// Count of | characters (tables)
335    pub pipe_count: usize,
336    /// Count of [ characters (links, images)
337    pub bracket_count: usize,
338    /// Count of ` characters (code spans, code blocks)
339    pub backtick_count: usize,
340    /// Count of < characters (HTML tags, autolinks)
341    pub lt_count: usize,
342    /// Count of ! characters (images)
343    pub exclamation_count: usize,
344    /// Count of newline characters
345    pub newline_count: usize,
346}
347
348/// Pre-parsed HTML tag information
349#[derive(Debug, Clone)]
350pub struct HtmlTag {
351    /// Line number (1-indexed)
352    pub line: usize,
353    /// Start column (0-indexed) in the line
354    pub start_col: usize,
355    /// End column (0-indexed) in the line
356    pub end_col: usize,
357    /// Byte offset in document
358    pub byte_offset: usize,
359    /// End byte offset in document
360    pub byte_end: usize,
361    /// Tag name (e.g., "div", "img", "br")
362    pub tag_name: String,
363    /// Whether it's a closing tag (`</tag>`)
364    pub is_closing: bool,
365    /// Whether it's self-closing (`<tag />`)
366    pub is_self_closing: bool,
367    /// Raw tag content
368    pub raw_content: String,
369}
370
371/// Pre-parsed emphasis span information
372#[derive(Debug, Clone)]
373pub struct EmphasisSpan {
374    /// Line number (1-indexed)
375    pub line: usize,
376    /// Start column (0-indexed) in the line
377    pub start_col: usize,
378    /// End column (0-indexed) in the line
379    pub end_col: usize,
380    /// Byte offset in document
381    pub byte_offset: usize,
382    /// End byte offset in document
383    pub byte_end: usize,
384    /// Type of emphasis ('*' or '_')
385    pub marker: char,
386    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
387    pub marker_count: usize,
388    /// Content inside the emphasis
389    pub content: String,
390}
391
392/// Pre-parsed table row information
393#[derive(Debug, Clone)]
394pub struct TableRow {
395    /// Line number (1-indexed)
396    pub line: usize,
397    /// Whether this is a separator row (contains only |, -, :, and spaces)
398    pub is_separator: bool,
399    /// Number of columns (pipe-separated cells)
400    pub column_count: usize,
401    /// Alignment info from separator row
402    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
403}
404
405/// Pre-parsed bare URL information (not in links)
406#[derive(Debug, Clone)]
407pub struct BareUrl {
408    /// Line number (1-indexed)
409    pub line: usize,
410    /// Start column (0-indexed) in the line
411    pub start_col: usize,
412    /// End column (0-indexed) in the line
413    pub end_col: usize,
414    /// Byte offset in document
415    pub byte_offset: usize,
416    /// End byte offset in document
417    pub byte_end: usize,
418    /// The URL string
419    pub url: String,
420    /// Type of URL ("http", "https", "ftp", "email")
421    pub url_type: String,
422}
423
424pub struct LintContext<'a> {
425    pub content: &'a str,
426    pub line_offsets: Vec<usize>,
427    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
428    pub lines: Vec<LineInfo>,             // Pre-computed line information
429    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
430    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
431    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
432    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
433    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
434    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
435    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
436    pub char_frequency: CharFrequency,    // Character frequency analysis
437    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
438    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
439    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
440    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
441    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
442    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
443    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
444    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
445    pub flavor: MarkdownFlavor,           // Markdown flavor being used
446    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
447}
448
449/// Detailed blockquote parse result with all components
450struct BlockquoteComponents<'a> {
451    indent: &'a str,
452    markers: &'a str,
453    spaces_after: &'a str,
454    content: &'a str,
455}
456
457/// Parse blockquote prefix with detailed components using manual parsing
458#[inline]
459fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
460    let bytes = line.as_bytes();
461    let mut pos = 0;
462
463    // Parse leading whitespace (indent)
464    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
465        pos += 1;
466    }
467    let indent_end = pos;
468
469    // Must have at least one '>' marker
470    if pos >= bytes.len() || bytes[pos] != b'>' {
471        return None;
472    }
473
474    // Parse '>' markers
475    while pos < bytes.len() && bytes[pos] == b'>' {
476        pos += 1;
477    }
478    let markers_end = pos;
479
480    // Parse spaces after markers
481    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
482        pos += 1;
483    }
484    let spaces_end = pos;
485
486    Some(BlockquoteComponents {
487        indent: &line[0..indent_end],
488        markers: &line[indent_end..markers_end],
489        spaces_after: &line[markers_end..spaces_end],
490        content: &line[spaces_end..],
491    })
492}
493
494impl<'a> LintContext<'a> {
495    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
496        #[cfg(not(target_arch = "wasm32"))]
497        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
498        #[cfg(target_arch = "wasm32")]
499        let profile = false;
500
501        let line_offsets = profile_section!("Line offsets", profile, {
502            let mut offsets = vec![0];
503            for (i, c) in content.char_indices() {
504                if c == '\n' {
505                    offsets.push(i + 1);
506                }
507            }
508            offsets
509        });
510
511        // Detect code blocks once and cache them
512        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
513
514        // Pre-compute HTML comment ranges ONCE for all operations
515        let html_comment_ranges = profile_section!(
516            "HTML comment ranges",
517            profile,
518            crate::utils::skip_context::compute_html_comment_ranges(content)
519        );
520
521        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
522        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
523            if flavor == MarkdownFlavor::MkDocs {
524                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
525            } else {
526                Vec::new()
527            }
528        });
529
530        // Pre-compute line information (without headings/blockquotes yet)
531        let mut lines = profile_section!(
532            "Basic line info",
533            profile,
534            Self::compute_basic_line_info(
535                content,
536                &line_offsets,
537                &code_blocks,
538                flavor,
539                &html_comment_ranges,
540                &autodoc_ranges,
541            )
542        );
543
544        // Detect HTML blocks BEFORE heading detection
545        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
546
547        // Detect ESM import/export blocks in MDX files BEFORE heading detection
548        profile_section!(
549            "ESM blocks",
550            profile,
551            Self::detect_esm_blocks(content, &mut lines, flavor)
552        );
553
554        // Now detect headings and blockquotes
555        profile_section!(
556            "Headings & blockquotes",
557            profile,
558            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
559        );
560
561        // Parse code spans early so we can exclude them from link/image parsing
562        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
563
564        // Mark lines that are continuations of multi-line code spans
565        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
566        for span in &code_spans {
567            if span.end_line > span.line {
568                // Mark lines after the first line as continuations
569                for line_num in (span.line + 1)..=span.end_line {
570                    if let Some(line_info) = lines.get_mut(line_num - 1) {
571                        line_info.in_code_span_continuation = true;
572                    }
573                }
574            }
575        }
576
577        // Parse links, images, references, and list blocks
578        let (links, broken_links, footnote_refs) = profile_section!(
579            "Links",
580            profile,
581            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
582        );
583
584        let images = profile_section!(
585            "Images",
586            profile,
587            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
588        );
589
590        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
591
592        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
593
594        // Compute character frequency for fast content analysis
595        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
596
597        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
598        let table_blocks = profile_section!(
599            "Table blocks",
600            profile,
601            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
602                content,
603                &code_blocks,
604                &code_spans,
605                &html_comment_ranges,
606            )
607        );
608
609        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
610        let line_index = profile_section!(
611            "Line index",
612            profile,
613            crate::utils::range_utils::LineIndex::new(content)
614        );
615
616        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
617        let jinja_ranges = profile_section!(
618            "Jinja ranges",
619            profile,
620            crate::utils::jinja_utils::find_jinja_ranges(content)
621        );
622
623        Self {
624            content,
625            line_offsets,
626            code_blocks,
627            lines,
628            links,
629            images,
630            broken_links,
631            footnote_refs,
632            reference_defs,
633            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
634            list_blocks,
635            char_frequency,
636            html_tags_cache: Mutex::new(None),
637            emphasis_spans_cache: Mutex::new(None),
638            table_rows_cache: Mutex::new(None),
639            bare_urls_cache: Mutex::new(None),
640            html_comment_ranges,
641            table_blocks,
642            line_index,
643            jinja_ranges,
644            flavor,
645            source_file,
646        }
647    }
648
649    /// Get code spans - computed lazily on first access
650    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
651        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
652
653        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
654    }
655
656    /// Get HTML comment ranges - pre-computed during LintContext construction
657    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
658        &self.html_comment_ranges
659    }
660
661    /// Get HTML tags - computed lazily on first access
662    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
663        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
664
665        Arc::clone(cache.get_or_insert_with(|| {
666            Arc::new(Self::parse_html_tags(
667                self.content,
668                &self.lines,
669                &self.code_blocks,
670                self.flavor,
671            ))
672        }))
673    }
674
675    /// Get emphasis spans - computed lazily on first access
676    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
677        let mut cache = self
678            .emphasis_spans_cache
679            .lock()
680            .expect("Emphasis spans cache mutex poisoned");
681
682        Arc::clone(
683            cache.get_or_insert_with(|| {
684                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
685            }),
686        )
687    }
688
689    /// Get table rows - computed lazily on first access
690    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
691        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
692
693        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
694    }
695
696    /// Get bare URLs - computed lazily on first access
697    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
698        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
699
700        Arc::clone(
701            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
702        )
703    }
704
705    /// Map a byte offset to (line, column)
706    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
707        match self.line_offsets.binary_search(&offset) {
708            Ok(line) => (line + 1, 1),
709            Err(line) => {
710                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
711                (line, offset - line_start + 1)
712            }
713        }
714    }
715
716    /// Check if a position is within a code block or code span
717    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
718        // Check code blocks first
719        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
720            return true;
721        }
722
723        // Check inline code spans (lazy load if needed)
724        self.code_spans()
725            .iter()
726            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
727    }
728
729    /// Get line information by line number (1-indexed)
730    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
731        if line_num > 0 {
732            self.lines.get(line_num - 1)
733        } else {
734            None
735        }
736    }
737
738    /// Get byte offset for a line number (1-indexed)
739    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
740        self.line_info(line_num).map(|info| info.byte_offset)
741    }
742
743    /// Get URL for a reference link/image by its ID
744    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
745        let normalized_id = ref_id.to_lowercase();
746        self.reference_defs
747            .iter()
748            .find(|def| def.id == normalized_id)
749            .map(|def| def.url.as_str())
750    }
751
752    /// Check if a line is part of a list block
753    pub fn is_in_list_block(&self, line_num: usize) -> bool {
754        self.list_blocks
755            .iter()
756            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
757    }
758
759    /// Get the list block containing a specific line
760    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
761        self.list_blocks
762            .iter()
763            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
764    }
765
766    // Compatibility methods for DocumentStructure migration
767
768    /// Check if a line is within a code block
769    pub fn is_in_code_block(&self, line_num: usize) -> bool {
770        if line_num == 0 || line_num > self.lines.len() {
771            return false;
772        }
773        self.lines[line_num - 1].in_code_block
774    }
775
776    /// Check if a line is within front matter
777    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
778        if line_num == 0 || line_num > self.lines.len() {
779            return false;
780        }
781        self.lines[line_num - 1].in_front_matter
782    }
783
784    /// Check if a line is within an HTML block
785    pub fn is_in_html_block(&self, line_num: usize) -> bool {
786        if line_num == 0 || line_num > self.lines.len() {
787            return false;
788        }
789        self.lines[line_num - 1].in_html_block
790    }
791
792    /// Check if a line and column is within a code span
793    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
794        if line_num == 0 || line_num > self.lines.len() {
795            return false;
796        }
797
798        // Use the code spans cache to check
799        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
800        // Convert col to 0-indexed for comparison
801        let col_0indexed = if col > 0 { col - 1 } else { 0 };
802        let code_spans = self.code_spans();
803        code_spans.iter().any(|span| {
804            // Check if line is within the span's line range
805            if line_num < span.line || line_num > span.end_line {
806                return false;
807            }
808
809            if span.line == span.end_line {
810                // Single-line span: check column bounds
811                col_0indexed >= span.start_col && col_0indexed < span.end_col
812            } else if line_num == span.line {
813                // First line of multi-line span: anything after start_col is in span
814                col_0indexed >= span.start_col
815            } else if line_num == span.end_line {
816                // Last line of multi-line span: anything before end_col is in span
817                col_0indexed < span.end_col
818            } else {
819                // Middle line of multi-line span: entire line is in span
820                true
821            }
822        })
823    }
824
825    /// Check if a byte offset is within a code span
826    #[inline]
827    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
828        let code_spans = self.code_spans();
829        code_spans
830            .iter()
831            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
832    }
833
834    /// Check if a byte position is within a reference definition
835    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
836    #[inline]
837    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
838        self.reference_defs
839            .iter()
840            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
841    }
842
843    /// Check if a byte position is within an HTML comment
844    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
845    /// where k is the number of HTML comments (typically very small)
846    #[inline]
847    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
848        self.html_comment_ranges
849            .iter()
850            .any(|range| byte_pos >= range.start && byte_pos < range.end)
851    }
852
853    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
854    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
855        self.jinja_ranges
856            .iter()
857            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
858    }
859
860    /// Check if content has any instances of a specific character (fast)
861    pub fn has_char(&self, ch: char) -> bool {
862        match ch {
863            '#' => self.char_frequency.hash_count > 0,
864            '*' => self.char_frequency.asterisk_count > 0,
865            '_' => self.char_frequency.underscore_count > 0,
866            '-' => self.char_frequency.hyphen_count > 0,
867            '+' => self.char_frequency.plus_count > 0,
868            '>' => self.char_frequency.gt_count > 0,
869            '|' => self.char_frequency.pipe_count > 0,
870            '[' => self.char_frequency.bracket_count > 0,
871            '`' => self.char_frequency.backtick_count > 0,
872            '<' => self.char_frequency.lt_count > 0,
873            '!' => self.char_frequency.exclamation_count > 0,
874            '\n' => self.char_frequency.newline_count > 0,
875            _ => self.content.contains(ch), // Fallback for other characters
876        }
877    }
878
879    /// Get count of a specific character (fast)
880    pub fn char_count(&self, ch: char) -> usize {
881        match ch {
882            '#' => self.char_frequency.hash_count,
883            '*' => self.char_frequency.asterisk_count,
884            '_' => self.char_frequency.underscore_count,
885            '-' => self.char_frequency.hyphen_count,
886            '+' => self.char_frequency.plus_count,
887            '>' => self.char_frequency.gt_count,
888            '|' => self.char_frequency.pipe_count,
889            '[' => self.char_frequency.bracket_count,
890            '`' => self.char_frequency.backtick_count,
891            '<' => self.char_frequency.lt_count,
892            '!' => self.char_frequency.exclamation_count,
893            '\n' => self.char_frequency.newline_count,
894            _ => self.content.matches(ch).count(), // Fallback for other characters
895        }
896    }
897
898    /// Check if content likely contains headings (fast)
899    pub fn likely_has_headings(&self) -> bool {
900        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
901    }
902
903    /// Check if content likely contains lists (fast)
904    pub fn likely_has_lists(&self) -> bool {
905        self.char_frequency.asterisk_count > 0
906            || self.char_frequency.hyphen_count > 0
907            || self.char_frequency.plus_count > 0
908    }
909
910    /// Check if content likely contains emphasis (fast)
911    pub fn likely_has_emphasis(&self) -> bool {
912        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
913    }
914
915    /// Check if content likely contains tables (fast)
916    pub fn likely_has_tables(&self) -> bool {
917        self.char_frequency.pipe_count > 2
918    }
919
920    /// Check if content likely contains blockquotes (fast)
921    pub fn likely_has_blockquotes(&self) -> bool {
922        self.char_frequency.gt_count > 0
923    }
924
925    /// Check if content likely contains code (fast)
926    pub fn likely_has_code(&self) -> bool {
927        self.char_frequency.backtick_count > 0
928    }
929
930    /// Check if content likely contains links or images (fast)
931    pub fn likely_has_links_or_images(&self) -> bool {
932        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
933    }
934
935    /// Check if content likely contains HTML (fast)
936    pub fn likely_has_html(&self) -> bool {
937        self.char_frequency.lt_count > 0
938    }
939
940    /// Get HTML tags on a specific line
941    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
942        self.html_tags()
943            .iter()
944            .filter(|tag| tag.line == line_num)
945            .cloned()
946            .collect()
947    }
948
949    /// Get emphasis spans on a specific line
950    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
951        self.emphasis_spans()
952            .iter()
953            .filter(|span| span.line == line_num)
954            .cloned()
955            .collect()
956    }
957
958    /// Get table rows on a specific line
959    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
960        self.table_rows()
961            .iter()
962            .filter(|row| row.line == line_num)
963            .cloned()
964            .collect()
965    }
966
967    /// Get bare URLs on a specific line
968    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
969        self.bare_urls()
970            .iter()
971            .filter(|url| url.line == line_num)
972            .cloned()
973            .collect()
974    }
975
976    /// Find the line index for a given byte offset using binary search.
977    /// Returns (line_index, line_number, column) where:
978    /// - line_index is the 0-based index in the lines array
979    /// - line_number is the 1-based line number
980    /// - column is the byte offset within that line
981    #[inline]
982    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
983        // Binary search to find the line containing this byte offset
984        let idx = match lines.binary_search_by(|line| {
985            if byte_offset < line.byte_offset {
986                std::cmp::Ordering::Greater
987            } else if byte_offset > line.byte_offset + line.byte_len {
988                std::cmp::Ordering::Less
989            } else {
990                std::cmp::Ordering::Equal
991            }
992        }) {
993            Ok(idx) => idx,
994            Err(idx) => idx.saturating_sub(1),
995        };
996
997        let line = &lines[idx];
998        let line_num = idx + 1;
999        let col = byte_offset.saturating_sub(line.byte_offset);
1000
1001        (idx, line_num, col)
1002    }
1003
1004    /// Check if a byte offset is within a code span using binary search
1005    #[inline]
1006    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1007        // Since spans are sorted by byte_offset, use partition_point for binary search
1008        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1009
1010        // Check the span that starts at or before our offset
1011        if idx > 0 {
1012            let span = &code_spans[idx - 1];
1013            if offset >= span.byte_offset && offset < span.byte_end {
1014                return true;
1015            }
1016        }
1017
1018        false
1019    }
1020
1021    /// Parse all links in the content
1022    fn parse_links(
1023        content: &'a str,
1024        lines: &[LineInfo],
1025        code_blocks: &[(usize, usize)],
1026        code_spans: &[CodeSpan],
1027        flavor: MarkdownFlavor,
1028        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1029    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1030        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1031        use std::collections::HashSet;
1032
1033        let mut links = Vec::with_capacity(content.len() / 500);
1034        let mut broken_links = Vec::new();
1035        let mut footnote_refs = Vec::new();
1036
1037        // Track byte positions of links found by pulldown-cmark
1038        let mut found_positions = HashSet::new();
1039
1040        // Use pulldown-cmark's streaming parser with BrokenLink callback
1041        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1042        // This automatically handles:
1043        // - Escaped links (won't generate events)
1044        // - Links in code blocks/spans (won't generate Link events)
1045        // - Images (generates Tag::Image instead)
1046        // - Reference resolution (dest_url is already resolved!)
1047        // - Broken references (callback is invoked)
1048        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1049        let mut options = Options::empty();
1050        options.insert(Options::ENABLE_WIKILINKS);
1051        options.insert(Options::ENABLE_FOOTNOTES);
1052
1053        let parser = Parser::new_with_broken_link_callback(
1054            content,
1055            options,
1056            Some(|link: BrokenLink<'_>| {
1057                broken_links.push(BrokenLinkInfo {
1058                    reference: link.reference.to_string(),
1059                    span: link.span.clone(),
1060                });
1061                None
1062            }),
1063        )
1064        .into_offset_iter();
1065
1066        let mut link_stack: Vec<(
1067            usize,
1068            usize,
1069            pulldown_cmark::CowStr<'a>,
1070            LinkType,
1071            pulldown_cmark::CowStr<'a>,
1072        )> = Vec::new();
1073        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1074
1075        for (event, range) in parser {
1076            match event {
1077                Event::Start(Tag::Link {
1078                    link_type,
1079                    dest_url,
1080                    id,
1081                    ..
1082                }) => {
1083                    // Link start - record position, URL, and reference ID
1084                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1085                    text_chunks.clear();
1086                }
1087                Event::Text(text) if !link_stack.is_empty() => {
1088                    // Track text content with its byte range
1089                    text_chunks.push((text.to_string(), range.start, range.end));
1090                }
1091                Event::Code(code) if !link_stack.is_empty() => {
1092                    // Include inline code in link text (with backticks)
1093                    let code_text = format!("`{code}`");
1094                    text_chunks.push((code_text, range.start, range.end));
1095                }
1096                Event::End(TagEnd::Link) => {
1097                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1098                        // Skip if in HTML comment
1099                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1100                            text_chunks.clear();
1101                            continue;
1102                        }
1103
1104                        // Find line and column information
1105                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1106
1107                        // Skip if this link is on a MkDocs snippet line
1108                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1109                            text_chunks.clear();
1110                            continue;
1111                        }
1112
1113                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1114
1115                        let is_reference = matches!(
1116                            link_type,
1117                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1118                        );
1119
1120                        // Extract link text directly from source bytes to preserve escaping
1121                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1122                        let link_text = if start_pos < content.len() {
1123                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1124
1125                            // Find MATCHING ] by tracking bracket depth for nested brackets
1126                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1127                            // Brackets inside code spans (between backticks) should be ignored
1128                            let mut close_pos = None;
1129                            let mut depth = 0;
1130                            let mut in_code_span = false;
1131
1132                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1133                                // Count preceding backslashes
1134                                let mut backslash_count = 0;
1135                                let mut j = i;
1136                                while j > 0 && link_bytes[j - 1] == b'\\' {
1137                                    backslash_count += 1;
1138                                    j -= 1;
1139                                }
1140                                let is_escaped = backslash_count % 2 != 0;
1141
1142                                // Track code spans - backticks toggle in/out of code
1143                                if byte == b'`' && !is_escaped {
1144                                    in_code_span = !in_code_span;
1145                                }
1146
1147                                // Only count brackets when NOT in a code span
1148                                if !is_escaped && !in_code_span {
1149                                    if byte == b'[' {
1150                                        depth += 1;
1151                                    } else if byte == b']' {
1152                                        if depth == 0 {
1153                                            // Found the matching closing bracket
1154                                            close_pos = Some(i);
1155                                            break;
1156                                        } else {
1157                                            depth -= 1;
1158                                        }
1159                                    }
1160                                }
1161                            }
1162
1163                            if let Some(pos) = close_pos {
1164                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1165                            } else {
1166                                Cow::Borrowed("")
1167                            }
1168                        } else {
1169                            Cow::Borrowed("")
1170                        };
1171
1172                        // For reference links, use the actual reference ID from pulldown-cmark
1173                        let reference_id = if is_reference && !ref_id.is_empty() {
1174                            Some(Cow::Owned(ref_id.to_lowercase()))
1175                        } else if is_reference {
1176                            // For collapsed/shortcut references without explicit ID, use the link text
1177                            Some(Cow::Owned(link_text.to_lowercase()))
1178                        } else {
1179                            None
1180                        };
1181
1182                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1183                        // Check for escaped image syntax: \![text](url)
1184                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1185                        let has_escaped_bang = start_pos >= 2
1186                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1187                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1188
1189                        // Check for escaped bracket: \[text](url)
1190                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1191                        let has_escaped_bracket =
1192                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1193
1194                        if has_escaped_bang || has_escaped_bracket {
1195                            text_chunks.clear();
1196                            continue; // Skip: this is escaped markdown, not a real link
1197                        }
1198
1199                        // Track this position as found
1200                        found_positions.insert(start_pos);
1201
1202                        links.push(ParsedLink {
1203                            line: line_num,
1204                            start_col: col_start,
1205                            end_col: col_end,
1206                            byte_offset: start_pos,
1207                            byte_end: range.end,
1208                            text: link_text,
1209                            url: Cow::Owned(url.to_string()),
1210                            is_reference,
1211                            reference_id,
1212                            link_type,
1213                        });
1214
1215                        text_chunks.clear();
1216                    }
1217                }
1218                Event::FootnoteReference(footnote_id) => {
1219                    // Capture footnote references like [^1], [^note]
1220                    // Skip if in HTML comment
1221                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1222                        continue;
1223                    }
1224
1225                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1226                    footnote_refs.push(FootnoteRef {
1227                        id: footnote_id.to_string(),
1228                        line: line_num,
1229                        byte_offset: range.start,
1230                        byte_end: range.end,
1231                    });
1232                }
1233                _ => {}
1234            }
1235        }
1236
1237        // Also find undefined references using regex
1238        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1239        // because the reference is undefined
1240        for cap in LINK_PATTERN.captures_iter(content) {
1241            let full_match = cap.get(0).unwrap();
1242            let match_start = full_match.start();
1243            let match_end = full_match.end();
1244
1245            // Skip if this was already found by pulldown-cmark (it's a valid link)
1246            if found_positions.contains(&match_start) {
1247                continue;
1248            }
1249
1250            // Skip if escaped
1251            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1252                continue;
1253            }
1254
1255            // Skip if it's an image
1256            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1257                continue;
1258            }
1259
1260            // Skip if in code block
1261            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1262                continue;
1263            }
1264
1265            // Skip if in code span
1266            if Self::is_offset_in_code_span(code_spans, match_start) {
1267                continue;
1268            }
1269
1270            // Skip if in HTML comment
1271            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1272                continue;
1273            }
1274
1275            // Find line and column information
1276            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1277
1278            // Skip if this link is on a MkDocs snippet line
1279            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1280                continue;
1281            }
1282
1283            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1284
1285            let text = cap.get(1).map_or("", |m| m.as_str());
1286
1287            // Only process reference links (group 6)
1288            if let Some(ref_id) = cap.get(6) {
1289                let ref_id_str = ref_id.as_str();
1290                let normalized_ref = if ref_id_str.is_empty() {
1291                    Cow::Owned(text.to_lowercase()) // Implicit reference
1292                } else {
1293                    Cow::Owned(ref_id_str.to_lowercase())
1294                };
1295
1296                // This is an undefined reference (pulldown-cmark didn't parse it)
1297                links.push(ParsedLink {
1298                    line: line_num,
1299                    start_col: col_start,
1300                    end_col: col_end,
1301                    byte_offset: match_start,
1302                    byte_end: match_end,
1303                    text: Cow::Borrowed(text),
1304                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1305                    is_reference: true,
1306                    reference_id: Some(normalized_ref),
1307                    link_type: LinkType::Reference, // Undefined references are reference-style
1308                });
1309            }
1310        }
1311
1312        (links, broken_links, footnote_refs)
1313    }
1314
1315    /// Parse all images in the content
1316    fn parse_images(
1317        content: &'a str,
1318        lines: &[LineInfo],
1319        code_blocks: &[(usize, usize)],
1320        code_spans: &[CodeSpan],
1321        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1322    ) -> Vec<ParsedImage<'a>> {
1323        use crate::utils::skip_context::is_in_html_comment_ranges;
1324        use std::collections::HashSet;
1325
1326        // Pre-size based on a heuristic: images are less common than links
1327        let mut images = Vec::with_capacity(content.len() / 1000);
1328        let mut found_positions = HashSet::new();
1329
1330        // Use pulldown-cmark for parsing - more accurate and faster
1331        let parser = Parser::new(content).into_offset_iter();
1332        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1333            Vec::new();
1334        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1335
1336        for (event, range) in parser {
1337            match event {
1338                Event::Start(Tag::Image {
1339                    link_type,
1340                    dest_url,
1341                    id,
1342                    ..
1343                }) => {
1344                    image_stack.push((range.start, dest_url, link_type, id));
1345                    text_chunks.clear();
1346                }
1347                Event::Text(text) if !image_stack.is_empty() => {
1348                    text_chunks.push((text.to_string(), range.start, range.end));
1349                }
1350                Event::Code(code) if !image_stack.is_empty() => {
1351                    let code_text = format!("`{code}`");
1352                    text_chunks.push((code_text, range.start, range.end));
1353                }
1354                Event::End(TagEnd::Image) => {
1355                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1356                        // Skip if in code block
1357                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1358                            continue;
1359                        }
1360
1361                        // Skip if in code span
1362                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1363                            continue;
1364                        }
1365
1366                        // Skip if in HTML comment
1367                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1368                            continue;
1369                        }
1370
1371                        // Find line and column using binary search
1372                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1373                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1374
1375                        let is_reference = matches!(
1376                            link_type,
1377                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1378                        );
1379
1380                        // Extract alt text directly from source bytes to preserve escaping
1381                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1382                        let alt_text = if start_pos < content.len() {
1383                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1384
1385                            // Find MATCHING ] by tracking bracket depth for nested brackets
1386                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1387                            let mut close_pos = None;
1388                            let mut depth = 0;
1389
1390                            if image_bytes.len() > 2 {
1391                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1392                                    // Count preceding backslashes
1393                                    let mut backslash_count = 0;
1394                                    let mut j = i;
1395                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1396                                        backslash_count += 1;
1397                                        j -= 1;
1398                                    }
1399                                    let is_escaped = backslash_count % 2 != 0;
1400
1401                                    if !is_escaped {
1402                                        if byte == b'[' {
1403                                            depth += 1;
1404                                        } else if byte == b']' {
1405                                            if depth == 0 {
1406                                                // Found the matching closing bracket
1407                                                close_pos = Some(i);
1408                                                break;
1409                                            } else {
1410                                                depth -= 1;
1411                                            }
1412                                        }
1413                                    }
1414                                }
1415                            }
1416
1417                            if let Some(pos) = close_pos {
1418                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1419                            } else {
1420                                Cow::Borrowed("")
1421                            }
1422                        } else {
1423                            Cow::Borrowed("")
1424                        };
1425
1426                        let reference_id = if is_reference && !ref_id.is_empty() {
1427                            Some(Cow::Owned(ref_id.to_lowercase()))
1428                        } else if is_reference {
1429                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1430                        } else {
1431                            None
1432                        };
1433
1434                        found_positions.insert(start_pos);
1435                        images.push(ParsedImage {
1436                            line: line_num,
1437                            start_col: col_start,
1438                            end_col: col_end,
1439                            byte_offset: start_pos,
1440                            byte_end: range.end,
1441                            alt_text,
1442                            url: Cow::Owned(url.to_string()),
1443                            is_reference,
1444                            reference_id,
1445                            link_type,
1446                        });
1447                    }
1448                }
1449                _ => {}
1450            }
1451        }
1452
1453        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1454        for cap in IMAGE_PATTERN.captures_iter(content) {
1455            let full_match = cap.get(0).unwrap();
1456            let match_start = full_match.start();
1457            let match_end = full_match.end();
1458
1459            // Skip if already found by pulldown-cmark
1460            if found_positions.contains(&match_start) {
1461                continue;
1462            }
1463
1464            // Skip if the ! is escaped
1465            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1466                continue;
1467            }
1468
1469            // Skip if in code block, code span, or HTML comment
1470            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1471                || Self::is_offset_in_code_span(code_spans, match_start)
1472                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1473            {
1474                continue;
1475            }
1476
1477            // Only process reference images (undefined references not found by pulldown-cmark)
1478            if let Some(ref_id) = cap.get(6) {
1479                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1480                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1481                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1482                let ref_id_str = ref_id.as_str();
1483                let normalized_ref = if ref_id_str.is_empty() {
1484                    Cow::Owned(alt_text.to_lowercase())
1485                } else {
1486                    Cow::Owned(ref_id_str.to_lowercase())
1487                };
1488
1489                images.push(ParsedImage {
1490                    line: line_num,
1491                    start_col: col_start,
1492                    end_col: col_end,
1493                    byte_offset: match_start,
1494                    byte_end: match_end,
1495                    alt_text: Cow::Borrowed(alt_text),
1496                    url: Cow::Borrowed(""),
1497                    is_reference: true,
1498                    reference_id: Some(normalized_ref),
1499                    link_type: LinkType::Reference, // Undefined references are reference-style
1500                });
1501            }
1502        }
1503
1504        images
1505    }
1506
1507    /// Parse reference definitions
1508    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1509        // Pre-size based on lines count as reference definitions are line-based
1510        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1511
1512        for (line_idx, line_info) in lines.iter().enumerate() {
1513            // Skip lines in code blocks
1514            if line_info.in_code_block {
1515                continue;
1516            }
1517
1518            let line = line_info.content(content);
1519            let line_num = line_idx + 1;
1520
1521            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1522                let id = cap.get(1).unwrap().as_str().to_lowercase();
1523                let url = cap.get(2).unwrap().as_str().to_string();
1524                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1525
1526                // Calculate byte positions
1527                // The match starts at the beginning of the line (0) and extends to the end
1528                let match_obj = cap.get(0).unwrap();
1529                let byte_offset = line_info.byte_offset + match_obj.start();
1530                let byte_end = line_info.byte_offset + match_obj.end();
1531
1532                refs.push(ReferenceDef {
1533                    line: line_num,
1534                    id,
1535                    url,
1536                    title,
1537                    byte_offset,
1538                    byte_end,
1539                });
1540            }
1541        }
1542
1543        refs
1544    }
1545
1546    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1547    /// Matches: ^(\s*>\s*)(.*)
1548    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1549    #[inline]
1550    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1551        let trimmed_start = line.trim_start();
1552        if !trimmed_start.starts_with('>') {
1553            return None;
1554        }
1555
1556        let leading_ws_len = line.len() - trimmed_start.len();
1557        let after_gt = &trimmed_start[1..];
1558        let content = after_gt.trim_start();
1559        let ws_after_gt_len = after_gt.len() - content.len();
1560        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1561
1562        Some((&line[..prefix_len], content))
1563    }
1564
1565    /// Fast unordered list parser - replaces regex for 5-10x speedup
1566    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1567    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1568    #[inline]
1569    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1570        let bytes = line.as_bytes();
1571        let mut i = 0;
1572
1573        // Skip leading whitespace
1574        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1575            i += 1;
1576        }
1577
1578        // Check for marker
1579        if i >= bytes.len() {
1580            return None;
1581        }
1582        let marker = bytes[i] as char;
1583        if marker != '-' && marker != '*' && marker != '+' {
1584            return None;
1585        }
1586        let marker_pos = i;
1587        i += 1;
1588
1589        // Collect spacing after marker (space or tab only)
1590        let spacing_start = i;
1591        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1592            i += 1;
1593        }
1594
1595        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1596    }
1597
1598    /// Fast ordered list parser - replaces regex for 5-10x speedup
1599    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1600    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1601    #[inline]
1602    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1603        let bytes = line.as_bytes();
1604        let mut i = 0;
1605
1606        // Skip leading whitespace
1607        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1608            i += 1;
1609        }
1610
1611        // Collect digits
1612        let number_start = i;
1613        while i < bytes.len() && bytes[i].is_ascii_digit() {
1614            i += 1;
1615        }
1616        if i == number_start {
1617            return None; // No digits found
1618        }
1619
1620        // Check for delimiter
1621        if i >= bytes.len() {
1622            return None;
1623        }
1624        let delimiter = bytes[i] as char;
1625        if delimiter != '.' && delimiter != ')' {
1626            return None;
1627        }
1628        let delimiter_pos = i;
1629        i += 1;
1630
1631        // Collect spacing after delimiter (space or tab only)
1632        let spacing_start = i;
1633        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1634            i += 1;
1635        }
1636
1637        Some((
1638            &line[..number_start],
1639            &line[number_start..delimiter_pos],
1640            delimiter,
1641            &line[spacing_start..i],
1642            &line[i..],
1643        ))
1644    }
1645
1646    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1647    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1648    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1649        let num_lines = line_offsets.len();
1650        let mut in_code_block = vec![false; num_lines];
1651
1652        // For each code block, mark all lines within it
1653        for &(start, end) in code_blocks {
1654            // Ensure we're at valid UTF-8 boundaries
1655            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1656                let mut boundary = start;
1657                while boundary > 0 && !content.is_char_boundary(boundary) {
1658                    boundary -= 1;
1659                }
1660                boundary
1661            } else {
1662                start
1663            };
1664
1665            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1666                let mut boundary = end;
1667                while boundary < content.len() && !content.is_char_boundary(boundary) {
1668                    boundary += 1;
1669                }
1670                boundary
1671            } else {
1672                end.min(content.len())
1673            };
1674
1675            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1676            // That function now has proper list context awareness (see code_block_utils.rs)
1677            // and correctly distinguishes between:
1678            // - Fenced code blocks (``` or ~~~)
1679            // - Indented code blocks at document level (4 spaces + blank line before)
1680            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1681            //
1682            // We no longer need to re-validate here. The original validation logic
1683            // was causing false positives by marking list continuation paragraphs as
1684            // code blocks when they have 4 spaces of indentation.
1685
1686            // Use binary search to find the first and last line indices
1687            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1688            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1689            //
1690            // Find the line that CONTAINS safe_start: the line with the largest
1691            // start offset that is <= safe_start. partition_point gives us the
1692            // first line that starts AFTER safe_start, so we subtract 1.
1693            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1694            let first_line = first_line_after.saturating_sub(1);
1695            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1696
1697            // Mark all lines in the range at once
1698            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1699                *flag = true;
1700            }
1701        }
1702
1703        in_code_block
1704    }
1705
1706    /// Pre-compute basic line information (without headings/blockquotes)
1707    fn compute_basic_line_info(
1708        content: &str,
1709        line_offsets: &[usize],
1710        code_blocks: &[(usize, usize)],
1711        flavor: MarkdownFlavor,
1712        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1713        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1714    ) -> Vec<LineInfo> {
1715        let content_lines: Vec<&str> = content.lines().collect();
1716        let mut lines = Vec::with_capacity(content_lines.len());
1717
1718        // Pre-compute which lines are in code blocks
1719        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1720
1721        // Detect front matter boundaries FIRST, before any other parsing
1722        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1723        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1724
1725        for (i, line) in content_lines.iter().enumerate() {
1726            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1727            let indent = line.len() - line.trim_start().len();
1728
1729            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1730            let blockquote_parse = Self::parse_blockquote_prefix(line);
1731
1732            // For blank detection, consider blockquote context
1733            let is_blank = if let Some((_, content)) = blockquote_parse {
1734                // In blockquote context, check if content after prefix is blank
1735                content.trim().is_empty()
1736            } else {
1737                line.trim().is_empty()
1738            };
1739
1740            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1741            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1742
1743            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1744            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1745                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1746            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1747            let in_html_comment =
1748                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1749            let list_item = if !(in_code_block
1750                || is_blank
1751                || in_mkdocstrings
1752                || in_html_comment
1753                || (front_matter_end > 0 && i < front_matter_end))
1754            {
1755                // Strip blockquote prefix if present for list detection (reuse cached result)
1756                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1757                    (content, prefix.len())
1758                } else {
1759                    (&**line, 0)
1760                };
1761
1762                if let Some((leading_spaces, marker, spacing, _content)) =
1763                    Self::parse_unordered_list(line_for_list_check)
1764                {
1765                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1766                    let content_column = marker_column + 1 + spacing.len();
1767
1768                    // According to CommonMark spec, unordered list items MUST have at least one space
1769                    // after the marker (-, *, or +). Without a space, it's not a list item.
1770                    // This also naturally handles cases like:
1771                    // - *emphasis* (not a list)
1772                    // - **bold** (not a list)
1773                    // - --- (horizontal rule, not a list)
1774                    if spacing.is_empty() {
1775                        None
1776                    } else {
1777                        Some(ListItemInfo {
1778                            marker: marker.to_string(),
1779                            is_ordered: false,
1780                            number: None,
1781                            marker_column,
1782                            content_column,
1783                        })
1784                    }
1785                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1786                    Self::parse_ordered_list(line_for_list_check)
1787                {
1788                    let marker = format!("{number_str}{delimiter}");
1789                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1790                    let content_column = marker_column + marker.len() + spacing.len();
1791
1792                    // According to CommonMark spec, ordered list items MUST have at least one space
1793                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1794                    if spacing.is_empty() {
1795                        None
1796                    } else {
1797                        Some(ListItemInfo {
1798                            marker,
1799                            is_ordered: true,
1800                            number: number_str.parse().ok(),
1801                            marker_column,
1802                            content_column,
1803                        })
1804                    }
1805                } else {
1806                    None
1807                }
1808            } else {
1809                None
1810            };
1811
1812            lines.push(LineInfo {
1813                byte_offset,
1814                byte_len: line.len(),
1815                indent,
1816                is_blank,
1817                in_code_block,
1818                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1819                in_html_block: false, // Will be populated after line creation
1820                in_html_comment,
1821                list_item,
1822                heading: None,    // Will be populated in second pass for Setext headings
1823                blockquote: None, // Will be populated after line creation
1824                in_mkdocstrings,
1825                in_esm_block: false, // Will be populated after line creation for MDX files
1826                in_code_span_continuation: false, // Will be populated after code spans are parsed
1827            });
1828        }
1829
1830        lines
1831    }
1832
1833    /// Detect headings and blockquotes (called after HTML block detection)
1834    fn detect_headings_and_blockquotes(
1835        content: &str,
1836        lines: &mut [LineInfo],
1837        flavor: MarkdownFlavor,
1838        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1839    ) {
1840        // Regex for heading detection
1841        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1842            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1843        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1844            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1845
1846        let content_lines: Vec<&str> = content.lines().collect();
1847
1848        // Detect front matter boundaries to skip those lines
1849        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1850
1851        // Detect headings (including Setext which needs look-ahead) and blockquotes
1852        for i in 0..lines.len() {
1853            if lines[i].in_code_block {
1854                continue;
1855            }
1856
1857            // Skip lines in front matter
1858            if front_matter_end > 0 && i < front_matter_end {
1859                continue;
1860            }
1861
1862            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1863            if lines[i].in_html_block {
1864                continue;
1865            }
1866
1867            let line = content_lines[i];
1868
1869            // Check for blockquotes (even on blank lines within blockquotes)
1870            if let Some(bq) = parse_blockquote_detailed(line) {
1871                let nesting_level = bq.markers.len(); // Each '>' is one level
1872                let marker_column = bq.indent.len();
1873
1874                // Build the prefix (indentation + markers + space)
1875                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1876
1877                // Check for various blockquote issues
1878                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1879                // Only flag multiple literal spaces, not tabs
1880                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
1881                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1882
1883                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1884                // MD028 flags empty blockquote lines that don't have a single space after the marker
1885                // Lines like "> " or ">> " are already correct and don't need fixing
1886                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1887
1888                lines[i].blockquote = Some(BlockquoteInfo {
1889                    nesting_level,
1890                    indent: bq.indent.to_string(),
1891                    marker_column,
1892                    prefix,
1893                    content: bq.content.to_string(),
1894                    has_no_space_after_marker: has_no_space,
1895                    has_multiple_spaces_after_marker: has_multiple_spaces,
1896                    needs_md028_fix,
1897                });
1898            }
1899
1900            // Skip heading detection for blank lines
1901            if lines[i].is_blank {
1902                continue;
1903            }
1904
1905            // Check for ATX headings (but skip MkDocs snippet lines)
1906            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1907            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1908                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1909                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1910            } else {
1911                false
1912            };
1913
1914            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1915                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1916                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1917                    continue;
1918                }
1919                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1920                let hashes = caps.get(2).map_or("", |m| m.as_str());
1921                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1922                let rest = caps.get(4).map_or("", |m| m.as_str());
1923
1924                let level = hashes.len() as u8;
1925                let marker_column = leading_spaces.len();
1926
1927                // Check for closing sequence, but handle custom IDs that might come after
1928                let (text, has_closing, closing_seq) = {
1929                    // First check if there's a custom ID at the end
1930                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1931                        // Check if this looks like a valid custom ID (ends with })
1932                        if rest[id_start..].trim_end().ends_with('}') {
1933                            // Split off the custom ID
1934                            (&rest[..id_start], &rest[id_start..])
1935                        } else {
1936                            (rest, "")
1937                        }
1938                    } else {
1939                        (rest, "")
1940                    };
1941
1942                    // Now look for closing hashes in the part before the custom ID
1943                    let trimmed_rest = rest_without_id.trim_end();
1944                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1945                        // Look for the start of the hash sequence
1946                        let mut start_of_hashes = last_hash_pos;
1947                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1948                            start_of_hashes -= 1;
1949                        }
1950
1951                        // Check if there's at least one space before the closing hashes
1952                        let has_space_before = start_of_hashes == 0
1953                            || trimmed_rest
1954                                .chars()
1955                                .nth(start_of_hashes - 1)
1956                                .is_some_and(|c| c.is_whitespace());
1957
1958                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1959                        let potential_closing = &trimmed_rest[start_of_hashes..];
1960                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1961
1962                        if is_all_hashes && has_space_before {
1963                            // This is a closing sequence
1964                            let closing_hashes = potential_closing.to_string();
1965                            // The text is everything before the closing hashes
1966                            // Don't include the custom ID here - it will be extracted later
1967                            let text_part = if !custom_id_part.is_empty() {
1968                                // If we have a custom ID, append it back to get the full rest
1969                                // This allows the extract_header_id function to handle it properly
1970                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1971                            } else {
1972                                rest_without_id[..start_of_hashes].trim_end().to_string()
1973                            };
1974                            (text_part, true, closing_hashes)
1975                        } else {
1976                            // Not a valid closing sequence, return the full content
1977                            (rest.to_string(), false, String::new())
1978                        }
1979                    } else {
1980                        // No hashes found, return the full content
1981                        (rest.to_string(), false, String::new())
1982                    }
1983                };
1984
1985                let content_column = marker_column + hashes.len() + spaces_after.len();
1986
1987                // Extract custom header ID if present
1988                let raw_text = text.trim().to_string();
1989                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1990
1991                // If no custom ID was found on the header line, check the next line for standalone attr-list
1992                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1993                    let next_line = content_lines[i + 1];
1994                    if !lines[i + 1].in_code_block
1995                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1996                        && let Some(next_line_id) =
1997                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1998                    {
1999                        custom_id = Some(next_line_id);
2000                    }
2001                }
2002
2003                lines[i].heading = Some(HeadingInfo {
2004                    level,
2005                    style: HeadingStyle::ATX,
2006                    marker: hashes.to_string(),
2007                    marker_column,
2008                    content_column,
2009                    text: clean_text,
2010                    custom_id,
2011                    raw_text,
2012                    has_closing_sequence: has_closing,
2013                    closing_sequence: closing_seq,
2014                });
2015            }
2016            // Check for Setext headings (need to look at next line)
2017            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2018                let next_line = content_lines[i + 1];
2019                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2020                    // Skip if next line is front matter delimiter
2021                    if front_matter_end > 0 && i < front_matter_end {
2022                        continue;
2023                    }
2024
2025                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2026                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2027                    {
2028                        continue;
2029                    }
2030
2031                    let underline = next_line.trim();
2032
2033                    let level = if underline.starts_with('=') { 1 } else { 2 };
2034                    let style = if level == 1 {
2035                        HeadingStyle::Setext1
2036                    } else {
2037                        HeadingStyle::Setext2
2038                    };
2039
2040                    // Extract custom header ID if present
2041                    let raw_text = line.trim().to_string();
2042                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2043
2044                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2045                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2046                        let attr_line = content_lines[i + 2];
2047                        if !lines[i + 2].in_code_block
2048                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2049                            && let Some(attr_line_id) =
2050                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2051                        {
2052                            custom_id = Some(attr_line_id);
2053                        }
2054                    }
2055
2056                    lines[i].heading = Some(HeadingInfo {
2057                        level,
2058                        style,
2059                        marker: underline.to_string(),
2060                        marker_column: next_line.len() - next_line.trim_start().len(),
2061                        content_column: lines[i].indent,
2062                        text: clean_text,
2063                        custom_id,
2064                        raw_text,
2065                        has_closing_sequence: false,
2066                        closing_sequence: String::new(),
2067                    });
2068                }
2069            }
2070        }
2071    }
2072
2073    /// Detect HTML blocks in the content
2074    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2075        // HTML block elements that trigger block context
2076        const BLOCK_ELEMENTS: &[&str] = &[
2077            "address",
2078            "article",
2079            "aside",
2080            "blockquote",
2081            "details",
2082            "dialog",
2083            "dd",
2084            "div",
2085            "dl",
2086            "dt",
2087            "fieldset",
2088            "figcaption",
2089            "figure",
2090            "footer",
2091            "form",
2092            "h1",
2093            "h2",
2094            "h3",
2095            "h4",
2096            "h5",
2097            "h6",
2098            "header",
2099            "hr",
2100            "li",
2101            "main",
2102            "nav",
2103            "ol",
2104            "p",
2105            "picture",
2106            "pre",
2107            "script",
2108            "section",
2109            "style",
2110            "table",
2111            "tbody",
2112            "td",
2113            "textarea",
2114            "tfoot",
2115            "th",
2116            "thead",
2117            "tr",
2118            "ul",
2119        ];
2120
2121        let mut i = 0;
2122        while i < lines.len() {
2123            // Skip if already in code block or front matter
2124            if lines[i].in_code_block || lines[i].in_front_matter {
2125                i += 1;
2126                continue;
2127            }
2128
2129            let trimmed = lines[i].content(content).trim_start();
2130
2131            // Check if line starts with an HTML tag
2132            if trimmed.starts_with('<') && trimmed.len() > 1 {
2133                // Extract tag name safely
2134                let after_bracket = &trimmed[1..];
2135                let is_closing = after_bracket.starts_with('/');
2136                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2137
2138                // Extract tag name (stop at space, >, /, or end of string)
2139                let tag_name = tag_start
2140                    .chars()
2141                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2142                    .collect::<String>()
2143                    .to_lowercase();
2144
2145                // Check if it's a block element
2146                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2147                    // Mark this line as in HTML block
2148                    lines[i].in_html_block = true;
2149
2150                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2151                    // This avoids complex nesting logic that might cause infinite loops
2152                    if !is_closing {
2153                        let closing_tag = format!("</{tag_name}>");
2154                        // style and script tags can contain blank lines (CSS/JS formatting)
2155                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2156                        let mut j = i + 1;
2157                        while j < lines.len() && j < i + 100 {
2158                            // Limit search to 100 lines
2159                            // Stop at blank lines (except for style/script tags)
2160                            if !allow_blank_lines && lines[j].is_blank {
2161                                break;
2162                            }
2163
2164                            lines[j].in_html_block = true;
2165
2166                            // Check if this line contains the closing tag
2167                            if lines[j].content(content).contains(&closing_tag) {
2168                                break;
2169                            }
2170                            j += 1;
2171                        }
2172                    }
2173                }
2174            }
2175
2176            i += 1;
2177        }
2178    }
2179
2180    /// Detect ESM import/export blocks in MDX files
2181    /// ESM blocks consist of contiguous import/export statements at the top of the file
2182    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2183        // Only process MDX files
2184        if !flavor.supports_esm_blocks() {
2185            return;
2186        }
2187
2188        for line in lines.iter_mut() {
2189            // Skip blank lines and comments at the start
2190            if line.is_blank || line.in_html_comment {
2191                continue;
2192            }
2193
2194            // Check if line starts with import or export
2195            let trimmed = line.content(content).trim_start();
2196            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2197                line.in_esm_block = true;
2198            } else {
2199                // Once we hit a non-ESM line, we're done with the ESM block
2200                break;
2201            }
2202        }
2203    }
2204
2205    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2206    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2207        let mut code_spans = Vec::new();
2208
2209        // Quick check - if no backticks, no code spans
2210        if !content.contains('`') {
2211            return code_spans;
2212        }
2213
2214        // Use pulldown-cmark's streaming parser with byte offsets
2215        let parser = Parser::new(content).into_offset_iter();
2216
2217        for (event, range) in parser {
2218            if let Event::Code(_) = event {
2219                let start_pos = range.start;
2220                let end_pos = range.end;
2221
2222                // The range includes the backticks, extract the actual content
2223                let full_span = &content[start_pos..end_pos];
2224                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2225
2226                // Extract content between backticks, preserving spaces
2227                let content_start = start_pos + backtick_count;
2228                let content_end = end_pos - backtick_count;
2229                let span_content = if content_start < content_end {
2230                    content[content_start..content_end].to_string()
2231                } else {
2232                    String::new()
2233                };
2234
2235                // Use binary search to find line number - O(log n) instead of O(n)
2236                // Find the rightmost line whose byte_offset <= start_pos
2237                let line_idx = lines
2238                    .partition_point(|line| line.byte_offset <= start_pos)
2239                    .saturating_sub(1);
2240                let line_num = line_idx + 1;
2241                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2242
2243                // Find end column using binary search
2244                let end_line_idx = lines
2245                    .partition_point(|line| line.byte_offset <= end_pos)
2246                    .saturating_sub(1);
2247                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2248
2249                // Convert byte offsets to character positions for correct Unicode handling
2250                // This ensures consistency with warning.column which uses character positions
2251                let line_content = lines[line_idx].content(content);
2252                let col_start = if byte_col_start <= line_content.len() {
2253                    line_content[..byte_col_start].chars().count()
2254                } else {
2255                    line_content.chars().count()
2256                };
2257
2258                let end_line_content = lines[end_line_idx].content(content);
2259                let col_end = if byte_col_end <= end_line_content.len() {
2260                    end_line_content[..byte_col_end].chars().count()
2261                } else {
2262                    end_line_content.chars().count()
2263                };
2264
2265                code_spans.push(CodeSpan {
2266                    line: line_num,
2267                    end_line: end_line_idx + 1,
2268                    start_col: col_start,
2269                    end_col: col_end,
2270                    byte_offset: start_pos,
2271                    byte_end: end_pos,
2272                    backtick_count,
2273                    content: span_content,
2274                });
2275            }
2276        }
2277
2278        // Sort by position to ensure consistent ordering
2279        code_spans.sort_by_key(|span| span.byte_offset);
2280
2281        code_spans
2282    }
2283
2284    /// Parse all list blocks in the content (legacy line-by-line approach)
2285    ///
2286    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2287    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2288    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2289    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2290    ///   treated as list continuation (based on the list marker width)
2291    ///
2292    /// When a new list item is encountered, we check if list-breaking content was seen
2293    /// since the last item. If so, we start a new list block.
2294    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2295        // Minimum indentation for unordered list continuation per CommonMark spec
2296        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2297
2298        /// Initialize or reset the forward-scanning tracking state.
2299        /// This helper eliminates code duplication across three initialization sites.
2300        #[inline]
2301        fn reset_tracking_state(
2302            list_item: &ListItemInfo,
2303            has_list_breaking_content: &mut bool,
2304            min_continuation: &mut usize,
2305        ) {
2306            *has_list_breaking_content = false;
2307            let marker_width = if list_item.is_ordered {
2308                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2309            } else {
2310                list_item.marker.len()
2311            };
2312            *min_continuation = if list_item.is_ordered {
2313                marker_width
2314            } else {
2315                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2316            };
2317        }
2318
2319        // Pre-size based on lines that could be list items
2320        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2321        let mut current_block: Option<ListBlock> = None;
2322        let mut last_list_item_line = 0;
2323        let mut current_indent_level = 0;
2324        let mut last_marker_width = 0;
2325
2326        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2327        let mut has_list_breaking_content_since_last_item = false;
2328        let mut min_continuation_for_tracking = 0;
2329
2330        for (line_idx, line_info) in lines.iter().enumerate() {
2331            let line_num = line_idx + 1;
2332
2333            // Enhanced code block handling using Design #3's context analysis
2334            if line_info.in_code_block {
2335                if let Some(ref mut block) = current_block {
2336                    // Calculate minimum indentation for list continuation
2337                    let min_continuation_indent =
2338                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2339
2340                    // Analyze code block context using the three-tier classification
2341                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2342
2343                    match context {
2344                        CodeBlockContext::Indented => {
2345                            // Code block is properly indented - continues the list
2346                            block.end_line = line_num;
2347                            continue;
2348                        }
2349                        CodeBlockContext::Standalone => {
2350                            // Code block separates lists - end current block
2351                            let completed_block = current_block.take().unwrap();
2352                            list_blocks.push(completed_block);
2353                            continue;
2354                        }
2355                        CodeBlockContext::Adjacent => {
2356                            // Edge case - use conservative behavior (continue list)
2357                            block.end_line = line_num;
2358                            continue;
2359                        }
2360                    }
2361                } else {
2362                    // No current list block - skip code block lines
2363                    continue;
2364                }
2365            }
2366
2367            // Extract blockquote prefix if any
2368            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2369                caps.get(0).unwrap().as_str().to_string()
2370            } else {
2371                String::new()
2372            };
2373
2374            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2375            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2376            if current_block.is_some()
2377                && line_info.list_item.is_none()
2378                && !line_info.is_blank
2379                && !line_info.in_code_span_continuation
2380            {
2381                let line_content = line_info.content(content).trim();
2382
2383                // Count pipes outside of inline code spans (to avoid confusing `||` for table)
2384                let pipes_outside_code = {
2385                    let mut count = 0;
2386                    let mut in_code = false;
2387                    for ch in line_content.chars() {
2388                        if ch == '`' {
2389                            in_code = !in_code;
2390                        } else if ch == '|' && !in_code {
2391                            count += 1;
2392                        }
2393                    }
2394                    count
2395                };
2396
2397                // Check for structural separators that break lists
2398                let breaks_list = line_info.heading.is_some()
2399                    || line_content.starts_with("---")
2400                    || line_content.starts_with("***")
2401                    || line_content.starts_with("___")
2402                    || (pipes_outside_code > 0
2403                        && !line_content.contains("](")
2404                        && !line_content.contains("http")
2405                        && (pipes_outside_code > 1 || line_content.starts_with('|') || line_content.ends_with('|')))
2406                    || line_content.starts_with(">")
2407                    || (line_info.indent < min_continuation_for_tracking);
2408
2409                if breaks_list {
2410                    has_list_breaking_content_since_last_item = true;
2411                }
2412            }
2413
2414            // If this line is a code span continuation within an active list block,
2415            // extend the block's end_line to include this line (maintains list continuity)
2416            if line_info.in_code_span_continuation
2417                && line_info.list_item.is_none()
2418                && let Some(ref mut block) = current_block
2419            {
2420                block.end_line = line_num;
2421            }
2422
2423            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2424            // properly indented lines within the list). This ensures the workaround at line 2448
2425            // works correctly when there are multiple continuation lines before a nested list item.
2426            if !line_info.in_code_span_continuation
2427                && line_info.list_item.is_none()
2428                && !line_info.is_blank
2429                && !line_info.in_code_block
2430                && line_info.indent >= min_continuation_for_tracking
2431                && let Some(ref mut block) = current_block
2432            {
2433                block.end_line = line_num;
2434            }
2435
2436            // Check if this line is a list item
2437            if let Some(list_item) = &line_info.list_item {
2438                // Calculate nesting level based on indentation
2439                let item_indent = list_item.marker_column;
2440                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2441
2442                if let Some(ref mut block) = current_block {
2443                    // Check if this continues the current block
2444                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2445                    // or a continuation at the same or lower level
2446                    let is_nested = nesting > block.nesting_level;
2447                    let same_type =
2448                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2449                    let same_context = block.blockquote_prefix == blockquote_prefix;
2450                    // Allow one blank line after last item, or lines immediately after block content
2451                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
2452
2453                    // For unordered lists, also check marker consistency
2454                    let marker_compatible =
2455                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2456
2457                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2458                    // This eliminates the quadratic bottleneck from issue #148
2459                    let has_non_list_content = has_list_breaking_content_since_last_item;
2460
2461                    // A list continues if:
2462                    // 1. It's a nested item (indented more than the parent), OR
2463                    // 2. It's the same type at the same level with reasonable distance
2464                    let mut continues_list = if is_nested {
2465                        // Nested items always continue the list if they're in the same context
2466                        same_context && reasonable_distance && !has_non_list_content
2467                    } else {
2468                        // Same-level items need to match type and markers
2469                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2470                    };
2471
2472                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2473                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2474                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2475                        // Check if the previous line was a list item
2476                        if block.item_lines.contains(&(line_num - 1)) {
2477                            // They're consecutive list items - force them to be in the same list
2478                            continues_list = true;
2479                        }
2480                    }
2481
2482                    if continues_list {
2483                        // Extend current block
2484                        block.end_line = line_num;
2485                        block.item_lines.push(line_num);
2486
2487                        // Update max marker width
2488                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2489                            list_item.marker.len() + 1
2490                        } else {
2491                            list_item.marker.len()
2492                        });
2493
2494                        // Update marker consistency for unordered lists
2495                        if !block.is_ordered
2496                            && block.marker.is_some()
2497                            && block.marker.as_ref() != Some(&list_item.marker)
2498                        {
2499                            // Mixed markers, clear the marker field
2500                            block.marker = None;
2501                        }
2502
2503                        // Reset tracked state for issue #148 optimization
2504                        reset_tracking_state(
2505                            list_item,
2506                            &mut has_list_breaking_content_since_last_item,
2507                            &mut min_continuation_for_tracking,
2508                        );
2509                    } else {
2510                        // End current block and start a new one
2511
2512                        list_blocks.push(block.clone());
2513
2514                        *block = ListBlock {
2515                            start_line: line_num,
2516                            end_line: line_num,
2517                            is_ordered: list_item.is_ordered,
2518                            marker: if list_item.is_ordered {
2519                                None
2520                            } else {
2521                                Some(list_item.marker.clone())
2522                            },
2523                            blockquote_prefix: blockquote_prefix.clone(),
2524                            item_lines: vec![line_num],
2525                            nesting_level: nesting,
2526                            max_marker_width: if list_item.is_ordered {
2527                                list_item.marker.len() + 1
2528                            } else {
2529                                list_item.marker.len()
2530                            },
2531                        };
2532
2533                        // Initialize tracked state for new block (issue #148 optimization)
2534                        reset_tracking_state(
2535                            list_item,
2536                            &mut has_list_breaking_content_since_last_item,
2537                            &mut min_continuation_for_tracking,
2538                        );
2539                    }
2540                } else {
2541                    // Start a new block
2542                    current_block = Some(ListBlock {
2543                        start_line: line_num,
2544                        end_line: line_num,
2545                        is_ordered: list_item.is_ordered,
2546                        marker: if list_item.is_ordered {
2547                            None
2548                        } else {
2549                            Some(list_item.marker.clone())
2550                        },
2551                        blockquote_prefix,
2552                        item_lines: vec![line_num],
2553                        nesting_level: nesting,
2554                        max_marker_width: list_item.marker.len(),
2555                    });
2556
2557                    // Initialize tracked state for new block (issue #148 optimization)
2558                    reset_tracking_state(
2559                        list_item,
2560                        &mut has_list_breaking_content_since_last_item,
2561                        &mut min_continuation_for_tracking,
2562                    );
2563                }
2564
2565                last_list_item_line = line_num;
2566                current_indent_level = item_indent;
2567                last_marker_width = if list_item.is_ordered {
2568                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2569                } else {
2570                    list_item.marker.len()
2571                };
2572            } else if let Some(ref mut block) = current_block {
2573                // Not a list item - check if it continues the current block
2574
2575                // For MD032 compatibility, we use a simple approach:
2576                // - Indented lines continue the list
2577                // - Blank lines followed by indented content continue the list
2578                // - Everything else ends the list
2579
2580                // Check if the last line in the list block ended with a backslash (hard line break)
2581                // This handles cases where list items use backslash for hard line breaks
2582                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2583                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2584                } else {
2585                    false
2586                };
2587
2588                // Calculate minimum indentation for list continuation
2589                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2590                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2591                let min_continuation_indent = if block.is_ordered {
2592                    current_indent_level + last_marker_width
2593                } else {
2594                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2595                };
2596
2597                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2598                    // Indented line or backslash continuation continues the list
2599                    block.end_line = line_num;
2600                } else if line_info.is_blank {
2601                    // Blank line - check if it's internal to the list or ending it
2602                    // We only include blank lines that are followed by more list content
2603                    let mut check_idx = line_idx + 1;
2604                    let mut found_continuation = false;
2605
2606                    // Skip additional blank lines
2607                    while check_idx < lines.len() && lines[check_idx].is_blank {
2608                        check_idx += 1;
2609                    }
2610
2611                    if check_idx < lines.len() {
2612                        let next_line = &lines[check_idx];
2613                        // Check if followed by indented content (list continuation)
2614                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2615                            found_continuation = true;
2616                        }
2617                        // Check if followed by another list item at the same level
2618                        else if !next_line.in_code_block
2619                            && next_line.list_item.is_some()
2620                            && let Some(item) = &next_line.list_item
2621                        {
2622                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2623                                .find(next_line.content(content))
2624                                .map_or(String::new(), |m| m.as_str().to_string());
2625                            if item.marker_column == current_indent_level
2626                                && item.is_ordered == block.is_ordered
2627                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2628                            {
2629                                // Check if there was meaningful content between the list items (unused now)
2630                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2631                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2632                                    if let Some(between_line) = lines.get(idx) {
2633                                        let between_content = between_line.content(content);
2634                                        let trimmed = between_content.trim();
2635                                        // Skip empty lines
2636                                        if trimmed.is_empty() {
2637                                            return false;
2638                                        }
2639                                        // Check for meaningful content
2640                                        let line_indent = between_content.len() - between_content.trim_start().len();
2641
2642                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2643                                        if trimmed.starts_with("```")
2644                                            || trimmed.starts_with("~~~")
2645                                            || trimmed.starts_with("---")
2646                                            || trimmed.starts_with("***")
2647                                            || trimmed.starts_with("___")
2648                                            || trimmed.starts_with(">")
2649                                            || trimmed.contains('|') // Tables
2650                                            || between_line.heading.is_some()
2651                                        {
2652                                            return true; // These are structural separators - meaningful content that breaks lists
2653                                        }
2654
2655                                        // Only properly indented content continues the list
2656                                        line_indent >= min_continuation_indent
2657                                    } else {
2658                                        false
2659                                    }
2660                                });
2661
2662                                if block.is_ordered {
2663                                    // For ordered lists: don't continue if there are structural separators
2664                                    // Check if there are structural separators between the list items
2665                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2666                                        if let Some(between_line) = lines.get(idx) {
2667                                            let trimmed = between_line.content(content).trim();
2668                                            if trimmed.is_empty() {
2669                                                return false;
2670                                            }
2671                                            // Check for structural separators that break lists
2672                                            trimmed.starts_with("```")
2673                                                || trimmed.starts_with("~~~")
2674                                                || trimmed.starts_with("---")
2675                                                || trimmed.starts_with("***")
2676                                                || trimmed.starts_with("___")
2677                                                || trimmed.starts_with(">")
2678                                                || trimmed.contains('|') // Tables
2679                                                || between_line.heading.is_some()
2680                                        } else {
2681                                            false
2682                                        }
2683                                    });
2684                                    found_continuation = !has_structural_separators;
2685                                } else {
2686                                    // For unordered lists: also check for structural separators
2687                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2688                                        if let Some(between_line) = lines.get(idx) {
2689                                            let trimmed = between_line.content(content).trim();
2690                                            if trimmed.is_empty() {
2691                                                return false;
2692                                            }
2693                                            // Check for structural separators that break lists
2694                                            trimmed.starts_with("```")
2695                                                || trimmed.starts_with("~~~")
2696                                                || trimmed.starts_with("---")
2697                                                || trimmed.starts_with("***")
2698                                                || trimmed.starts_with("___")
2699                                                || trimmed.starts_with(">")
2700                                                || trimmed.contains('|') // Tables
2701                                                || between_line.heading.is_some()
2702                                        } else {
2703                                            false
2704                                        }
2705                                    });
2706                                    found_continuation = !has_structural_separators;
2707                                }
2708                            }
2709                        }
2710                    }
2711
2712                    if found_continuation {
2713                        // Include the blank line in the block
2714                        block.end_line = line_num;
2715                    } else {
2716                        // Blank line ends the list - don't include it
2717                        list_blocks.push(block.clone());
2718                        current_block = None;
2719                    }
2720                } else {
2721                    // Check for lazy continuation - non-indented line immediately after a list item
2722                    // But only if the line has sufficient indentation for the list type
2723                    let min_required_indent = if block.is_ordered {
2724                        current_indent_level + last_marker_width
2725                    } else {
2726                        current_indent_level + 2
2727                    };
2728
2729                    // For lazy continuation to apply, the line must either:
2730                    // 1. Have no indentation (true lazy continuation)
2731                    // 2. Have sufficient indentation for the list type
2732                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2733                    let line_content = line_info.content(content).trim();
2734                    let is_structural_separator = line_info.heading.is_some()
2735                        || line_content.starts_with("```")
2736                        || line_content.starts_with("~~~")
2737                        || line_content.starts_with("---")
2738                        || line_content.starts_with("***")
2739                        || line_content.starts_with("___")
2740                        || line_content.starts_with(">")
2741                        || (line_content.contains('|')
2742                            && !line_content.contains("](")
2743                            && !line_content.contains("http")
2744                            && (line_content.matches('|').count() > 1
2745                                || line_content.starts_with('|')
2746                                || line_content.ends_with('|'))); // Tables
2747
2748                    // Allow lazy continuation if we're still within the same list block
2749                    // (not just immediately after a list item)
2750                    let is_lazy_continuation = !is_structural_separator
2751                        && !line_info.is_blank
2752                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2753
2754                    if is_lazy_continuation {
2755                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2756                        // it's probably not a continuation
2757                        let content_to_check = if !blockquote_prefix.is_empty() {
2758                            // Strip blockquote prefix to check the actual content
2759                            line_info
2760                                .content(content)
2761                                .strip_prefix(&blockquote_prefix)
2762                                .unwrap_or(line_info.content(content))
2763                                .trim()
2764                        } else {
2765                            line_info.content(content).trim()
2766                        };
2767
2768                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2769
2770                        // If it starts with uppercase and the previous line ended with punctuation,
2771                        // it's likely a new paragraph, not a continuation
2772                        if starts_with_uppercase && last_list_item_line > 0 {
2773                            // This looks like a new paragraph
2774                            list_blocks.push(block.clone());
2775                            current_block = None;
2776                        } else {
2777                            // This is a lazy continuation line
2778                            block.end_line = line_num;
2779                        }
2780                    } else {
2781                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2782                        list_blocks.push(block.clone());
2783                        current_block = None;
2784                    }
2785                }
2786            }
2787        }
2788
2789        // Don't forget the last block
2790        if let Some(block) = current_block {
2791            list_blocks.push(block);
2792        }
2793
2794        // Merge adjacent blocks that should be one
2795        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2796
2797        list_blocks
2798    }
2799
2800    /// Compute character frequency for fast content analysis
2801    fn compute_char_frequency(content: &str) -> CharFrequency {
2802        let mut frequency = CharFrequency::default();
2803
2804        for ch in content.chars() {
2805            match ch {
2806                '#' => frequency.hash_count += 1,
2807                '*' => frequency.asterisk_count += 1,
2808                '_' => frequency.underscore_count += 1,
2809                '-' => frequency.hyphen_count += 1,
2810                '+' => frequency.plus_count += 1,
2811                '>' => frequency.gt_count += 1,
2812                '|' => frequency.pipe_count += 1,
2813                '[' => frequency.bracket_count += 1,
2814                '`' => frequency.backtick_count += 1,
2815                '<' => frequency.lt_count += 1,
2816                '!' => frequency.exclamation_count += 1,
2817                '\n' => frequency.newline_count += 1,
2818                _ => {}
2819            }
2820        }
2821
2822        frequency
2823    }
2824
2825    /// Parse HTML tags in the content
2826    fn parse_html_tags(
2827        content: &str,
2828        lines: &[LineInfo],
2829        code_blocks: &[(usize, usize)],
2830        flavor: MarkdownFlavor,
2831    ) -> Vec<HtmlTag> {
2832        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2833            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2834
2835        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2836
2837        for cap in HTML_TAG_REGEX.captures_iter(content) {
2838            let full_match = cap.get(0).unwrap();
2839            let match_start = full_match.start();
2840            let match_end = full_match.end();
2841
2842            // Skip if in code block
2843            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2844                continue;
2845            }
2846
2847            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2848            let tag_name_original = cap.get(2).unwrap().as_str();
2849            let tag_name = tag_name_original.to_lowercase();
2850            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2851
2852            // Skip JSX components in MDX files (tags starting with uppercase letter)
2853            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2854            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2855                continue;
2856            }
2857
2858            // Find which line this tag is on
2859            let mut line_num = 1;
2860            let mut col_start = match_start;
2861            let mut col_end = match_end;
2862            for (idx, line_info) in lines.iter().enumerate() {
2863                if match_start >= line_info.byte_offset {
2864                    line_num = idx + 1;
2865                    col_start = match_start - line_info.byte_offset;
2866                    col_end = match_end - line_info.byte_offset;
2867                } else {
2868                    break;
2869                }
2870            }
2871
2872            html_tags.push(HtmlTag {
2873                line: line_num,
2874                start_col: col_start,
2875                end_col: col_end,
2876                byte_offset: match_start,
2877                byte_end: match_end,
2878                tag_name,
2879                is_closing,
2880                is_self_closing,
2881                raw_content: full_match.as_str().to_string(),
2882            });
2883        }
2884
2885        html_tags
2886    }
2887
2888    /// Parse emphasis spans in the content
2889    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2890        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2891            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2892
2893        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2894
2895        for cap in EMPHASIS_REGEX.captures_iter(content) {
2896            let full_match = cap.get(0).unwrap();
2897            let match_start = full_match.start();
2898            let match_end = full_match.end();
2899
2900            // Skip if in code block
2901            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2902                continue;
2903            }
2904
2905            let opening_markers = cap.get(1).unwrap().as_str();
2906            let content_part = cap.get(2).unwrap().as_str();
2907            let closing_markers = cap.get(3).unwrap().as_str();
2908
2909            // Validate matching markers
2910            if opening_markers.chars().next() != closing_markers.chars().next()
2911                || opening_markers.len() != closing_markers.len()
2912            {
2913                continue;
2914            }
2915
2916            let marker = opening_markers.chars().next().unwrap();
2917            let marker_count = opening_markers.len();
2918
2919            // Find which line this emphasis is on
2920            let mut line_num = 1;
2921            let mut col_start = match_start;
2922            let mut col_end = match_end;
2923            for (idx, line_info) in lines.iter().enumerate() {
2924                if match_start >= line_info.byte_offset {
2925                    line_num = idx + 1;
2926                    col_start = match_start - line_info.byte_offset;
2927                    col_end = match_end - line_info.byte_offset;
2928                } else {
2929                    break;
2930                }
2931            }
2932
2933            emphasis_spans.push(EmphasisSpan {
2934                line: line_num,
2935                start_col: col_start,
2936                end_col: col_end,
2937                byte_offset: match_start,
2938                byte_end: match_end,
2939                marker,
2940                marker_count,
2941                content: content_part.to_string(),
2942            });
2943        }
2944
2945        emphasis_spans
2946    }
2947
2948    /// Parse table rows in the content
2949    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2950        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2951
2952        for (line_idx, line_info) in lines.iter().enumerate() {
2953            // Skip lines in code blocks or blank lines
2954            if line_info.in_code_block || line_info.is_blank {
2955                continue;
2956            }
2957
2958            let line = line_info.content(content);
2959            let line_num = line_idx + 1;
2960
2961            // Check if this line contains pipes (potential table row)
2962            if !line.contains('|') {
2963                continue;
2964            }
2965
2966            // Count columns by splitting on pipes
2967            let parts: Vec<&str> = line.split('|').collect();
2968            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2969
2970            // Check if this is a separator row
2971            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2972            let mut column_alignments = Vec::new();
2973
2974            if is_separator {
2975                for part in &parts[1..parts.len() - 1] {
2976                    // Skip first and last empty parts
2977                    let trimmed = part.trim();
2978                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2979                        "center".to_string()
2980                    } else if trimmed.ends_with(':') {
2981                        "right".to_string()
2982                    } else if trimmed.starts_with(':') {
2983                        "left".to_string()
2984                    } else {
2985                        "none".to_string()
2986                    };
2987                    column_alignments.push(alignment);
2988                }
2989            }
2990
2991            table_rows.push(TableRow {
2992                line: line_num,
2993                is_separator,
2994                column_count,
2995                column_alignments,
2996            });
2997        }
2998
2999        table_rows
3000    }
3001
3002    /// Parse bare URLs and emails in the content
3003    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3004        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3005
3006        // Check for bare URLs (not in angle brackets or markdown links)
3007        for cap in BARE_URL_PATTERN.captures_iter(content) {
3008            let full_match = cap.get(0).unwrap();
3009            let match_start = full_match.start();
3010            let match_end = full_match.end();
3011
3012            // Skip if in code block
3013            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3014                continue;
3015            }
3016
3017            // Skip if already in angle brackets or markdown links
3018            let preceding_char = if match_start > 0 {
3019                content.chars().nth(match_start - 1)
3020            } else {
3021                None
3022            };
3023            let following_char = content.chars().nth(match_end);
3024
3025            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3026                continue;
3027            }
3028            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3029                continue;
3030            }
3031
3032            let url = full_match.as_str();
3033            let url_type = if url.starts_with("https://") {
3034                "https"
3035            } else if url.starts_with("http://") {
3036                "http"
3037            } else if url.starts_with("ftp://") {
3038                "ftp"
3039            } else {
3040                "other"
3041            };
3042
3043            // Find which line this URL is on
3044            let mut line_num = 1;
3045            let mut col_start = match_start;
3046            let mut col_end = match_end;
3047            for (idx, line_info) in lines.iter().enumerate() {
3048                if match_start >= line_info.byte_offset {
3049                    line_num = idx + 1;
3050                    col_start = match_start - line_info.byte_offset;
3051                    col_end = match_end - line_info.byte_offset;
3052                } else {
3053                    break;
3054                }
3055            }
3056
3057            bare_urls.push(BareUrl {
3058                line: line_num,
3059                start_col: col_start,
3060                end_col: col_end,
3061                byte_offset: match_start,
3062                byte_end: match_end,
3063                url: url.to_string(),
3064                url_type: url_type.to_string(),
3065            });
3066        }
3067
3068        // Check for bare email addresses
3069        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3070            let full_match = cap.get(0).unwrap();
3071            let match_start = full_match.start();
3072            let match_end = full_match.end();
3073
3074            // Skip if in code block
3075            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3076                continue;
3077            }
3078
3079            // Skip if already in angle brackets or markdown links
3080            let preceding_char = if match_start > 0 {
3081                content.chars().nth(match_start - 1)
3082            } else {
3083                None
3084            };
3085            let following_char = content.chars().nth(match_end);
3086
3087            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3088                continue;
3089            }
3090            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3091                continue;
3092            }
3093
3094            let email = full_match.as_str();
3095
3096            // Find which line this email is on
3097            let mut line_num = 1;
3098            let mut col_start = match_start;
3099            let mut col_end = match_end;
3100            for (idx, line_info) in lines.iter().enumerate() {
3101                if match_start >= line_info.byte_offset {
3102                    line_num = idx + 1;
3103                    col_start = match_start - line_info.byte_offset;
3104                    col_end = match_end - line_info.byte_offset;
3105                } else {
3106                    break;
3107                }
3108            }
3109
3110            bare_urls.push(BareUrl {
3111                line: line_num,
3112                start_col: col_start,
3113                end_col: col_end,
3114                byte_offset: match_start,
3115                byte_end: match_end,
3116                url: email.to_string(),
3117                url_type: "email".to_string(),
3118            });
3119        }
3120
3121        bare_urls
3122    }
3123}
3124
3125/// Merge adjacent list blocks that should be treated as one
3126fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3127    if list_blocks.len() < 2 {
3128        return;
3129    }
3130
3131    let mut merger = ListBlockMerger::new(content, lines);
3132    *list_blocks = merger.merge(list_blocks);
3133}
3134
3135/// Helper struct to manage the complex logic of merging list blocks
3136struct ListBlockMerger<'a> {
3137    content: &'a str,
3138    lines: &'a [LineInfo],
3139}
3140
3141impl<'a> ListBlockMerger<'a> {
3142    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3143        Self { content, lines }
3144    }
3145
3146    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3147        let mut merged = Vec::with_capacity(list_blocks.len());
3148        let mut current = list_blocks[0].clone();
3149
3150        for next in list_blocks.iter().skip(1) {
3151            if self.should_merge_blocks(&current, next) {
3152                current = self.merge_two_blocks(current, next);
3153            } else {
3154                merged.push(current);
3155                current = next.clone();
3156            }
3157        }
3158
3159        merged.push(current);
3160        merged
3161    }
3162
3163    /// Determine if two adjacent list blocks should be merged
3164    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3165        // Basic compatibility checks
3166        if !self.blocks_are_compatible(current, next) {
3167            return false;
3168        }
3169
3170        // Check spacing and content between blocks
3171        let spacing = self.analyze_spacing_between(current, next);
3172        match spacing {
3173            BlockSpacing::Consecutive => true,
3174            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3175            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3176                self.can_merge_with_content_between(current, next)
3177            }
3178        }
3179    }
3180
3181    /// Check if blocks have compatible structure for merging
3182    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3183        current.is_ordered == next.is_ordered
3184            && current.blockquote_prefix == next.blockquote_prefix
3185            && current.nesting_level == next.nesting_level
3186    }
3187
3188    /// Analyze the spacing between two list blocks
3189    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3190        let gap = next.start_line - current.end_line;
3191
3192        match gap {
3193            1 => BlockSpacing::Consecutive,
3194            2 => BlockSpacing::SingleBlank,
3195            _ if gap > 2 => {
3196                if self.has_only_blank_lines_between(current, next) {
3197                    BlockSpacing::MultipleBlanks
3198                } else {
3199                    BlockSpacing::ContentBetween
3200                }
3201            }
3202            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3203        }
3204    }
3205
3206    /// Check if unordered lists can be merged with a single blank line between
3207    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3208        // Check if there are structural separators between the blocks
3209        // If has_meaningful_content_between returns true, it means there are structural separators
3210        if has_meaningful_content_between(self.content, current, next, self.lines) {
3211            return false; // Structural separators prevent merging
3212        }
3213
3214        // Only merge unordered lists with same marker across single blank
3215        !current.is_ordered && current.marker == next.marker
3216    }
3217
3218    /// Check if ordered lists can be merged when there's content between them
3219    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3220        // Do not merge lists if there are structural separators between them
3221        if has_meaningful_content_between(self.content, current, next, self.lines) {
3222            return false; // Structural separators prevent merging
3223        }
3224
3225        // Only consider merging ordered lists if there's no structural content between
3226        current.is_ordered && next.is_ordered
3227    }
3228
3229    /// Check if there are only blank lines between blocks
3230    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3231        for line_num in (current.end_line + 1)..next.start_line {
3232            if let Some(line_info) = self.lines.get(line_num - 1)
3233                && !line_info.content(self.content).trim().is_empty()
3234            {
3235                return false;
3236            }
3237        }
3238        true
3239    }
3240
3241    /// Merge two compatible list blocks into one
3242    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3243        current.end_line = next.end_line;
3244        current.item_lines.extend_from_slice(&next.item_lines);
3245
3246        // Update max marker width
3247        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3248
3249        // Handle marker consistency for unordered lists
3250        if !current.is_ordered && self.markers_differ(&current, next) {
3251            current.marker = None; // Mixed markers
3252        }
3253
3254        current
3255    }
3256
3257    /// Check if two blocks have different markers
3258    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3259        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3260    }
3261}
3262
3263/// Types of spacing between list blocks
3264#[derive(Debug, PartialEq)]
3265enum BlockSpacing {
3266    Consecutive,    // No gap between blocks
3267    SingleBlank,    // One blank line between blocks
3268    MultipleBlanks, // Multiple blank lines but no content
3269    ContentBetween, // Content exists between blocks
3270}
3271
3272/// Check if there's meaningful content (not just blank lines) between two list blocks
3273fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3274    // Check lines between current.end_line and next.start_line
3275    for line_num in (current.end_line + 1)..next.start_line {
3276        if let Some(line_info) = lines.get(line_num - 1) {
3277            // Convert to 0-indexed
3278            let trimmed = line_info.content(content).trim();
3279
3280            // Skip empty lines
3281            if trimmed.is_empty() {
3282                continue;
3283            }
3284
3285            // Check for structural separators that should separate lists (CommonMark compliant)
3286
3287            // Headings separate lists
3288            if line_info.heading.is_some() {
3289                return true; // Has meaningful content - headings separate lists
3290            }
3291
3292            // Horizontal rules separate lists (---, ***, ___)
3293            if is_horizontal_rule(trimmed) {
3294                return true; // Has meaningful content - horizontal rules separate lists
3295            }
3296
3297            // Tables separate lists (lines containing | but not in URLs or code)
3298            // Simple heuristic: tables typically have | at start/end or multiple |
3299            if trimmed.contains('|') && trimmed.len() > 1 {
3300                // Don't treat URLs with | as tables
3301                if !trimmed.contains("](") && !trimmed.contains("http") {
3302                    // More robust check: tables usually have multiple | or | at edges
3303                    let pipe_count = trimmed.matches('|').count();
3304                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3305                        return true; // Has meaningful content - tables separate lists
3306                    }
3307                }
3308            }
3309
3310            // Blockquotes separate lists
3311            if trimmed.starts_with('>') {
3312                return true; // Has meaningful content - blockquotes separate lists
3313            }
3314
3315            // Code block fences separate lists (unless properly indented as list content)
3316            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3317                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3318
3319                // Check if this code block is properly indented as list continuation
3320                let min_continuation_indent = if current.is_ordered {
3321                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3322                } else {
3323                    current.nesting_level + 2
3324                };
3325
3326                if line_indent < min_continuation_indent {
3327                    // This is a standalone code block that separates lists
3328                    return true; // Has meaningful content - standalone code blocks separate lists
3329                }
3330            }
3331
3332            // Check if this line has proper indentation for list continuation
3333            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3334
3335            // Calculate minimum indentation needed to be list continuation
3336            let min_indent = if current.is_ordered {
3337                current.nesting_level + current.max_marker_width
3338            } else {
3339                current.nesting_level + 2
3340            };
3341
3342            // If the line is not indented enough to be list continuation, it's meaningful content
3343            if line_indent < min_indent {
3344                return true; // Has meaningful content - content not indented as list continuation
3345            }
3346
3347            // If we reach here, the line is properly indented as list continuation
3348            // Continue checking other lines
3349        }
3350    }
3351
3352    // Only blank lines or properly indented list continuation content between blocks
3353    false
3354}
3355
3356/// Check if a line is a horizontal rule (---, ***, ___)
3357fn is_horizontal_rule(trimmed: &str) -> bool {
3358    if trimmed.len() < 3 {
3359        return false;
3360    }
3361
3362    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3363    let chars: Vec<char> = trimmed.chars().collect();
3364    if let Some(&first_char) = chars.first()
3365        && (first_char == '-' || first_char == '*' || first_char == '_')
3366    {
3367        let mut count = 0;
3368        for &ch in &chars {
3369            if ch == first_char {
3370                count += 1;
3371            } else if ch != ' ' && ch != '\t' {
3372                return false; // Non-matching, non-whitespace character
3373            }
3374        }
3375        return count >= 3;
3376    }
3377    false
3378}
3379
3380/// Check if content contains patterns that cause the markdown crate to panic
3381#[cfg(test)]
3382mod tests {
3383    use super::*;
3384
3385    #[test]
3386    fn test_empty_content() {
3387        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
3388        assert_eq!(ctx.content, "");
3389        assert_eq!(ctx.line_offsets, vec![0]);
3390        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3391        assert_eq!(ctx.lines.len(), 0);
3392    }
3393
3394    #[test]
3395    fn test_single_line() {
3396        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
3397        assert_eq!(ctx.content, "# Hello");
3398        assert_eq!(ctx.line_offsets, vec![0]);
3399        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3400        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3401    }
3402
3403    #[test]
3404    fn test_multi_line() {
3405        let content = "# Title\n\nSecond line\nThird line";
3406        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3407        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3408        // Test offset to line/col
3409        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3410        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3411        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3412        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3413        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3414    }
3415
3416    #[test]
3417    fn test_line_info() {
3418        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3419        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3420
3421        // Test line info
3422        assert_eq!(ctx.lines.len(), 7);
3423
3424        // Line 1: "# Title"
3425        let line1 = &ctx.lines[0];
3426        assert_eq!(line1.content(ctx.content), "# Title");
3427        assert_eq!(line1.byte_offset, 0);
3428        assert_eq!(line1.indent, 0);
3429        assert!(!line1.is_blank);
3430        assert!(!line1.in_code_block);
3431        assert!(line1.list_item.is_none());
3432
3433        // Line 2: "    indented"
3434        let line2 = &ctx.lines[1];
3435        assert_eq!(line2.content(ctx.content), "    indented");
3436        assert_eq!(line2.byte_offset, 8);
3437        assert_eq!(line2.indent, 4);
3438        assert!(!line2.is_blank);
3439
3440        // Line 3: "" (blank)
3441        let line3 = &ctx.lines[2];
3442        assert_eq!(line3.content(ctx.content), "");
3443        assert!(line3.is_blank);
3444
3445        // Test helper methods
3446        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3447        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3448        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3449        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3450    }
3451
3452    #[test]
3453    fn test_list_item_detection() {
3454        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3455        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3456
3457        // Line 1: "- Unordered item"
3458        let line1 = &ctx.lines[0];
3459        assert!(line1.list_item.is_some());
3460        let list1 = line1.list_item.as_ref().unwrap();
3461        assert_eq!(list1.marker, "-");
3462        assert!(!list1.is_ordered);
3463        assert_eq!(list1.marker_column, 0);
3464        assert_eq!(list1.content_column, 2);
3465
3466        // Line 2: "  * Nested item"
3467        let line2 = &ctx.lines[1];
3468        assert!(line2.list_item.is_some());
3469        let list2 = line2.list_item.as_ref().unwrap();
3470        assert_eq!(list2.marker, "*");
3471        assert_eq!(list2.marker_column, 2);
3472
3473        // Line 3: "1. Ordered item"
3474        let line3 = &ctx.lines[2];
3475        assert!(line3.list_item.is_some());
3476        let list3 = line3.list_item.as_ref().unwrap();
3477        assert_eq!(list3.marker, "1.");
3478        assert!(list3.is_ordered);
3479        assert_eq!(list3.number, Some(1));
3480
3481        // Line 6: "Not a list"
3482        let line6 = &ctx.lines[5];
3483        assert!(line6.list_item.is_none());
3484    }
3485
3486    #[test]
3487    fn test_offset_to_line_col_edge_cases() {
3488        let content = "a\nb\nc";
3489        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3490        // line_offsets: [0, 2, 4]
3491        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3492        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3493        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3494        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3495        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3496        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3497    }
3498
3499    #[test]
3500    fn test_mdx_esm_blocks() {
3501        let content = r##"import {Chart} from './snowfall.js'
3502export const year = 2023
3503
3504# Last year's snowfall
3505
3506In {year}, the snowfall was above average.
3507It was followed by a warm spring which caused
3508flood conditions in many of the nearby rivers.
3509
3510<Chart color="#fcb32c" year={year} />
3511"##;
3512
3513        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
3514
3515        // Check that lines 1 and 2 are marked as ESM blocks
3516        assert_eq!(ctx.lines.len(), 10);
3517        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3518        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3519        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3520        assert!(
3521            !ctx.lines[3].in_esm_block,
3522            "Line 4 (heading) should NOT be in_esm_block"
3523        );
3524        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3525        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3526    }
3527
3528    #[test]
3529    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3530        let content = r#"import {Chart} from './snowfall.js'
3531export const year = 2023
3532
3533# Last year's snowfall
3534"#;
3535
3536        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3537
3538        // ESM blocks should NOT be detected in Standard flavor
3539        assert!(
3540            !ctx.lines[0].in_esm_block,
3541            "Line 1 should NOT be in_esm_block in Standard flavor"
3542        );
3543        assert!(
3544            !ctx.lines[1].in_esm_block,
3545            "Line 2 should NOT be in_esm_block in Standard flavor"
3546        );
3547    }
3548}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs