rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::path::PathBuf;
8use std::sync::LazyLock;
9
10/// Macro for profiling sections - only active in non-WASM builds
11#[cfg(not(target_arch = "wasm32"))]
12macro_rules! profile_section {
13    ($name:expr, $profile:expr, $code:expr) => {{
14        let start = std::time::Instant::now();
15        let result = $code;
16        if $profile {
17            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
18        }
19        result
20    }};
21}
22
23#[cfg(target_arch = "wasm32")]
24macro_rules! profile_section {
25    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
26}
27
28// Comprehensive link pattern that captures both inline and reference links
29// Use (?s) flag to make . match newlines
30static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
31    Regex::new(
32        r#"(?sx)
33        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
34        (?:
35            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
36            |
37            \[([^\]]*)\]      # Reference ID in group 6
38        )"#
39    ).unwrap()
40});
41
42// Image pattern (similar to links but with ! prefix)
43// Use (?s) flag to make . match newlines
44static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r#"(?sx)
47        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
48        (?:
49            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
50            |
51            \[([^\]]*)\]      # Reference ID in group 6
52        )"#
53    ).unwrap()
54});
55
56// Reference definition pattern
57static REF_DEF_PATTERN: LazyLock<Regex> =
58    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
59
60// Pattern for bare URLs
61static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
62    Regex::new(
63        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
64    ).unwrap()
65});
66
67// Pattern for email addresses
68static BARE_EMAIL_PATTERN: LazyLock<Regex> =
69    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
70
71// Pattern for blockquote prefix in parse_list_blocks
72static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
73
74/// Pre-computed information about a line
75#[derive(Debug, Clone)]
76pub struct LineInfo {
77    /// Byte offset where this line starts in the document
78    pub byte_offset: usize,
79    /// Length of the line in bytes (without newline)
80    pub byte_len: usize,
81    /// Number of leading spaces/tabs
82    pub indent: usize,
83    /// Whether the line is blank (empty or only whitespace)
84    pub is_blank: bool,
85    /// Whether this line is inside a code block
86    pub in_code_block: bool,
87    /// Whether this line is inside front matter
88    pub in_front_matter: bool,
89    /// Whether this line is inside an HTML block
90    pub in_html_block: bool,
91    /// Whether this line is inside an HTML comment
92    pub in_html_comment: bool,
93    /// List item information if this line starts a list item
94    pub list_item: Option<ListItemInfo>,
95    /// Heading information if this line is a heading
96    pub heading: Option<HeadingInfo>,
97    /// Blockquote information if this line is a blockquote
98    pub blockquote: Option<BlockquoteInfo>,
99    /// Whether this line is inside a mkdocstrings autodoc block
100    pub in_mkdocstrings: bool,
101    /// Whether this line is part of an ESM import/export block (MDX only)
102    pub in_esm_block: bool,
103    /// Whether this line is a continuation of a multi-line code span from a previous line
104    pub in_code_span_continuation: bool,
105}
106
107impl LineInfo {
108    /// Get the line content as a string slice from the source document
109    pub fn content<'a>(&self, source: &'a str) -> &'a str {
110        &source[self.byte_offset..self.byte_offset + self.byte_len]
111    }
112}
113
114/// Information about a list item
115#[derive(Debug, Clone)]
116pub struct ListItemInfo {
117    /// The marker used (*, -, +, or number with . or ))
118    pub marker: String,
119    /// Whether it's ordered (true) or unordered (false)
120    pub is_ordered: bool,
121    /// The number for ordered lists
122    pub number: Option<usize>,
123    /// Column where the marker starts (0-based)
124    pub marker_column: usize,
125    /// Column where content after marker starts
126    pub content_column: usize,
127}
128
129/// Heading style type
130#[derive(Debug, Clone, PartialEq)]
131pub enum HeadingStyle {
132    /// ATX style heading (# Heading)
133    ATX,
134    /// Setext style heading with = underline
135    Setext1,
136    /// Setext style heading with - underline
137    Setext2,
138}
139
140/// Parsed link information
141#[derive(Debug, Clone)]
142pub struct ParsedLink<'a> {
143    /// Line number (1-indexed)
144    pub line: usize,
145    /// Start column (0-indexed) in the line
146    pub start_col: usize,
147    /// End column (0-indexed) in the line
148    pub end_col: usize,
149    /// Byte offset in document
150    pub byte_offset: usize,
151    /// End byte offset in document
152    pub byte_end: usize,
153    /// Link text
154    pub text: Cow<'a, str>,
155    /// Link URL or reference
156    pub url: Cow<'a, str>,
157    /// Whether this is a reference link [text][ref] vs inline [text](url)
158    pub is_reference: bool,
159    /// Reference ID for reference links
160    pub reference_id: Option<Cow<'a, str>>,
161    /// Link type from pulldown-cmark
162    pub link_type: LinkType,
163}
164
165/// Information about a broken link reported by pulldown-cmark
166#[derive(Debug, Clone)]
167pub struct BrokenLinkInfo {
168    /// The reference text that couldn't be resolved
169    pub reference: String,
170    /// Byte span in the source document
171    pub span: std::ops::Range<usize>,
172}
173
174/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
175#[derive(Debug, Clone)]
176pub struct FootnoteRef {
177    /// The footnote ID (without the ^ prefix)
178    pub id: String,
179    /// Line number (1-indexed)
180    pub line: usize,
181    /// Start byte offset in document
182    pub byte_offset: usize,
183    /// End byte offset in document
184    pub byte_end: usize,
185}
186
187/// Parsed image information
188#[derive(Debug, Clone)]
189pub struct ParsedImage<'a> {
190    /// Line number (1-indexed)
191    pub line: usize,
192    /// Start column (0-indexed) in the line
193    pub start_col: usize,
194    /// End column (0-indexed) in the line
195    pub end_col: usize,
196    /// Byte offset in document
197    pub byte_offset: usize,
198    /// End byte offset in document
199    pub byte_end: usize,
200    /// Alt text
201    pub alt_text: Cow<'a, str>,
202    /// Image URL or reference
203    pub url: Cow<'a, str>,
204    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
205    pub is_reference: bool,
206    /// Reference ID for reference images
207    pub reference_id: Option<Cow<'a, str>>,
208    /// Link type from pulldown-cmark
209    pub link_type: LinkType,
210}
211
212/// Reference definition [ref]: url "title"
213#[derive(Debug, Clone)]
214pub struct ReferenceDef {
215    /// Line number (1-indexed)
216    pub line: usize,
217    /// Reference ID (normalized to lowercase)
218    pub id: String,
219    /// URL
220    pub url: String,
221    /// Optional title
222    pub title: Option<String>,
223    /// Byte offset where the reference definition starts
224    pub byte_offset: usize,
225    /// Byte offset where the reference definition ends
226    pub byte_end: usize,
227    /// Byte offset where the title starts (if present, includes quote)
228    pub title_byte_start: Option<usize>,
229    /// Byte offset where the title ends (if present, includes quote)
230    pub title_byte_end: Option<usize>,
231}
232
233/// Parsed code span information
234#[derive(Debug, Clone)]
235pub struct CodeSpan {
236    /// Line number where the code span starts (1-indexed)
237    pub line: usize,
238    /// Line number where the code span ends (1-indexed)
239    pub end_line: usize,
240    /// Start column (0-indexed) in the line
241    pub start_col: usize,
242    /// End column (0-indexed) in the line
243    pub end_col: usize,
244    /// Byte offset in document
245    pub byte_offset: usize,
246    /// End byte offset in document
247    pub byte_end: usize,
248    /// Number of backticks used (1, 2, 3, etc.)
249    pub backtick_count: usize,
250    /// Content inside the code span (without backticks)
251    pub content: String,
252}
253
254/// Information about a heading
255#[derive(Debug, Clone)]
256pub struct HeadingInfo {
257    /// Heading level (1-6 for ATX, 1-2 for Setext)
258    pub level: u8,
259    /// Style of heading
260    pub style: HeadingStyle,
261    /// The heading marker (# characters or underline)
262    pub marker: String,
263    /// Column where the marker starts (0-based)
264    pub marker_column: usize,
265    /// Column where heading text starts
266    pub content_column: usize,
267    /// The heading text (without markers and without custom ID syntax)
268    pub text: String,
269    /// Custom header ID if present (e.g., from {#custom-id} syntax)
270    pub custom_id: Option<String>,
271    /// Original heading text including custom ID syntax
272    pub raw_text: String,
273    /// Whether it has a closing sequence (for ATX)
274    pub has_closing_sequence: bool,
275    /// The closing sequence if present
276    pub closing_sequence: String,
277    /// Whether this is a valid CommonMark heading (ATX headings require space after #)
278    /// False for malformed headings like `#NoSpace` that MD018 should flag
279    pub is_valid: bool,
280}
281
282/// A valid heading from a filtered iteration
283///
284/// Only includes headings that are CommonMark-compliant (have space after #).
285/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
286#[derive(Debug, Clone)]
287pub struct ValidHeading<'a> {
288    /// The 1-indexed line number in the document
289    pub line_num: usize,
290    /// Reference to the heading information
291    pub heading: &'a HeadingInfo,
292    /// Reference to the full line info (for rules that need additional context)
293    pub line_info: &'a LineInfo,
294}
295
296/// Iterator over valid CommonMark headings in a document
297///
298/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
299/// but should not be processed by other heading rules.
300pub struct ValidHeadingsIter<'a> {
301    lines: &'a [LineInfo],
302    current_index: usize,
303}
304
305impl<'a> ValidHeadingsIter<'a> {
306    fn new(lines: &'a [LineInfo]) -> Self {
307        Self {
308            lines,
309            current_index: 0,
310        }
311    }
312}
313
314impl<'a> Iterator for ValidHeadingsIter<'a> {
315    type Item = ValidHeading<'a>;
316
317    fn next(&mut self) -> Option<Self::Item> {
318        while self.current_index < self.lines.len() {
319            let idx = self.current_index;
320            self.current_index += 1;
321
322            let line_info = &self.lines[idx];
323            if let Some(heading) = &line_info.heading
324                && heading.is_valid
325            {
326                return Some(ValidHeading {
327                    line_num: idx + 1, // Convert 0-indexed to 1-indexed
328                    heading,
329                    line_info,
330                });
331            }
332        }
333        None
334    }
335}
336
337/// Information about a blockquote line
338#[derive(Debug, Clone)]
339pub struct BlockquoteInfo {
340    /// Nesting level (1 for >, 2 for >>, etc.)
341    pub nesting_level: usize,
342    /// The indentation before the blockquote marker
343    pub indent: String,
344    /// Column where the first > starts (0-based)
345    pub marker_column: usize,
346    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
347    pub prefix: String,
348    /// Content after the blockquote marker(s)
349    pub content: String,
350    /// Whether the line has no space after the marker
351    pub has_no_space_after_marker: bool,
352    /// Whether the line has multiple spaces after the marker
353    pub has_multiple_spaces_after_marker: bool,
354    /// Whether this is an empty blockquote line needing MD028 fix
355    pub needs_md028_fix: bool,
356}
357
358/// Information about a list block
359#[derive(Debug, Clone)]
360pub struct ListBlock {
361    /// Line number where the list starts (1-indexed)
362    pub start_line: usize,
363    /// Line number where the list ends (1-indexed)
364    pub end_line: usize,
365    /// Whether it's ordered or unordered
366    pub is_ordered: bool,
367    /// The consistent marker for unordered lists (if any)
368    pub marker: Option<String>,
369    /// Blockquote prefix for this list (empty if not in blockquote)
370    pub blockquote_prefix: String,
371    /// Lines that are list items within this block
372    pub item_lines: Vec<usize>,
373    /// Nesting level (0 for top-level lists)
374    pub nesting_level: usize,
375    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
376    pub max_marker_width: usize,
377}
378
379use std::sync::{Arc, OnceLock};
380
381/// Character frequency data for fast content analysis
382#[derive(Debug, Clone, Default)]
383pub struct CharFrequency {
384    /// Count of # characters (headings)
385    pub hash_count: usize,
386    /// Count of * characters (emphasis, lists, horizontal rules)
387    pub asterisk_count: usize,
388    /// Count of _ characters (emphasis, horizontal rules)
389    pub underscore_count: usize,
390    /// Count of - characters (lists, horizontal rules, setext headings)
391    pub hyphen_count: usize,
392    /// Count of + characters (lists)
393    pub plus_count: usize,
394    /// Count of > characters (blockquotes)
395    pub gt_count: usize,
396    /// Count of | characters (tables)
397    pub pipe_count: usize,
398    /// Count of [ characters (links, images)
399    pub bracket_count: usize,
400    /// Count of ` characters (code spans, code blocks)
401    pub backtick_count: usize,
402    /// Count of < characters (HTML tags, autolinks)
403    pub lt_count: usize,
404    /// Count of ! characters (images)
405    pub exclamation_count: usize,
406    /// Count of newline characters
407    pub newline_count: usize,
408}
409
410/// Pre-parsed HTML tag information
411#[derive(Debug, Clone)]
412pub struct HtmlTag {
413    /// Line number (1-indexed)
414    pub line: usize,
415    /// Start column (0-indexed) in the line
416    pub start_col: usize,
417    /// End column (0-indexed) in the line
418    pub end_col: usize,
419    /// Byte offset in document
420    pub byte_offset: usize,
421    /// End byte offset in document
422    pub byte_end: usize,
423    /// Tag name (e.g., "div", "img", "br")
424    pub tag_name: String,
425    /// Whether it's a closing tag (`</tag>`)
426    pub is_closing: bool,
427    /// Whether it's self-closing (`<tag />`)
428    pub is_self_closing: bool,
429    /// Raw tag content
430    pub raw_content: String,
431}
432
433/// Pre-parsed emphasis span information
434#[derive(Debug, Clone)]
435pub struct EmphasisSpan {
436    /// Line number (1-indexed)
437    pub line: usize,
438    /// Start column (0-indexed) in the line
439    pub start_col: usize,
440    /// End column (0-indexed) in the line
441    pub end_col: usize,
442    /// Byte offset in document
443    pub byte_offset: usize,
444    /// End byte offset in document
445    pub byte_end: usize,
446    /// Type of emphasis ('*' or '_')
447    pub marker: char,
448    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
449    pub marker_count: usize,
450    /// Content inside the emphasis
451    pub content: String,
452}
453
454/// Pre-parsed table row information
455#[derive(Debug, Clone)]
456pub struct TableRow {
457    /// Line number (1-indexed)
458    pub line: usize,
459    /// Whether this is a separator row (contains only |, -, :, and spaces)
460    pub is_separator: bool,
461    /// Number of columns (pipe-separated cells)
462    pub column_count: usize,
463    /// Alignment info from separator row
464    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
465}
466
467/// Pre-parsed bare URL information (not in links)
468#[derive(Debug, Clone)]
469pub struct BareUrl {
470    /// Line number (1-indexed)
471    pub line: usize,
472    /// Start column (0-indexed) in the line
473    pub start_col: usize,
474    /// End column (0-indexed) in the line
475    pub end_col: usize,
476    /// Byte offset in document
477    pub byte_offset: usize,
478    /// End byte offset in document
479    pub byte_end: usize,
480    /// The URL string
481    pub url: String,
482    /// Type of URL ("http", "https", "ftp", "email")
483    pub url_type: String,
484}
485
486pub struct LintContext<'a> {
487    pub content: &'a str,
488    pub line_offsets: Vec<usize>,
489    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
490    pub lines: Vec<LineInfo>,             // Pre-computed line information
491    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
492    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
493    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
494    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
495    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
496    code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
497    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
498    pub char_frequency: CharFrequency,    // Character frequency analysis
499    html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
500    emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
501    table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
502    bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
503    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
504    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
505    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
506    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
507    pub flavor: MarkdownFlavor,           // Markdown flavor being used
508    pub source_file: Option<PathBuf>,     // Source file path (for rules that need file context)
509}
510
511/// Detailed blockquote parse result with all components
512struct BlockquoteComponents<'a> {
513    indent: &'a str,
514    markers: &'a str,
515    spaces_after: &'a str,
516    content: &'a str,
517}
518
519/// Parse blockquote prefix with detailed components using manual parsing
520#[inline]
521fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
522    let bytes = line.as_bytes();
523    let mut pos = 0;
524
525    // Parse leading whitespace (indent)
526    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
527        pos += 1;
528    }
529    let indent_end = pos;
530
531    // Must have at least one '>' marker
532    if pos >= bytes.len() || bytes[pos] != b'>' {
533        return None;
534    }
535
536    // Parse '>' markers
537    while pos < bytes.len() && bytes[pos] == b'>' {
538        pos += 1;
539    }
540    let markers_end = pos;
541
542    // Parse spaces after markers
543    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
544        pos += 1;
545    }
546    let spaces_end = pos;
547
548    Some(BlockquoteComponents {
549        indent: &line[0..indent_end],
550        markers: &line[indent_end..markers_end],
551        spaces_after: &line[markers_end..spaces_end],
552        content: &line[spaces_end..],
553    })
554}
555
556impl<'a> LintContext<'a> {
557    pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
558        #[cfg(not(target_arch = "wasm32"))]
559        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
560        #[cfg(target_arch = "wasm32")]
561        let profile = false;
562
563        let line_offsets = profile_section!("Line offsets", profile, {
564            let mut offsets = vec![0];
565            for (i, c) in content.char_indices() {
566                if c == '\n' {
567                    offsets.push(i + 1);
568                }
569            }
570            offsets
571        });
572
573        // Detect code blocks once and cache them
574        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
575
576        // Pre-compute HTML comment ranges ONCE for all operations
577        let html_comment_ranges = profile_section!(
578            "HTML comment ranges",
579            profile,
580            crate::utils::skip_context::compute_html_comment_ranges(content)
581        );
582
583        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
584        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
585            if flavor == MarkdownFlavor::MkDocs {
586                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
587            } else {
588                Vec::new()
589            }
590        });
591
592        // Pre-compute line information (without headings/blockquotes yet)
593        let mut lines = profile_section!(
594            "Basic line info",
595            profile,
596            Self::compute_basic_line_info(
597                content,
598                &line_offsets,
599                &code_blocks,
600                flavor,
601                &html_comment_ranges,
602                &autodoc_ranges,
603            )
604        );
605
606        // Detect HTML blocks BEFORE heading detection
607        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
608
609        // Detect ESM import/export blocks in MDX files BEFORE heading detection
610        profile_section!(
611            "ESM blocks",
612            profile,
613            Self::detect_esm_blocks(content, &mut lines, flavor)
614        );
615
616        // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
617        let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
618
619        // Now detect headings and blockquotes
620        profile_section!(
621            "Headings & blockquotes",
622            profile,
623            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
624        );
625
626        // Parse code spans early so we can exclude them from link/image parsing
627        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
628
629        // Mark lines that are continuations of multi-line code spans
630        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
631        for span in &code_spans {
632            if span.end_line > span.line {
633                // Mark lines after the first line as continuations
634                for line_num in (span.line + 1)..=span.end_line {
635                    if let Some(line_info) = lines.get_mut(line_num - 1) {
636                        line_info.in_code_span_continuation = true;
637                    }
638                }
639            }
640        }
641
642        // Parse links, images, references, and list blocks
643        let (links, broken_links, footnote_refs) = profile_section!(
644            "Links",
645            profile,
646            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
647        );
648
649        let images = profile_section!(
650            "Images",
651            profile,
652            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
653        );
654
655        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
656
657        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
658
659        // Compute character frequency for fast content analysis
660        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
661
662        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
663        let table_blocks = profile_section!(
664            "Table blocks",
665            profile,
666            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
667                content,
668                &code_blocks,
669                &code_spans,
670                &html_comment_ranges,
671            )
672        );
673
674        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
675        let line_index = profile_section!(
676            "Line index",
677            profile,
678            crate::utils::range_utils::LineIndex::new(content)
679        );
680
681        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
682        let jinja_ranges = profile_section!(
683            "Jinja ranges",
684            profile,
685            crate::utils::jinja_utils::find_jinja_ranges(content)
686        );
687
688        Self {
689            content,
690            line_offsets,
691            code_blocks,
692            lines,
693            links,
694            images,
695            broken_links,
696            footnote_refs,
697            reference_defs,
698            code_spans_cache: OnceLock::from(Arc::new(code_spans)),
699            list_blocks,
700            char_frequency,
701            html_tags_cache: OnceLock::new(),
702            emphasis_spans_cache: OnceLock::new(),
703            table_rows_cache: OnceLock::new(),
704            bare_urls_cache: OnceLock::new(),
705            html_comment_ranges,
706            table_blocks,
707            line_index,
708            jinja_ranges,
709            flavor,
710            source_file,
711        }
712    }
713
714    /// Get code spans - computed lazily on first access
715    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
716        Arc::clone(
717            self.code_spans_cache
718                .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
719        )
720    }
721
722    /// Get HTML comment ranges - pre-computed during LintContext construction
723    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
724        &self.html_comment_ranges
725    }
726
727    /// Get HTML tags - computed lazily on first access
728    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
729        Arc::clone(self.html_tags_cache.get_or_init(|| {
730            Arc::new(Self::parse_html_tags(
731                self.content,
732                &self.lines,
733                &self.code_blocks,
734                self.flavor,
735            ))
736        }))
737    }
738
739    /// Get emphasis spans - computed lazily on first access
740    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
741        Arc::clone(
742            self.emphasis_spans_cache
743                .get_or_init(|| Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))),
744        )
745    }
746
747    /// Get table rows - computed lazily on first access
748    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
749        Arc::clone(
750            self.table_rows_cache
751                .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
752        )
753    }
754
755    /// Get bare URLs - computed lazily on first access
756    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
757        Arc::clone(
758            self.bare_urls_cache
759                .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
760        )
761    }
762
763    /// Map a byte offset to (line, column)
764    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
765        match self.line_offsets.binary_search(&offset) {
766            Ok(line) => (line + 1, 1),
767            Err(line) => {
768                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
769                (line, offset - line_start + 1)
770            }
771        }
772    }
773
774    /// Check if a position is within a code block or code span
775    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
776        // Check code blocks first
777        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
778            return true;
779        }
780
781        // Check inline code spans (lazy load if needed)
782        self.code_spans()
783            .iter()
784            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
785    }
786
787    /// Get line information by line number (1-indexed)
788    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
789        if line_num > 0 {
790            self.lines.get(line_num - 1)
791        } else {
792            None
793        }
794    }
795
796    /// Get byte offset for a line number (1-indexed)
797    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
798        self.line_info(line_num).map(|info| info.byte_offset)
799    }
800
801    /// Get URL for a reference link/image by its ID
802    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
803        let normalized_id = ref_id.to_lowercase();
804        self.reference_defs
805            .iter()
806            .find(|def| def.id == normalized_id)
807            .map(|def| def.url.as_str())
808    }
809
810    /// Check if a line is part of a list block
811    pub fn is_in_list_block(&self, line_num: usize) -> bool {
812        self.list_blocks
813            .iter()
814            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
815    }
816
817    /// Get the list block containing a specific line
818    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
819        self.list_blocks
820            .iter()
821            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
822    }
823
824    // Compatibility methods for DocumentStructure migration
825
826    /// Check if a line is within a code block
827    pub fn is_in_code_block(&self, line_num: usize) -> bool {
828        if line_num == 0 || line_num > self.lines.len() {
829            return false;
830        }
831        self.lines[line_num - 1].in_code_block
832    }
833
834    /// Check if a line is within front matter
835    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
836        if line_num == 0 || line_num > self.lines.len() {
837            return false;
838        }
839        self.lines[line_num - 1].in_front_matter
840    }
841
842    /// Check if a line is within an HTML block
843    pub fn is_in_html_block(&self, line_num: usize) -> bool {
844        if line_num == 0 || line_num > self.lines.len() {
845            return false;
846        }
847        self.lines[line_num - 1].in_html_block
848    }
849
850    /// Check if a line and column is within a code span
851    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
852        if line_num == 0 || line_num > self.lines.len() {
853            return false;
854        }
855
856        // Use the code spans cache to check
857        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
858        // Convert col to 0-indexed for comparison
859        let col_0indexed = if col > 0 { col - 1 } else { 0 };
860        let code_spans = self.code_spans();
861        code_spans.iter().any(|span| {
862            // Check if line is within the span's line range
863            if line_num < span.line || line_num > span.end_line {
864                return false;
865            }
866
867            if span.line == span.end_line {
868                // Single-line span: check column bounds
869                col_0indexed >= span.start_col && col_0indexed < span.end_col
870            } else if line_num == span.line {
871                // First line of multi-line span: anything after start_col is in span
872                col_0indexed >= span.start_col
873            } else if line_num == span.end_line {
874                // Last line of multi-line span: anything before end_col is in span
875                col_0indexed < span.end_col
876            } else {
877                // Middle line of multi-line span: entire line is in span
878                true
879            }
880        })
881    }
882
883    /// Check if a byte offset is within a code span
884    #[inline]
885    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
886        let code_spans = self.code_spans();
887        code_spans
888            .iter()
889            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
890    }
891
892    /// Check if a byte position is within a reference definition
893    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
894    #[inline]
895    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
896        self.reference_defs
897            .iter()
898            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
899    }
900
901    /// Check if a byte position is within an HTML comment
902    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
903    /// where k is the number of HTML comments (typically very small)
904    #[inline]
905    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
906        self.html_comment_ranges
907            .iter()
908            .any(|range| byte_pos >= range.start && byte_pos < range.end)
909    }
910
911    /// Check if a byte position is within an HTML tag (including multiline tags)
912    /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
913    #[inline]
914    pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
915        self.html_tags()
916            .iter()
917            .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
918    }
919
920    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
921    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
922        self.jinja_ranges
923            .iter()
924            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
925    }
926
927    /// Check if a byte position is within a link reference definition title
928    pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
929        self.reference_defs.iter().any(|def| {
930            if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
931                byte_pos >= start && byte_pos < end
932            } else {
933                false
934            }
935        })
936    }
937
938    /// Check if content has any instances of a specific character (fast)
939    pub fn has_char(&self, ch: char) -> bool {
940        match ch {
941            '#' => self.char_frequency.hash_count > 0,
942            '*' => self.char_frequency.asterisk_count > 0,
943            '_' => self.char_frequency.underscore_count > 0,
944            '-' => self.char_frequency.hyphen_count > 0,
945            '+' => self.char_frequency.plus_count > 0,
946            '>' => self.char_frequency.gt_count > 0,
947            '|' => self.char_frequency.pipe_count > 0,
948            '[' => self.char_frequency.bracket_count > 0,
949            '`' => self.char_frequency.backtick_count > 0,
950            '<' => self.char_frequency.lt_count > 0,
951            '!' => self.char_frequency.exclamation_count > 0,
952            '\n' => self.char_frequency.newline_count > 0,
953            _ => self.content.contains(ch), // Fallback for other characters
954        }
955    }
956
957    /// Get count of a specific character (fast)
958    pub fn char_count(&self, ch: char) -> usize {
959        match ch {
960            '#' => self.char_frequency.hash_count,
961            '*' => self.char_frequency.asterisk_count,
962            '_' => self.char_frequency.underscore_count,
963            '-' => self.char_frequency.hyphen_count,
964            '+' => self.char_frequency.plus_count,
965            '>' => self.char_frequency.gt_count,
966            '|' => self.char_frequency.pipe_count,
967            '[' => self.char_frequency.bracket_count,
968            '`' => self.char_frequency.backtick_count,
969            '<' => self.char_frequency.lt_count,
970            '!' => self.char_frequency.exclamation_count,
971            '\n' => self.char_frequency.newline_count,
972            _ => self.content.matches(ch).count(), // Fallback for other characters
973        }
974    }
975
976    /// Check if content likely contains headings (fast)
977    pub fn likely_has_headings(&self) -> bool {
978        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
979    }
980
981    /// Check if content likely contains lists (fast)
982    pub fn likely_has_lists(&self) -> bool {
983        self.char_frequency.asterisk_count > 0
984            || self.char_frequency.hyphen_count > 0
985            || self.char_frequency.plus_count > 0
986    }
987
988    /// Check if content likely contains emphasis (fast)
989    pub fn likely_has_emphasis(&self) -> bool {
990        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
991    }
992
993    /// Check if content likely contains tables (fast)
994    pub fn likely_has_tables(&self) -> bool {
995        self.char_frequency.pipe_count > 2
996    }
997
998    /// Check if content likely contains blockquotes (fast)
999    pub fn likely_has_blockquotes(&self) -> bool {
1000        self.char_frequency.gt_count > 0
1001    }
1002
1003    /// Check if content likely contains code (fast)
1004    pub fn likely_has_code(&self) -> bool {
1005        self.char_frequency.backtick_count > 0
1006    }
1007
1008    /// Check if content likely contains links or images (fast)
1009    pub fn likely_has_links_or_images(&self) -> bool {
1010        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1011    }
1012
1013    /// Check if content likely contains HTML (fast)
1014    pub fn likely_has_html(&self) -> bool {
1015        self.char_frequency.lt_count > 0
1016    }
1017
1018    /// Get HTML tags on a specific line
1019    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1020        self.html_tags()
1021            .iter()
1022            .filter(|tag| tag.line == line_num)
1023            .cloned()
1024            .collect()
1025    }
1026
1027    /// Get emphasis spans on a specific line
1028    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1029        self.emphasis_spans()
1030            .iter()
1031            .filter(|span| span.line == line_num)
1032            .cloned()
1033            .collect()
1034    }
1035
1036    /// Get table rows on a specific line
1037    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1038        self.table_rows()
1039            .iter()
1040            .filter(|row| row.line == line_num)
1041            .cloned()
1042            .collect()
1043    }
1044
1045    /// Get bare URLs on a specific line
1046    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1047        self.bare_urls()
1048            .iter()
1049            .filter(|url| url.line == line_num)
1050            .cloned()
1051            .collect()
1052    }
1053
1054    /// Find the line index for a given byte offset using binary search.
1055    /// Returns (line_index, line_number, column) where:
1056    /// - line_index is the 0-based index in the lines array
1057    /// - line_number is the 1-based line number
1058    /// - column is the byte offset within that line
1059    #[inline]
1060    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1061        // Binary search to find the line containing this byte offset
1062        let idx = match lines.binary_search_by(|line| {
1063            if byte_offset < line.byte_offset {
1064                std::cmp::Ordering::Greater
1065            } else if byte_offset > line.byte_offset + line.byte_len {
1066                std::cmp::Ordering::Less
1067            } else {
1068                std::cmp::Ordering::Equal
1069            }
1070        }) {
1071            Ok(idx) => idx,
1072            Err(idx) => idx.saturating_sub(1),
1073        };
1074
1075        let line = &lines[idx];
1076        let line_num = idx + 1;
1077        let col = byte_offset.saturating_sub(line.byte_offset);
1078
1079        (idx, line_num, col)
1080    }
1081
1082    /// Check if a byte offset is within a code span using binary search
1083    #[inline]
1084    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1085        // Since spans are sorted by byte_offset, use partition_point for binary search
1086        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1087
1088        // Check the span that starts at or before our offset
1089        if idx > 0 {
1090            let span = &code_spans[idx - 1];
1091            if offset >= span.byte_offset && offset < span.byte_end {
1092                return true;
1093            }
1094        }
1095
1096        false
1097    }
1098
1099    /// Collect byte ranges of all links using pulldown-cmark
1100    /// This is used to skip heading detection for lines that fall within link syntax
1101    /// (e.g., multiline links like `[text](url\n#fragment)`)
1102    fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1103        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1104
1105        let mut link_ranges = Vec::new();
1106        let mut options = Options::empty();
1107        options.insert(Options::ENABLE_WIKILINKS);
1108        options.insert(Options::ENABLE_FOOTNOTES);
1109
1110        let parser = Parser::new_ext(content, options).into_offset_iter();
1111        let mut link_stack: Vec<usize> = Vec::new();
1112
1113        for (event, range) in parser {
1114            match event {
1115                Event::Start(Tag::Link { .. }) => {
1116                    link_stack.push(range.start);
1117                }
1118                Event::End(TagEnd::Link) => {
1119                    if let Some(start_pos) = link_stack.pop() {
1120                        link_ranges.push((start_pos, range.end));
1121                    }
1122                }
1123                _ => {}
1124            }
1125        }
1126
1127        link_ranges
1128    }
1129
1130    /// Parse all links in the content
1131    fn parse_links(
1132        content: &'a str,
1133        lines: &[LineInfo],
1134        code_blocks: &[(usize, usize)],
1135        code_spans: &[CodeSpan],
1136        flavor: MarkdownFlavor,
1137        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1138    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1139        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1140        use std::collections::HashSet;
1141
1142        let mut links = Vec::with_capacity(content.len() / 500);
1143        let mut broken_links = Vec::new();
1144        let mut footnote_refs = Vec::new();
1145
1146        // Track byte positions of links found by pulldown-cmark
1147        let mut found_positions = HashSet::new();
1148
1149        // Use pulldown-cmark's streaming parser with BrokenLink callback
1150        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1151        // This automatically handles:
1152        // - Escaped links (won't generate events)
1153        // - Links in code blocks/spans (won't generate Link events)
1154        // - Images (generates Tag::Image instead)
1155        // - Reference resolution (dest_url is already resolved!)
1156        // - Broken references (callback is invoked)
1157        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1158        let mut options = Options::empty();
1159        options.insert(Options::ENABLE_WIKILINKS);
1160        options.insert(Options::ENABLE_FOOTNOTES);
1161
1162        let parser = Parser::new_with_broken_link_callback(
1163            content,
1164            options,
1165            Some(|link: BrokenLink<'_>| {
1166                broken_links.push(BrokenLinkInfo {
1167                    reference: link.reference.to_string(),
1168                    span: link.span.clone(),
1169                });
1170                None
1171            }),
1172        )
1173        .into_offset_iter();
1174
1175        let mut link_stack: Vec<(
1176            usize,
1177            usize,
1178            pulldown_cmark::CowStr<'a>,
1179            LinkType,
1180            pulldown_cmark::CowStr<'a>,
1181        )> = Vec::new();
1182        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1183
1184        for (event, range) in parser {
1185            match event {
1186                Event::Start(Tag::Link {
1187                    link_type,
1188                    dest_url,
1189                    id,
1190                    ..
1191                }) => {
1192                    // Link start - record position, URL, and reference ID
1193                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1194                    text_chunks.clear();
1195                }
1196                Event::Text(text) if !link_stack.is_empty() => {
1197                    // Track text content with its byte range
1198                    text_chunks.push((text.to_string(), range.start, range.end));
1199                }
1200                Event::Code(code) if !link_stack.is_empty() => {
1201                    // Include inline code in link text (with backticks)
1202                    let code_text = format!("`{code}`");
1203                    text_chunks.push((code_text, range.start, range.end));
1204                }
1205                Event::End(TagEnd::Link) => {
1206                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1207                        // Skip if in HTML comment
1208                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1209                            text_chunks.clear();
1210                            continue;
1211                        }
1212
1213                        // Find line and column information
1214                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1215
1216                        // Skip if this link is on a MkDocs snippet line
1217                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1218                            text_chunks.clear();
1219                            continue;
1220                        }
1221
1222                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1223
1224                        let is_reference = matches!(
1225                            link_type,
1226                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1227                        );
1228
1229                        // Extract link text directly from source bytes to preserve escaping
1230                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1231                        let link_text = if start_pos < content.len() {
1232                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1233
1234                            // Find MATCHING ] by tracking bracket depth for nested brackets
1235                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1236                            // Brackets inside code spans (between backticks) should be ignored
1237                            let mut close_pos = None;
1238                            let mut depth = 0;
1239                            let mut in_code_span = false;
1240
1241                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1242                                // Count preceding backslashes
1243                                let mut backslash_count = 0;
1244                                let mut j = i;
1245                                while j > 0 && link_bytes[j - 1] == b'\\' {
1246                                    backslash_count += 1;
1247                                    j -= 1;
1248                                }
1249                                let is_escaped = backslash_count % 2 != 0;
1250
1251                                // Track code spans - backticks toggle in/out of code
1252                                if byte == b'`' && !is_escaped {
1253                                    in_code_span = !in_code_span;
1254                                }
1255
1256                                // Only count brackets when NOT in a code span
1257                                if !is_escaped && !in_code_span {
1258                                    if byte == b'[' {
1259                                        depth += 1;
1260                                    } else if byte == b']' {
1261                                        if depth == 0 {
1262                                            // Found the matching closing bracket
1263                                            close_pos = Some(i);
1264                                            break;
1265                                        } else {
1266                                            depth -= 1;
1267                                        }
1268                                    }
1269                                }
1270                            }
1271
1272                            if let Some(pos) = close_pos {
1273                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1274                            } else {
1275                                Cow::Borrowed("")
1276                            }
1277                        } else {
1278                            Cow::Borrowed("")
1279                        };
1280
1281                        // For reference links, use the actual reference ID from pulldown-cmark
1282                        let reference_id = if is_reference && !ref_id.is_empty() {
1283                            Some(Cow::Owned(ref_id.to_lowercase()))
1284                        } else if is_reference {
1285                            // For collapsed/shortcut references without explicit ID, use the link text
1286                            Some(Cow::Owned(link_text.to_lowercase()))
1287                        } else {
1288                            None
1289                        };
1290
1291                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1292                        // Check for escaped image syntax: \![text](url)
1293                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1294                        let has_escaped_bang = start_pos >= 2
1295                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1296                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1297
1298                        // Check for escaped bracket: \[text](url)
1299                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1300                        let has_escaped_bracket =
1301                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1302
1303                        if has_escaped_bang || has_escaped_bracket {
1304                            text_chunks.clear();
1305                            continue; // Skip: this is escaped markdown, not a real link
1306                        }
1307
1308                        // Track this position as found
1309                        found_positions.insert(start_pos);
1310
1311                        links.push(ParsedLink {
1312                            line: line_num,
1313                            start_col: col_start,
1314                            end_col: col_end,
1315                            byte_offset: start_pos,
1316                            byte_end: range.end,
1317                            text: link_text,
1318                            url: Cow::Owned(url.to_string()),
1319                            is_reference,
1320                            reference_id,
1321                            link_type,
1322                        });
1323
1324                        text_chunks.clear();
1325                    }
1326                }
1327                Event::FootnoteReference(footnote_id) => {
1328                    // Capture footnote references like [^1], [^note]
1329                    // Skip if in HTML comment
1330                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1331                        continue;
1332                    }
1333
1334                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1335                    footnote_refs.push(FootnoteRef {
1336                        id: footnote_id.to_string(),
1337                        line: line_num,
1338                        byte_offset: range.start,
1339                        byte_end: range.end,
1340                    });
1341                }
1342                _ => {}
1343            }
1344        }
1345
1346        // Also find undefined references using regex
1347        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1348        // because the reference is undefined
1349        for cap in LINK_PATTERN.captures_iter(content) {
1350            let full_match = cap.get(0).unwrap();
1351            let match_start = full_match.start();
1352            let match_end = full_match.end();
1353
1354            // Skip if this was already found by pulldown-cmark (it's a valid link)
1355            if found_positions.contains(&match_start) {
1356                continue;
1357            }
1358
1359            // Skip if escaped
1360            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1361                continue;
1362            }
1363
1364            // Skip if it's an image
1365            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1366                continue;
1367            }
1368
1369            // Skip if in code block
1370            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1371                continue;
1372            }
1373
1374            // Skip if in code span
1375            if Self::is_offset_in_code_span(code_spans, match_start) {
1376                continue;
1377            }
1378
1379            // Skip if in HTML comment
1380            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1381                continue;
1382            }
1383
1384            // Find line and column information
1385            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1386
1387            // Skip if this link is on a MkDocs snippet line
1388            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1389                continue;
1390            }
1391
1392            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1393
1394            let text = cap.get(1).map_or("", |m| m.as_str());
1395
1396            // Only process reference links (group 6)
1397            if let Some(ref_id) = cap.get(6) {
1398                let ref_id_str = ref_id.as_str();
1399                let normalized_ref = if ref_id_str.is_empty() {
1400                    Cow::Owned(text.to_lowercase()) // Implicit reference
1401                } else {
1402                    Cow::Owned(ref_id_str.to_lowercase())
1403                };
1404
1405                // This is an undefined reference (pulldown-cmark didn't parse it)
1406                links.push(ParsedLink {
1407                    line: line_num,
1408                    start_col: col_start,
1409                    end_col: col_end,
1410                    byte_offset: match_start,
1411                    byte_end: match_end,
1412                    text: Cow::Borrowed(text),
1413                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1414                    is_reference: true,
1415                    reference_id: Some(normalized_ref),
1416                    link_type: LinkType::Reference, // Undefined references are reference-style
1417                });
1418            }
1419        }
1420
1421        (links, broken_links, footnote_refs)
1422    }
1423
1424    /// Parse all images in the content
1425    fn parse_images(
1426        content: &'a str,
1427        lines: &[LineInfo],
1428        code_blocks: &[(usize, usize)],
1429        code_spans: &[CodeSpan],
1430        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1431    ) -> Vec<ParsedImage<'a>> {
1432        use crate::utils::skip_context::is_in_html_comment_ranges;
1433        use std::collections::HashSet;
1434
1435        // Pre-size based on a heuristic: images are less common than links
1436        let mut images = Vec::with_capacity(content.len() / 1000);
1437        let mut found_positions = HashSet::new();
1438
1439        // Use pulldown-cmark for parsing - more accurate and faster
1440        let parser = Parser::new(content).into_offset_iter();
1441        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1442            Vec::new();
1443        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1444
1445        for (event, range) in parser {
1446            match event {
1447                Event::Start(Tag::Image {
1448                    link_type,
1449                    dest_url,
1450                    id,
1451                    ..
1452                }) => {
1453                    image_stack.push((range.start, dest_url, link_type, id));
1454                    text_chunks.clear();
1455                }
1456                Event::Text(text) if !image_stack.is_empty() => {
1457                    text_chunks.push((text.to_string(), range.start, range.end));
1458                }
1459                Event::Code(code) if !image_stack.is_empty() => {
1460                    let code_text = format!("`{code}`");
1461                    text_chunks.push((code_text, range.start, range.end));
1462                }
1463                Event::End(TagEnd::Image) => {
1464                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1465                        // Skip if in code block
1466                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1467                            continue;
1468                        }
1469
1470                        // Skip if in code span
1471                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1472                            continue;
1473                        }
1474
1475                        // Skip if in HTML comment
1476                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1477                            continue;
1478                        }
1479
1480                        // Find line and column using binary search
1481                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1482                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1483
1484                        let is_reference = matches!(
1485                            link_type,
1486                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1487                        );
1488
1489                        // Extract alt text directly from source bytes to preserve escaping
1490                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1491                        let alt_text = if start_pos < content.len() {
1492                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1493
1494                            // Find MATCHING ] by tracking bracket depth for nested brackets
1495                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1496                            let mut close_pos = None;
1497                            let mut depth = 0;
1498
1499                            if image_bytes.len() > 2 {
1500                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1501                                    // Count preceding backslashes
1502                                    let mut backslash_count = 0;
1503                                    let mut j = i;
1504                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1505                                        backslash_count += 1;
1506                                        j -= 1;
1507                                    }
1508                                    let is_escaped = backslash_count % 2 != 0;
1509
1510                                    if !is_escaped {
1511                                        if byte == b'[' {
1512                                            depth += 1;
1513                                        } else if byte == b']' {
1514                                            if depth == 0 {
1515                                                // Found the matching closing bracket
1516                                                close_pos = Some(i);
1517                                                break;
1518                                            } else {
1519                                                depth -= 1;
1520                                            }
1521                                        }
1522                                    }
1523                                }
1524                            }
1525
1526                            if let Some(pos) = close_pos {
1527                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1528                            } else {
1529                                Cow::Borrowed("")
1530                            }
1531                        } else {
1532                            Cow::Borrowed("")
1533                        };
1534
1535                        let reference_id = if is_reference && !ref_id.is_empty() {
1536                            Some(Cow::Owned(ref_id.to_lowercase()))
1537                        } else if is_reference {
1538                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1539                        } else {
1540                            None
1541                        };
1542
1543                        found_positions.insert(start_pos);
1544                        images.push(ParsedImage {
1545                            line: line_num,
1546                            start_col: col_start,
1547                            end_col: col_end,
1548                            byte_offset: start_pos,
1549                            byte_end: range.end,
1550                            alt_text,
1551                            url: Cow::Owned(url.to_string()),
1552                            is_reference,
1553                            reference_id,
1554                            link_type,
1555                        });
1556                    }
1557                }
1558                _ => {}
1559            }
1560        }
1561
1562        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1563        for cap in IMAGE_PATTERN.captures_iter(content) {
1564            let full_match = cap.get(0).unwrap();
1565            let match_start = full_match.start();
1566            let match_end = full_match.end();
1567
1568            // Skip if already found by pulldown-cmark
1569            if found_positions.contains(&match_start) {
1570                continue;
1571            }
1572
1573            // Skip if the ! is escaped
1574            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1575                continue;
1576            }
1577
1578            // Skip if in code block, code span, or HTML comment
1579            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1580                || Self::is_offset_in_code_span(code_spans, match_start)
1581                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1582            {
1583                continue;
1584            }
1585
1586            // Only process reference images (undefined references not found by pulldown-cmark)
1587            if let Some(ref_id) = cap.get(6) {
1588                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1589                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1590                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1591                let ref_id_str = ref_id.as_str();
1592                let normalized_ref = if ref_id_str.is_empty() {
1593                    Cow::Owned(alt_text.to_lowercase())
1594                } else {
1595                    Cow::Owned(ref_id_str.to_lowercase())
1596                };
1597
1598                images.push(ParsedImage {
1599                    line: line_num,
1600                    start_col: col_start,
1601                    end_col: col_end,
1602                    byte_offset: match_start,
1603                    byte_end: match_end,
1604                    alt_text: Cow::Borrowed(alt_text),
1605                    url: Cow::Borrowed(""),
1606                    is_reference: true,
1607                    reference_id: Some(normalized_ref),
1608                    link_type: LinkType::Reference, // Undefined references are reference-style
1609                });
1610            }
1611        }
1612
1613        images
1614    }
1615
1616    /// Parse reference definitions
1617    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1618        // Pre-size based on lines count as reference definitions are line-based
1619        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1620
1621        for (line_idx, line_info) in lines.iter().enumerate() {
1622            // Skip lines in code blocks
1623            if line_info.in_code_block {
1624                continue;
1625            }
1626
1627            let line = line_info.content(content);
1628            let line_num = line_idx + 1;
1629
1630            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1631                let id = cap.get(1).unwrap().as_str().to_lowercase();
1632                let url = cap.get(2).unwrap().as_str().to_string();
1633                let title_match = cap.get(3).or_else(|| cap.get(4));
1634                let title = title_match.map(|m| m.as_str().to_string());
1635
1636                // Calculate byte positions
1637                // The match starts at the beginning of the line (0) and extends to the end
1638                let match_obj = cap.get(0).unwrap();
1639                let byte_offset = line_info.byte_offset + match_obj.start();
1640                let byte_end = line_info.byte_offset + match_obj.end();
1641
1642                // Calculate title byte positions (includes the quote character before content)
1643                let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1644                    // The match is the content inside quotes, so we include the quote before
1645                    let start = line_info.byte_offset + m.start().saturating_sub(1);
1646                    let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1647                    (Some(start), Some(end))
1648                } else {
1649                    (None, None)
1650                };
1651
1652                refs.push(ReferenceDef {
1653                    line: line_num,
1654                    id,
1655                    url,
1656                    title,
1657                    byte_offset,
1658                    byte_end,
1659                    title_byte_start,
1660                    title_byte_end,
1661                });
1662            }
1663        }
1664
1665        refs
1666    }
1667
1668    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1669    /// Handles nested blockquotes like `> > > content`
1670    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1671    #[inline]
1672    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1673        let trimmed_start = line.trim_start();
1674        if !trimmed_start.starts_with('>') {
1675            return None;
1676        }
1677
1678        // Track total prefix length to handle nested blockquotes
1679        let mut remaining = line;
1680        let mut total_prefix_len = 0;
1681
1682        loop {
1683            let trimmed = remaining.trim_start();
1684            if !trimmed.starts_with('>') {
1685                break;
1686            }
1687
1688            // Add leading whitespace + '>' to prefix
1689            let leading_ws_len = remaining.len() - trimmed.len();
1690            total_prefix_len += leading_ws_len + 1;
1691
1692            let after_gt = &trimmed[1..];
1693
1694            // Handle optional whitespace after '>' (space or tab)
1695            if let Some(stripped) = after_gt.strip_prefix(' ') {
1696                total_prefix_len += 1;
1697                remaining = stripped;
1698            } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1699                total_prefix_len += 1;
1700                remaining = stripped;
1701            } else {
1702                remaining = after_gt;
1703            }
1704        }
1705
1706        Some((&line[..total_prefix_len], remaining))
1707    }
1708
1709    /// Fast unordered list parser - replaces regex for 5-10x speedup
1710    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1711    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1712    #[inline]
1713    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1714        let bytes = line.as_bytes();
1715        let mut i = 0;
1716
1717        // Skip leading whitespace
1718        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1719            i += 1;
1720        }
1721
1722        // Check for marker
1723        if i >= bytes.len() {
1724            return None;
1725        }
1726        let marker = bytes[i] as char;
1727        if marker != '-' && marker != '*' && marker != '+' {
1728            return None;
1729        }
1730        let marker_pos = i;
1731        i += 1;
1732
1733        // Collect spacing after marker (space or tab only)
1734        let spacing_start = i;
1735        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1736            i += 1;
1737        }
1738
1739        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1740    }
1741
1742    /// Fast ordered list parser - replaces regex for 5-10x speedup
1743    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1744    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1745    #[inline]
1746    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1747        let bytes = line.as_bytes();
1748        let mut i = 0;
1749
1750        // Skip leading whitespace
1751        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1752            i += 1;
1753        }
1754
1755        // Collect digits
1756        let number_start = i;
1757        while i < bytes.len() && bytes[i].is_ascii_digit() {
1758            i += 1;
1759        }
1760        if i == number_start {
1761            return None; // No digits found
1762        }
1763
1764        // Check for delimiter
1765        if i >= bytes.len() {
1766            return None;
1767        }
1768        let delimiter = bytes[i] as char;
1769        if delimiter != '.' && delimiter != ')' {
1770            return None;
1771        }
1772        let delimiter_pos = i;
1773        i += 1;
1774
1775        // Collect spacing after delimiter (space or tab only)
1776        let spacing_start = i;
1777        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1778            i += 1;
1779        }
1780
1781        Some((
1782            &line[..number_start],
1783            &line[number_start..delimiter_pos],
1784            delimiter,
1785            &line[spacing_start..i],
1786            &line[i..],
1787        ))
1788    }
1789
1790    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1791    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1792    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1793        let num_lines = line_offsets.len();
1794        let mut in_code_block = vec![false; num_lines];
1795
1796        // For each code block, mark all lines within it
1797        for &(start, end) in code_blocks {
1798            // Ensure we're at valid UTF-8 boundaries
1799            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1800                let mut boundary = start;
1801                while boundary > 0 && !content.is_char_boundary(boundary) {
1802                    boundary -= 1;
1803                }
1804                boundary
1805            } else {
1806                start
1807            };
1808
1809            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1810                let mut boundary = end;
1811                while boundary < content.len() && !content.is_char_boundary(boundary) {
1812                    boundary += 1;
1813                }
1814                boundary
1815            } else {
1816                end.min(content.len())
1817            };
1818
1819            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1820            // That function now has proper list context awareness (see code_block_utils.rs)
1821            // and correctly distinguishes between:
1822            // - Fenced code blocks (``` or ~~~)
1823            // - Indented code blocks at document level (4 spaces + blank line before)
1824            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1825            //
1826            // We no longer need to re-validate here. The original validation logic
1827            // was causing false positives by marking list continuation paragraphs as
1828            // code blocks when they have 4 spaces of indentation.
1829
1830            // Use binary search to find the first and last line indices
1831            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1832            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1833            //
1834            // Find the line that CONTAINS safe_start: the line with the largest
1835            // start offset that is <= safe_start. partition_point gives us the
1836            // first line that starts AFTER safe_start, so we subtract 1.
1837            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1838            let first_line = first_line_after.saturating_sub(1);
1839            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1840
1841            // Mark all lines in the range at once
1842            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1843                *flag = true;
1844            }
1845        }
1846
1847        in_code_block
1848    }
1849
1850    /// Pre-compute basic line information (without headings/blockquotes)
1851    fn compute_basic_line_info(
1852        content: &str,
1853        line_offsets: &[usize],
1854        code_blocks: &[(usize, usize)],
1855        flavor: MarkdownFlavor,
1856        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1857        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1858    ) -> Vec<LineInfo> {
1859        let content_lines: Vec<&str> = content.lines().collect();
1860        let mut lines = Vec::with_capacity(content_lines.len());
1861
1862        // Pre-compute which lines are in code blocks
1863        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1864
1865        // Detect front matter boundaries FIRST, before any other parsing
1866        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1867        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1868
1869        for (i, line) in content_lines.iter().enumerate() {
1870            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1871            let indent = line.len() - line.trim_start().len();
1872
1873            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1874            let blockquote_parse = Self::parse_blockquote_prefix(line);
1875
1876            // For blank detection, consider blockquote context
1877            let is_blank = if let Some((_, content)) = blockquote_parse {
1878                // In blockquote context, check if content after prefix is blank
1879                content.trim().is_empty()
1880            } else {
1881                line.trim().is_empty()
1882            };
1883
1884            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1885            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1886
1887            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1888            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1889                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1890            // Check if the ENTIRE line is within an HTML comment (not just the line start)
1891            // This ensures content after `-->` on the same line is not incorrectly skipped
1892            let line_end_offset = byte_offset + line.len();
1893            let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
1894                html_comment_ranges,
1895                byte_offset,
1896                line_end_offset,
1897            );
1898            let list_item = if !(in_code_block
1899                || is_blank
1900                || in_mkdocstrings
1901                || in_html_comment
1902                || (front_matter_end > 0 && i < front_matter_end))
1903            {
1904                // Strip blockquote prefix if present for list detection (reuse cached result)
1905                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1906                    (content, prefix.len())
1907                } else {
1908                    (&**line, 0)
1909                };
1910
1911                if let Some((leading_spaces, marker, spacing, _content)) =
1912                    Self::parse_unordered_list(line_for_list_check)
1913                {
1914                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1915                    let content_column = marker_column + 1 + spacing.len();
1916
1917                    // According to CommonMark spec, unordered list items MUST have at least one space
1918                    // after the marker (-, *, or +). Without a space, it's not a list item.
1919                    // This also naturally handles cases like:
1920                    // - *emphasis* (not a list)
1921                    // - **bold** (not a list)
1922                    // - --- (horizontal rule, not a list)
1923                    if spacing.is_empty() {
1924                        None
1925                    } else {
1926                        Some(ListItemInfo {
1927                            marker: marker.to_string(),
1928                            is_ordered: false,
1929                            number: None,
1930                            marker_column,
1931                            content_column,
1932                        })
1933                    }
1934                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1935                    Self::parse_ordered_list(line_for_list_check)
1936                {
1937                    let marker = format!("{number_str}{delimiter}");
1938                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1939                    let content_column = marker_column + marker.len() + spacing.len();
1940
1941                    // According to CommonMark spec, ordered list items MUST have at least one space
1942                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1943                    if spacing.is_empty() {
1944                        None
1945                    } else {
1946                        Some(ListItemInfo {
1947                            marker,
1948                            is_ordered: true,
1949                            number: number_str.parse().ok(),
1950                            marker_column,
1951                            content_column,
1952                        })
1953                    }
1954                } else {
1955                    None
1956                }
1957            } else {
1958                None
1959            };
1960
1961            lines.push(LineInfo {
1962                byte_offset,
1963                byte_len: line.len(),
1964                indent,
1965                is_blank,
1966                in_code_block,
1967                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1968                in_html_block: false, // Will be populated after line creation
1969                in_html_comment,
1970                list_item,
1971                heading: None,    // Will be populated in second pass for Setext headings
1972                blockquote: None, // Will be populated after line creation
1973                in_mkdocstrings,
1974                in_esm_block: false, // Will be populated after line creation for MDX files
1975                in_code_span_continuation: false, // Will be populated after code spans are parsed
1976            });
1977        }
1978
1979        lines
1980    }
1981
1982    /// Detect headings and blockquotes (called after HTML block detection)
1983    fn detect_headings_and_blockquotes(
1984        content: &str,
1985        lines: &mut [LineInfo],
1986        flavor: MarkdownFlavor,
1987        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1988        link_byte_ranges: &[(usize, usize)],
1989    ) {
1990        // Regex for heading detection
1991        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1992            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1993        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1994            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1995
1996        let content_lines: Vec<&str> = content.lines().collect();
1997
1998        // Detect front matter boundaries to skip those lines
1999        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2000
2001        // Detect headings (including Setext which needs look-ahead) and blockquotes
2002        for i in 0..lines.len() {
2003            if lines[i].in_code_block {
2004                continue;
2005            }
2006
2007            // Skip lines in front matter
2008            if front_matter_end > 0 && i < front_matter_end {
2009                continue;
2010            }
2011
2012            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2013            if lines[i].in_html_block {
2014                continue;
2015            }
2016
2017            let line = content_lines[i];
2018
2019            // Check for blockquotes (even on blank lines within blockquotes)
2020            if let Some(bq) = parse_blockquote_detailed(line) {
2021                let nesting_level = bq.markers.len(); // Each '>' is one level
2022                let marker_column = bq.indent.len();
2023
2024                // Build the prefix (indentation + markers + space)
2025                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2026
2027                // Check for various blockquote issues
2028                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2029                // Only flag multiple literal spaces, not tabs
2030                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
2031                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2032
2033                // Check if needs MD028 fix (empty blockquote line without proper spacing)
2034                // MD028 flags empty blockquote lines that don't have a single space after the marker
2035                // Lines like "> " or ">> " are already correct and don't need fixing
2036                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2037
2038                lines[i].blockquote = Some(BlockquoteInfo {
2039                    nesting_level,
2040                    indent: bq.indent.to_string(),
2041                    marker_column,
2042                    prefix,
2043                    content: bq.content.to_string(),
2044                    has_no_space_after_marker: has_no_space,
2045                    has_multiple_spaces_after_marker: has_multiple_spaces,
2046                    needs_md028_fix,
2047                });
2048            }
2049
2050            // Skip heading detection for blank lines
2051            if lines[i].is_blank {
2052                continue;
2053            }
2054
2055            // Check for ATX headings (but skip MkDocs snippet lines)
2056            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2057            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2058                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2059                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2060            } else {
2061                false
2062            };
2063
2064            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2065                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2066                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2067                    continue;
2068                }
2069                // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2070                // This prevents false positives where `#fragment` is detected as a heading
2071                let line_offset = lines[i].byte_offset;
2072                if link_byte_ranges
2073                    .iter()
2074                    .any(|&(start, end)| line_offset > start && line_offset < end)
2075                {
2076                    continue;
2077                }
2078                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2079                let hashes = caps.get(2).map_or("", |m| m.as_str());
2080                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2081                let rest = caps.get(4).map_or("", |m| m.as_str());
2082
2083                let level = hashes.len() as u8;
2084                let marker_column = leading_spaces.len();
2085
2086                // Check for closing sequence, but handle custom IDs that might come after
2087                let (text, has_closing, closing_seq) = {
2088                    // First check if there's a custom ID at the end
2089                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2090                        // Check if this looks like a valid custom ID (ends with })
2091                        if rest[id_start..].trim_end().ends_with('}') {
2092                            // Split off the custom ID
2093                            (&rest[..id_start], &rest[id_start..])
2094                        } else {
2095                            (rest, "")
2096                        }
2097                    } else {
2098                        (rest, "")
2099                    };
2100
2101                    // Now look for closing hashes in the part before the custom ID
2102                    let trimmed_rest = rest_without_id.trim_end();
2103                    if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2104                        // Find the start of the hash sequence by walking backwards
2105                        // Use char_indices to get byte positions at char boundaries
2106                        let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2107
2108                        // Find which char index corresponds to last_hash_byte_pos
2109                        let last_hash_char_idx = char_positions
2110                            .iter()
2111                            .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2112
2113                        if let Some(mut char_idx) = last_hash_char_idx {
2114                            // Walk backwards to find start of hash sequence
2115                            while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2116                                char_idx -= 1;
2117                            }
2118
2119                            // Get the byte position of the start of hashes
2120                            let start_of_hashes = char_positions[char_idx].0;
2121
2122                            // Check if there's at least one space before the closing hashes
2123                            let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2124
2125                            // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2126                            let potential_closing = &trimmed_rest[start_of_hashes..];
2127                            let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2128
2129                            if is_all_hashes && has_space_before {
2130                                // This is a closing sequence
2131                                let closing_hashes = potential_closing.to_string();
2132                                // The text is everything before the closing hashes
2133                                // Don't include the custom ID here - it will be extracted later
2134                                let text_part = if !custom_id_part.is_empty() {
2135                                    // If we have a custom ID, append it back to get the full rest
2136                                    // This allows the extract_header_id function to handle it properly
2137                                    format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2138                                } else {
2139                                    trimmed_rest[..start_of_hashes].trim_end().to_string()
2140                                };
2141                                (text_part, true, closing_hashes)
2142                            } else {
2143                                // Not a valid closing sequence, return the full content
2144                                (rest.to_string(), false, String::new())
2145                            }
2146                        } else {
2147                            // Couldn't find char boundary, return the full content
2148                            (rest.to_string(), false, String::new())
2149                        }
2150                    } else {
2151                        // No hashes found, return the full content
2152                        (rest.to_string(), false, String::new())
2153                    }
2154                };
2155
2156                let content_column = marker_column + hashes.len() + spaces_after.len();
2157
2158                // Extract custom header ID if present
2159                let raw_text = text.trim().to_string();
2160                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2161
2162                // If no custom ID was found on the header line, check the next line for standalone attr-list
2163                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2164                    let next_line = content_lines[i + 1];
2165                    if !lines[i + 1].in_code_block
2166                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2167                        && let Some(next_line_id) =
2168                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2169                    {
2170                        custom_id = Some(next_line_id);
2171                    }
2172                }
2173
2174                // ATX heading is "valid" for processing by heading rules if:
2175                // 1. Has space after # (CommonMark compliant): `# Heading`
2176                // 2. Is empty (just hashes): `#`
2177                // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2178                // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2179                //
2180                // Invalid patterns (hashtag-like) are skipped by most heading rules:
2181                // - `#tag` - single # with lowercase (social hashtag)
2182                // - `#123` - single # with number (GitHub issue ref)
2183                let is_valid = !spaces_after.is_empty()
2184                    || rest.is_empty()
2185                    || level > 1
2186                    || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2187
2188                lines[i].heading = Some(HeadingInfo {
2189                    level,
2190                    style: HeadingStyle::ATX,
2191                    marker: hashes.to_string(),
2192                    marker_column,
2193                    content_column,
2194                    text: clean_text,
2195                    custom_id,
2196                    raw_text,
2197                    has_closing_sequence: has_closing,
2198                    closing_sequence: closing_seq,
2199                    is_valid,
2200                });
2201            }
2202            // Check for Setext headings (need to look at next line)
2203            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2204                let next_line = content_lines[i + 1];
2205                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2206                    // Skip if next line is front matter delimiter
2207                    if front_matter_end > 0 && i < front_matter_end {
2208                        continue;
2209                    }
2210
2211                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2212                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2213                    {
2214                        continue;
2215                    }
2216
2217                    let underline = next_line.trim();
2218
2219                    let level = if underline.starts_with('=') { 1 } else { 2 };
2220                    let style = if level == 1 {
2221                        HeadingStyle::Setext1
2222                    } else {
2223                        HeadingStyle::Setext2
2224                    };
2225
2226                    // Extract custom header ID if present
2227                    let raw_text = line.trim().to_string();
2228                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2229
2230                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2231                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2232                        let attr_line = content_lines[i + 2];
2233                        if !lines[i + 2].in_code_block
2234                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2235                            && let Some(attr_line_id) =
2236                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2237                        {
2238                            custom_id = Some(attr_line_id);
2239                        }
2240                    }
2241
2242                    lines[i].heading = Some(HeadingInfo {
2243                        level,
2244                        style,
2245                        marker: underline.to_string(),
2246                        marker_column: next_line.len() - next_line.trim_start().len(),
2247                        content_column: lines[i].indent,
2248                        text: clean_text,
2249                        custom_id,
2250                        raw_text,
2251                        has_closing_sequence: false,
2252                        closing_sequence: String::new(),
2253                        is_valid: true, // Setext headings are always valid
2254                    });
2255                }
2256            }
2257        }
2258    }
2259
2260    /// Detect HTML blocks in the content
2261    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2262        // HTML block elements that trigger block context
2263        // Includes HTML5 media, embedded content, and interactive elements
2264        const BLOCK_ELEMENTS: &[&str] = &[
2265            "address",
2266            "article",
2267            "aside",
2268            "audio",
2269            "blockquote",
2270            "canvas",
2271            "details",
2272            "dialog",
2273            "dd",
2274            "div",
2275            "dl",
2276            "dt",
2277            "embed",
2278            "fieldset",
2279            "figcaption",
2280            "figure",
2281            "footer",
2282            "form",
2283            "h1",
2284            "h2",
2285            "h3",
2286            "h4",
2287            "h5",
2288            "h6",
2289            "header",
2290            "hr",
2291            "iframe",
2292            "li",
2293            "main",
2294            "menu",
2295            "nav",
2296            "noscript",
2297            "object",
2298            "ol",
2299            "p",
2300            "picture",
2301            "pre",
2302            "script",
2303            "search",
2304            "section",
2305            "source",
2306            "style",
2307            "summary",
2308            "svg",
2309            "table",
2310            "tbody",
2311            "td",
2312            "template",
2313            "textarea",
2314            "tfoot",
2315            "th",
2316            "thead",
2317            "tr",
2318            "track",
2319            "ul",
2320            "video",
2321        ];
2322
2323        let mut i = 0;
2324        while i < lines.len() {
2325            // Skip if already in code block or front matter
2326            if lines[i].in_code_block || lines[i].in_front_matter {
2327                i += 1;
2328                continue;
2329            }
2330
2331            let trimmed = lines[i].content(content).trim_start();
2332
2333            // Check if line starts with an HTML tag
2334            if trimmed.starts_with('<') && trimmed.len() > 1 {
2335                // Extract tag name safely
2336                let after_bracket = &trimmed[1..];
2337                let is_closing = after_bracket.starts_with('/');
2338                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2339
2340                // Extract tag name (stop at space, >, /, or end of string)
2341                let tag_name = tag_start
2342                    .chars()
2343                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2344                    .collect::<String>()
2345                    .to_lowercase();
2346
2347                // Check if it's a block element
2348                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2349                    // Mark this line as in HTML block
2350                    lines[i].in_html_block = true;
2351
2352                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2353                    // This avoids complex nesting logic that might cause infinite loops
2354                    if !is_closing {
2355                        let closing_tag = format!("</{tag_name}>");
2356                        // style and script tags can contain blank lines (CSS/JS formatting)
2357                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2358                        let mut j = i + 1;
2359                        while j < lines.len() && j < i + 100 {
2360                            // Limit search to 100 lines
2361                            // Stop at blank lines (except for style/script tags)
2362                            if !allow_blank_lines && lines[j].is_blank {
2363                                break;
2364                            }
2365
2366                            lines[j].in_html_block = true;
2367
2368                            // Check if this line contains the closing tag
2369                            if lines[j].content(content).contains(&closing_tag) {
2370                                break;
2371                            }
2372                            j += 1;
2373                        }
2374                    }
2375                }
2376            }
2377
2378            i += 1;
2379        }
2380    }
2381
2382    /// Detect ESM import/export blocks in MDX files
2383    /// ESM blocks consist of contiguous import/export statements at the top of the file
2384    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2385        // Only process MDX files
2386        if !flavor.supports_esm_blocks() {
2387            return;
2388        }
2389
2390        let mut in_multiline_comment = false;
2391
2392        for line in lines.iter_mut() {
2393            // Skip blank lines and HTML comments
2394            if line.is_blank || line.in_html_comment {
2395                continue;
2396            }
2397
2398            let trimmed = line.content(content).trim_start();
2399
2400            // Handle continuation of multi-line JS comments
2401            if in_multiline_comment {
2402                if trimmed.contains("*/") {
2403                    in_multiline_comment = false;
2404                }
2405                continue;
2406            }
2407
2408            // Skip single-line JS comments (// and ///)
2409            if trimmed.starts_with("//") {
2410                continue;
2411            }
2412
2413            // Handle start of multi-line JS comment
2414            if trimmed.starts_with("/*") {
2415                if !trimmed.contains("*/") {
2416                    in_multiline_comment = true;
2417                }
2418                continue;
2419            }
2420
2421            // Check if line starts with import or export
2422            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2423                line.in_esm_block = true;
2424            } else {
2425                // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2426                break;
2427            }
2428        }
2429    }
2430
2431    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2432    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2433        let mut code_spans = Vec::new();
2434
2435        // Quick check - if no backticks, no code spans
2436        if !content.contains('`') {
2437            return code_spans;
2438        }
2439
2440        // Use pulldown-cmark's streaming parser with byte offsets
2441        let parser = Parser::new(content).into_offset_iter();
2442
2443        for (event, range) in parser {
2444            if let Event::Code(_) = event {
2445                let start_pos = range.start;
2446                let end_pos = range.end;
2447
2448                // The range includes the backticks, extract the actual content
2449                let full_span = &content[start_pos..end_pos];
2450                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2451
2452                // Extract content between backticks, preserving spaces
2453                let content_start = start_pos + backtick_count;
2454                let content_end = end_pos - backtick_count;
2455                let span_content = if content_start < content_end {
2456                    content[content_start..content_end].to_string()
2457                } else {
2458                    String::new()
2459                };
2460
2461                // Use binary search to find line number - O(log n) instead of O(n)
2462                // Find the rightmost line whose byte_offset <= start_pos
2463                let line_idx = lines
2464                    .partition_point(|line| line.byte_offset <= start_pos)
2465                    .saturating_sub(1);
2466                let line_num = line_idx + 1;
2467                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2468
2469                // Find end column using binary search
2470                let end_line_idx = lines
2471                    .partition_point(|line| line.byte_offset <= end_pos)
2472                    .saturating_sub(1);
2473                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2474
2475                // Convert byte offsets to character positions for correct Unicode handling
2476                // This ensures consistency with warning.column which uses character positions
2477                let line_content = lines[line_idx].content(content);
2478                let col_start = if byte_col_start <= line_content.len() {
2479                    line_content[..byte_col_start].chars().count()
2480                } else {
2481                    line_content.chars().count()
2482                };
2483
2484                let end_line_content = lines[end_line_idx].content(content);
2485                let col_end = if byte_col_end <= end_line_content.len() {
2486                    end_line_content[..byte_col_end].chars().count()
2487                } else {
2488                    end_line_content.chars().count()
2489                };
2490
2491                code_spans.push(CodeSpan {
2492                    line: line_num,
2493                    end_line: end_line_idx + 1,
2494                    start_col: col_start,
2495                    end_col: col_end,
2496                    byte_offset: start_pos,
2497                    byte_end: end_pos,
2498                    backtick_count,
2499                    content: span_content,
2500                });
2501            }
2502        }
2503
2504        // Sort by position to ensure consistent ordering
2505        code_spans.sort_by_key(|span| span.byte_offset);
2506
2507        code_spans
2508    }
2509
2510    /// Parse all list blocks in the content (legacy line-by-line approach)
2511    ///
2512    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2513    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2514    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2515    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2516    ///   treated as list continuation (based on the list marker width)
2517    ///
2518    /// When a new list item is encountered, we check if list-breaking content was seen
2519    /// since the last item. If so, we start a new list block.
2520    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2521        // Minimum indentation for unordered list continuation per CommonMark spec
2522        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2523
2524        /// Initialize or reset the forward-scanning tracking state.
2525        /// This helper eliminates code duplication across three initialization sites.
2526        #[inline]
2527        fn reset_tracking_state(
2528            list_item: &ListItemInfo,
2529            has_list_breaking_content: &mut bool,
2530            min_continuation: &mut usize,
2531        ) {
2532            *has_list_breaking_content = false;
2533            let marker_width = if list_item.is_ordered {
2534                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2535            } else {
2536                list_item.marker.len()
2537            };
2538            *min_continuation = if list_item.is_ordered {
2539                marker_width
2540            } else {
2541                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2542            };
2543        }
2544
2545        // Pre-size based on lines that could be list items
2546        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2547        let mut current_block: Option<ListBlock> = None;
2548        let mut last_list_item_line = 0;
2549        let mut current_indent_level = 0;
2550        let mut last_marker_width = 0;
2551
2552        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2553        let mut has_list_breaking_content_since_last_item = false;
2554        let mut min_continuation_for_tracking = 0;
2555
2556        for (line_idx, line_info) in lines.iter().enumerate() {
2557            let line_num = line_idx + 1;
2558
2559            // Enhanced code block handling using Design #3's context analysis
2560            if line_info.in_code_block {
2561                if let Some(ref mut block) = current_block {
2562                    // Calculate minimum indentation for list continuation
2563                    let min_continuation_indent =
2564                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2565
2566                    // Analyze code block context using the three-tier classification
2567                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2568
2569                    match context {
2570                        CodeBlockContext::Indented => {
2571                            // Code block is properly indented - continues the list
2572                            block.end_line = line_num;
2573                            continue;
2574                        }
2575                        CodeBlockContext::Standalone => {
2576                            // Code block separates lists - end current block
2577                            let completed_block = current_block.take().unwrap();
2578                            list_blocks.push(completed_block);
2579                            continue;
2580                        }
2581                        CodeBlockContext::Adjacent => {
2582                            // Edge case - use conservative behavior (continue list)
2583                            block.end_line = line_num;
2584                            continue;
2585                        }
2586                    }
2587                } else {
2588                    // No current list block - skip code block lines
2589                    continue;
2590                }
2591            }
2592
2593            // Extract blockquote prefix if any
2594            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2595                caps.get(0).unwrap().as_str().to_string()
2596            } else {
2597                String::new()
2598            };
2599
2600            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2601            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2602            if current_block.is_some()
2603                && line_info.list_item.is_none()
2604                && !line_info.is_blank
2605                && !line_info.in_code_span_continuation
2606            {
2607                let line_content = line_info.content(content).trim();
2608
2609                // Check for structural separators that break lists
2610                // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2611                // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2612                // as they indicate improper indentation rather than lazy continuation.
2613                let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2614                let breaks_list = line_info.heading.is_some()
2615                    || line_content.starts_with("---")
2616                    || line_content.starts_with("***")
2617                    || line_content.starts_with("___")
2618                    || crate::utils::skip_context::is_table_line(line_content)
2619                    || line_content.starts_with(">")
2620                    || (line_info.indent > 0
2621                        && line_info.indent < min_continuation_for_tracking
2622                        && !is_lazy_continuation);
2623
2624                if breaks_list {
2625                    has_list_breaking_content_since_last_item = true;
2626                }
2627            }
2628
2629            // If this line is a code span continuation within an active list block,
2630            // extend the block's end_line to include this line (maintains list continuity)
2631            if line_info.in_code_span_continuation
2632                && line_info.list_item.is_none()
2633                && let Some(ref mut block) = current_block
2634            {
2635                block.end_line = line_num;
2636            }
2637
2638            // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2639            // properly indented lines within the list). This ensures the workaround at line 2448
2640            // works correctly when there are multiple continuation lines before a nested list item.
2641            // Also include lazy continuation lines (indent=0) per CommonMark spec.
2642            let is_valid_continuation =
2643                line_info.indent >= min_continuation_for_tracking || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
2644            if !line_info.in_code_span_continuation
2645                && line_info.list_item.is_none()
2646                && !line_info.is_blank
2647                && !line_info.in_code_block
2648                && is_valid_continuation
2649                && let Some(ref mut block) = current_block
2650            {
2651                block.end_line = line_num;
2652            }
2653
2654            // Check if this line is a list item
2655            if let Some(list_item) = &line_info.list_item {
2656                // Calculate nesting level based on indentation
2657                let item_indent = list_item.marker_column;
2658                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2659
2660                if let Some(ref mut block) = current_block {
2661                    // Check if this continues the current block
2662                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2663                    // or a continuation at the same or lower level
2664                    let is_nested = nesting > block.nesting_level;
2665                    let same_type =
2666                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2667                    let same_context = block.blockquote_prefix == blockquote_prefix;
2668                    // Allow one blank line after last item, or lines immediately after block content
2669                    let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
2670
2671                    // For unordered lists, also check marker consistency
2672                    let marker_compatible =
2673                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2674
2675                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2676                    // This eliminates the quadratic bottleneck from issue #148
2677                    let has_non_list_content = has_list_breaking_content_since_last_item;
2678
2679                    // A list continues if:
2680                    // 1. It's a nested item (indented more than the parent), OR
2681                    // 2. It's the same type at the same level with reasonable distance
2682                    let mut continues_list = if is_nested {
2683                        // Nested items always continue the list if they're in the same context
2684                        same_context && reasonable_distance && !has_non_list_content
2685                    } else {
2686                        // Same-level items need to match type and markers
2687                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2688                    };
2689
2690                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2691                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2692                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2693                        // Check if the previous line was a list item or a continuation of a list item
2694                        // (including lazy continuation lines)
2695                        if block.item_lines.contains(&(line_num - 1)) {
2696                            // They're consecutive list items - force them to be in the same list
2697                            continues_list = true;
2698                        } else {
2699                            // Previous line is a continuation line within this block
2700                            // (e.g., lazy continuation with indent=0)
2701                            // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
2702                            continues_list = true;
2703                        }
2704                    }
2705
2706                    if continues_list {
2707                        // Extend current block
2708                        block.end_line = line_num;
2709                        block.item_lines.push(line_num);
2710
2711                        // Update max marker width
2712                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2713                            list_item.marker.len() + 1
2714                        } else {
2715                            list_item.marker.len()
2716                        });
2717
2718                        // Update marker consistency for unordered lists
2719                        if !block.is_ordered
2720                            && block.marker.is_some()
2721                            && block.marker.as_ref() != Some(&list_item.marker)
2722                        {
2723                            // Mixed markers, clear the marker field
2724                            block.marker = None;
2725                        }
2726
2727                        // Reset tracked state for issue #148 optimization
2728                        reset_tracking_state(
2729                            list_item,
2730                            &mut has_list_breaking_content_since_last_item,
2731                            &mut min_continuation_for_tracking,
2732                        );
2733                    } else {
2734                        // End current block and start a new one
2735
2736                        list_blocks.push(block.clone());
2737
2738                        *block = ListBlock {
2739                            start_line: line_num,
2740                            end_line: line_num,
2741                            is_ordered: list_item.is_ordered,
2742                            marker: if list_item.is_ordered {
2743                                None
2744                            } else {
2745                                Some(list_item.marker.clone())
2746                            },
2747                            blockquote_prefix: blockquote_prefix.clone(),
2748                            item_lines: vec![line_num],
2749                            nesting_level: nesting,
2750                            max_marker_width: if list_item.is_ordered {
2751                                list_item.marker.len() + 1
2752                            } else {
2753                                list_item.marker.len()
2754                            },
2755                        };
2756
2757                        // Initialize tracked state for new block (issue #148 optimization)
2758                        reset_tracking_state(
2759                            list_item,
2760                            &mut has_list_breaking_content_since_last_item,
2761                            &mut min_continuation_for_tracking,
2762                        );
2763                    }
2764                } else {
2765                    // Start a new block
2766                    current_block = Some(ListBlock {
2767                        start_line: line_num,
2768                        end_line: line_num,
2769                        is_ordered: list_item.is_ordered,
2770                        marker: if list_item.is_ordered {
2771                            None
2772                        } else {
2773                            Some(list_item.marker.clone())
2774                        },
2775                        blockquote_prefix,
2776                        item_lines: vec![line_num],
2777                        nesting_level: nesting,
2778                        max_marker_width: list_item.marker.len(),
2779                    });
2780
2781                    // Initialize tracked state for new block (issue #148 optimization)
2782                    reset_tracking_state(
2783                        list_item,
2784                        &mut has_list_breaking_content_since_last_item,
2785                        &mut min_continuation_for_tracking,
2786                    );
2787                }
2788
2789                last_list_item_line = line_num;
2790                current_indent_level = item_indent;
2791                last_marker_width = if list_item.is_ordered {
2792                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2793                } else {
2794                    list_item.marker.len()
2795                };
2796            } else if let Some(ref mut block) = current_block {
2797                // Not a list item - check if it continues the current block
2798
2799                // For MD032 compatibility, we use a simple approach:
2800                // - Indented lines continue the list
2801                // - Blank lines followed by indented content continue the list
2802                // - Everything else ends the list
2803
2804                // Check if the last line in the list block ended with a backslash (hard line break)
2805                // This handles cases where list items use backslash for hard line breaks
2806                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2807                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2808                } else {
2809                    false
2810                };
2811
2812                // Calculate minimum indentation for list continuation
2813                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2814                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2815                let min_continuation_indent = if block.is_ordered {
2816                    current_indent_level + last_marker_width
2817                } else {
2818                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2819                };
2820
2821                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2822                    // Indented line or backslash continuation continues the list
2823                    block.end_line = line_num;
2824                } else if line_info.is_blank {
2825                    // Blank line - check if it's internal to the list or ending it
2826                    // We only include blank lines that are followed by more list content
2827                    let mut check_idx = line_idx + 1;
2828                    let mut found_continuation = false;
2829
2830                    // Skip additional blank lines
2831                    while check_idx < lines.len() && lines[check_idx].is_blank {
2832                        check_idx += 1;
2833                    }
2834
2835                    if check_idx < lines.len() {
2836                        let next_line = &lines[check_idx];
2837                        // Check if followed by indented content (list continuation)
2838                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2839                            found_continuation = true;
2840                        }
2841                        // Check if followed by another list item at the same level
2842                        else if !next_line.in_code_block
2843                            && next_line.list_item.is_some()
2844                            && let Some(item) = &next_line.list_item
2845                        {
2846                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2847                                .find(next_line.content(content))
2848                                .map_or(String::new(), |m| m.as_str().to_string());
2849                            if item.marker_column == current_indent_level
2850                                && item.is_ordered == block.is_ordered
2851                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2852                            {
2853                                // Check if there was meaningful content between the list items (unused now)
2854                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2855                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2856                                    if let Some(between_line) = lines.get(idx) {
2857                                        let between_content = between_line.content(content);
2858                                        let trimmed = between_content.trim();
2859                                        // Skip empty lines
2860                                        if trimmed.is_empty() {
2861                                            return false;
2862                                        }
2863                                        // Check for meaningful content
2864                                        let line_indent = between_content.len() - between_content.trim_start().len();
2865
2866                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2867                                        if trimmed.starts_with("```")
2868                                            || trimmed.starts_with("~~~")
2869                                            || trimmed.starts_with("---")
2870                                            || trimmed.starts_with("***")
2871                                            || trimmed.starts_with("___")
2872                                            || trimmed.starts_with(">")
2873                                            || crate::utils::skip_context::is_table_line(trimmed)
2874                                            || between_line.heading.is_some()
2875                                        {
2876                                            return true; // These are structural separators - meaningful content that breaks lists
2877                                        }
2878
2879                                        // Only properly indented content continues the list
2880                                        line_indent >= min_continuation_indent
2881                                    } else {
2882                                        false
2883                                    }
2884                                });
2885
2886                                if block.is_ordered {
2887                                    // For ordered lists: don't continue if there are structural separators
2888                                    // Check if there are structural separators between the list items
2889                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2890                                        if let Some(between_line) = lines.get(idx) {
2891                                            let trimmed = between_line.content(content).trim();
2892                                            if trimmed.is_empty() {
2893                                                return false;
2894                                            }
2895                                            // Check for structural separators that break lists
2896                                            trimmed.starts_with("```")
2897                                                || trimmed.starts_with("~~~")
2898                                                || trimmed.starts_with("---")
2899                                                || trimmed.starts_with("***")
2900                                                || trimmed.starts_with("___")
2901                                                || trimmed.starts_with(">")
2902                                                || crate::utils::skip_context::is_table_line(trimmed)
2903                                                || between_line.heading.is_some()
2904                                        } else {
2905                                            false
2906                                        }
2907                                    });
2908                                    found_continuation = !has_structural_separators;
2909                                } else {
2910                                    // For unordered lists: also check for structural separators
2911                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2912                                        if let Some(between_line) = lines.get(idx) {
2913                                            let trimmed = between_line.content(content).trim();
2914                                            if trimmed.is_empty() {
2915                                                return false;
2916                                            }
2917                                            // Check for structural separators that break lists
2918                                            trimmed.starts_with("```")
2919                                                || trimmed.starts_with("~~~")
2920                                                || trimmed.starts_with("---")
2921                                                || trimmed.starts_with("***")
2922                                                || trimmed.starts_with("___")
2923                                                || trimmed.starts_with(">")
2924                                                || crate::utils::skip_context::is_table_line(trimmed)
2925                                                || between_line.heading.is_some()
2926                                        } else {
2927                                            false
2928                                        }
2929                                    });
2930                                    found_continuation = !has_structural_separators;
2931                                }
2932                            }
2933                        }
2934                    }
2935
2936                    if found_continuation {
2937                        // Include the blank line in the block
2938                        block.end_line = line_num;
2939                    } else {
2940                        // Blank line ends the list - don't include it
2941                        list_blocks.push(block.clone());
2942                        current_block = None;
2943                    }
2944                } else {
2945                    // Check for lazy continuation - non-indented line immediately after a list item
2946                    // But only if the line has sufficient indentation for the list type
2947                    let min_required_indent = if block.is_ordered {
2948                        current_indent_level + last_marker_width
2949                    } else {
2950                        current_indent_level + 2
2951                    };
2952
2953                    // For lazy continuation to apply, the line must either:
2954                    // 1. Have no indentation (true lazy continuation)
2955                    // 2. Have sufficient indentation for the list type
2956                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2957                    let line_content = line_info.content(content).trim();
2958
2959                    // Check for table-like patterns
2960                    let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
2961
2962                    let is_structural_separator = line_info.heading.is_some()
2963                        || line_content.starts_with("```")
2964                        || line_content.starts_with("~~~")
2965                        || line_content.starts_with("---")
2966                        || line_content.starts_with("***")
2967                        || line_content.starts_with("___")
2968                        || line_content.starts_with(">")
2969                        || looks_like_table;
2970
2971                    // Allow lazy continuation if we're still within the same list block
2972                    // (not just immediately after a list item)
2973                    let is_lazy_continuation = !is_structural_separator
2974                        && !line_info.is_blank
2975                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2976
2977                    if is_lazy_continuation {
2978                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2979                        // it's probably not a continuation
2980                        let content_to_check = if !blockquote_prefix.is_empty() {
2981                            // Strip blockquote prefix to check the actual content
2982                            line_info
2983                                .content(content)
2984                                .strip_prefix(&blockquote_prefix)
2985                                .unwrap_or(line_info.content(content))
2986                                .trim()
2987                        } else {
2988                            line_info.content(content).trim()
2989                        };
2990
2991                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2992
2993                        // If it starts with uppercase and the previous line ended with punctuation,
2994                        // it's likely a new paragraph, not a continuation
2995                        if starts_with_uppercase && last_list_item_line > 0 {
2996                            // This looks like a new paragraph
2997                            list_blocks.push(block.clone());
2998                            current_block = None;
2999                        } else {
3000                            // This is a lazy continuation line
3001                            block.end_line = line_num;
3002                        }
3003                    } else {
3004                        // Non-indented, non-blank line that's not a lazy continuation - end the block
3005                        list_blocks.push(block.clone());
3006                        current_block = None;
3007                    }
3008                }
3009            }
3010        }
3011
3012        // Don't forget the last block
3013        if let Some(block) = current_block {
3014            list_blocks.push(block);
3015        }
3016
3017        // Merge adjacent blocks that should be one
3018        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3019
3020        list_blocks
3021    }
3022
3023    /// Compute character frequency for fast content analysis
3024    fn compute_char_frequency(content: &str) -> CharFrequency {
3025        let mut frequency = CharFrequency::default();
3026
3027        for ch in content.chars() {
3028            match ch {
3029                '#' => frequency.hash_count += 1,
3030                '*' => frequency.asterisk_count += 1,
3031                '_' => frequency.underscore_count += 1,
3032                '-' => frequency.hyphen_count += 1,
3033                '+' => frequency.plus_count += 1,
3034                '>' => frequency.gt_count += 1,
3035                '|' => frequency.pipe_count += 1,
3036                '[' => frequency.bracket_count += 1,
3037                '`' => frequency.backtick_count += 1,
3038                '<' => frequency.lt_count += 1,
3039                '!' => frequency.exclamation_count += 1,
3040                '\n' => frequency.newline_count += 1,
3041                _ => {}
3042            }
3043        }
3044
3045        frequency
3046    }
3047
3048    /// Parse HTML tags in the content
3049    fn parse_html_tags(
3050        content: &str,
3051        lines: &[LineInfo],
3052        code_blocks: &[(usize, usize)],
3053        flavor: MarkdownFlavor,
3054    ) -> Vec<HtmlTag> {
3055        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3056            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3057
3058        let mut html_tags = Vec::with_capacity(content.matches('<').count());
3059
3060        for cap in HTML_TAG_REGEX.captures_iter(content) {
3061            let full_match = cap.get(0).unwrap();
3062            let match_start = full_match.start();
3063            let match_end = full_match.end();
3064
3065            // Skip if in code block
3066            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3067                continue;
3068            }
3069
3070            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3071            let tag_name_original = cap.get(2).unwrap().as_str();
3072            let tag_name = tag_name_original.to_lowercase();
3073            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3074
3075            // Skip JSX components in MDX files (tags starting with uppercase letter)
3076            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3077            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3078                continue;
3079            }
3080
3081            // Find which line this tag is on
3082            let mut line_num = 1;
3083            let mut col_start = match_start;
3084            let mut col_end = match_end;
3085            for (idx, line_info) in lines.iter().enumerate() {
3086                if match_start >= line_info.byte_offset {
3087                    line_num = idx + 1;
3088                    col_start = match_start - line_info.byte_offset;
3089                    col_end = match_end - line_info.byte_offset;
3090                } else {
3091                    break;
3092                }
3093            }
3094
3095            html_tags.push(HtmlTag {
3096                line: line_num,
3097                start_col: col_start,
3098                end_col: col_end,
3099                byte_offset: match_start,
3100                byte_end: match_end,
3101                tag_name,
3102                is_closing,
3103                is_self_closing,
3104                raw_content: full_match.as_str().to_string(),
3105            });
3106        }
3107
3108        html_tags
3109    }
3110
3111    /// Parse emphasis spans in the content
3112    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
3113        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
3114            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
3115
3116        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
3117
3118        for cap in EMPHASIS_REGEX.captures_iter(content) {
3119            let full_match = cap.get(0).unwrap();
3120            let match_start = full_match.start();
3121            let match_end = full_match.end();
3122
3123            // Skip if in code block
3124            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3125                continue;
3126            }
3127
3128            let opening_markers = cap.get(1).unwrap().as_str();
3129            let content_part = cap.get(2).unwrap().as_str();
3130            let closing_markers = cap.get(3).unwrap().as_str();
3131
3132            // Validate matching markers
3133            if opening_markers.chars().next() != closing_markers.chars().next()
3134                || opening_markers.len() != closing_markers.len()
3135            {
3136                continue;
3137            }
3138
3139            let marker = opening_markers.chars().next().unwrap();
3140            let marker_count = opening_markers.len();
3141
3142            // Find which line this emphasis is on
3143            let mut line_num = 1;
3144            let mut col_start = match_start;
3145            let mut col_end = match_end;
3146            for (idx, line_info) in lines.iter().enumerate() {
3147                if match_start >= line_info.byte_offset {
3148                    line_num = idx + 1;
3149                    col_start = match_start - line_info.byte_offset;
3150                    col_end = match_end - line_info.byte_offset;
3151                } else {
3152                    break;
3153                }
3154            }
3155
3156            emphasis_spans.push(EmphasisSpan {
3157                line: line_num,
3158                start_col: col_start,
3159                end_col: col_end,
3160                byte_offset: match_start,
3161                byte_end: match_end,
3162                marker,
3163                marker_count,
3164                content: content_part.to_string(),
3165            });
3166        }
3167
3168        emphasis_spans
3169    }
3170
3171    /// Parse table rows in the content
3172    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3173        let mut table_rows = Vec::with_capacity(lines.len() / 20);
3174
3175        for (line_idx, line_info) in lines.iter().enumerate() {
3176            // Skip lines in code blocks or blank lines
3177            if line_info.in_code_block || line_info.is_blank {
3178                continue;
3179            }
3180
3181            let line = line_info.content(content);
3182            let line_num = line_idx + 1;
3183
3184            // Check if this line contains pipes (potential table row)
3185            if !line.contains('|') {
3186                continue;
3187            }
3188
3189            // Count columns by splitting on pipes
3190            let parts: Vec<&str> = line.split('|').collect();
3191            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3192
3193            // Check if this is a separator row
3194            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3195            let mut column_alignments = Vec::new();
3196
3197            if is_separator {
3198                for part in &parts[1..parts.len() - 1] {
3199                    // Skip first and last empty parts
3200                    let trimmed = part.trim();
3201                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3202                        "center".to_string()
3203                    } else if trimmed.ends_with(':') {
3204                        "right".to_string()
3205                    } else if trimmed.starts_with(':') {
3206                        "left".to_string()
3207                    } else {
3208                        "none".to_string()
3209                    };
3210                    column_alignments.push(alignment);
3211                }
3212            }
3213
3214            table_rows.push(TableRow {
3215                line: line_num,
3216                is_separator,
3217                column_count,
3218                column_alignments,
3219            });
3220        }
3221
3222        table_rows
3223    }
3224
3225    /// Parse bare URLs and emails in the content
3226    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3227        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3228
3229        // Check for bare URLs (not in angle brackets or markdown links)
3230        for cap in BARE_URL_PATTERN.captures_iter(content) {
3231            let full_match = cap.get(0).unwrap();
3232            let match_start = full_match.start();
3233            let match_end = full_match.end();
3234
3235            // Skip if in code block
3236            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3237                continue;
3238            }
3239
3240            // Skip if already in angle brackets or markdown links
3241            let preceding_char = if match_start > 0 {
3242                content.chars().nth(match_start - 1)
3243            } else {
3244                None
3245            };
3246            let following_char = content.chars().nth(match_end);
3247
3248            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3249                continue;
3250            }
3251            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3252                continue;
3253            }
3254
3255            let url = full_match.as_str();
3256            let url_type = if url.starts_with("https://") {
3257                "https"
3258            } else if url.starts_with("http://") {
3259                "http"
3260            } else if url.starts_with("ftp://") {
3261                "ftp"
3262            } else {
3263                "other"
3264            };
3265
3266            // Find which line this URL is on
3267            let mut line_num = 1;
3268            let mut col_start = match_start;
3269            let mut col_end = match_end;
3270            for (idx, line_info) in lines.iter().enumerate() {
3271                if match_start >= line_info.byte_offset {
3272                    line_num = idx + 1;
3273                    col_start = match_start - line_info.byte_offset;
3274                    col_end = match_end - line_info.byte_offset;
3275                } else {
3276                    break;
3277                }
3278            }
3279
3280            bare_urls.push(BareUrl {
3281                line: line_num,
3282                start_col: col_start,
3283                end_col: col_end,
3284                byte_offset: match_start,
3285                byte_end: match_end,
3286                url: url.to_string(),
3287                url_type: url_type.to_string(),
3288            });
3289        }
3290
3291        // Check for bare email addresses
3292        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3293            let full_match = cap.get(0).unwrap();
3294            let match_start = full_match.start();
3295            let match_end = full_match.end();
3296
3297            // Skip if in code block
3298            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3299                continue;
3300            }
3301
3302            // Skip if already in angle brackets or markdown links
3303            let preceding_char = if match_start > 0 {
3304                content.chars().nth(match_start - 1)
3305            } else {
3306                None
3307            };
3308            let following_char = content.chars().nth(match_end);
3309
3310            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3311                continue;
3312            }
3313            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3314                continue;
3315            }
3316
3317            let email = full_match.as_str();
3318
3319            // Find which line this email is on
3320            let mut line_num = 1;
3321            let mut col_start = match_start;
3322            let mut col_end = match_end;
3323            for (idx, line_info) in lines.iter().enumerate() {
3324                if match_start >= line_info.byte_offset {
3325                    line_num = idx + 1;
3326                    col_start = match_start - line_info.byte_offset;
3327                    col_end = match_end - line_info.byte_offset;
3328                } else {
3329                    break;
3330                }
3331            }
3332
3333            bare_urls.push(BareUrl {
3334                line: line_num,
3335                start_col: col_start,
3336                end_col: col_end,
3337                byte_offset: match_start,
3338                byte_end: match_end,
3339                url: email.to_string(),
3340                url_type: "email".to_string(),
3341            });
3342        }
3343
3344        bare_urls
3345    }
3346
3347    /// Get an iterator over valid CommonMark headings
3348    ///
3349    /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
3350    /// that should be flagged by MD018 but should not be processed by other heading rules.
3351    ///
3352    /// # Examples
3353    ///
3354    /// ```rust
3355    /// use rumdl_lib::lint_context::LintContext;
3356    /// use rumdl_lib::config::MarkdownFlavor;
3357    ///
3358    /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
3359    /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3360    ///
3361    /// for heading in ctx.valid_headings() {
3362    ///     println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
3363    /// }
3364    /// // Only prints valid headings, skips `#NoSpace`
3365    /// ```
3366    #[must_use]
3367    pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
3368        ValidHeadingsIter::new(&self.lines)
3369    }
3370
3371    /// Check if the document contains any valid CommonMark headings
3372    ///
3373    /// Returns `true` if there is at least one heading with proper space after `#`.
3374    #[must_use]
3375    pub fn has_valid_headings(&self) -> bool {
3376        self.lines
3377            .iter()
3378            .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
3379    }
3380}
3381
3382/// Merge adjacent list blocks that should be treated as one
3383fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3384    if list_blocks.len() < 2 {
3385        return;
3386    }
3387
3388    let mut merger = ListBlockMerger::new(content, lines);
3389    *list_blocks = merger.merge(list_blocks);
3390}
3391
3392/// Helper struct to manage the complex logic of merging list blocks
3393struct ListBlockMerger<'a> {
3394    content: &'a str,
3395    lines: &'a [LineInfo],
3396}
3397
3398impl<'a> ListBlockMerger<'a> {
3399    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3400        Self { content, lines }
3401    }
3402
3403    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3404        let mut merged = Vec::with_capacity(list_blocks.len());
3405        let mut current = list_blocks[0].clone();
3406
3407        for next in list_blocks.iter().skip(1) {
3408            if self.should_merge_blocks(&current, next) {
3409                current = self.merge_two_blocks(current, next);
3410            } else {
3411                merged.push(current);
3412                current = next.clone();
3413            }
3414        }
3415
3416        merged.push(current);
3417        merged
3418    }
3419
3420    /// Determine if two adjacent list blocks should be merged
3421    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3422        // Basic compatibility checks
3423        if !self.blocks_are_compatible(current, next) {
3424            return false;
3425        }
3426
3427        // Check spacing and content between blocks
3428        let spacing = self.analyze_spacing_between(current, next);
3429        match spacing {
3430            BlockSpacing::Consecutive => true,
3431            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3432            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3433                self.can_merge_with_content_between(current, next)
3434            }
3435        }
3436    }
3437
3438    /// Check if blocks have compatible structure for merging
3439    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3440        current.is_ordered == next.is_ordered
3441            && current.blockquote_prefix == next.blockquote_prefix
3442            && current.nesting_level == next.nesting_level
3443    }
3444
3445    /// Analyze the spacing between two list blocks
3446    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3447        let gap = next.start_line - current.end_line;
3448
3449        match gap {
3450            1 => BlockSpacing::Consecutive,
3451            2 => BlockSpacing::SingleBlank,
3452            _ if gap > 2 => {
3453                if self.has_only_blank_lines_between(current, next) {
3454                    BlockSpacing::MultipleBlanks
3455                } else {
3456                    BlockSpacing::ContentBetween
3457                }
3458            }
3459            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3460        }
3461    }
3462
3463    /// Check if unordered lists can be merged with a single blank line between
3464    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3465        // Check if there are structural separators between the blocks
3466        // If has_meaningful_content_between returns true, it means there are structural separators
3467        if has_meaningful_content_between(self.content, current, next, self.lines) {
3468            return false; // Structural separators prevent merging
3469        }
3470
3471        // Only merge unordered lists with same marker across single blank
3472        !current.is_ordered && current.marker == next.marker
3473    }
3474
3475    /// Check if ordered lists can be merged when there's content between them
3476    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3477        // Do not merge lists if there are structural separators between them
3478        if has_meaningful_content_between(self.content, current, next, self.lines) {
3479            return false; // Structural separators prevent merging
3480        }
3481
3482        // Only consider merging ordered lists if there's no structural content between
3483        current.is_ordered && next.is_ordered
3484    }
3485
3486    /// Check if there are only blank lines between blocks
3487    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3488        for line_num in (current.end_line + 1)..next.start_line {
3489            if let Some(line_info) = self.lines.get(line_num - 1)
3490                && !line_info.content(self.content).trim().is_empty()
3491            {
3492                return false;
3493            }
3494        }
3495        true
3496    }
3497
3498    /// Merge two compatible list blocks into one
3499    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3500        current.end_line = next.end_line;
3501        current.item_lines.extend_from_slice(&next.item_lines);
3502
3503        // Update max marker width
3504        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3505
3506        // Handle marker consistency for unordered lists
3507        if !current.is_ordered && self.markers_differ(&current, next) {
3508            current.marker = None; // Mixed markers
3509        }
3510
3511        current
3512    }
3513
3514    /// Check if two blocks have different markers
3515    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3516        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3517    }
3518}
3519
3520/// Types of spacing between list blocks
3521#[derive(Debug, PartialEq)]
3522enum BlockSpacing {
3523    Consecutive,    // No gap between blocks
3524    SingleBlank,    // One blank line between blocks
3525    MultipleBlanks, // Multiple blank lines but no content
3526    ContentBetween, // Content exists between blocks
3527}
3528
3529/// Check if there's meaningful content (not just blank lines) between two list blocks
3530fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3531    // Check lines between current.end_line and next.start_line
3532    for line_num in (current.end_line + 1)..next.start_line {
3533        if let Some(line_info) = lines.get(line_num - 1) {
3534            // Convert to 0-indexed
3535            let trimmed = line_info.content(content).trim();
3536
3537            // Skip empty lines
3538            if trimmed.is_empty() {
3539                continue;
3540            }
3541
3542            // Check for structural separators that should separate lists (CommonMark compliant)
3543
3544            // Headings separate lists
3545            if line_info.heading.is_some() {
3546                return true; // Has meaningful content - headings separate lists
3547            }
3548
3549            // Horizontal rules separate lists (---, ***, ___)
3550            if is_horizontal_rule(trimmed) {
3551                return true; // Has meaningful content - horizontal rules separate lists
3552            }
3553
3554            // Tables separate lists
3555            if crate::utils::skip_context::is_table_line(trimmed) {
3556                return true; // Has meaningful content - tables separate lists
3557            }
3558
3559            // Blockquotes separate lists
3560            if trimmed.starts_with('>') {
3561                return true; // Has meaningful content - blockquotes separate lists
3562            }
3563
3564            // Code block fences separate lists (unless properly indented as list content)
3565            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3566                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3567
3568                // Check if this code block is properly indented as list continuation
3569                let min_continuation_indent = if current.is_ordered {
3570                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3571                } else {
3572                    current.nesting_level + 2
3573                };
3574
3575                if line_indent < min_continuation_indent {
3576                    // This is a standalone code block that separates lists
3577                    return true; // Has meaningful content - standalone code blocks separate lists
3578                }
3579            }
3580
3581            // Check if this line has proper indentation for list continuation
3582            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3583
3584            // Calculate minimum indentation needed to be list continuation
3585            let min_indent = if current.is_ordered {
3586                current.nesting_level + current.max_marker_width
3587            } else {
3588                current.nesting_level + 2
3589            };
3590
3591            // If the line is not indented enough to be list continuation, it's meaningful content
3592            if line_indent < min_indent {
3593                return true; // Has meaningful content - content not indented as list continuation
3594            }
3595
3596            // If we reach here, the line is properly indented as list continuation
3597            // Continue checking other lines
3598        }
3599    }
3600
3601    // Only blank lines or properly indented list continuation content between blocks
3602    false
3603}
3604
3605/// Check if a line is a horizontal rule (---, ***, ___)
3606fn is_horizontal_rule(trimmed: &str) -> bool {
3607    if trimmed.len() < 3 {
3608        return false;
3609    }
3610
3611    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3612    let chars: Vec<char> = trimmed.chars().collect();
3613    if let Some(&first_char) = chars.first()
3614        && (first_char == '-' || first_char == '*' || first_char == '_')
3615    {
3616        let mut count = 0;
3617        for &ch in &chars {
3618            if ch == first_char {
3619                count += 1;
3620            } else if ch != ' ' && ch != '\t' {
3621                return false; // Non-matching, non-whitespace character
3622            }
3623        }
3624        return count >= 3;
3625    }
3626    false
3627}
3628
3629/// Check if content contains patterns that cause the markdown crate to panic
3630#[cfg(test)]
3631mod tests {
3632    use super::*;
3633
3634    #[test]
3635    fn test_empty_content() {
3636        let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
3637        assert_eq!(ctx.content, "");
3638        assert_eq!(ctx.line_offsets, vec![0]);
3639        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3640        assert_eq!(ctx.lines.len(), 0);
3641    }
3642
3643    #[test]
3644    fn test_single_line() {
3645        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
3646        assert_eq!(ctx.content, "# Hello");
3647        assert_eq!(ctx.line_offsets, vec![0]);
3648        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3649        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3650    }
3651
3652    #[test]
3653    fn test_multi_line() {
3654        let content = "# Title\n\nSecond line\nThird line";
3655        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3656        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3657        // Test offset to line/col
3658        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3659        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3660        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3661        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3662        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3663    }
3664
3665    #[test]
3666    fn test_line_info() {
3667        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3668        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3669
3670        // Test line info
3671        assert_eq!(ctx.lines.len(), 7);
3672
3673        // Line 1: "# Title"
3674        let line1 = &ctx.lines[0];
3675        assert_eq!(line1.content(ctx.content), "# Title");
3676        assert_eq!(line1.byte_offset, 0);
3677        assert_eq!(line1.indent, 0);
3678        assert!(!line1.is_blank);
3679        assert!(!line1.in_code_block);
3680        assert!(line1.list_item.is_none());
3681
3682        // Line 2: "    indented"
3683        let line2 = &ctx.lines[1];
3684        assert_eq!(line2.content(ctx.content), "    indented");
3685        assert_eq!(line2.byte_offset, 8);
3686        assert_eq!(line2.indent, 4);
3687        assert!(!line2.is_blank);
3688
3689        // Line 3: "" (blank)
3690        let line3 = &ctx.lines[2];
3691        assert_eq!(line3.content(ctx.content), "");
3692        assert!(line3.is_blank);
3693
3694        // Test helper methods
3695        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3696        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3697        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3698        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3699    }
3700
3701    #[test]
3702    fn test_list_item_detection() {
3703        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3704        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3705
3706        // Line 1: "- Unordered item"
3707        let line1 = &ctx.lines[0];
3708        assert!(line1.list_item.is_some());
3709        let list1 = line1.list_item.as_ref().unwrap();
3710        assert_eq!(list1.marker, "-");
3711        assert!(!list1.is_ordered);
3712        assert_eq!(list1.marker_column, 0);
3713        assert_eq!(list1.content_column, 2);
3714
3715        // Line 2: "  * Nested item"
3716        let line2 = &ctx.lines[1];
3717        assert!(line2.list_item.is_some());
3718        let list2 = line2.list_item.as_ref().unwrap();
3719        assert_eq!(list2.marker, "*");
3720        assert_eq!(list2.marker_column, 2);
3721
3722        // Line 3: "1. Ordered item"
3723        let line3 = &ctx.lines[2];
3724        assert!(line3.list_item.is_some());
3725        let list3 = line3.list_item.as_ref().unwrap();
3726        assert_eq!(list3.marker, "1.");
3727        assert!(list3.is_ordered);
3728        assert_eq!(list3.number, Some(1));
3729
3730        // Line 6: "Not a list"
3731        let line6 = &ctx.lines[5];
3732        assert!(line6.list_item.is_none());
3733    }
3734
3735    #[test]
3736    fn test_offset_to_line_col_edge_cases() {
3737        let content = "a\nb\nc";
3738        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3739        // line_offsets: [0, 2, 4]
3740        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3741        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3742        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3743        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3744        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3745        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3746    }
3747
3748    #[test]
3749    fn test_mdx_esm_blocks() {
3750        let content = r##"import {Chart} from './snowfall.js'
3751export const year = 2023
3752
3753# Last year's snowfall
3754
3755In {year}, the snowfall was above average.
3756It was followed by a warm spring which caused
3757flood conditions in many of the nearby rivers.
3758
3759<Chart color="#fcb32c" year={year} />
3760"##;
3761
3762        let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
3763
3764        // Check that lines 1 and 2 are marked as ESM blocks
3765        assert_eq!(ctx.lines.len(), 10);
3766        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3767        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3768        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3769        assert!(
3770            !ctx.lines[3].in_esm_block,
3771            "Line 4 (heading) should NOT be in_esm_block"
3772        );
3773        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3774        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3775    }
3776
3777    #[test]
3778    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3779        let content = r#"import {Chart} from './snowfall.js'
3780export const year = 2023
3781
3782# Last year's snowfall
3783"#;
3784
3785        let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3786
3787        // ESM blocks should NOT be detected in Standard flavor
3788        assert!(
3789            !ctx.lines[0].in_esm_block,
3790            "Line 1 should NOT be in_esm_block in Standard flavor"
3791        );
3792        assert!(
3793            !ctx.lines[1].in_esm_block,
3794            "Line 2 should NOT be in_esm_block in Standard flavor"
3795        );
3796    }
3797}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs