rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9/// Macro for profiling sections - only active in non-WASM builds
10#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12    ($name:expr, $profile:expr, $code:expr) => {{
13        let start = std::time::Instant::now();
14        let result = $code;
15        if $profile {
16            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17        }
18        result
19    }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27// Comprehensive link pattern that captures both inline and reference links
28// Use (?s) flag to make . match newlines
29static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30    Regex::new(
31        r#"(?sx)
32        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33        (?:
34            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
35            |
36            \[([^\]]*)\]      # Reference ID in group 6
37        )"#
38    ).unwrap()
39});
40
41// Image pattern (similar to links but with ! prefix)
42// Use (?s) flag to make . match newlines
43static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(
45        r#"(?sx)
46        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47        (?:
48            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
49            |
50            \[([^\]]*)\]      # Reference ID in group 6
51        )"#
52    ).unwrap()
53});
54
55// Reference definition pattern
56static REF_DEF_PATTERN: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59// Pattern for bare URLs
60static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61    Regex::new(
62        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63    ).unwrap()
64});
65
66// Pattern for email addresses
67static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70// Pattern for blockquote prefix in parse_list_blocks
71static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73/// Pre-computed information about a line
74#[derive(Debug, Clone)]
75pub struct LineInfo {
76    /// Byte offset where this line starts in the document
77    pub byte_offset: usize,
78    /// Length of the line in bytes (without newline)
79    pub byte_len: usize,
80    /// Number of leading spaces/tabs
81    pub indent: usize,
82    /// Whether the line is blank (empty or only whitespace)
83    pub is_blank: bool,
84    /// Whether this line is inside a code block
85    pub in_code_block: bool,
86    /// Whether this line is inside front matter
87    pub in_front_matter: bool,
88    /// Whether this line is inside an HTML block
89    pub in_html_block: bool,
90    /// Whether this line is inside an HTML comment
91    pub in_html_comment: bool,
92    /// List item information if this line starts a list item
93    pub list_item: Option<ListItemInfo>,
94    /// Heading information if this line is a heading
95    pub heading: Option<HeadingInfo>,
96    /// Blockquote information if this line is a blockquote
97    pub blockquote: Option<BlockquoteInfo>,
98    /// Whether this line is inside a mkdocstrings autodoc block
99    pub in_mkdocstrings: bool,
100    /// Whether this line is part of an ESM import/export block (MDX only)
101    pub in_esm_block: bool,
102    /// Whether this line is a continuation of a multi-line code span from a previous line
103    pub in_code_span_continuation: bool,
104}
105
106impl LineInfo {
107    /// Get the line content as a string slice from the source document
108    pub fn content<'a>(&self, source: &'a str) -> &'a str {
109        &source[self.byte_offset..self.byte_offset + self.byte_len]
110    }
111}
112
113/// Information about a list item
114#[derive(Debug, Clone)]
115pub struct ListItemInfo {
116    /// The marker used (*, -, +, or number with . or ))
117    pub marker: String,
118    /// Whether it's ordered (true) or unordered (false)
119    pub is_ordered: bool,
120    /// The number for ordered lists
121    pub number: Option<usize>,
122    /// Column where the marker starts (0-based)
123    pub marker_column: usize,
124    /// Column where content after marker starts
125    pub content_column: usize,
126}
127
128/// Heading style type
129#[derive(Debug, Clone, PartialEq)]
130pub enum HeadingStyle {
131    /// ATX style heading (# Heading)
132    ATX,
133    /// Setext style heading with = underline
134    Setext1,
135    /// Setext style heading with - underline
136    Setext2,
137}
138
139/// Parsed link information
140#[derive(Debug, Clone)]
141pub struct ParsedLink<'a> {
142    /// Line number (1-indexed)
143    pub line: usize,
144    /// Start column (0-indexed) in the line
145    pub start_col: usize,
146    /// End column (0-indexed) in the line
147    pub end_col: usize,
148    /// Byte offset in document
149    pub byte_offset: usize,
150    /// End byte offset in document
151    pub byte_end: usize,
152    /// Link text
153    pub text: Cow<'a, str>,
154    /// Link URL or reference
155    pub url: Cow<'a, str>,
156    /// Whether this is a reference link [text][ref] vs inline [text](url)
157    pub is_reference: bool,
158    /// Reference ID for reference links
159    pub reference_id: Option<Cow<'a, str>>,
160    /// Link type from pulldown-cmark
161    pub link_type: LinkType,
162}
163
164/// Information about a broken link reported by pulldown-cmark
165#[derive(Debug, Clone)]
166pub struct BrokenLinkInfo {
167    /// The reference text that couldn't be resolved
168    pub reference: String,
169    /// Byte span in the source document
170    pub span: std::ops::Range<usize>,
171}
172
173/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
174#[derive(Debug, Clone)]
175pub struct FootnoteRef {
176    /// The footnote ID (without the ^ prefix)
177    pub id: String,
178    /// Line number (1-indexed)
179    pub line: usize,
180    /// Start byte offset in document
181    pub byte_offset: usize,
182    /// End byte offset in document
183    pub byte_end: usize,
184}
185
186/// Parsed image information
187#[derive(Debug, Clone)]
188pub struct ParsedImage<'a> {
189    /// Line number (1-indexed)
190    pub line: usize,
191    /// Start column (0-indexed) in the line
192    pub start_col: usize,
193    /// End column (0-indexed) in the line
194    pub end_col: usize,
195    /// Byte offset in document
196    pub byte_offset: usize,
197    /// End byte offset in document
198    pub byte_end: usize,
199    /// Alt text
200    pub alt_text: Cow<'a, str>,
201    /// Image URL or reference
202    pub url: Cow<'a, str>,
203    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
204    pub is_reference: bool,
205    /// Reference ID for reference images
206    pub reference_id: Option<Cow<'a, str>>,
207    /// Link type from pulldown-cmark
208    pub link_type: LinkType,
209}
210
211/// Reference definition [ref]: url "title"
212#[derive(Debug, Clone)]
213pub struct ReferenceDef {
214    /// Line number (1-indexed)
215    pub line: usize,
216    /// Reference ID (normalized to lowercase)
217    pub id: String,
218    /// URL
219    pub url: String,
220    /// Optional title
221    pub title: Option<String>,
222    /// Byte offset where the reference definition starts
223    pub byte_offset: usize,
224    /// Byte offset where the reference definition ends
225    pub byte_end: usize,
226}
227
228/// Parsed code span information
229#[derive(Debug, Clone)]
230pub struct CodeSpan {
231    /// Line number where the code span starts (1-indexed)
232    pub line: usize,
233    /// Line number where the code span ends (1-indexed)
234    pub end_line: usize,
235    /// Start column (0-indexed) in the line
236    pub start_col: usize,
237    /// End column (0-indexed) in the line
238    pub end_col: usize,
239    /// Byte offset in document
240    pub byte_offset: usize,
241    /// End byte offset in document
242    pub byte_end: usize,
243    /// Number of backticks used (1, 2, 3, etc.)
244    pub backtick_count: usize,
245    /// Content inside the code span (without backticks)
246    pub content: String,
247}
248
249/// Information about a heading
250#[derive(Debug, Clone)]
251pub struct HeadingInfo {
252    /// Heading level (1-6 for ATX, 1-2 for Setext)
253    pub level: u8,
254    /// Style of heading
255    pub style: HeadingStyle,
256    /// The heading marker (# characters or underline)
257    pub marker: String,
258    /// Column where the marker starts (0-based)
259    pub marker_column: usize,
260    /// Column where heading text starts
261    pub content_column: usize,
262    /// The heading text (without markers and without custom ID syntax)
263    pub text: String,
264    /// Custom header ID if present (e.g., from {#custom-id} syntax)
265    pub custom_id: Option<String>,
266    /// Original heading text including custom ID syntax
267    pub raw_text: String,
268    /// Whether it has a closing sequence (for ATX)
269    pub has_closing_sequence: bool,
270    /// The closing sequence if present
271    pub closing_sequence: String,
272}
273
274/// Information about a blockquote line
275#[derive(Debug, Clone)]
276pub struct BlockquoteInfo {
277    /// Nesting level (1 for >, 2 for >>, etc.)
278    pub nesting_level: usize,
279    /// The indentation before the blockquote marker
280    pub indent: String,
281    /// Column where the first > starts (0-based)
282    pub marker_column: usize,
283    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
284    pub prefix: String,
285    /// Content after the blockquote marker(s)
286    pub content: String,
287    /// Whether the line has no space after the marker
288    pub has_no_space_after_marker: bool,
289    /// Whether the line has multiple spaces after the marker
290    pub has_multiple_spaces_after_marker: bool,
291    /// Whether this is an empty blockquote line needing MD028 fix
292    pub needs_md028_fix: bool,
293}
294
295/// Information about a list block
296#[derive(Debug, Clone)]
297pub struct ListBlock {
298    /// Line number where the list starts (1-indexed)
299    pub start_line: usize,
300    /// Line number where the list ends (1-indexed)
301    pub end_line: usize,
302    /// Whether it's ordered or unordered
303    pub is_ordered: bool,
304    /// The consistent marker for unordered lists (if any)
305    pub marker: Option<String>,
306    /// Blockquote prefix for this list (empty if not in blockquote)
307    pub blockquote_prefix: String,
308    /// Lines that are list items within this block
309    pub item_lines: Vec<usize>,
310    /// Nesting level (0 for top-level lists)
311    pub nesting_level: usize,
312    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
313    pub max_marker_width: usize,
314}
315
316use std::sync::{Arc, Mutex};
317
318/// Character frequency data for fast content analysis
319#[derive(Debug, Clone, Default)]
320pub struct CharFrequency {
321    /// Count of # characters (headings)
322    pub hash_count: usize,
323    /// Count of * characters (emphasis, lists, horizontal rules)
324    pub asterisk_count: usize,
325    /// Count of _ characters (emphasis, horizontal rules)
326    pub underscore_count: usize,
327    /// Count of - characters (lists, horizontal rules, setext headings)
328    pub hyphen_count: usize,
329    /// Count of + characters (lists)
330    pub plus_count: usize,
331    /// Count of > characters (blockquotes)
332    pub gt_count: usize,
333    /// Count of | characters (tables)
334    pub pipe_count: usize,
335    /// Count of [ characters (links, images)
336    pub bracket_count: usize,
337    /// Count of ` characters (code spans, code blocks)
338    pub backtick_count: usize,
339    /// Count of < characters (HTML tags, autolinks)
340    pub lt_count: usize,
341    /// Count of ! characters (images)
342    pub exclamation_count: usize,
343    /// Count of newline characters
344    pub newline_count: usize,
345}
346
347/// Pre-parsed HTML tag information
348#[derive(Debug, Clone)]
349pub struct HtmlTag {
350    /// Line number (1-indexed)
351    pub line: usize,
352    /// Start column (0-indexed) in the line
353    pub start_col: usize,
354    /// End column (0-indexed) in the line
355    pub end_col: usize,
356    /// Byte offset in document
357    pub byte_offset: usize,
358    /// End byte offset in document
359    pub byte_end: usize,
360    /// Tag name (e.g., "div", "img", "br")
361    pub tag_name: String,
362    /// Whether it's a closing tag (`</tag>`)
363    pub is_closing: bool,
364    /// Whether it's self-closing (`<tag />`)
365    pub is_self_closing: bool,
366    /// Raw tag content
367    pub raw_content: String,
368}
369
370/// Pre-parsed emphasis span information
371#[derive(Debug, Clone)]
372pub struct EmphasisSpan {
373    /// Line number (1-indexed)
374    pub line: usize,
375    /// Start column (0-indexed) in the line
376    pub start_col: usize,
377    /// End column (0-indexed) in the line
378    pub end_col: usize,
379    /// Byte offset in document
380    pub byte_offset: usize,
381    /// End byte offset in document
382    pub byte_end: usize,
383    /// Type of emphasis ('*' or '_')
384    pub marker: char,
385    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
386    pub marker_count: usize,
387    /// Content inside the emphasis
388    pub content: String,
389}
390
391/// Pre-parsed table row information
392#[derive(Debug, Clone)]
393pub struct TableRow {
394    /// Line number (1-indexed)
395    pub line: usize,
396    /// Whether this is a separator row (contains only |, -, :, and spaces)
397    pub is_separator: bool,
398    /// Number of columns (pipe-separated cells)
399    pub column_count: usize,
400    /// Alignment info from separator row
401    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
402}
403
404/// Pre-parsed bare URL information (not in links)
405#[derive(Debug, Clone)]
406pub struct BareUrl {
407    /// Line number (1-indexed)
408    pub line: usize,
409    /// Start column (0-indexed) in the line
410    pub start_col: usize,
411    /// End column (0-indexed) in the line
412    pub end_col: usize,
413    /// Byte offset in document
414    pub byte_offset: usize,
415    /// End byte offset in document
416    pub byte_end: usize,
417    /// The URL string
418    pub url: String,
419    /// Type of URL ("http", "https", "ftp", "email")
420    pub url_type: String,
421}
422
423pub struct LintContext<'a> {
424    pub content: &'a str,
425    pub line_offsets: Vec<usize>,
426    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
427    pub lines: Vec<LineInfo>,             // Pre-computed line information
428    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
429    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
430    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
431    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
432    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
433    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
434    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
435    pub char_frequency: CharFrequency,    // Character frequency analysis
436    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
437    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
438    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
439    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
440    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
441    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
442    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
443    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
444    pub flavor: MarkdownFlavor,           // Markdown flavor being used
445}
446
447/// Detailed blockquote parse result with all components
448struct BlockquoteComponents<'a> {
449    indent: &'a str,
450    markers: &'a str,
451    spaces_after: &'a str,
452    content: &'a str,
453}
454
455/// Parse blockquote prefix with detailed components using manual parsing
456#[inline]
457fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
458    let bytes = line.as_bytes();
459    let mut pos = 0;
460
461    // Parse leading whitespace (indent)
462    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
463        pos += 1;
464    }
465    let indent_end = pos;
466
467    // Must have at least one '>' marker
468    if pos >= bytes.len() || bytes[pos] != b'>' {
469        return None;
470    }
471
472    // Parse '>' markers
473    while pos < bytes.len() && bytes[pos] == b'>' {
474        pos += 1;
475    }
476    let markers_end = pos;
477
478    // Parse spaces after markers
479    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
480        pos += 1;
481    }
482    let spaces_end = pos;
483
484    Some(BlockquoteComponents {
485        indent: &line[0..indent_end],
486        markers: &line[indent_end..markers_end],
487        spaces_after: &line[markers_end..spaces_end],
488        content: &line[spaces_end..],
489    })
490}
491
492impl<'a> LintContext<'a> {
493    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
494        #[cfg(not(target_arch = "wasm32"))]
495        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
496        #[cfg(target_arch = "wasm32")]
497        let profile = false;
498
499        let line_offsets = profile_section!("Line offsets", profile, {
500            let mut offsets = vec![0];
501            for (i, c) in content.char_indices() {
502                if c == '\n' {
503                    offsets.push(i + 1);
504                }
505            }
506            offsets
507        });
508
509        // Detect code blocks once and cache them
510        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
511
512        // Pre-compute HTML comment ranges ONCE for all operations
513        let html_comment_ranges = profile_section!(
514            "HTML comment ranges",
515            profile,
516            crate::utils::skip_context::compute_html_comment_ranges(content)
517        );
518
519        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
520        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
521            if flavor == MarkdownFlavor::MkDocs {
522                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
523            } else {
524                Vec::new()
525            }
526        });
527
528        // Pre-compute line information (without headings/blockquotes yet)
529        let mut lines = profile_section!(
530            "Basic line info",
531            profile,
532            Self::compute_basic_line_info(
533                content,
534                &line_offsets,
535                &code_blocks,
536                flavor,
537                &html_comment_ranges,
538                &autodoc_ranges,
539            )
540        );
541
542        // Detect HTML blocks BEFORE heading detection
543        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
544
545        // Detect ESM import/export blocks in MDX files BEFORE heading detection
546        profile_section!(
547            "ESM blocks",
548            profile,
549            Self::detect_esm_blocks(content, &mut lines, flavor)
550        );
551
552        // Now detect headings and blockquotes
553        profile_section!(
554            "Headings & blockquotes",
555            profile,
556            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
557        );
558
559        // Parse code spans early so we can exclude them from link/image parsing
560        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
561
562        // Mark lines that are continuations of multi-line code spans
563        // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
564        for span in &code_spans {
565            if span.end_line > span.line {
566                // Mark lines after the first line as continuations
567                for line_num in (span.line + 1)..=span.end_line {
568                    if let Some(line_info) = lines.get_mut(line_num - 1) {
569                        line_info.in_code_span_continuation = true;
570                    }
571                }
572            }
573        }
574
575        // Parse links, images, references, and list blocks
576        let (links, broken_links, footnote_refs) = profile_section!(
577            "Links",
578            profile,
579            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
580        );
581
582        let images = profile_section!(
583            "Images",
584            profile,
585            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
586        );
587
588        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
589
590        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
591
592        // Compute character frequency for fast content analysis
593        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
594
595        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
596        let table_blocks = profile_section!(
597            "Table blocks",
598            profile,
599            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
600                content,
601                &code_blocks,
602                &code_spans,
603                &html_comment_ranges,
604            )
605        );
606
607        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
608        let line_index = profile_section!(
609            "Line index",
610            profile,
611            crate::utils::range_utils::LineIndex::new(content)
612        );
613
614        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
615        let jinja_ranges = profile_section!(
616            "Jinja ranges",
617            profile,
618            crate::utils::jinja_utils::find_jinja_ranges(content)
619        );
620
621        Self {
622            content,
623            line_offsets,
624            code_blocks,
625            lines,
626            links,
627            images,
628            broken_links,
629            footnote_refs,
630            reference_defs,
631            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
632            list_blocks,
633            char_frequency,
634            html_tags_cache: Mutex::new(None),
635            emphasis_spans_cache: Mutex::new(None),
636            table_rows_cache: Mutex::new(None),
637            bare_urls_cache: Mutex::new(None),
638            html_comment_ranges,
639            table_blocks,
640            line_index,
641            jinja_ranges,
642            flavor,
643        }
644    }
645
646    /// Get code spans - computed lazily on first access
647    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
648        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
649
650        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
651    }
652
653    /// Get HTML comment ranges - pre-computed during LintContext construction
654    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
655        &self.html_comment_ranges
656    }
657
658    /// Get HTML tags - computed lazily on first access
659    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
660        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
661
662        Arc::clone(cache.get_or_insert_with(|| {
663            Arc::new(Self::parse_html_tags(
664                self.content,
665                &self.lines,
666                &self.code_blocks,
667                self.flavor,
668            ))
669        }))
670    }
671
672    /// Get emphasis spans - computed lazily on first access
673    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
674        let mut cache = self
675            .emphasis_spans_cache
676            .lock()
677            .expect("Emphasis spans cache mutex poisoned");
678
679        Arc::clone(
680            cache.get_or_insert_with(|| {
681                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
682            }),
683        )
684    }
685
686    /// Get table rows - computed lazily on first access
687    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
688        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
689
690        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
691    }
692
693    /// Get bare URLs - computed lazily on first access
694    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
695        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
696
697        Arc::clone(
698            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
699        )
700    }
701
702    /// Map a byte offset to (line, column)
703    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
704        match self.line_offsets.binary_search(&offset) {
705            Ok(line) => (line + 1, 1),
706            Err(line) => {
707                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
708                (line, offset - line_start + 1)
709            }
710        }
711    }
712
713    /// Check if a position is within a code block or code span
714    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
715        // Check code blocks first
716        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
717            return true;
718        }
719
720        // Check inline code spans (lazy load if needed)
721        self.code_spans()
722            .iter()
723            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
724    }
725
726    /// Get line information by line number (1-indexed)
727    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
728        if line_num > 0 {
729            self.lines.get(line_num - 1)
730        } else {
731            None
732        }
733    }
734
735    /// Get byte offset for a line number (1-indexed)
736    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
737        self.line_info(line_num).map(|info| info.byte_offset)
738    }
739
740    /// Get URL for a reference link/image by its ID
741    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
742        let normalized_id = ref_id.to_lowercase();
743        self.reference_defs
744            .iter()
745            .find(|def| def.id == normalized_id)
746            .map(|def| def.url.as_str())
747    }
748
749    /// Check if a line is part of a list block
750    pub fn is_in_list_block(&self, line_num: usize) -> bool {
751        self.list_blocks
752            .iter()
753            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
754    }
755
756    /// Get the list block containing a specific line
757    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
758        self.list_blocks
759            .iter()
760            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
761    }
762
763    // Compatibility methods for DocumentStructure migration
764
765    /// Check if a line is within a code block
766    pub fn is_in_code_block(&self, line_num: usize) -> bool {
767        if line_num == 0 || line_num > self.lines.len() {
768            return false;
769        }
770        self.lines[line_num - 1].in_code_block
771    }
772
773    /// Check if a line is within front matter
774    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
775        if line_num == 0 || line_num > self.lines.len() {
776            return false;
777        }
778        self.lines[line_num - 1].in_front_matter
779    }
780
781    /// Check if a line is within an HTML block
782    pub fn is_in_html_block(&self, line_num: usize) -> bool {
783        if line_num == 0 || line_num > self.lines.len() {
784            return false;
785        }
786        self.lines[line_num - 1].in_html_block
787    }
788
789    /// Check if a line and column is within a code span
790    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
791        if line_num == 0 || line_num > self.lines.len() {
792            return false;
793        }
794
795        // Use the code spans cache to check
796        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
797        // Convert col to 0-indexed for comparison
798        let col_0indexed = if col > 0 { col - 1 } else { 0 };
799        let code_spans = self.code_spans();
800        code_spans.iter().any(|span| {
801            // Check if line is within the span's line range
802            if line_num < span.line || line_num > span.end_line {
803                return false;
804            }
805
806            if span.line == span.end_line {
807                // Single-line span: check column bounds
808                col_0indexed >= span.start_col && col_0indexed < span.end_col
809            } else if line_num == span.line {
810                // First line of multi-line span: anything after start_col is in span
811                col_0indexed >= span.start_col
812            } else if line_num == span.end_line {
813                // Last line of multi-line span: anything before end_col is in span
814                col_0indexed < span.end_col
815            } else {
816                // Middle line of multi-line span: entire line is in span
817                true
818            }
819        })
820    }
821
822    /// Check if a byte offset is within a code span
823    #[inline]
824    pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
825        let code_spans = self.code_spans();
826        code_spans
827            .iter()
828            .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
829    }
830
831    /// Check if a byte position is within a reference definition
832    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
833    #[inline]
834    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
835        self.reference_defs
836            .iter()
837            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
838    }
839
840    /// Check if a byte position is within an HTML comment
841    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
842    /// where k is the number of HTML comments (typically very small)
843    #[inline]
844    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
845        self.html_comment_ranges
846            .iter()
847            .any(|range| byte_pos >= range.start && byte_pos < range.end)
848    }
849
850    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
851    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
852        self.jinja_ranges
853            .iter()
854            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
855    }
856
857    /// Check if content has any instances of a specific character (fast)
858    pub fn has_char(&self, ch: char) -> bool {
859        match ch {
860            '#' => self.char_frequency.hash_count > 0,
861            '*' => self.char_frequency.asterisk_count > 0,
862            '_' => self.char_frequency.underscore_count > 0,
863            '-' => self.char_frequency.hyphen_count > 0,
864            '+' => self.char_frequency.plus_count > 0,
865            '>' => self.char_frequency.gt_count > 0,
866            '|' => self.char_frequency.pipe_count > 0,
867            '[' => self.char_frequency.bracket_count > 0,
868            '`' => self.char_frequency.backtick_count > 0,
869            '<' => self.char_frequency.lt_count > 0,
870            '!' => self.char_frequency.exclamation_count > 0,
871            '\n' => self.char_frequency.newline_count > 0,
872            _ => self.content.contains(ch), // Fallback for other characters
873        }
874    }
875
876    /// Get count of a specific character (fast)
877    pub fn char_count(&self, ch: char) -> usize {
878        match ch {
879            '#' => self.char_frequency.hash_count,
880            '*' => self.char_frequency.asterisk_count,
881            '_' => self.char_frequency.underscore_count,
882            '-' => self.char_frequency.hyphen_count,
883            '+' => self.char_frequency.plus_count,
884            '>' => self.char_frequency.gt_count,
885            '|' => self.char_frequency.pipe_count,
886            '[' => self.char_frequency.bracket_count,
887            '`' => self.char_frequency.backtick_count,
888            '<' => self.char_frequency.lt_count,
889            '!' => self.char_frequency.exclamation_count,
890            '\n' => self.char_frequency.newline_count,
891            _ => self.content.matches(ch).count(), // Fallback for other characters
892        }
893    }
894
895    /// Check if content likely contains headings (fast)
896    pub fn likely_has_headings(&self) -> bool {
897        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
898    }
899
900    /// Check if content likely contains lists (fast)
901    pub fn likely_has_lists(&self) -> bool {
902        self.char_frequency.asterisk_count > 0
903            || self.char_frequency.hyphen_count > 0
904            || self.char_frequency.plus_count > 0
905    }
906
907    /// Check if content likely contains emphasis (fast)
908    pub fn likely_has_emphasis(&self) -> bool {
909        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
910    }
911
912    /// Check if content likely contains tables (fast)
913    pub fn likely_has_tables(&self) -> bool {
914        self.char_frequency.pipe_count > 2
915    }
916
917    /// Check if content likely contains blockquotes (fast)
918    pub fn likely_has_blockquotes(&self) -> bool {
919        self.char_frequency.gt_count > 0
920    }
921
922    /// Check if content likely contains code (fast)
923    pub fn likely_has_code(&self) -> bool {
924        self.char_frequency.backtick_count > 0
925    }
926
927    /// Check if content likely contains links or images (fast)
928    pub fn likely_has_links_or_images(&self) -> bool {
929        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
930    }
931
932    /// Check if content likely contains HTML (fast)
933    pub fn likely_has_html(&self) -> bool {
934        self.char_frequency.lt_count > 0
935    }
936
937    /// Get HTML tags on a specific line
938    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
939        self.html_tags()
940            .iter()
941            .filter(|tag| tag.line == line_num)
942            .cloned()
943            .collect()
944    }
945
946    /// Get emphasis spans on a specific line
947    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
948        self.emphasis_spans()
949            .iter()
950            .filter(|span| span.line == line_num)
951            .cloned()
952            .collect()
953    }
954
955    /// Get table rows on a specific line
956    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
957        self.table_rows()
958            .iter()
959            .filter(|row| row.line == line_num)
960            .cloned()
961            .collect()
962    }
963
964    /// Get bare URLs on a specific line
965    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
966        self.bare_urls()
967            .iter()
968            .filter(|url| url.line == line_num)
969            .cloned()
970            .collect()
971    }
972
973    /// Find the line index for a given byte offset using binary search.
974    /// Returns (line_index, line_number, column) where:
975    /// - line_index is the 0-based index in the lines array
976    /// - line_number is the 1-based line number
977    /// - column is the byte offset within that line
978    #[inline]
979    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
980        // Binary search to find the line containing this byte offset
981        let idx = match lines.binary_search_by(|line| {
982            if byte_offset < line.byte_offset {
983                std::cmp::Ordering::Greater
984            } else if byte_offset > line.byte_offset + line.byte_len {
985                std::cmp::Ordering::Less
986            } else {
987                std::cmp::Ordering::Equal
988            }
989        }) {
990            Ok(idx) => idx,
991            Err(idx) => idx.saturating_sub(1),
992        };
993
994        let line = &lines[idx];
995        let line_num = idx + 1;
996        let col = byte_offset.saturating_sub(line.byte_offset);
997
998        (idx, line_num, col)
999    }
1000
1001    /// Check if a byte offset is within a code span using binary search
1002    #[inline]
1003    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1004        // Since spans are sorted by byte_offset, use partition_point for binary search
1005        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1006
1007        // Check the span that starts at or before our offset
1008        if idx > 0 {
1009            let span = &code_spans[idx - 1];
1010            if offset >= span.byte_offset && offset < span.byte_end {
1011                return true;
1012            }
1013        }
1014
1015        false
1016    }
1017
1018    /// Parse all links in the content
1019    fn parse_links(
1020        content: &'a str,
1021        lines: &[LineInfo],
1022        code_blocks: &[(usize, usize)],
1023        code_spans: &[CodeSpan],
1024        flavor: MarkdownFlavor,
1025        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1026    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1027        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1028        use std::collections::HashSet;
1029
1030        let mut links = Vec::with_capacity(content.len() / 500);
1031        let mut broken_links = Vec::new();
1032        let mut footnote_refs = Vec::new();
1033
1034        // Track byte positions of links found by pulldown-cmark
1035        let mut found_positions = HashSet::new();
1036
1037        // Use pulldown-cmark's streaming parser with BrokenLink callback
1038        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1039        // This automatically handles:
1040        // - Escaped links (won't generate events)
1041        // - Links in code blocks/spans (won't generate Link events)
1042        // - Images (generates Tag::Image instead)
1043        // - Reference resolution (dest_url is already resolved!)
1044        // - Broken references (callback is invoked)
1045        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1046        let mut options = Options::empty();
1047        options.insert(Options::ENABLE_WIKILINKS);
1048        options.insert(Options::ENABLE_FOOTNOTES);
1049
1050        let parser = Parser::new_with_broken_link_callback(
1051            content,
1052            options,
1053            Some(|link: BrokenLink<'_>| {
1054                broken_links.push(BrokenLinkInfo {
1055                    reference: link.reference.to_string(),
1056                    span: link.span.clone(),
1057                });
1058                None
1059            }),
1060        )
1061        .into_offset_iter();
1062
1063        let mut link_stack: Vec<(
1064            usize,
1065            usize,
1066            pulldown_cmark::CowStr<'a>,
1067            LinkType,
1068            pulldown_cmark::CowStr<'a>,
1069        )> = Vec::new();
1070        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1071
1072        for (event, range) in parser {
1073            match event {
1074                Event::Start(Tag::Link {
1075                    link_type,
1076                    dest_url,
1077                    id,
1078                    ..
1079                }) => {
1080                    // Link start - record position, URL, and reference ID
1081                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1082                    text_chunks.clear();
1083                }
1084                Event::Text(text) if !link_stack.is_empty() => {
1085                    // Track text content with its byte range
1086                    text_chunks.push((text.to_string(), range.start, range.end));
1087                }
1088                Event::Code(code) if !link_stack.is_empty() => {
1089                    // Include inline code in link text (with backticks)
1090                    let code_text = format!("`{code}`");
1091                    text_chunks.push((code_text, range.start, range.end));
1092                }
1093                Event::End(TagEnd::Link) => {
1094                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1095                        // Skip if in HTML comment
1096                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1097                            text_chunks.clear();
1098                            continue;
1099                        }
1100
1101                        // Find line and column information
1102                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1103
1104                        // Skip if this link is on a MkDocs snippet line
1105                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1106                            text_chunks.clear();
1107                            continue;
1108                        }
1109
1110                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1111
1112                        let is_reference = matches!(
1113                            link_type,
1114                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1115                        );
1116
1117                        // Extract link text directly from source bytes to preserve escaping
1118                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1119                        let link_text = if start_pos < content.len() {
1120                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1121
1122                            // Find MATCHING ] by tracking bracket depth for nested brackets
1123                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1124                            // Brackets inside code spans (between backticks) should be ignored
1125                            let mut close_pos = None;
1126                            let mut depth = 0;
1127                            let mut in_code_span = false;
1128
1129                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1130                                // Count preceding backslashes
1131                                let mut backslash_count = 0;
1132                                let mut j = i;
1133                                while j > 0 && link_bytes[j - 1] == b'\\' {
1134                                    backslash_count += 1;
1135                                    j -= 1;
1136                                }
1137                                let is_escaped = backslash_count % 2 != 0;
1138
1139                                // Track code spans - backticks toggle in/out of code
1140                                if byte == b'`' && !is_escaped {
1141                                    in_code_span = !in_code_span;
1142                                }
1143
1144                                // Only count brackets when NOT in a code span
1145                                if !is_escaped && !in_code_span {
1146                                    if byte == b'[' {
1147                                        depth += 1;
1148                                    } else if byte == b']' {
1149                                        if depth == 0 {
1150                                            // Found the matching closing bracket
1151                                            close_pos = Some(i);
1152                                            break;
1153                                        } else {
1154                                            depth -= 1;
1155                                        }
1156                                    }
1157                                }
1158                            }
1159
1160                            if let Some(pos) = close_pos {
1161                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1162                            } else {
1163                                Cow::Borrowed("")
1164                            }
1165                        } else {
1166                            Cow::Borrowed("")
1167                        };
1168
1169                        // For reference links, use the actual reference ID from pulldown-cmark
1170                        let reference_id = if is_reference && !ref_id.is_empty() {
1171                            Some(Cow::Owned(ref_id.to_lowercase()))
1172                        } else if is_reference {
1173                            // For collapsed/shortcut references without explicit ID, use the link text
1174                            Some(Cow::Owned(link_text.to_lowercase()))
1175                        } else {
1176                            None
1177                        };
1178
1179                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1180                        // Check for escaped image syntax: \![text](url)
1181                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1182                        let has_escaped_bang = start_pos >= 2
1183                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1184                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1185
1186                        // Check for escaped bracket: \[text](url)
1187                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1188                        let has_escaped_bracket =
1189                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1190
1191                        if has_escaped_bang || has_escaped_bracket {
1192                            text_chunks.clear();
1193                            continue; // Skip: this is escaped markdown, not a real link
1194                        }
1195
1196                        // Track this position as found
1197                        found_positions.insert(start_pos);
1198
1199                        links.push(ParsedLink {
1200                            line: line_num,
1201                            start_col: col_start,
1202                            end_col: col_end,
1203                            byte_offset: start_pos,
1204                            byte_end: range.end,
1205                            text: link_text,
1206                            url: Cow::Owned(url.to_string()),
1207                            is_reference,
1208                            reference_id,
1209                            link_type,
1210                        });
1211
1212                        text_chunks.clear();
1213                    }
1214                }
1215                Event::FootnoteReference(footnote_id) => {
1216                    // Capture footnote references like [^1], [^note]
1217                    // Skip if in HTML comment
1218                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1219                        continue;
1220                    }
1221
1222                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1223                    footnote_refs.push(FootnoteRef {
1224                        id: footnote_id.to_string(),
1225                        line: line_num,
1226                        byte_offset: range.start,
1227                        byte_end: range.end,
1228                    });
1229                }
1230                _ => {}
1231            }
1232        }
1233
1234        // Also find undefined references using regex
1235        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1236        // because the reference is undefined
1237        for cap in LINK_PATTERN.captures_iter(content) {
1238            let full_match = cap.get(0).unwrap();
1239            let match_start = full_match.start();
1240            let match_end = full_match.end();
1241
1242            // Skip if this was already found by pulldown-cmark (it's a valid link)
1243            if found_positions.contains(&match_start) {
1244                continue;
1245            }
1246
1247            // Skip if escaped
1248            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1249                continue;
1250            }
1251
1252            // Skip if it's an image
1253            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1254                continue;
1255            }
1256
1257            // Skip if in code block
1258            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1259                continue;
1260            }
1261
1262            // Skip if in code span
1263            if Self::is_offset_in_code_span(code_spans, match_start) {
1264                continue;
1265            }
1266
1267            // Skip if in HTML comment
1268            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1269                continue;
1270            }
1271
1272            // Find line and column information
1273            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1274
1275            // Skip if this link is on a MkDocs snippet line
1276            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1277                continue;
1278            }
1279
1280            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1281
1282            let text = cap.get(1).map_or("", |m| m.as_str());
1283
1284            // Only process reference links (group 6)
1285            if let Some(ref_id) = cap.get(6) {
1286                let ref_id_str = ref_id.as_str();
1287                let normalized_ref = if ref_id_str.is_empty() {
1288                    Cow::Owned(text.to_lowercase()) // Implicit reference
1289                } else {
1290                    Cow::Owned(ref_id_str.to_lowercase())
1291                };
1292
1293                // This is an undefined reference (pulldown-cmark didn't parse it)
1294                links.push(ParsedLink {
1295                    line: line_num,
1296                    start_col: col_start,
1297                    end_col: col_end,
1298                    byte_offset: match_start,
1299                    byte_end: match_end,
1300                    text: Cow::Borrowed(text),
1301                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1302                    is_reference: true,
1303                    reference_id: Some(normalized_ref),
1304                    link_type: LinkType::Reference, // Undefined references are reference-style
1305                });
1306            }
1307        }
1308
1309        (links, broken_links, footnote_refs)
1310    }
1311
1312    /// Parse all images in the content
1313    fn parse_images(
1314        content: &'a str,
1315        lines: &[LineInfo],
1316        code_blocks: &[(usize, usize)],
1317        code_spans: &[CodeSpan],
1318        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1319    ) -> Vec<ParsedImage<'a>> {
1320        use crate::utils::skip_context::is_in_html_comment_ranges;
1321        use std::collections::HashSet;
1322
1323        // Pre-size based on a heuristic: images are less common than links
1324        let mut images = Vec::with_capacity(content.len() / 1000);
1325        let mut found_positions = HashSet::new();
1326
1327        // Use pulldown-cmark for parsing - more accurate and faster
1328        let parser = Parser::new(content).into_offset_iter();
1329        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1330            Vec::new();
1331        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1332
1333        for (event, range) in parser {
1334            match event {
1335                Event::Start(Tag::Image {
1336                    link_type,
1337                    dest_url,
1338                    id,
1339                    ..
1340                }) => {
1341                    image_stack.push((range.start, dest_url, link_type, id));
1342                    text_chunks.clear();
1343                }
1344                Event::Text(text) if !image_stack.is_empty() => {
1345                    text_chunks.push((text.to_string(), range.start, range.end));
1346                }
1347                Event::Code(code) if !image_stack.is_empty() => {
1348                    let code_text = format!("`{code}`");
1349                    text_chunks.push((code_text, range.start, range.end));
1350                }
1351                Event::End(TagEnd::Image) => {
1352                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1353                        // Skip if in code block
1354                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1355                            continue;
1356                        }
1357
1358                        // Skip if in code span
1359                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1360                            continue;
1361                        }
1362
1363                        // Skip if in HTML comment
1364                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1365                            continue;
1366                        }
1367
1368                        // Find line and column using binary search
1369                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1370                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1371
1372                        let is_reference = matches!(
1373                            link_type,
1374                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1375                        );
1376
1377                        // Extract alt text directly from source bytes to preserve escaping
1378                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1379                        let alt_text = if start_pos < content.len() {
1380                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1381
1382                            // Find MATCHING ] by tracking bracket depth for nested brackets
1383                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1384                            let mut close_pos = None;
1385                            let mut depth = 0;
1386
1387                            if image_bytes.len() > 2 {
1388                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1389                                    // Count preceding backslashes
1390                                    let mut backslash_count = 0;
1391                                    let mut j = i;
1392                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1393                                        backslash_count += 1;
1394                                        j -= 1;
1395                                    }
1396                                    let is_escaped = backslash_count % 2 != 0;
1397
1398                                    if !is_escaped {
1399                                        if byte == b'[' {
1400                                            depth += 1;
1401                                        } else if byte == b']' {
1402                                            if depth == 0 {
1403                                                // Found the matching closing bracket
1404                                                close_pos = Some(i);
1405                                                break;
1406                                            } else {
1407                                                depth -= 1;
1408                                            }
1409                                        }
1410                                    }
1411                                }
1412                            }
1413
1414                            if let Some(pos) = close_pos {
1415                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1416                            } else {
1417                                Cow::Borrowed("")
1418                            }
1419                        } else {
1420                            Cow::Borrowed("")
1421                        };
1422
1423                        let reference_id = if is_reference && !ref_id.is_empty() {
1424                            Some(Cow::Owned(ref_id.to_lowercase()))
1425                        } else if is_reference {
1426                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1427                        } else {
1428                            None
1429                        };
1430
1431                        found_positions.insert(start_pos);
1432                        images.push(ParsedImage {
1433                            line: line_num,
1434                            start_col: col_start,
1435                            end_col: col_end,
1436                            byte_offset: start_pos,
1437                            byte_end: range.end,
1438                            alt_text,
1439                            url: Cow::Owned(url.to_string()),
1440                            is_reference,
1441                            reference_id,
1442                            link_type,
1443                        });
1444                    }
1445                }
1446                _ => {}
1447            }
1448        }
1449
1450        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1451        for cap in IMAGE_PATTERN.captures_iter(content) {
1452            let full_match = cap.get(0).unwrap();
1453            let match_start = full_match.start();
1454            let match_end = full_match.end();
1455
1456            // Skip if already found by pulldown-cmark
1457            if found_positions.contains(&match_start) {
1458                continue;
1459            }
1460
1461            // Skip if the ! is escaped
1462            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1463                continue;
1464            }
1465
1466            // Skip if in code block, code span, or HTML comment
1467            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1468                || Self::is_offset_in_code_span(code_spans, match_start)
1469                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1470            {
1471                continue;
1472            }
1473
1474            // Only process reference images (undefined references not found by pulldown-cmark)
1475            if let Some(ref_id) = cap.get(6) {
1476                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1477                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1478                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1479                let ref_id_str = ref_id.as_str();
1480                let normalized_ref = if ref_id_str.is_empty() {
1481                    Cow::Owned(alt_text.to_lowercase())
1482                } else {
1483                    Cow::Owned(ref_id_str.to_lowercase())
1484                };
1485
1486                images.push(ParsedImage {
1487                    line: line_num,
1488                    start_col: col_start,
1489                    end_col: col_end,
1490                    byte_offset: match_start,
1491                    byte_end: match_end,
1492                    alt_text: Cow::Borrowed(alt_text),
1493                    url: Cow::Borrowed(""),
1494                    is_reference: true,
1495                    reference_id: Some(normalized_ref),
1496                    link_type: LinkType::Reference, // Undefined references are reference-style
1497                });
1498            }
1499        }
1500
1501        images
1502    }
1503
1504    /// Parse reference definitions
1505    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1506        // Pre-size based on lines count as reference definitions are line-based
1507        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1508
1509        for (line_idx, line_info) in lines.iter().enumerate() {
1510            // Skip lines in code blocks
1511            if line_info.in_code_block {
1512                continue;
1513            }
1514
1515            let line = line_info.content(content);
1516            let line_num = line_idx + 1;
1517
1518            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1519                let id = cap.get(1).unwrap().as_str().to_lowercase();
1520                let url = cap.get(2).unwrap().as_str().to_string();
1521                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1522
1523                // Calculate byte positions
1524                // The match starts at the beginning of the line (0) and extends to the end
1525                let match_obj = cap.get(0).unwrap();
1526                let byte_offset = line_info.byte_offset + match_obj.start();
1527                let byte_end = line_info.byte_offset + match_obj.end();
1528
1529                refs.push(ReferenceDef {
1530                    line: line_num,
1531                    id,
1532                    url,
1533                    title,
1534                    byte_offset,
1535                    byte_end,
1536                });
1537            }
1538        }
1539
1540        refs
1541    }
1542
1543    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1544    /// Matches: ^(\s*>\s*)(.*)
1545    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1546    #[inline]
1547    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1548        let trimmed_start = line.trim_start();
1549        if !trimmed_start.starts_with('>') {
1550            return None;
1551        }
1552
1553        let leading_ws_len = line.len() - trimmed_start.len();
1554        let after_gt = &trimmed_start[1..];
1555        let content = after_gt.trim_start();
1556        let ws_after_gt_len = after_gt.len() - content.len();
1557        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1558
1559        Some((&line[..prefix_len], content))
1560    }
1561
1562    /// Fast unordered list parser - replaces regex for 5-10x speedup
1563    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1564    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1565    #[inline]
1566    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1567        let bytes = line.as_bytes();
1568        let mut i = 0;
1569
1570        // Skip leading whitespace
1571        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1572            i += 1;
1573        }
1574
1575        // Check for marker
1576        if i >= bytes.len() {
1577            return None;
1578        }
1579        let marker = bytes[i] as char;
1580        if marker != '-' && marker != '*' && marker != '+' {
1581            return None;
1582        }
1583        let marker_pos = i;
1584        i += 1;
1585
1586        // Collect spacing after marker (space or tab only)
1587        let spacing_start = i;
1588        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1589            i += 1;
1590        }
1591
1592        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1593    }
1594
1595    /// Fast ordered list parser - replaces regex for 5-10x speedup
1596    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1597    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1598    #[inline]
1599    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1600        let bytes = line.as_bytes();
1601        let mut i = 0;
1602
1603        // Skip leading whitespace
1604        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1605            i += 1;
1606        }
1607
1608        // Collect digits
1609        let number_start = i;
1610        while i < bytes.len() && bytes[i].is_ascii_digit() {
1611            i += 1;
1612        }
1613        if i == number_start {
1614            return None; // No digits found
1615        }
1616
1617        // Check for delimiter
1618        if i >= bytes.len() {
1619            return None;
1620        }
1621        let delimiter = bytes[i] as char;
1622        if delimiter != '.' && delimiter != ')' {
1623            return None;
1624        }
1625        let delimiter_pos = i;
1626        i += 1;
1627
1628        // Collect spacing after delimiter (space or tab only)
1629        let spacing_start = i;
1630        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1631            i += 1;
1632        }
1633
1634        Some((
1635            &line[..number_start],
1636            &line[number_start..delimiter_pos],
1637            delimiter,
1638            &line[spacing_start..i],
1639            &line[i..],
1640        ))
1641    }
1642
1643    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1644    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1645    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1646        let num_lines = line_offsets.len();
1647        let mut in_code_block = vec![false; num_lines];
1648
1649        // For each code block, mark all lines within it
1650        for &(start, end) in code_blocks {
1651            // Ensure we're at valid UTF-8 boundaries
1652            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1653                let mut boundary = start;
1654                while boundary > 0 && !content.is_char_boundary(boundary) {
1655                    boundary -= 1;
1656                }
1657                boundary
1658            } else {
1659                start
1660            };
1661
1662            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1663                let mut boundary = end;
1664                while boundary < content.len() && !content.is_char_boundary(boundary) {
1665                    boundary += 1;
1666                }
1667                boundary
1668            } else {
1669                end.min(content.len())
1670            };
1671
1672            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1673            // That function now has proper list context awareness (see code_block_utils.rs)
1674            // and correctly distinguishes between:
1675            // - Fenced code blocks (``` or ~~~)
1676            // - Indented code blocks at document level (4 spaces + blank line before)
1677            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1678            //
1679            // We no longer need to re-validate here. The original validation logic
1680            // was causing false positives by marking list continuation paragraphs as
1681            // code blocks when they have 4 spaces of indentation.
1682
1683            // Use binary search to find the first and last line indices
1684            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1685            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1686            //
1687            // Find the line that CONTAINS safe_start: the line with the largest
1688            // start offset that is <= safe_start. partition_point gives us the
1689            // first line that starts AFTER safe_start, so we subtract 1.
1690            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1691            let first_line = first_line_after.saturating_sub(1);
1692            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1693
1694            // Mark all lines in the range at once
1695            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1696                *flag = true;
1697            }
1698        }
1699
1700        in_code_block
1701    }
1702
1703    /// Pre-compute basic line information (without headings/blockquotes)
1704    fn compute_basic_line_info(
1705        content: &str,
1706        line_offsets: &[usize],
1707        code_blocks: &[(usize, usize)],
1708        flavor: MarkdownFlavor,
1709        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1710        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1711    ) -> Vec<LineInfo> {
1712        let content_lines: Vec<&str> = content.lines().collect();
1713        let mut lines = Vec::with_capacity(content_lines.len());
1714
1715        // Pre-compute which lines are in code blocks
1716        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1717
1718        // Detect front matter boundaries FIRST, before any other parsing
1719        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1720        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1721
1722        for (i, line) in content_lines.iter().enumerate() {
1723            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1724            let indent = line.len() - line.trim_start().len();
1725
1726            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1727            let blockquote_parse = Self::parse_blockquote_prefix(line);
1728
1729            // For blank detection, consider blockquote context
1730            let is_blank = if let Some((_, content)) = blockquote_parse {
1731                // In blockquote context, check if content after prefix is blank
1732                content.trim().is_empty()
1733            } else {
1734                line.trim().is_empty()
1735            };
1736
1737            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1738            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1739
1740            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1741            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1742                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1743            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1744            let in_html_comment =
1745                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1746            let list_item = if !(in_code_block
1747                || is_blank
1748                || in_mkdocstrings
1749                || in_html_comment
1750                || (front_matter_end > 0 && i < front_matter_end))
1751            {
1752                // Strip blockquote prefix if present for list detection (reuse cached result)
1753                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1754                    (content, prefix.len())
1755                } else {
1756                    (&**line, 0)
1757                };
1758
1759                if let Some((leading_spaces, marker, spacing, _content)) =
1760                    Self::parse_unordered_list(line_for_list_check)
1761                {
1762                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1763                    let content_column = marker_column + 1 + spacing.len();
1764
1765                    // According to CommonMark spec, unordered list items MUST have at least one space
1766                    // after the marker (-, *, or +). Without a space, it's not a list item.
1767                    // This also naturally handles cases like:
1768                    // - *emphasis* (not a list)
1769                    // - **bold** (not a list)
1770                    // - --- (horizontal rule, not a list)
1771                    if spacing.is_empty() {
1772                        None
1773                    } else {
1774                        Some(ListItemInfo {
1775                            marker: marker.to_string(),
1776                            is_ordered: false,
1777                            number: None,
1778                            marker_column,
1779                            content_column,
1780                        })
1781                    }
1782                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1783                    Self::parse_ordered_list(line_for_list_check)
1784                {
1785                    let marker = format!("{number_str}{delimiter}");
1786                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1787                    let content_column = marker_column + marker.len() + spacing.len();
1788
1789                    // According to CommonMark spec, ordered list items MUST have at least one space
1790                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1791                    if spacing.is_empty() {
1792                        None
1793                    } else {
1794                        Some(ListItemInfo {
1795                            marker,
1796                            is_ordered: true,
1797                            number: number_str.parse().ok(),
1798                            marker_column,
1799                            content_column,
1800                        })
1801                    }
1802                } else {
1803                    None
1804                }
1805            } else {
1806                None
1807            };
1808
1809            lines.push(LineInfo {
1810                byte_offset,
1811                byte_len: line.len(),
1812                indent,
1813                is_blank,
1814                in_code_block,
1815                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1816                in_html_block: false, // Will be populated after line creation
1817                in_html_comment,
1818                list_item,
1819                heading: None,    // Will be populated in second pass for Setext headings
1820                blockquote: None, // Will be populated after line creation
1821                in_mkdocstrings,
1822                in_esm_block: false, // Will be populated after line creation for MDX files
1823                in_code_span_continuation: false, // Will be populated after code spans are parsed
1824            });
1825        }
1826
1827        lines
1828    }
1829
1830    /// Detect headings and blockquotes (called after HTML block detection)
1831    fn detect_headings_and_blockquotes(
1832        content: &str,
1833        lines: &mut [LineInfo],
1834        flavor: MarkdownFlavor,
1835        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1836    ) {
1837        // Regex for heading detection
1838        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1839            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1840        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1841            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1842
1843        let content_lines: Vec<&str> = content.lines().collect();
1844
1845        // Detect front matter boundaries to skip those lines
1846        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1847
1848        // Detect headings (including Setext which needs look-ahead) and blockquotes
1849        for i in 0..lines.len() {
1850            if lines[i].in_code_block {
1851                continue;
1852            }
1853
1854            // Skip lines in front matter
1855            if front_matter_end > 0 && i < front_matter_end {
1856                continue;
1857            }
1858
1859            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1860            if lines[i].in_html_block {
1861                continue;
1862            }
1863
1864            let line = content_lines[i];
1865
1866            // Check for blockquotes (even on blank lines within blockquotes)
1867            if let Some(bq) = parse_blockquote_detailed(line) {
1868                let nesting_level = bq.markers.len(); // Each '>' is one level
1869                let marker_column = bq.indent.len();
1870
1871                // Build the prefix (indentation + markers + space)
1872                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1873
1874                // Check for various blockquote issues
1875                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1876                // Only flag multiple literal spaces, not tabs
1877                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
1878                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1879
1880                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1881                // MD028 flags empty blockquote lines that don't have a single space after the marker
1882                // Lines like "> " or ">> " are already correct and don't need fixing
1883                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1884
1885                lines[i].blockquote = Some(BlockquoteInfo {
1886                    nesting_level,
1887                    indent: bq.indent.to_string(),
1888                    marker_column,
1889                    prefix,
1890                    content: bq.content.to_string(),
1891                    has_no_space_after_marker: has_no_space,
1892                    has_multiple_spaces_after_marker: has_multiple_spaces,
1893                    needs_md028_fix,
1894                });
1895            }
1896
1897            // Skip heading detection for blank lines
1898            if lines[i].is_blank {
1899                continue;
1900            }
1901
1902            // Check for ATX headings (but skip MkDocs snippet lines)
1903            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1904            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1905                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1906                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1907            } else {
1908                false
1909            };
1910
1911            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1912                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1913                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1914                    continue;
1915                }
1916                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1917                let hashes = caps.get(2).map_or("", |m| m.as_str());
1918                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1919                let rest = caps.get(4).map_or("", |m| m.as_str());
1920
1921                let level = hashes.len() as u8;
1922                let marker_column = leading_spaces.len();
1923
1924                // Check for closing sequence, but handle custom IDs that might come after
1925                let (text, has_closing, closing_seq) = {
1926                    // First check if there's a custom ID at the end
1927                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1928                        // Check if this looks like a valid custom ID (ends with })
1929                        if rest[id_start..].trim_end().ends_with('}') {
1930                            // Split off the custom ID
1931                            (&rest[..id_start], &rest[id_start..])
1932                        } else {
1933                            (rest, "")
1934                        }
1935                    } else {
1936                        (rest, "")
1937                    };
1938
1939                    // Now look for closing hashes in the part before the custom ID
1940                    let trimmed_rest = rest_without_id.trim_end();
1941                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1942                        // Look for the start of the hash sequence
1943                        let mut start_of_hashes = last_hash_pos;
1944                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1945                            start_of_hashes -= 1;
1946                        }
1947
1948                        // Check if there's at least one space before the closing hashes
1949                        let has_space_before = start_of_hashes == 0
1950                            || trimmed_rest
1951                                .chars()
1952                                .nth(start_of_hashes - 1)
1953                                .is_some_and(|c| c.is_whitespace());
1954
1955                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1956                        let potential_closing = &trimmed_rest[start_of_hashes..];
1957                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1958
1959                        if is_all_hashes && has_space_before {
1960                            // This is a closing sequence
1961                            let closing_hashes = potential_closing.to_string();
1962                            // The text is everything before the closing hashes
1963                            // Don't include the custom ID here - it will be extracted later
1964                            let text_part = if !custom_id_part.is_empty() {
1965                                // If we have a custom ID, append it back to get the full rest
1966                                // This allows the extract_header_id function to handle it properly
1967                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1968                            } else {
1969                                rest_without_id[..start_of_hashes].trim_end().to_string()
1970                            };
1971                            (text_part, true, closing_hashes)
1972                        } else {
1973                            // Not a valid closing sequence, return the full content
1974                            (rest.to_string(), false, String::new())
1975                        }
1976                    } else {
1977                        // No hashes found, return the full content
1978                        (rest.to_string(), false, String::new())
1979                    }
1980                };
1981
1982                let content_column = marker_column + hashes.len() + spaces_after.len();
1983
1984                // Extract custom header ID if present
1985                let raw_text = text.trim().to_string();
1986                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1987
1988                // If no custom ID was found on the header line, check the next line for standalone attr-list
1989                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1990                    let next_line = content_lines[i + 1];
1991                    if !lines[i + 1].in_code_block
1992                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1993                        && let Some(next_line_id) =
1994                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1995                    {
1996                        custom_id = Some(next_line_id);
1997                    }
1998                }
1999
2000                lines[i].heading = Some(HeadingInfo {
2001                    level,
2002                    style: HeadingStyle::ATX,
2003                    marker: hashes.to_string(),
2004                    marker_column,
2005                    content_column,
2006                    text: clean_text,
2007                    custom_id,
2008                    raw_text,
2009                    has_closing_sequence: has_closing,
2010                    closing_sequence: closing_seq,
2011                });
2012            }
2013            // Check for Setext headings (need to look at next line)
2014            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2015                let next_line = content_lines[i + 1];
2016                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2017                    // Skip if next line is front matter delimiter
2018                    if front_matter_end > 0 && i < front_matter_end {
2019                        continue;
2020                    }
2021
2022                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2023                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2024                    {
2025                        continue;
2026                    }
2027
2028                    let underline = next_line.trim();
2029
2030                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
2031                    // YAML uses exactly `---` while Setext headings typically use longer underlines
2032                    if underline == "---" {
2033                        continue;
2034                    }
2035
2036                    // Skip if the current line looks like YAML key-value syntax
2037                    let current_line_trimmed = line.trim();
2038                    if current_line_trimmed.contains(':')
2039                        && !current_line_trimmed.starts_with('#')
2040                        && !current_line_trimmed.contains('[')
2041                        && !current_line_trimmed.contains("](")
2042                    {
2043                        // This looks like "key: value" which suggests YAML, not a heading
2044                        continue;
2045                    }
2046
2047                    let level = if underline.starts_with('=') { 1 } else { 2 };
2048                    let style = if level == 1 {
2049                        HeadingStyle::Setext1
2050                    } else {
2051                        HeadingStyle::Setext2
2052                    };
2053
2054                    // Extract custom header ID if present
2055                    let raw_text = line.trim().to_string();
2056                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2057
2058                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2059                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2060                        let attr_line = content_lines[i + 2];
2061                        if !lines[i + 2].in_code_block
2062                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2063                            && let Some(attr_line_id) =
2064                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2065                        {
2066                            custom_id = Some(attr_line_id);
2067                        }
2068                    }
2069
2070                    lines[i].heading = Some(HeadingInfo {
2071                        level,
2072                        style,
2073                        marker: underline.to_string(),
2074                        marker_column: next_line.len() - next_line.trim_start().len(),
2075                        content_column: lines[i].indent,
2076                        text: clean_text,
2077                        custom_id,
2078                        raw_text,
2079                        has_closing_sequence: false,
2080                        closing_sequence: String::new(),
2081                    });
2082                }
2083            }
2084        }
2085    }
2086
2087    /// Detect HTML blocks in the content
2088    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2089        // HTML block elements that trigger block context
2090        const BLOCK_ELEMENTS: &[&str] = &[
2091            "address",
2092            "article",
2093            "aside",
2094            "blockquote",
2095            "details",
2096            "dialog",
2097            "dd",
2098            "div",
2099            "dl",
2100            "dt",
2101            "fieldset",
2102            "figcaption",
2103            "figure",
2104            "footer",
2105            "form",
2106            "h1",
2107            "h2",
2108            "h3",
2109            "h4",
2110            "h5",
2111            "h6",
2112            "header",
2113            "hr",
2114            "li",
2115            "main",
2116            "nav",
2117            "ol",
2118            "p",
2119            "picture",
2120            "pre",
2121            "script",
2122            "section",
2123            "style",
2124            "table",
2125            "tbody",
2126            "td",
2127            "textarea",
2128            "tfoot",
2129            "th",
2130            "thead",
2131            "tr",
2132            "ul",
2133        ];
2134
2135        let mut i = 0;
2136        while i < lines.len() {
2137            // Skip if already in code block or front matter
2138            if lines[i].in_code_block || lines[i].in_front_matter {
2139                i += 1;
2140                continue;
2141            }
2142
2143            let trimmed = lines[i].content(content).trim_start();
2144
2145            // Check if line starts with an HTML tag
2146            if trimmed.starts_with('<') && trimmed.len() > 1 {
2147                // Extract tag name safely
2148                let after_bracket = &trimmed[1..];
2149                let is_closing = after_bracket.starts_with('/');
2150                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2151
2152                // Extract tag name (stop at space, >, /, or end of string)
2153                let tag_name = tag_start
2154                    .chars()
2155                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2156                    .collect::<String>()
2157                    .to_lowercase();
2158
2159                // Check if it's a block element
2160                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2161                    // Mark this line as in HTML block
2162                    lines[i].in_html_block = true;
2163
2164                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2165                    // This avoids complex nesting logic that might cause infinite loops
2166                    if !is_closing {
2167                        let closing_tag = format!("</{tag_name}>");
2168                        // style and script tags can contain blank lines (CSS/JS formatting)
2169                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2170                        let mut j = i + 1;
2171                        while j < lines.len() && j < i + 100 {
2172                            // Limit search to 100 lines
2173                            // Stop at blank lines (except for style/script tags)
2174                            if !allow_blank_lines && lines[j].is_blank {
2175                                break;
2176                            }
2177
2178                            lines[j].in_html_block = true;
2179
2180                            // Check if this line contains the closing tag
2181                            if lines[j].content(content).contains(&closing_tag) {
2182                                break;
2183                            }
2184                            j += 1;
2185                        }
2186                    }
2187                }
2188            }
2189
2190            i += 1;
2191        }
2192    }
2193
2194    /// Detect ESM import/export blocks in MDX files
2195    /// ESM blocks consist of contiguous import/export statements at the top of the file
2196    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2197        // Only process MDX files
2198        if !flavor.supports_esm_blocks() {
2199            return;
2200        }
2201
2202        for line in lines.iter_mut() {
2203            // Skip blank lines and comments at the start
2204            if line.is_blank || line.in_html_comment {
2205                continue;
2206            }
2207
2208            // Check if line starts with import or export
2209            let trimmed = line.content(content).trim_start();
2210            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2211                line.in_esm_block = true;
2212            } else {
2213                // Once we hit a non-ESM line, we're done with the ESM block
2214                break;
2215            }
2216        }
2217    }
2218
2219    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2220    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2221        let mut code_spans = Vec::new();
2222
2223        // Quick check - if no backticks, no code spans
2224        if !content.contains('`') {
2225            return code_spans;
2226        }
2227
2228        // Use pulldown-cmark's streaming parser with byte offsets
2229        let parser = Parser::new(content).into_offset_iter();
2230
2231        for (event, range) in parser {
2232            if let Event::Code(_) = event {
2233                let start_pos = range.start;
2234                let end_pos = range.end;
2235
2236                // The range includes the backticks, extract the actual content
2237                let full_span = &content[start_pos..end_pos];
2238                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2239
2240                // Extract content between backticks, preserving spaces
2241                let content_start = start_pos + backtick_count;
2242                let content_end = end_pos - backtick_count;
2243                let span_content = if content_start < content_end {
2244                    content[content_start..content_end].to_string()
2245                } else {
2246                    String::new()
2247                };
2248
2249                // Use binary search to find line number - O(log n) instead of O(n)
2250                // Find the rightmost line whose byte_offset <= start_pos
2251                let line_idx = lines
2252                    .partition_point(|line| line.byte_offset <= start_pos)
2253                    .saturating_sub(1);
2254                let line_num = line_idx + 1;
2255                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2256
2257                // Find end column using binary search
2258                let end_line_idx = lines
2259                    .partition_point(|line| line.byte_offset <= end_pos)
2260                    .saturating_sub(1);
2261                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2262
2263                // Convert byte offsets to character positions for correct Unicode handling
2264                // This ensures consistency with warning.column which uses character positions
2265                let line_content = lines[line_idx].content(content);
2266                let col_start = if byte_col_start <= line_content.len() {
2267                    line_content[..byte_col_start].chars().count()
2268                } else {
2269                    line_content.chars().count()
2270                };
2271
2272                let end_line_content = lines[end_line_idx].content(content);
2273                let col_end = if byte_col_end <= end_line_content.len() {
2274                    end_line_content[..byte_col_end].chars().count()
2275                } else {
2276                    end_line_content.chars().count()
2277                };
2278
2279                code_spans.push(CodeSpan {
2280                    line: line_num,
2281                    end_line: end_line_idx + 1,
2282                    start_col: col_start,
2283                    end_col: col_end,
2284                    byte_offset: start_pos,
2285                    byte_end: end_pos,
2286                    backtick_count,
2287                    content: span_content,
2288                });
2289            }
2290        }
2291
2292        // Sort by position to ensure consistent ordering
2293        code_spans.sort_by_key(|span| span.byte_offset);
2294
2295        code_spans
2296    }
2297
2298    /// Parse all list blocks in the content (legacy line-by-line approach)
2299    ///
2300    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2301    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2302    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2303    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2304    ///   treated as list continuation (based on the list marker width)
2305    ///
2306    /// When a new list item is encountered, we check if list-breaking content was seen
2307    /// since the last item. If so, we start a new list block.
2308    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2309        // Minimum indentation for unordered list continuation per CommonMark spec
2310        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2311
2312        /// Initialize or reset the forward-scanning tracking state.
2313        /// This helper eliminates code duplication across three initialization sites.
2314        #[inline]
2315        fn reset_tracking_state(
2316            list_item: &ListItemInfo,
2317            has_list_breaking_content: &mut bool,
2318            min_continuation: &mut usize,
2319        ) {
2320            *has_list_breaking_content = false;
2321            let marker_width = if list_item.is_ordered {
2322                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2323            } else {
2324                list_item.marker.len()
2325            };
2326            *min_continuation = if list_item.is_ordered {
2327                marker_width
2328            } else {
2329                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2330            };
2331        }
2332
2333        // Pre-size based on lines that could be list items
2334        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2335        let mut current_block: Option<ListBlock> = None;
2336        let mut last_list_item_line = 0;
2337        let mut current_indent_level = 0;
2338        let mut last_marker_width = 0;
2339
2340        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2341        let mut has_list_breaking_content_since_last_item = false;
2342        let mut min_continuation_for_tracking = 0;
2343
2344        for (line_idx, line_info) in lines.iter().enumerate() {
2345            let line_num = line_idx + 1;
2346
2347            // Enhanced code block handling using Design #3's context analysis
2348            if line_info.in_code_block {
2349                if let Some(ref mut block) = current_block {
2350                    // Calculate minimum indentation for list continuation
2351                    let min_continuation_indent =
2352                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2353
2354                    // Analyze code block context using the three-tier classification
2355                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2356
2357                    match context {
2358                        CodeBlockContext::Indented => {
2359                            // Code block is properly indented - continues the list
2360                            block.end_line = line_num;
2361                            continue;
2362                        }
2363                        CodeBlockContext::Standalone => {
2364                            // Code block separates lists - end current block
2365                            let completed_block = current_block.take().unwrap();
2366                            list_blocks.push(completed_block);
2367                            continue;
2368                        }
2369                        CodeBlockContext::Adjacent => {
2370                            // Edge case - use conservative behavior (continue list)
2371                            block.end_line = line_num;
2372                            continue;
2373                        }
2374                    }
2375                } else {
2376                    // No current list block - skip code block lines
2377                    continue;
2378                }
2379            }
2380
2381            // Extract blockquote prefix if any
2382            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2383                caps.get(0).unwrap().as_str().to_string()
2384            } else {
2385                String::new()
2386            };
2387
2388            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2389            // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2390            if current_block.is_some()
2391                && line_info.list_item.is_none()
2392                && !line_info.is_blank
2393                && !line_info.in_code_span_continuation
2394            {
2395                let line_content = line_info.content(content).trim();
2396
2397                // Check for structural separators that break lists
2398                let breaks_list = line_info.heading.is_some()
2399                    || line_content.starts_with("---")
2400                    || line_content.starts_with("***")
2401                    || line_content.starts_with("___")
2402                    || (line_content.contains('|')
2403                        && !line_content.contains("](")
2404                        && !line_content.contains("http")
2405                        && (line_content.matches('|').count() > 1
2406                            || line_content.starts_with('|')
2407                            || line_content.ends_with('|')))
2408                    || line_content.starts_with(">")
2409                    || (line_info.indent < min_continuation_for_tracking);
2410
2411                if breaks_list {
2412                    has_list_breaking_content_since_last_item = true;
2413                }
2414            }
2415
2416            // If this line is a code span continuation within an active list block,
2417            // extend the block's end_line to include this line (maintains list continuity)
2418            if line_info.in_code_span_continuation
2419                && line_info.list_item.is_none()
2420                && let Some(ref mut block) = current_block
2421            {
2422                block.end_line = line_num;
2423            }
2424
2425            // Check if this line is a list item
2426            if let Some(list_item) = &line_info.list_item {
2427                // Calculate nesting level based on indentation
2428                let item_indent = list_item.marker_column;
2429                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2430
2431                if let Some(ref mut block) = current_block {
2432                    // Check if this continues the current block
2433                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2434                    // or a continuation at the same or lower level
2435                    let is_nested = nesting > block.nesting_level;
2436                    let same_type =
2437                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2438                    let same_context = block.blockquote_prefix == blockquote_prefix;
2439                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2440
2441                    // For unordered lists, also check marker consistency
2442                    let marker_compatible =
2443                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2444
2445                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2446                    // This eliminates the quadratic bottleneck from issue #148
2447                    let has_non_list_content = has_list_breaking_content_since_last_item;
2448
2449                    // A list continues if:
2450                    // 1. It's a nested item (indented more than the parent), OR
2451                    // 2. It's the same type at the same level with reasonable distance
2452                    let mut continues_list = if is_nested {
2453                        // Nested items always continue the list if they're in the same context
2454                        same_context && reasonable_distance && !has_non_list_content
2455                    } else {
2456                        // Same-level items need to match type and markers
2457                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2458                    };
2459
2460                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2461                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2462                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2463                        // Check if the previous line was a list item
2464                        if block.item_lines.contains(&(line_num - 1)) {
2465                            // They're consecutive list items - force them to be in the same list
2466                            continues_list = true;
2467                        }
2468                    }
2469
2470                    if continues_list {
2471                        // Extend current block
2472                        block.end_line = line_num;
2473                        block.item_lines.push(line_num);
2474
2475                        // Update max marker width
2476                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2477                            list_item.marker.len() + 1
2478                        } else {
2479                            list_item.marker.len()
2480                        });
2481
2482                        // Update marker consistency for unordered lists
2483                        if !block.is_ordered
2484                            && block.marker.is_some()
2485                            && block.marker.as_ref() != Some(&list_item.marker)
2486                        {
2487                            // Mixed markers, clear the marker field
2488                            block.marker = None;
2489                        }
2490
2491                        // Reset tracked state for issue #148 optimization
2492                        reset_tracking_state(
2493                            list_item,
2494                            &mut has_list_breaking_content_since_last_item,
2495                            &mut min_continuation_for_tracking,
2496                        );
2497                    } else {
2498                        // End current block and start a new one
2499
2500                        list_blocks.push(block.clone());
2501
2502                        *block = ListBlock {
2503                            start_line: line_num,
2504                            end_line: line_num,
2505                            is_ordered: list_item.is_ordered,
2506                            marker: if list_item.is_ordered {
2507                                None
2508                            } else {
2509                                Some(list_item.marker.clone())
2510                            },
2511                            blockquote_prefix: blockquote_prefix.clone(),
2512                            item_lines: vec![line_num],
2513                            nesting_level: nesting,
2514                            max_marker_width: if list_item.is_ordered {
2515                                list_item.marker.len() + 1
2516                            } else {
2517                                list_item.marker.len()
2518                            },
2519                        };
2520
2521                        // Initialize tracked state for new block (issue #148 optimization)
2522                        reset_tracking_state(
2523                            list_item,
2524                            &mut has_list_breaking_content_since_last_item,
2525                            &mut min_continuation_for_tracking,
2526                        );
2527                    }
2528                } else {
2529                    // Start a new block
2530                    current_block = Some(ListBlock {
2531                        start_line: line_num,
2532                        end_line: line_num,
2533                        is_ordered: list_item.is_ordered,
2534                        marker: if list_item.is_ordered {
2535                            None
2536                        } else {
2537                            Some(list_item.marker.clone())
2538                        },
2539                        blockquote_prefix,
2540                        item_lines: vec![line_num],
2541                        nesting_level: nesting,
2542                        max_marker_width: list_item.marker.len(),
2543                    });
2544
2545                    // Initialize tracked state for new block (issue #148 optimization)
2546                    reset_tracking_state(
2547                        list_item,
2548                        &mut has_list_breaking_content_since_last_item,
2549                        &mut min_continuation_for_tracking,
2550                    );
2551                }
2552
2553                last_list_item_line = line_num;
2554                current_indent_level = item_indent;
2555                last_marker_width = if list_item.is_ordered {
2556                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2557                } else {
2558                    list_item.marker.len()
2559                };
2560            } else if let Some(ref mut block) = current_block {
2561                // Not a list item - check if it continues the current block
2562
2563                // For MD032 compatibility, we use a simple approach:
2564                // - Indented lines continue the list
2565                // - Blank lines followed by indented content continue the list
2566                // - Everything else ends the list
2567
2568                // Check if the last line in the list block ended with a backslash (hard line break)
2569                // This handles cases where list items use backslash for hard line breaks
2570                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2571                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2572                } else {
2573                    false
2574                };
2575
2576                // Calculate minimum indentation for list continuation
2577                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2578                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2579                let min_continuation_indent = if block.is_ordered {
2580                    current_indent_level + last_marker_width
2581                } else {
2582                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2583                };
2584
2585                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2586                    // Indented line or backslash continuation continues the list
2587                    block.end_line = line_num;
2588                } else if line_info.is_blank {
2589                    // Blank line - check if it's internal to the list or ending it
2590                    // We only include blank lines that are followed by more list content
2591                    let mut check_idx = line_idx + 1;
2592                    let mut found_continuation = false;
2593
2594                    // Skip additional blank lines
2595                    while check_idx < lines.len() && lines[check_idx].is_blank {
2596                        check_idx += 1;
2597                    }
2598
2599                    if check_idx < lines.len() {
2600                        let next_line = &lines[check_idx];
2601                        // Check if followed by indented content (list continuation)
2602                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2603                            found_continuation = true;
2604                        }
2605                        // Check if followed by another list item at the same level
2606                        else if !next_line.in_code_block
2607                            && next_line.list_item.is_some()
2608                            && let Some(item) = &next_line.list_item
2609                        {
2610                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2611                                .find(next_line.content(content))
2612                                .map_or(String::new(), |m| m.as_str().to_string());
2613                            if item.marker_column == current_indent_level
2614                                && item.is_ordered == block.is_ordered
2615                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2616                            {
2617                                // Check if there was meaningful content between the list items (unused now)
2618                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2619                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2620                                    if let Some(between_line) = lines.get(idx) {
2621                                        let between_content = between_line.content(content);
2622                                        let trimmed = between_content.trim();
2623                                        // Skip empty lines
2624                                        if trimmed.is_empty() {
2625                                            return false;
2626                                        }
2627                                        // Check for meaningful content
2628                                        let line_indent = between_content.len() - between_content.trim_start().len();
2629
2630                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2631                                        if trimmed.starts_with("```")
2632                                            || trimmed.starts_with("~~~")
2633                                            || trimmed.starts_with("---")
2634                                            || trimmed.starts_with("***")
2635                                            || trimmed.starts_with("___")
2636                                            || trimmed.starts_with(">")
2637                                            || trimmed.contains('|') // Tables
2638                                            || between_line.heading.is_some()
2639                                        {
2640                                            return true; // These are structural separators - meaningful content that breaks lists
2641                                        }
2642
2643                                        // Only properly indented content continues the list
2644                                        line_indent >= min_continuation_indent
2645                                    } else {
2646                                        false
2647                                    }
2648                                });
2649
2650                                if block.is_ordered {
2651                                    // For ordered lists: don't continue if there are structural separators
2652                                    // Check if there are structural separators between the list items
2653                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2654                                        if let Some(between_line) = lines.get(idx) {
2655                                            let trimmed = between_line.content(content).trim();
2656                                            if trimmed.is_empty() {
2657                                                return false;
2658                                            }
2659                                            // Check for structural separators that break lists
2660                                            trimmed.starts_with("```")
2661                                                || trimmed.starts_with("~~~")
2662                                                || trimmed.starts_with("---")
2663                                                || trimmed.starts_with("***")
2664                                                || trimmed.starts_with("___")
2665                                                || trimmed.starts_with(">")
2666                                                || trimmed.contains('|') // Tables
2667                                                || between_line.heading.is_some()
2668                                        } else {
2669                                            false
2670                                        }
2671                                    });
2672                                    found_continuation = !has_structural_separators;
2673                                } else {
2674                                    // For unordered lists: also check for structural separators
2675                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2676                                        if let Some(between_line) = lines.get(idx) {
2677                                            let trimmed = between_line.content(content).trim();
2678                                            if trimmed.is_empty() {
2679                                                return false;
2680                                            }
2681                                            // Check for structural separators that break lists
2682                                            trimmed.starts_with("```")
2683                                                || trimmed.starts_with("~~~")
2684                                                || trimmed.starts_with("---")
2685                                                || trimmed.starts_with("***")
2686                                                || trimmed.starts_with("___")
2687                                                || trimmed.starts_with(">")
2688                                                || trimmed.contains('|') // Tables
2689                                                || between_line.heading.is_some()
2690                                        } else {
2691                                            false
2692                                        }
2693                                    });
2694                                    found_continuation = !has_structural_separators;
2695                                }
2696                            }
2697                        }
2698                    }
2699
2700                    if found_continuation {
2701                        // Include the blank line in the block
2702                        block.end_line = line_num;
2703                    } else {
2704                        // Blank line ends the list - don't include it
2705                        list_blocks.push(block.clone());
2706                        current_block = None;
2707                    }
2708                } else {
2709                    // Check for lazy continuation - non-indented line immediately after a list item
2710                    // But only if the line has sufficient indentation for the list type
2711                    let min_required_indent = if block.is_ordered {
2712                        current_indent_level + last_marker_width
2713                    } else {
2714                        current_indent_level + 2
2715                    };
2716
2717                    // For lazy continuation to apply, the line must either:
2718                    // 1. Have no indentation (true lazy continuation)
2719                    // 2. Have sufficient indentation for the list type
2720                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2721                    let line_content = line_info.content(content).trim();
2722                    let is_structural_separator = line_info.heading.is_some()
2723                        || line_content.starts_with("```")
2724                        || line_content.starts_with("~~~")
2725                        || line_content.starts_with("---")
2726                        || line_content.starts_with("***")
2727                        || line_content.starts_with("___")
2728                        || line_content.starts_with(">")
2729                        || (line_content.contains('|')
2730                            && !line_content.contains("](")
2731                            && !line_content.contains("http")
2732                            && (line_content.matches('|').count() > 1
2733                                || line_content.starts_with('|')
2734                                || line_content.ends_with('|'))); // Tables
2735
2736                    // Allow lazy continuation if we're still within the same list block
2737                    // (not just immediately after a list item)
2738                    let is_lazy_continuation = !is_structural_separator
2739                        && !line_info.is_blank
2740                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2741
2742                    if is_lazy_continuation {
2743                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2744                        // it's probably not a continuation
2745                        let content_to_check = if !blockquote_prefix.is_empty() {
2746                            // Strip blockquote prefix to check the actual content
2747                            line_info
2748                                .content(content)
2749                                .strip_prefix(&blockquote_prefix)
2750                                .unwrap_or(line_info.content(content))
2751                                .trim()
2752                        } else {
2753                            line_info.content(content).trim()
2754                        };
2755
2756                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2757
2758                        // If it starts with uppercase and the previous line ended with punctuation,
2759                        // it's likely a new paragraph, not a continuation
2760                        if starts_with_uppercase && last_list_item_line > 0 {
2761                            // This looks like a new paragraph
2762                            list_blocks.push(block.clone());
2763                            current_block = None;
2764                        } else {
2765                            // This is a lazy continuation line
2766                            block.end_line = line_num;
2767                        }
2768                    } else {
2769                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2770                        list_blocks.push(block.clone());
2771                        current_block = None;
2772                    }
2773                }
2774            }
2775        }
2776
2777        // Don't forget the last block
2778        if let Some(block) = current_block {
2779            list_blocks.push(block);
2780        }
2781
2782        // Merge adjacent blocks that should be one
2783        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2784
2785        list_blocks
2786    }
2787
2788    /// Compute character frequency for fast content analysis
2789    fn compute_char_frequency(content: &str) -> CharFrequency {
2790        let mut frequency = CharFrequency::default();
2791
2792        for ch in content.chars() {
2793            match ch {
2794                '#' => frequency.hash_count += 1,
2795                '*' => frequency.asterisk_count += 1,
2796                '_' => frequency.underscore_count += 1,
2797                '-' => frequency.hyphen_count += 1,
2798                '+' => frequency.plus_count += 1,
2799                '>' => frequency.gt_count += 1,
2800                '|' => frequency.pipe_count += 1,
2801                '[' => frequency.bracket_count += 1,
2802                '`' => frequency.backtick_count += 1,
2803                '<' => frequency.lt_count += 1,
2804                '!' => frequency.exclamation_count += 1,
2805                '\n' => frequency.newline_count += 1,
2806                _ => {}
2807            }
2808        }
2809
2810        frequency
2811    }
2812
2813    /// Parse HTML tags in the content
2814    fn parse_html_tags(
2815        content: &str,
2816        lines: &[LineInfo],
2817        code_blocks: &[(usize, usize)],
2818        flavor: MarkdownFlavor,
2819    ) -> Vec<HtmlTag> {
2820        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2821            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2822
2823        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2824
2825        for cap in HTML_TAG_REGEX.captures_iter(content) {
2826            let full_match = cap.get(0).unwrap();
2827            let match_start = full_match.start();
2828            let match_end = full_match.end();
2829
2830            // Skip if in code block
2831            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2832                continue;
2833            }
2834
2835            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2836            let tag_name_original = cap.get(2).unwrap().as_str();
2837            let tag_name = tag_name_original.to_lowercase();
2838            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2839
2840            // Skip JSX components in MDX files (tags starting with uppercase letter)
2841            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2842            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2843                continue;
2844            }
2845
2846            // Find which line this tag is on
2847            let mut line_num = 1;
2848            let mut col_start = match_start;
2849            let mut col_end = match_end;
2850            for (idx, line_info) in lines.iter().enumerate() {
2851                if match_start >= line_info.byte_offset {
2852                    line_num = idx + 1;
2853                    col_start = match_start - line_info.byte_offset;
2854                    col_end = match_end - line_info.byte_offset;
2855                } else {
2856                    break;
2857                }
2858            }
2859
2860            html_tags.push(HtmlTag {
2861                line: line_num,
2862                start_col: col_start,
2863                end_col: col_end,
2864                byte_offset: match_start,
2865                byte_end: match_end,
2866                tag_name,
2867                is_closing,
2868                is_self_closing,
2869                raw_content: full_match.as_str().to_string(),
2870            });
2871        }
2872
2873        html_tags
2874    }
2875
2876    /// Parse emphasis spans in the content
2877    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2878        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2879            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2880
2881        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2882
2883        for cap in EMPHASIS_REGEX.captures_iter(content) {
2884            let full_match = cap.get(0).unwrap();
2885            let match_start = full_match.start();
2886            let match_end = full_match.end();
2887
2888            // Skip if in code block
2889            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2890                continue;
2891            }
2892
2893            let opening_markers = cap.get(1).unwrap().as_str();
2894            let content_part = cap.get(2).unwrap().as_str();
2895            let closing_markers = cap.get(3).unwrap().as_str();
2896
2897            // Validate matching markers
2898            if opening_markers.chars().next() != closing_markers.chars().next()
2899                || opening_markers.len() != closing_markers.len()
2900            {
2901                continue;
2902            }
2903
2904            let marker = opening_markers.chars().next().unwrap();
2905            let marker_count = opening_markers.len();
2906
2907            // Find which line this emphasis is on
2908            let mut line_num = 1;
2909            let mut col_start = match_start;
2910            let mut col_end = match_end;
2911            for (idx, line_info) in lines.iter().enumerate() {
2912                if match_start >= line_info.byte_offset {
2913                    line_num = idx + 1;
2914                    col_start = match_start - line_info.byte_offset;
2915                    col_end = match_end - line_info.byte_offset;
2916                } else {
2917                    break;
2918                }
2919            }
2920
2921            emphasis_spans.push(EmphasisSpan {
2922                line: line_num,
2923                start_col: col_start,
2924                end_col: col_end,
2925                byte_offset: match_start,
2926                byte_end: match_end,
2927                marker,
2928                marker_count,
2929                content: content_part.to_string(),
2930            });
2931        }
2932
2933        emphasis_spans
2934    }
2935
2936    /// Parse table rows in the content
2937    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2938        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2939
2940        for (line_idx, line_info) in lines.iter().enumerate() {
2941            // Skip lines in code blocks or blank lines
2942            if line_info.in_code_block || line_info.is_blank {
2943                continue;
2944            }
2945
2946            let line = line_info.content(content);
2947            let line_num = line_idx + 1;
2948
2949            // Check if this line contains pipes (potential table row)
2950            if !line.contains('|') {
2951                continue;
2952            }
2953
2954            // Count columns by splitting on pipes
2955            let parts: Vec<&str> = line.split('|').collect();
2956            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2957
2958            // Check if this is a separator row
2959            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2960            let mut column_alignments = Vec::new();
2961
2962            if is_separator {
2963                for part in &parts[1..parts.len() - 1] {
2964                    // Skip first and last empty parts
2965                    let trimmed = part.trim();
2966                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2967                        "center".to_string()
2968                    } else if trimmed.ends_with(':') {
2969                        "right".to_string()
2970                    } else if trimmed.starts_with(':') {
2971                        "left".to_string()
2972                    } else {
2973                        "none".to_string()
2974                    };
2975                    column_alignments.push(alignment);
2976                }
2977            }
2978
2979            table_rows.push(TableRow {
2980                line: line_num,
2981                is_separator,
2982                column_count,
2983                column_alignments,
2984            });
2985        }
2986
2987        table_rows
2988    }
2989
2990    /// Parse bare URLs and emails in the content
2991    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2992        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2993
2994        // Check for bare URLs (not in angle brackets or markdown links)
2995        for cap in BARE_URL_PATTERN.captures_iter(content) {
2996            let full_match = cap.get(0).unwrap();
2997            let match_start = full_match.start();
2998            let match_end = full_match.end();
2999
3000            // Skip if in code block
3001            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3002                continue;
3003            }
3004
3005            // Skip if already in angle brackets or markdown links
3006            let preceding_char = if match_start > 0 {
3007                content.chars().nth(match_start - 1)
3008            } else {
3009                None
3010            };
3011            let following_char = content.chars().nth(match_end);
3012
3013            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3014                continue;
3015            }
3016            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3017                continue;
3018            }
3019
3020            let url = full_match.as_str();
3021            let url_type = if url.starts_with("https://") {
3022                "https"
3023            } else if url.starts_with("http://") {
3024                "http"
3025            } else if url.starts_with("ftp://") {
3026                "ftp"
3027            } else {
3028                "other"
3029            };
3030
3031            // Find which line this URL is on
3032            let mut line_num = 1;
3033            let mut col_start = match_start;
3034            let mut col_end = match_end;
3035            for (idx, line_info) in lines.iter().enumerate() {
3036                if match_start >= line_info.byte_offset {
3037                    line_num = idx + 1;
3038                    col_start = match_start - line_info.byte_offset;
3039                    col_end = match_end - line_info.byte_offset;
3040                } else {
3041                    break;
3042                }
3043            }
3044
3045            bare_urls.push(BareUrl {
3046                line: line_num,
3047                start_col: col_start,
3048                end_col: col_end,
3049                byte_offset: match_start,
3050                byte_end: match_end,
3051                url: url.to_string(),
3052                url_type: url_type.to_string(),
3053            });
3054        }
3055
3056        // Check for bare email addresses
3057        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3058            let full_match = cap.get(0).unwrap();
3059            let match_start = full_match.start();
3060            let match_end = full_match.end();
3061
3062            // Skip if in code block
3063            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3064                continue;
3065            }
3066
3067            // Skip if already in angle brackets or markdown links
3068            let preceding_char = if match_start > 0 {
3069                content.chars().nth(match_start - 1)
3070            } else {
3071                None
3072            };
3073            let following_char = content.chars().nth(match_end);
3074
3075            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3076                continue;
3077            }
3078            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3079                continue;
3080            }
3081
3082            let email = full_match.as_str();
3083
3084            // Find which line this email is on
3085            let mut line_num = 1;
3086            let mut col_start = match_start;
3087            let mut col_end = match_end;
3088            for (idx, line_info) in lines.iter().enumerate() {
3089                if match_start >= line_info.byte_offset {
3090                    line_num = idx + 1;
3091                    col_start = match_start - line_info.byte_offset;
3092                    col_end = match_end - line_info.byte_offset;
3093                } else {
3094                    break;
3095                }
3096            }
3097
3098            bare_urls.push(BareUrl {
3099                line: line_num,
3100                start_col: col_start,
3101                end_col: col_end,
3102                byte_offset: match_start,
3103                byte_end: match_end,
3104                url: email.to_string(),
3105                url_type: "email".to_string(),
3106            });
3107        }
3108
3109        bare_urls
3110    }
3111}
3112
3113/// Merge adjacent list blocks that should be treated as one
3114fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3115    if list_blocks.len() < 2 {
3116        return;
3117    }
3118
3119    let mut merger = ListBlockMerger::new(content, lines);
3120    *list_blocks = merger.merge(list_blocks);
3121}
3122
3123/// Helper struct to manage the complex logic of merging list blocks
3124struct ListBlockMerger<'a> {
3125    content: &'a str,
3126    lines: &'a [LineInfo],
3127}
3128
3129impl<'a> ListBlockMerger<'a> {
3130    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3131        Self { content, lines }
3132    }
3133
3134    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3135        let mut merged = Vec::with_capacity(list_blocks.len());
3136        let mut current = list_blocks[0].clone();
3137
3138        for next in list_blocks.iter().skip(1) {
3139            if self.should_merge_blocks(&current, next) {
3140                current = self.merge_two_blocks(current, next);
3141            } else {
3142                merged.push(current);
3143                current = next.clone();
3144            }
3145        }
3146
3147        merged.push(current);
3148        merged
3149    }
3150
3151    /// Determine if two adjacent list blocks should be merged
3152    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3153        // Basic compatibility checks
3154        if !self.blocks_are_compatible(current, next) {
3155            return false;
3156        }
3157
3158        // Check spacing and content between blocks
3159        let spacing = self.analyze_spacing_between(current, next);
3160        match spacing {
3161            BlockSpacing::Consecutive => true,
3162            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3163            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3164                self.can_merge_with_content_between(current, next)
3165            }
3166        }
3167    }
3168
3169    /// Check if blocks have compatible structure for merging
3170    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3171        current.is_ordered == next.is_ordered
3172            && current.blockquote_prefix == next.blockquote_prefix
3173            && current.nesting_level == next.nesting_level
3174    }
3175
3176    /// Analyze the spacing between two list blocks
3177    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3178        let gap = next.start_line - current.end_line;
3179
3180        match gap {
3181            1 => BlockSpacing::Consecutive,
3182            2 => BlockSpacing::SingleBlank,
3183            _ if gap > 2 => {
3184                if self.has_only_blank_lines_between(current, next) {
3185                    BlockSpacing::MultipleBlanks
3186                } else {
3187                    BlockSpacing::ContentBetween
3188                }
3189            }
3190            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3191        }
3192    }
3193
3194    /// Check if unordered lists can be merged with a single blank line between
3195    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3196        // Check if there are structural separators between the blocks
3197        // If has_meaningful_content_between returns true, it means there are structural separators
3198        if has_meaningful_content_between(self.content, current, next, self.lines) {
3199            return false; // Structural separators prevent merging
3200        }
3201
3202        // Only merge unordered lists with same marker across single blank
3203        !current.is_ordered && current.marker == next.marker
3204    }
3205
3206    /// Check if ordered lists can be merged when there's content between them
3207    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3208        // Do not merge lists if there are structural separators between them
3209        if has_meaningful_content_between(self.content, current, next, self.lines) {
3210            return false; // Structural separators prevent merging
3211        }
3212
3213        // Only consider merging ordered lists if there's no structural content between
3214        current.is_ordered && next.is_ordered
3215    }
3216
3217    /// Check if there are only blank lines between blocks
3218    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3219        for line_num in (current.end_line + 1)..next.start_line {
3220            if let Some(line_info) = self.lines.get(line_num - 1)
3221                && !line_info.content(self.content).trim().is_empty()
3222            {
3223                return false;
3224            }
3225        }
3226        true
3227    }
3228
3229    /// Merge two compatible list blocks into one
3230    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3231        current.end_line = next.end_line;
3232        current.item_lines.extend_from_slice(&next.item_lines);
3233
3234        // Update max marker width
3235        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3236
3237        // Handle marker consistency for unordered lists
3238        if !current.is_ordered && self.markers_differ(&current, next) {
3239            current.marker = None; // Mixed markers
3240        }
3241
3242        current
3243    }
3244
3245    /// Check if two blocks have different markers
3246    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3247        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3248    }
3249}
3250
3251/// Types of spacing between list blocks
3252#[derive(Debug, PartialEq)]
3253enum BlockSpacing {
3254    Consecutive,    // No gap between blocks
3255    SingleBlank,    // One blank line between blocks
3256    MultipleBlanks, // Multiple blank lines but no content
3257    ContentBetween, // Content exists between blocks
3258}
3259
3260/// Check if there's meaningful content (not just blank lines) between two list blocks
3261fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3262    // Check lines between current.end_line and next.start_line
3263    for line_num in (current.end_line + 1)..next.start_line {
3264        if let Some(line_info) = lines.get(line_num - 1) {
3265            // Convert to 0-indexed
3266            let trimmed = line_info.content(content).trim();
3267
3268            // Skip empty lines
3269            if trimmed.is_empty() {
3270                continue;
3271            }
3272
3273            // Check for structural separators that should separate lists (CommonMark compliant)
3274
3275            // Headings separate lists
3276            if line_info.heading.is_some() {
3277                return true; // Has meaningful content - headings separate lists
3278            }
3279
3280            // Horizontal rules separate lists (---, ***, ___)
3281            if is_horizontal_rule(trimmed) {
3282                return true; // Has meaningful content - horizontal rules separate lists
3283            }
3284
3285            // Tables separate lists (lines containing | but not in URLs or code)
3286            // Simple heuristic: tables typically have | at start/end or multiple |
3287            if trimmed.contains('|') && trimmed.len() > 1 {
3288                // Don't treat URLs with | as tables
3289                if !trimmed.contains("](") && !trimmed.contains("http") {
3290                    // More robust check: tables usually have multiple | or | at edges
3291                    let pipe_count = trimmed.matches('|').count();
3292                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3293                        return true; // Has meaningful content - tables separate lists
3294                    }
3295                }
3296            }
3297
3298            // Blockquotes separate lists
3299            if trimmed.starts_with('>') {
3300                return true; // Has meaningful content - blockquotes separate lists
3301            }
3302
3303            // Code block fences separate lists (unless properly indented as list content)
3304            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3305                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3306
3307                // Check if this code block is properly indented as list continuation
3308                let min_continuation_indent = if current.is_ordered {
3309                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3310                } else {
3311                    current.nesting_level + 2
3312                };
3313
3314                if line_indent < min_continuation_indent {
3315                    // This is a standalone code block that separates lists
3316                    return true; // Has meaningful content - standalone code blocks separate lists
3317                }
3318            }
3319
3320            // Check if this line has proper indentation for list continuation
3321            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3322
3323            // Calculate minimum indentation needed to be list continuation
3324            let min_indent = if current.is_ordered {
3325                current.nesting_level + current.max_marker_width
3326            } else {
3327                current.nesting_level + 2
3328            };
3329
3330            // If the line is not indented enough to be list continuation, it's meaningful content
3331            if line_indent < min_indent {
3332                return true; // Has meaningful content - content not indented as list continuation
3333            }
3334
3335            // If we reach here, the line is properly indented as list continuation
3336            // Continue checking other lines
3337        }
3338    }
3339
3340    // Only blank lines or properly indented list continuation content between blocks
3341    false
3342}
3343
3344/// Check if a line is a horizontal rule (---, ***, ___)
3345fn is_horizontal_rule(trimmed: &str) -> bool {
3346    if trimmed.len() < 3 {
3347        return false;
3348    }
3349
3350    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3351    let chars: Vec<char> = trimmed.chars().collect();
3352    if let Some(&first_char) = chars.first()
3353        && (first_char == '-' || first_char == '*' || first_char == '_')
3354    {
3355        let mut count = 0;
3356        for &ch in &chars {
3357            if ch == first_char {
3358                count += 1;
3359            } else if ch != ' ' && ch != '\t' {
3360                return false; // Non-matching, non-whitespace character
3361            }
3362        }
3363        return count >= 3;
3364    }
3365    false
3366}
3367
3368/// Check if content contains patterns that cause the markdown crate to panic
3369#[cfg(test)]
3370mod tests {
3371    use super::*;
3372
3373    #[test]
3374    fn test_empty_content() {
3375        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3376        assert_eq!(ctx.content, "");
3377        assert_eq!(ctx.line_offsets, vec![0]);
3378        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3379        assert_eq!(ctx.lines.len(), 0);
3380    }
3381
3382    #[test]
3383    fn test_single_line() {
3384        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3385        assert_eq!(ctx.content, "# Hello");
3386        assert_eq!(ctx.line_offsets, vec![0]);
3387        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3388        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3389    }
3390
3391    #[test]
3392    fn test_multi_line() {
3393        let content = "# Title\n\nSecond line\nThird line";
3394        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3395        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3396        // Test offset to line/col
3397        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3398        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3399        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3400        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3401        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3402    }
3403
3404    #[test]
3405    fn test_line_info() {
3406        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3407        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3408
3409        // Test line info
3410        assert_eq!(ctx.lines.len(), 7);
3411
3412        // Line 1: "# Title"
3413        let line1 = &ctx.lines[0];
3414        assert_eq!(line1.content(ctx.content), "# Title");
3415        assert_eq!(line1.byte_offset, 0);
3416        assert_eq!(line1.indent, 0);
3417        assert!(!line1.is_blank);
3418        assert!(!line1.in_code_block);
3419        assert!(line1.list_item.is_none());
3420
3421        // Line 2: "    indented"
3422        let line2 = &ctx.lines[1];
3423        assert_eq!(line2.content(ctx.content), "    indented");
3424        assert_eq!(line2.byte_offset, 8);
3425        assert_eq!(line2.indent, 4);
3426        assert!(!line2.is_blank);
3427
3428        // Line 3: "" (blank)
3429        let line3 = &ctx.lines[2];
3430        assert_eq!(line3.content(ctx.content), "");
3431        assert!(line3.is_blank);
3432
3433        // Test helper methods
3434        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3435        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3436        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3437        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3438    }
3439
3440    #[test]
3441    fn test_list_item_detection() {
3442        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3443        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3444
3445        // Line 1: "- Unordered item"
3446        let line1 = &ctx.lines[0];
3447        assert!(line1.list_item.is_some());
3448        let list1 = line1.list_item.as_ref().unwrap();
3449        assert_eq!(list1.marker, "-");
3450        assert!(!list1.is_ordered);
3451        assert_eq!(list1.marker_column, 0);
3452        assert_eq!(list1.content_column, 2);
3453
3454        // Line 2: "  * Nested item"
3455        let line2 = &ctx.lines[1];
3456        assert!(line2.list_item.is_some());
3457        let list2 = line2.list_item.as_ref().unwrap();
3458        assert_eq!(list2.marker, "*");
3459        assert_eq!(list2.marker_column, 2);
3460
3461        // Line 3: "1. Ordered item"
3462        let line3 = &ctx.lines[2];
3463        assert!(line3.list_item.is_some());
3464        let list3 = line3.list_item.as_ref().unwrap();
3465        assert_eq!(list3.marker, "1.");
3466        assert!(list3.is_ordered);
3467        assert_eq!(list3.number, Some(1));
3468
3469        // Line 6: "Not a list"
3470        let line6 = &ctx.lines[5];
3471        assert!(line6.list_item.is_none());
3472    }
3473
3474    #[test]
3475    fn test_offset_to_line_col_edge_cases() {
3476        let content = "a\nb\nc";
3477        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3478        // line_offsets: [0, 2, 4]
3479        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3480        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3481        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3482        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3483        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3484        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3485    }
3486
3487    #[test]
3488    fn test_mdx_esm_blocks() {
3489        let content = r##"import {Chart} from './snowfall.js'
3490export const year = 2023
3491
3492# Last year's snowfall
3493
3494In {year}, the snowfall was above average.
3495It was followed by a warm spring which caused
3496flood conditions in many of the nearby rivers.
3497
3498<Chart color="#fcb32c" year={year} />
3499"##;
3500
3501        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3502
3503        // Check that lines 1 and 2 are marked as ESM blocks
3504        assert_eq!(ctx.lines.len(), 10);
3505        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3506        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3507        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3508        assert!(
3509            !ctx.lines[3].in_esm_block,
3510            "Line 4 (heading) should NOT be in_esm_block"
3511        );
3512        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3513        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3514    }
3515
3516    #[test]
3517    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3518        let content = r#"import {Chart} from './snowfall.js'
3519export const year = 2023
3520
3521# Last year's snowfall
3522"#;
3523
3524        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3525
3526        // ESM blocks should NOT be detected in Standard flavor
3527        assert!(
3528            !ctx.lines[0].in_esm_block,
3529            "Line 1 should NOT be in_esm_block in Standard flavor"
3530        );
3531        assert!(
3532            !ctx.lines[1].in_esm_block,
3533            "Line 2 should NOT be in_esm_block in Standard flavor"
3534        );
3535    }
3536}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs