rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9/// Macro for profiling sections - only active in non-WASM builds
10#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12    ($name:expr, $profile:expr, $code:expr) => {{
13        let start = std::time::Instant::now();
14        let result = $code;
15        if $profile {
16            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17        }
18        result
19    }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27// Comprehensive link pattern that captures both inline and reference links
28// Use (?s) flag to make . match newlines
29static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30    Regex::new(
31        r#"(?sx)
32        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33        (?:
34            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
35            |
36            \[([^\]]*)\]      # Reference ID in group 6
37        )"#
38    ).unwrap()
39});
40
41// Image pattern (similar to links but with ! prefix)
42// Use (?s) flag to make . match newlines
43static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(
45        r#"(?sx)
46        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47        (?:
48            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
49            |
50            \[([^\]]*)\]      # Reference ID in group 6
51        )"#
52    ).unwrap()
53});
54
55// Reference definition pattern
56static REF_DEF_PATTERN: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59// Pattern for bare URLs
60static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61    Regex::new(
62        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63    ).unwrap()
64});
65
66// Pattern for email addresses
67static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70// Pattern for blockquote prefix in parse_list_blocks
71static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73/// Pre-computed information about a line
74#[derive(Debug, Clone)]
75pub struct LineInfo {
76    /// Byte offset where this line starts in the document
77    pub byte_offset: usize,
78    /// Length of the line in bytes (without newline)
79    pub byte_len: usize,
80    /// Number of leading spaces/tabs
81    pub indent: usize,
82    /// Whether the line is blank (empty or only whitespace)
83    pub is_blank: bool,
84    /// Whether this line is inside a code block
85    pub in_code_block: bool,
86    /// Whether this line is inside front matter
87    pub in_front_matter: bool,
88    /// Whether this line is inside an HTML block
89    pub in_html_block: bool,
90    /// Whether this line is inside an HTML comment
91    pub in_html_comment: bool,
92    /// List item information if this line starts a list item
93    pub list_item: Option<ListItemInfo>,
94    /// Heading information if this line is a heading
95    pub heading: Option<HeadingInfo>,
96    /// Blockquote information if this line is a blockquote
97    pub blockquote: Option<BlockquoteInfo>,
98    /// Whether this line is inside a mkdocstrings autodoc block
99    pub in_mkdocstrings: bool,
100    /// Whether this line is part of an ESM import/export block (MDX only)
101    pub in_esm_block: bool,
102}
103
104impl LineInfo {
105    /// Get the line content as a string slice from the source document
106    pub fn content<'a>(&self, source: &'a str) -> &'a str {
107        &source[self.byte_offset..self.byte_offset + self.byte_len]
108    }
109}
110
111/// Information about a list item
112#[derive(Debug, Clone)]
113pub struct ListItemInfo {
114    /// The marker used (*, -, +, or number with . or ))
115    pub marker: String,
116    /// Whether it's ordered (true) or unordered (false)
117    pub is_ordered: bool,
118    /// The number for ordered lists
119    pub number: Option<usize>,
120    /// Column where the marker starts (0-based)
121    pub marker_column: usize,
122    /// Column where content after marker starts
123    pub content_column: usize,
124}
125
126/// Heading style type
127#[derive(Debug, Clone, PartialEq)]
128pub enum HeadingStyle {
129    /// ATX style heading (# Heading)
130    ATX,
131    /// Setext style heading with = underline
132    Setext1,
133    /// Setext style heading with - underline
134    Setext2,
135}
136
137/// Parsed link information
138#[derive(Debug, Clone)]
139pub struct ParsedLink<'a> {
140    /// Line number (1-indexed)
141    pub line: usize,
142    /// Start column (0-indexed) in the line
143    pub start_col: usize,
144    /// End column (0-indexed) in the line
145    pub end_col: usize,
146    /// Byte offset in document
147    pub byte_offset: usize,
148    /// End byte offset in document
149    pub byte_end: usize,
150    /// Link text
151    pub text: Cow<'a, str>,
152    /// Link URL or reference
153    pub url: Cow<'a, str>,
154    /// Whether this is a reference link [text][ref] vs inline [text](url)
155    pub is_reference: bool,
156    /// Reference ID for reference links
157    pub reference_id: Option<Cow<'a, str>>,
158    /// Link type from pulldown-cmark
159    pub link_type: LinkType,
160}
161
162/// Information about a broken link reported by pulldown-cmark
163#[derive(Debug, Clone)]
164pub struct BrokenLinkInfo {
165    /// The reference text that couldn't be resolved
166    pub reference: String,
167    /// Byte span in the source document
168    pub span: std::ops::Range<usize>,
169}
170
171/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
172#[derive(Debug, Clone)]
173pub struct FootnoteRef {
174    /// The footnote ID (without the ^ prefix)
175    pub id: String,
176    /// Line number (1-indexed)
177    pub line: usize,
178    /// Start byte offset in document
179    pub byte_offset: usize,
180    /// End byte offset in document
181    pub byte_end: usize,
182}
183
184/// Parsed image information
185#[derive(Debug, Clone)]
186pub struct ParsedImage<'a> {
187    /// Line number (1-indexed)
188    pub line: usize,
189    /// Start column (0-indexed) in the line
190    pub start_col: usize,
191    /// End column (0-indexed) in the line
192    pub end_col: usize,
193    /// Byte offset in document
194    pub byte_offset: usize,
195    /// End byte offset in document
196    pub byte_end: usize,
197    /// Alt text
198    pub alt_text: Cow<'a, str>,
199    /// Image URL or reference
200    pub url: Cow<'a, str>,
201    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
202    pub is_reference: bool,
203    /// Reference ID for reference images
204    pub reference_id: Option<Cow<'a, str>>,
205    /// Link type from pulldown-cmark
206    pub link_type: LinkType,
207}
208
209/// Reference definition [ref]: url "title"
210#[derive(Debug, Clone)]
211pub struct ReferenceDef {
212    /// Line number (1-indexed)
213    pub line: usize,
214    /// Reference ID (normalized to lowercase)
215    pub id: String,
216    /// URL
217    pub url: String,
218    /// Optional title
219    pub title: Option<String>,
220    /// Byte offset where the reference definition starts
221    pub byte_offset: usize,
222    /// Byte offset where the reference definition ends
223    pub byte_end: usize,
224}
225
226/// Parsed code span information
227#[derive(Debug, Clone)]
228pub struct CodeSpan {
229    /// Line number (1-indexed)
230    pub line: usize,
231    /// Start column (0-indexed) in the line
232    pub start_col: usize,
233    /// End column (0-indexed) in the line
234    pub end_col: usize,
235    /// Byte offset in document
236    pub byte_offset: usize,
237    /// End byte offset in document
238    pub byte_end: usize,
239    /// Number of backticks used (1, 2, 3, etc.)
240    pub backtick_count: usize,
241    /// Content inside the code span (without backticks)
242    pub content: String,
243}
244
245/// Information about a heading
246#[derive(Debug, Clone)]
247pub struct HeadingInfo {
248    /// Heading level (1-6 for ATX, 1-2 for Setext)
249    pub level: u8,
250    /// Style of heading
251    pub style: HeadingStyle,
252    /// The heading marker (# characters or underline)
253    pub marker: String,
254    /// Column where the marker starts (0-based)
255    pub marker_column: usize,
256    /// Column where heading text starts
257    pub content_column: usize,
258    /// The heading text (without markers and without custom ID syntax)
259    pub text: String,
260    /// Custom header ID if present (e.g., from {#custom-id} syntax)
261    pub custom_id: Option<String>,
262    /// Original heading text including custom ID syntax
263    pub raw_text: String,
264    /// Whether it has a closing sequence (for ATX)
265    pub has_closing_sequence: bool,
266    /// The closing sequence if present
267    pub closing_sequence: String,
268}
269
270/// Information about a blockquote line
271#[derive(Debug, Clone)]
272pub struct BlockquoteInfo {
273    /// Nesting level (1 for >, 2 for >>, etc.)
274    pub nesting_level: usize,
275    /// The indentation before the blockquote marker
276    pub indent: String,
277    /// Column where the first > starts (0-based)
278    pub marker_column: usize,
279    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
280    pub prefix: String,
281    /// Content after the blockquote marker(s)
282    pub content: String,
283    /// Whether the line has no space after the marker
284    pub has_no_space_after_marker: bool,
285    /// Whether the line has multiple spaces after the marker
286    pub has_multiple_spaces_after_marker: bool,
287    /// Whether this is an empty blockquote line needing MD028 fix
288    pub needs_md028_fix: bool,
289}
290
291/// Information about a list block
292#[derive(Debug, Clone)]
293pub struct ListBlock {
294    /// Line number where the list starts (1-indexed)
295    pub start_line: usize,
296    /// Line number where the list ends (1-indexed)
297    pub end_line: usize,
298    /// Whether it's ordered or unordered
299    pub is_ordered: bool,
300    /// The consistent marker for unordered lists (if any)
301    pub marker: Option<String>,
302    /// Blockquote prefix for this list (empty if not in blockquote)
303    pub blockquote_prefix: String,
304    /// Lines that are list items within this block
305    pub item_lines: Vec<usize>,
306    /// Nesting level (0 for top-level lists)
307    pub nesting_level: usize,
308    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
309    pub max_marker_width: usize,
310}
311
312use std::sync::{Arc, Mutex};
313
314/// Character frequency data for fast content analysis
315#[derive(Debug, Clone, Default)]
316pub struct CharFrequency {
317    /// Count of # characters (headings)
318    pub hash_count: usize,
319    /// Count of * characters (emphasis, lists, horizontal rules)
320    pub asterisk_count: usize,
321    /// Count of _ characters (emphasis, horizontal rules)
322    pub underscore_count: usize,
323    /// Count of - characters (lists, horizontal rules, setext headings)
324    pub hyphen_count: usize,
325    /// Count of + characters (lists)
326    pub plus_count: usize,
327    /// Count of > characters (blockquotes)
328    pub gt_count: usize,
329    /// Count of | characters (tables)
330    pub pipe_count: usize,
331    /// Count of [ characters (links, images)
332    pub bracket_count: usize,
333    /// Count of ` characters (code spans, code blocks)
334    pub backtick_count: usize,
335    /// Count of < characters (HTML tags, autolinks)
336    pub lt_count: usize,
337    /// Count of ! characters (images)
338    pub exclamation_count: usize,
339    /// Count of newline characters
340    pub newline_count: usize,
341}
342
343/// Pre-parsed HTML tag information
344#[derive(Debug, Clone)]
345pub struct HtmlTag {
346    /// Line number (1-indexed)
347    pub line: usize,
348    /// Start column (0-indexed) in the line
349    pub start_col: usize,
350    /// End column (0-indexed) in the line
351    pub end_col: usize,
352    /// Byte offset in document
353    pub byte_offset: usize,
354    /// End byte offset in document
355    pub byte_end: usize,
356    /// Tag name (e.g., "div", "img", "br")
357    pub tag_name: String,
358    /// Whether it's a closing tag (`</tag>`)
359    pub is_closing: bool,
360    /// Whether it's self-closing (`<tag />`)
361    pub is_self_closing: bool,
362    /// Raw tag content
363    pub raw_content: String,
364}
365
366/// Pre-parsed emphasis span information
367#[derive(Debug, Clone)]
368pub struct EmphasisSpan {
369    /// Line number (1-indexed)
370    pub line: usize,
371    /// Start column (0-indexed) in the line
372    pub start_col: usize,
373    /// End column (0-indexed) in the line
374    pub end_col: usize,
375    /// Byte offset in document
376    pub byte_offset: usize,
377    /// End byte offset in document
378    pub byte_end: usize,
379    /// Type of emphasis ('*' or '_')
380    pub marker: char,
381    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
382    pub marker_count: usize,
383    /// Content inside the emphasis
384    pub content: String,
385}
386
387/// Pre-parsed table row information
388#[derive(Debug, Clone)]
389pub struct TableRow {
390    /// Line number (1-indexed)
391    pub line: usize,
392    /// Whether this is a separator row (contains only |, -, :, and spaces)
393    pub is_separator: bool,
394    /// Number of columns (pipe-separated cells)
395    pub column_count: usize,
396    /// Alignment info from separator row
397    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
398}
399
400/// Pre-parsed bare URL information (not in links)
401#[derive(Debug, Clone)]
402pub struct BareUrl {
403    /// Line number (1-indexed)
404    pub line: usize,
405    /// Start column (0-indexed) in the line
406    pub start_col: usize,
407    /// End column (0-indexed) in the line
408    pub end_col: usize,
409    /// Byte offset in document
410    pub byte_offset: usize,
411    /// End byte offset in document
412    pub byte_end: usize,
413    /// The URL string
414    pub url: String,
415    /// Type of URL ("http", "https", "ftp", "email")
416    pub url_type: String,
417}
418
419pub struct LintContext<'a> {
420    pub content: &'a str,
421    pub line_offsets: Vec<usize>,
422    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
423    pub lines: Vec<LineInfo>,             // Pre-computed line information
424    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
425    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
426    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
427    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
428    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
429    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
430    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
431    pub char_frequency: CharFrequency,    // Character frequency analysis
432    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
433    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
434    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
435    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
436    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
437    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
438    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
439    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
440    pub flavor: MarkdownFlavor,           // Markdown flavor being used
441}
442
443/// Detailed blockquote parse result with all components
444struct BlockquoteComponents<'a> {
445    indent: &'a str,
446    markers: &'a str,
447    spaces_after: &'a str,
448    content: &'a str,
449}
450
451/// Parse blockquote prefix with detailed components using manual parsing
452#[inline]
453fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
454    let bytes = line.as_bytes();
455    let mut pos = 0;
456
457    // Parse leading whitespace (indent)
458    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
459        pos += 1;
460    }
461    let indent_end = pos;
462
463    // Must have at least one '>' marker
464    if pos >= bytes.len() || bytes[pos] != b'>' {
465        return None;
466    }
467
468    // Parse '>' markers
469    while pos < bytes.len() && bytes[pos] == b'>' {
470        pos += 1;
471    }
472    let markers_end = pos;
473
474    // Parse spaces after markers
475    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
476        pos += 1;
477    }
478    let spaces_end = pos;
479
480    Some(BlockquoteComponents {
481        indent: &line[0..indent_end],
482        markers: &line[indent_end..markers_end],
483        spaces_after: &line[markers_end..spaces_end],
484        content: &line[spaces_end..],
485    })
486}
487
488impl<'a> LintContext<'a> {
489    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
490        #[cfg(not(target_arch = "wasm32"))]
491        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
492        #[cfg(target_arch = "wasm32")]
493        let profile = false;
494
495        let line_offsets = profile_section!("Line offsets", profile, {
496            let mut offsets = vec![0];
497            for (i, c) in content.char_indices() {
498                if c == '\n' {
499                    offsets.push(i + 1);
500                }
501            }
502            offsets
503        });
504
505        // Detect code blocks once and cache them
506        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
507
508        // Pre-compute HTML comment ranges ONCE for all operations
509        let html_comment_ranges = profile_section!(
510            "HTML comment ranges",
511            profile,
512            crate::utils::skip_context::compute_html_comment_ranges(content)
513        );
514
515        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
516        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
517            if flavor == MarkdownFlavor::MkDocs {
518                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
519            } else {
520                Vec::new()
521            }
522        });
523
524        // Pre-compute line information (without headings/blockquotes yet)
525        let mut lines = profile_section!(
526            "Basic line info",
527            profile,
528            Self::compute_basic_line_info(
529                content,
530                &line_offsets,
531                &code_blocks,
532                flavor,
533                &html_comment_ranges,
534                &autodoc_ranges,
535            )
536        );
537
538        // Detect HTML blocks BEFORE heading detection
539        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
540
541        // Detect ESM import/export blocks in MDX files BEFORE heading detection
542        profile_section!(
543            "ESM blocks",
544            profile,
545            Self::detect_esm_blocks(content, &mut lines, flavor)
546        );
547
548        // Now detect headings and blockquotes
549        profile_section!(
550            "Headings & blockquotes",
551            profile,
552            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
553        );
554
555        // Parse code spans early so we can exclude them from link/image parsing
556        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
557
558        // Parse links, images, references, and list blocks
559        let (links, broken_links, footnote_refs) = profile_section!(
560            "Links",
561            profile,
562            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
563        );
564
565        let images = profile_section!(
566            "Images",
567            profile,
568            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
569        );
570
571        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
572
573        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
574
575        // Compute character frequency for fast content analysis
576        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
577
578        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
579        let table_blocks = profile_section!(
580            "Table blocks",
581            profile,
582            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
583                content,
584                &code_blocks,
585                &code_spans,
586                &html_comment_ranges,
587            )
588        );
589
590        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
591        let line_index = profile_section!(
592            "Line index",
593            profile,
594            crate::utils::range_utils::LineIndex::new(content)
595        );
596
597        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
598        let jinja_ranges = profile_section!(
599            "Jinja ranges",
600            profile,
601            crate::utils::jinja_utils::find_jinja_ranges(content)
602        );
603
604        Self {
605            content,
606            line_offsets,
607            code_blocks,
608            lines,
609            links,
610            images,
611            broken_links,
612            footnote_refs,
613            reference_defs,
614            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
615            list_blocks,
616            char_frequency,
617            html_tags_cache: Mutex::new(None),
618            emphasis_spans_cache: Mutex::new(None),
619            table_rows_cache: Mutex::new(None),
620            bare_urls_cache: Mutex::new(None),
621            html_comment_ranges,
622            table_blocks,
623            line_index,
624            jinja_ranges,
625            flavor,
626        }
627    }
628
629    /// Get code spans - computed lazily on first access
630    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
631        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
632
633        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
634    }
635
636    /// Get HTML comment ranges - pre-computed during LintContext construction
637    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
638        &self.html_comment_ranges
639    }
640
641    /// Get HTML tags - computed lazily on first access
642    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
643        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
644
645        Arc::clone(cache.get_or_insert_with(|| {
646            Arc::new(Self::parse_html_tags(
647                self.content,
648                &self.lines,
649                &self.code_blocks,
650                self.flavor,
651            ))
652        }))
653    }
654
655    /// Get emphasis spans - computed lazily on first access
656    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
657        let mut cache = self
658            .emphasis_spans_cache
659            .lock()
660            .expect("Emphasis spans cache mutex poisoned");
661
662        Arc::clone(
663            cache.get_or_insert_with(|| {
664                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
665            }),
666        )
667    }
668
669    /// Get table rows - computed lazily on first access
670    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
671        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
672
673        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
674    }
675
676    /// Get bare URLs - computed lazily on first access
677    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
678        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
679
680        Arc::clone(
681            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
682        )
683    }
684
685    /// Map a byte offset to (line, column)
686    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
687        match self.line_offsets.binary_search(&offset) {
688            Ok(line) => (line + 1, 1),
689            Err(line) => {
690                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
691                (line, offset - line_start + 1)
692            }
693        }
694    }
695
696    /// Check if a position is within a code block or code span
697    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
698        // Check code blocks first
699        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
700            return true;
701        }
702
703        // Check inline code spans (lazy load if needed)
704        self.code_spans()
705            .iter()
706            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
707    }
708
709    /// Get line information by line number (1-indexed)
710    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
711        if line_num > 0 {
712            self.lines.get(line_num - 1)
713        } else {
714            None
715        }
716    }
717
718    /// Get byte offset for a line number (1-indexed)
719    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
720        self.line_info(line_num).map(|info| info.byte_offset)
721    }
722
723    /// Get URL for a reference link/image by its ID
724    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
725        let normalized_id = ref_id.to_lowercase();
726        self.reference_defs
727            .iter()
728            .find(|def| def.id == normalized_id)
729            .map(|def| def.url.as_str())
730    }
731
732    /// Check if a line is part of a list block
733    pub fn is_in_list_block(&self, line_num: usize) -> bool {
734        self.list_blocks
735            .iter()
736            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
737    }
738
739    /// Get the list block containing a specific line
740    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
741        self.list_blocks
742            .iter()
743            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
744    }
745
746    // Compatibility methods for DocumentStructure migration
747
748    /// Check if a line is within a code block
749    pub fn is_in_code_block(&self, line_num: usize) -> bool {
750        if line_num == 0 || line_num > self.lines.len() {
751            return false;
752        }
753        self.lines[line_num - 1].in_code_block
754    }
755
756    /// Check if a line is within front matter
757    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
758        if line_num == 0 || line_num > self.lines.len() {
759            return false;
760        }
761        self.lines[line_num - 1].in_front_matter
762    }
763
764    /// Check if a line is within an HTML block
765    pub fn is_in_html_block(&self, line_num: usize) -> bool {
766        if line_num == 0 || line_num > self.lines.len() {
767            return false;
768        }
769        self.lines[line_num - 1].in_html_block
770    }
771
772    /// Check if a line and column is within a code span
773    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
774        if line_num == 0 || line_num > self.lines.len() {
775            return false;
776        }
777
778        // Use the code spans cache to check
779        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
780        // Convert col to 0-indexed for comparison
781        let col_0indexed = if col > 0 { col - 1 } else { 0 };
782        let code_spans = self.code_spans();
783        code_spans
784            .iter()
785            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
786    }
787
788    /// Check if a byte position is within a reference definition
789    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
790    #[inline]
791    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
792        self.reference_defs
793            .iter()
794            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
795    }
796
797    /// Check if a byte position is within an HTML comment
798    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
799    /// where k is the number of HTML comments (typically very small)
800    #[inline]
801    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
802        self.html_comment_ranges
803            .iter()
804            .any(|range| byte_pos >= range.start && byte_pos < range.end)
805    }
806
807    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
808    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
809        self.jinja_ranges
810            .iter()
811            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
812    }
813
814    /// Check if content has any instances of a specific character (fast)
815    pub fn has_char(&self, ch: char) -> bool {
816        match ch {
817            '#' => self.char_frequency.hash_count > 0,
818            '*' => self.char_frequency.asterisk_count > 0,
819            '_' => self.char_frequency.underscore_count > 0,
820            '-' => self.char_frequency.hyphen_count > 0,
821            '+' => self.char_frequency.plus_count > 0,
822            '>' => self.char_frequency.gt_count > 0,
823            '|' => self.char_frequency.pipe_count > 0,
824            '[' => self.char_frequency.bracket_count > 0,
825            '`' => self.char_frequency.backtick_count > 0,
826            '<' => self.char_frequency.lt_count > 0,
827            '!' => self.char_frequency.exclamation_count > 0,
828            '\n' => self.char_frequency.newline_count > 0,
829            _ => self.content.contains(ch), // Fallback for other characters
830        }
831    }
832
833    /// Get count of a specific character (fast)
834    pub fn char_count(&self, ch: char) -> usize {
835        match ch {
836            '#' => self.char_frequency.hash_count,
837            '*' => self.char_frequency.asterisk_count,
838            '_' => self.char_frequency.underscore_count,
839            '-' => self.char_frequency.hyphen_count,
840            '+' => self.char_frequency.plus_count,
841            '>' => self.char_frequency.gt_count,
842            '|' => self.char_frequency.pipe_count,
843            '[' => self.char_frequency.bracket_count,
844            '`' => self.char_frequency.backtick_count,
845            '<' => self.char_frequency.lt_count,
846            '!' => self.char_frequency.exclamation_count,
847            '\n' => self.char_frequency.newline_count,
848            _ => self.content.matches(ch).count(), // Fallback for other characters
849        }
850    }
851
852    /// Check if content likely contains headings (fast)
853    pub fn likely_has_headings(&self) -> bool {
854        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
855    }
856
857    /// Check if content likely contains lists (fast)
858    pub fn likely_has_lists(&self) -> bool {
859        self.char_frequency.asterisk_count > 0
860            || self.char_frequency.hyphen_count > 0
861            || self.char_frequency.plus_count > 0
862    }
863
864    /// Check if content likely contains emphasis (fast)
865    pub fn likely_has_emphasis(&self) -> bool {
866        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
867    }
868
869    /// Check if content likely contains tables (fast)
870    pub fn likely_has_tables(&self) -> bool {
871        self.char_frequency.pipe_count > 2
872    }
873
874    /// Check if content likely contains blockquotes (fast)
875    pub fn likely_has_blockquotes(&self) -> bool {
876        self.char_frequency.gt_count > 0
877    }
878
879    /// Check if content likely contains code (fast)
880    pub fn likely_has_code(&self) -> bool {
881        self.char_frequency.backtick_count > 0
882    }
883
884    /// Check if content likely contains links or images (fast)
885    pub fn likely_has_links_or_images(&self) -> bool {
886        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
887    }
888
889    /// Check if content likely contains HTML (fast)
890    pub fn likely_has_html(&self) -> bool {
891        self.char_frequency.lt_count > 0
892    }
893
894    /// Get HTML tags on a specific line
895    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
896        self.html_tags()
897            .iter()
898            .filter(|tag| tag.line == line_num)
899            .cloned()
900            .collect()
901    }
902
903    /// Get emphasis spans on a specific line
904    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
905        self.emphasis_spans()
906            .iter()
907            .filter(|span| span.line == line_num)
908            .cloned()
909            .collect()
910    }
911
912    /// Get table rows on a specific line
913    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
914        self.table_rows()
915            .iter()
916            .filter(|row| row.line == line_num)
917            .cloned()
918            .collect()
919    }
920
921    /// Get bare URLs on a specific line
922    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
923        self.bare_urls()
924            .iter()
925            .filter(|url| url.line == line_num)
926            .cloned()
927            .collect()
928    }
929
930    /// Find the line index for a given byte offset using binary search.
931    /// Returns (line_index, line_number, column) where:
932    /// - line_index is the 0-based index in the lines array
933    /// - line_number is the 1-based line number
934    /// - column is the byte offset within that line
935    #[inline]
936    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
937        // Binary search to find the line containing this byte offset
938        let idx = match lines.binary_search_by(|line| {
939            if byte_offset < line.byte_offset {
940                std::cmp::Ordering::Greater
941            } else if byte_offset > line.byte_offset + line.byte_len {
942                std::cmp::Ordering::Less
943            } else {
944                std::cmp::Ordering::Equal
945            }
946        }) {
947            Ok(idx) => idx,
948            Err(idx) => idx.saturating_sub(1),
949        };
950
951        let line = &lines[idx];
952        let line_num = idx + 1;
953        let col = byte_offset.saturating_sub(line.byte_offset);
954
955        (idx, line_num, col)
956    }
957
958    /// Check if a byte offset is within a code span using binary search
959    #[inline]
960    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
961        // Since spans are sorted by byte_offset, use partition_point for binary search
962        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
963
964        // Check the span that starts at or before our offset
965        if idx > 0 {
966            let span = &code_spans[idx - 1];
967            if offset >= span.byte_offset && offset < span.byte_end {
968                return true;
969            }
970        }
971
972        false
973    }
974
975    /// Parse all links in the content
976    fn parse_links(
977        content: &'a str,
978        lines: &[LineInfo],
979        code_blocks: &[(usize, usize)],
980        code_spans: &[CodeSpan],
981        flavor: MarkdownFlavor,
982        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
983    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
984        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
985        use std::collections::HashSet;
986
987        let mut links = Vec::with_capacity(content.len() / 500);
988        let mut broken_links = Vec::new();
989        let mut footnote_refs = Vec::new();
990
991        // Track byte positions of links found by pulldown-cmark
992        let mut found_positions = HashSet::new();
993
994        // Use pulldown-cmark's streaming parser with BrokenLink callback
995        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
996        // This automatically handles:
997        // - Escaped links (won't generate events)
998        // - Links in code blocks/spans (won't generate Link events)
999        // - Images (generates Tag::Image instead)
1000        // - Reference resolution (dest_url is already resolved!)
1001        // - Broken references (callback is invoked)
1002        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1003        let mut options = Options::empty();
1004        options.insert(Options::ENABLE_WIKILINKS);
1005        options.insert(Options::ENABLE_FOOTNOTES);
1006
1007        let parser = Parser::new_with_broken_link_callback(
1008            content,
1009            options,
1010            Some(|link: BrokenLink<'_>| {
1011                broken_links.push(BrokenLinkInfo {
1012                    reference: link.reference.to_string(),
1013                    span: link.span.clone(),
1014                });
1015                None
1016            }),
1017        )
1018        .into_offset_iter();
1019
1020        let mut link_stack: Vec<(
1021            usize,
1022            usize,
1023            pulldown_cmark::CowStr<'a>,
1024            LinkType,
1025            pulldown_cmark::CowStr<'a>,
1026        )> = Vec::new();
1027        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1028
1029        for (event, range) in parser {
1030            match event {
1031                Event::Start(Tag::Link {
1032                    link_type,
1033                    dest_url,
1034                    id,
1035                    ..
1036                }) => {
1037                    // Link start - record position, URL, and reference ID
1038                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1039                    text_chunks.clear();
1040                }
1041                Event::Text(text) if !link_stack.is_empty() => {
1042                    // Track text content with its byte range
1043                    text_chunks.push((text.to_string(), range.start, range.end));
1044                }
1045                Event::Code(code) if !link_stack.is_empty() => {
1046                    // Include inline code in link text (with backticks)
1047                    let code_text = format!("`{code}`");
1048                    text_chunks.push((code_text, range.start, range.end));
1049                }
1050                Event::End(TagEnd::Link) => {
1051                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1052                        // Skip if in HTML comment
1053                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1054                            text_chunks.clear();
1055                            continue;
1056                        }
1057
1058                        // Find line and column information
1059                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1060
1061                        // Skip if this link is on a MkDocs snippet line
1062                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1063                            text_chunks.clear();
1064                            continue;
1065                        }
1066
1067                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1068
1069                        let is_reference = matches!(
1070                            link_type,
1071                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1072                        );
1073
1074                        // Extract link text directly from source bytes to preserve escaping
1075                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1076                        let link_text = if start_pos < content.len() {
1077                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1078
1079                            // Find MATCHING ] by tracking bracket depth for nested brackets
1080                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1081                            // Brackets inside code spans (between backticks) should be ignored
1082                            let mut close_pos = None;
1083                            let mut depth = 0;
1084                            let mut in_code_span = false;
1085
1086                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1087                                // Count preceding backslashes
1088                                let mut backslash_count = 0;
1089                                let mut j = i;
1090                                while j > 0 && link_bytes[j - 1] == b'\\' {
1091                                    backslash_count += 1;
1092                                    j -= 1;
1093                                }
1094                                let is_escaped = backslash_count % 2 != 0;
1095
1096                                // Track code spans - backticks toggle in/out of code
1097                                if byte == b'`' && !is_escaped {
1098                                    in_code_span = !in_code_span;
1099                                }
1100
1101                                // Only count brackets when NOT in a code span
1102                                if !is_escaped && !in_code_span {
1103                                    if byte == b'[' {
1104                                        depth += 1;
1105                                    } else if byte == b']' {
1106                                        if depth == 0 {
1107                                            // Found the matching closing bracket
1108                                            close_pos = Some(i);
1109                                            break;
1110                                        } else {
1111                                            depth -= 1;
1112                                        }
1113                                    }
1114                                }
1115                            }
1116
1117                            if let Some(pos) = close_pos {
1118                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1119                            } else {
1120                                Cow::Borrowed("")
1121                            }
1122                        } else {
1123                            Cow::Borrowed("")
1124                        };
1125
1126                        // For reference links, use the actual reference ID from pulldown-cmark
1127                        let reference_id = if is_reference && !ref_id.is_empty() {
1128                            Some(Cow::Owned(ref_id.to_lowercase()))
1129                        } else if is_reference {
1130                            // For collapsed/shortcut references without explicit ID, use the link text
1131                            Some(Cow::Owned(link_text.to_lowercase()))
1132                        } else {
1133                            None
1134                        };
1135
1136                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1137                        // Check for escaped image syntax: \![text](url)
1138                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1139                        let has_escaped_bang = start_pos >= 2
1140                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1141                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1142
1143                        // Check for escaped bracket: \[text](url)
1144                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1145                        let has_escaped_bracket =
1146                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1147
1148                        if has_escaped_bang || has_escaped_bracket {
1149                            text_chunks.clear();
1150                            continue; // Skip: this is escaped markdown, not a real link
1151                        }
1152
1153                        // Track this position as found
1154                        found_positions.insert(start_pos);
1155
1156                        links.push(ParsedLink {
1157                            line: line_num,
1158                            start_col: col_start,
1159                            end_col: col_end,
1160                            byte_offset: start_pos,
1161                            byte_end: range.end,
1162                            text: link_text,
1163                            url: Cow::Owned(url.to_string()),
1164                            is_reference,
1165                            reference_id,
1166                            link_type,
1167                        });
1168
1169                        text_chunks.clear();
1170                    }
1171                }
1172                Event::FootnoteReference(footnote_id) => {
1173                    // Capture footnote references like [^1], [^note]
1174                    // Skip if in HTML comment
1175                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1176                        continue;
1177                    }
1178
1179                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1180                    footnote_refs.push(FootnoteRef {
1181                        id: footnote_id.to_string(),
1182                        line: line_num,
1183                        byte_offset: range.start,
1184                        byte_end: range.end,
1185                    });
1186                }
1187                _ => {}
1188            }
1189        }
1190
1191        // Also find undefined references using regex
1192        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1193        // because the reference is undefined
1194        for cap in LINK_PATTERN.captures_iter(content) {
1195            let full_match = cap.get(0).unwrap();
1196            let match_start = full_match.start();
1197            let match_end = full_match.end();
1198
1199            // Skip if this was already found by pulldown-cmark (it's a valid link)
1200            if found_positions.contains(&match_start) {
1201                continue;
1202            }
1203
1204            // Skip if escaped
1205            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1206                continue;
1207            }
1208
1209            // Skip if it's an image
1210            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1211                continue;
1212            }
1213
1214            // Skip if in code block
1215            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1216                continue;
1217            }
1218
1219            // Skip if in code span
1220            if Self::is_offset_in_code_span(code_spans, match_start) {
1221                continue;
1222            }
1223
1224            // Skip if in HTML comment
1225            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1226                continue;
1227            }
1228
1229            // Find line and column information
1230            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1231
1232            // Skip if this link is on a MkDocs snippet line
1233            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1234                continue;
1235            }
1236
1237            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1238
1239            let text = cap.get(1).map_or("", |m| m.as_str());
1240
1241            // Only process reference links (group 6)
1242            if let Some(ref_id) = cap.get(6) {
1243                let ref_id_str = ref_id.as_str();
1244                let normalized_ref = if ref_id_str.is_empty() {
1245                    Cow::Owned(text.to_lowercase()) // Implicit reference
1246                } else {
1247                    Cow::Owned(ref_id_str.to_lowercase())
1248                };
1249
1250                // This is an undefined reference (pulldown-cmark didn't parse it)
1251                links.push(ParsedLink {
1252                    line: line_num,
1253                    start_col: col_start,
1254                    end_col: col_end,
1255                    byte_offset: match_start,
1256                    byte_end: match_end,
1257                    text: Cow::Borrowed(text),
1258                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1259                    is_reference: true,
1260                    reference_id: Some(normalized_ref),
1261                    link_type: LinkType::Reference, // Undefined references are reference-style
1262                });
1263            }
1264        }
1265
1266        (links, broken_links, footnote_refs)
1267    }
1268
1269    /// Parse all images in the content
1270    fn parse_images(
1271        content: &'a str,
1272        lines: &[LineInfo],
1273        code_blocks: &[(usize, usize)],
1274        code_spans: &[CodeSpan],
1275        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1276    ) -> Vec<ParsedImage<'a>> {
1277        use crate::utils::skip_context::is_in_html_comment_ranges;
1278        use std::collections::HashSet;
1279
1280        // Pre-size based on a heuristic: images are less common than links
1281        let mut images = Vec::with_capacity(content.len() / 1000);
1282        let mut found_positions = HashSet::new();
1283
1284        // Use pulldown-cmark for parsing - more accurate and faster
1285        let parser = Parser::new(content).into_offset_iter();
1286        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1287            Vec::new();
1288        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1289
1290        for (event, range) in parser {
1291            match event {
1292                Event::Start(Tag::Image {
1293                    link_type,
1294                    dest_url,
1295                    id,
1296                    ..
1297                }) => {
1298                    image_stack.push((range.start, dest_url, link_type, id));
1299                    text_chunks.clear();
1300                }
1301                Event::Text(text) if !image_stack.is_empty() => {
1302                    text_chunks.push((text.to_string(), range.start, range.end));
1303                }
1304                Event::Code(code) if !image_stack.is_empty() => {
1305                    let code_text = format!("`{code}`");
1306                    text_chunks.push((code_text, range.start, range.end));
1307                }
1308                Event::End(TagEnd::Image) => {
1309                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1310                        // Skip if in code block
1311                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1312                            continue;
1313                        }
1314
1315                        // Skip if in code span
1316                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1317                            continue;
1318                        }
1319
1320                        // Skip if in HTML comment
1321                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1322                            continue;
1323                        }
1324
1325                        // Find line and column using binary search
1326                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1327                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1328
1329                        let is_reference = matches!(
1330                            link_type,
1331                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1332                        );
1333
1334                        // Extract alt text directly from source bytes to preserve escaping
1335                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1336                        let alt_text = if start_pos < content.len() {
1337                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1338
1339                            // Find MATCHING ] by tracking bracket depth for nested brackets
1340                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1341                            let mut close_pos = None;
1342                            let mut depth = 0;
1343
1344                            if image_bytes.len() > 2 {
1345                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1346                                    // Count preceding backslashes
1347                                    let mut backslash_count = 0;
1348                                    let mut j = i;
1349                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1350                                        backslash_count += 1;
1351                                        j -= 1;
1352                                    }
1353                                    let is_escaped = backslash_count % 2 != 0;
1354
1355                                    if !is_escaped {
1356                                        if byte == b'[' {
1357                                            depth += 1;
1358                                        } else if byte == b']' {
1359                                            if depth == 0 {
1360                                                // Found the matching closing bracket
1361                                                close_pos = Some(i);
1362                                                break;
1363                                            } else {
1364                                                depth -= 1;
1365                                            }
1366                                        }
1367                                    }
1368                                }
1369                            }
1370
1371                            if let Some(pos) = close_pos {
1372                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1373                            } else {
1374                                Cow::Borrowed("")
1375                            }
1376                        } else {
1377                            Cow::Borrowed("")
1378                        };
1379
1380                        let reference_id = if is_reference && !ref_id.is_empty() {
1381                            Some(Cow::Owned(ref_id.to_lowercase()))
1382                        } else if is_reference {
1383                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1384                        } else {
1385                            None
1386                        };
1387
1388                        found_positions.insert(start_pos);
1389                        images.push(ParsedImage {
1390                            line: line_num,
1391                            start_col: col_start,
1392                            end_col: col_end,
1393                            byte_offset: start_pos,
1394                            byte_end: range.end,
1395                            alt_text,
1396                            url: Cow::Owned(url.to_string()),
1397                            is_reference,
1398                            reference_id,
1399                            link_type,
1400                        });
1401                    }
1402                }
1403                _ => {}
1404            }
1405        }
1406
1407        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1408        for cap in IMAGE_PATTERN.captures_iter(content) {
1409            let full_match = cap.get(0).unwrap();
1410            let match_start = full_match.start();
1411            let match_end = full_match.end();
1412
1413            // Skip if already found by pulldown-cmark
1414            if found_positions.contains(&match_start) {
1415                continue;
1416            }
1417
1418            // Skip if the ! is escaped
1419            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1420                continue;
1421            }
1422
1423            // Skip if in code block, code span, or HTML comment
1424            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1425                || Self::is_offset_in_code_span(code_spans, match_start)
1426                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1427            {
1428                continue;
1429            }
1430
1431            // Only process reference images (undefined references not found by pulldown-cmark)
1432            if let Some(ref_id) = cap.get(6) {
1433                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1434                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1435                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1436                let ref_id_str = ref_id.as_str();
1437                let normalized_ref = if ref_id_str.is_empty() {
1438                    Cow::Owned(alt_text.to_lowercase())
1439                } else {
1440                    Cow::Owned(ref_id_str.to_lowercase())
1441                };
1442
1443                images.push(ParsedImage {
1444                    line: line_num,
1445                    start_col: col_start,
1446                    end_col: col_end,
1447                    byte_offset: match_start,
1448                    byte_end: match_end,
1449                    alt_text: Cow::Borrowed(alt_text),
1450                    url: Cow::Borrowed(""),
1451                    is_reference: true,
1452                    reference_id: Some(normalized_ref),
1453                    link_type: LinkType::Reference, // Undefined references are reference-style
1454                });
1455            }
1456        }
1457
1458        images
1459    }
1460
1461    /// Parse reference definitions
1462    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1463        // Pre-size based on lines count as reference definitions are line-based
1464        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1465
1466        for (line_idx, line_info) in lines.iter().enumerate() {
1467            // Skip lines in code blocks
1468            if line_info.in_code_block {
1469                continue;
1470            }
1471
1472            let line = line_info.content(content);
1473            let line_num = line_idx + 1;
1474
1475            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1476                let id = cap.get(1).unwrap().as_str().to_lowercase();
1477                let url = cap.get(2).unwrap().as_str().to_string();
1478                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1479
1480                // Calculate byte positions
1481                // The match starts at the beginning of the line (0) and extends to the end
1482                let match_obj = cap.get(0).unwrap();
1483                let byte_offset = line_info.byte_offset + match_obj.start();
1484                let byte_end = line_info.byte_offset + match_obj.end();
1485
1486                refs.push(ReferenceDef {
1487                    line: line_num,
1488                    id,
1489                    url,
1490                    title,
1491                    byte_offset,
1492                    byte_end,
1493                });
1494            }
1495        }
1496
1497        refs
1498    }
1499
1500    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1501    /// Matches: ^(\s*>\s*)(.*)
1502    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1503    #[inline]
1504    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1505        let trimmed_start = line.trim_start();
1506        if !trimmed_start.starts_with('>') {
1507            return None;
1508        }
1509
1510        let leading_ws_len = line.len() - trimmed_start.len();
1511        let after_gt = &trimmed_start[1..];
1512        let content = after_gt.trim_start();
1513        let ws_after_gt_len = after_gt.len() - content.len();
1514        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1515
1516        Some((&line[..prefix_len], content))
1517    }
1518
1519    /// Fast unordered list parser - replaces regex for 5-10x speedup
1520    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1521    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1522    #[inline]
1523    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1524        let bytes = line.as_bytes();
1525        let mut i = 0;
1526
1527        // Skip leading whitespace
1528        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1529            i += 1;
1530        }
1531
1532        // Check for marker
1533        if i >= bytes.len() {
1534            return None;
1535        }
1536        let marker = bytes[i] as char;
1537        if marker != '-' && marker != '*' && marker != '+' {
1538            return None;
1539        }
1540        let marker_pos = i;
1541        i += 1;
1542
1543        // Collect spacing after marker (space or tab only)
1544        let spacing_start = i;
1545        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1546            i += 1;
1547        }
1548
1549        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1550    }
1551
1552    /// Fast ordered list parser - replaces regex for 5-10x speedup
1553    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1554    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1555    #[inline]
1556    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1557        let bytes = line.as_bytes();
1558        let mut i = 0;
1559
1560        // Skip leading whitespace
1561        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1562            i += 1;
1563        }
1564
1565        // Collect digits
1566        let number_start = i;
1567        while i < bytes.len() && bytes[i].is_ascii_digit() {
1568            i += 1;
1569        }
1570        if i == number_start {
1571            return None; // No digits found
1572        }
1573
1574        // Check for delimiter
1575        if i >= bytes.len() {
1576            return None;
1577        }
1578        let delimiter = bytes[i] as char;
1579        if delimiter != '.' && delimiter != ')' {
1580            return None;
1581        }
1582        let delimiter_pos = i;
1583        i += 1;
1584
1585        // Collect spacing after delimiter (space or tab only)
1586        let spacing_start = i;
1587        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1588            i += 1;
1589        }
1590
1591        Some((
1592            &line[..number_start],
1593            &line[number_start..delimiter_pos],
1594            delimiter,
1595            &line[spacing_start..i],
1596            &line[i..],
1597        ))
1598    }
1599
1600    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1601    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1602    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1603        let num_lines = line_offsets.len();
1604        let mut in_code_block = vec![false; num_lines];
1605
1606        // For each code block, mark all lines within it
1607        for &(start, end) in code_blocks {
1608            // Ensure we're at valid UTF-8 boundaries
1609            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1610                let mut boundary = start;
1611                while boundary > 0 && !content.is_char_boundary(boundary) {
1612                    boundary -= 1;
1613                }
1614                boundary
1615            } else {
1616                start
1617            };
1618
1619            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1620                let mut boundary = end;
1621                while boundary < content.len() && !content.is_char_boundary(boundary) {
1622                    boundary += 1;
1623                }
1624                boundary
1625            } else {
1626                end.min(content.len())
1627            };
1628
1629            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1630            // That function now has proper list context awareness (see code_block_utils.rs)
1631            // and correctly distinguishes between:
1632            // - Fenced code blocks (``` or ~~~)
1633            // - Indented code blocks at document level (4 spaces + blank line before)
1634            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1635            //
1636            // We no longer need to re-validate here. The original validation logic
1637            // was causing false positives by marking list continuation paragraphs as
1638            // code blocks when they have 4 spaces of indentation.
1639
1640            // Use binary search to find the first and last line indices
1641            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1642            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1643            //
1644            // Find the line that CONTAINS safe_start: the line with the largest
1645            // start offset that is <= safe_start. partition_point gives us the
1646            // first line that starts AFTER safe_start, so we subtract 1.
1647            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1648            let first_line = first_line_after.saturating_sub(1);
1649            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1650
1651            // Mark all lines in the range at once
1652            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1653                *flag = true;
1654            }
1655        }
1656
1657        in_code_block
1658    }
1659
1660    /// Pre-compute basic line information (without headings/blockquotes)
1661    fn compute_basic_line_info(
1662        content: &str,
1663        line_offsets: &[usize],
1664        code_blocks: &[(usize, usize)],
1665        flavor: MarkdownFlavor,
1666        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1667        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1668    ) -> Vec<LineInfo> {
1669        let content_lines: Vec<&str> = content.lines().collect();
1670        let mut lines = Vec::with_capacity(content_lines.len());
1671
1672        // Pre-compute which lines are in code blocks
1673        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1674
1675        // Detect front matter boundaries FIRST, before any other parsing
1676        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1677        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1678
1679        for (i, line) in content_lines.iter().enumerate() {
1680            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1681            let indent = line.len() - line.trim_start().len();
1682
1683            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1684            let blockquote_parse = Self::parse_blockquote_prefix(line);
1685
1686            // For blank detection, consider blockquote context
1687            let is_blank = if let Some((_, content)) = blockquote_parse {
1688                // In blockquote context, check if content after prefix is blank
1689                content.trim().is_empty()
1690            } else {
1691                line.trim().is_empty()
1692            };
1693
1694            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1695            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1696
1697            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1698            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1699                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1700            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1701            let in_html_comment =
1702                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1703            let list_item = if !(in_code_block
1704                || is_blank
1705                || in_mkdocstrings
1706                || in_html_comment
1707                || (front_matter_end > 0 && i < front_matter_end))
1708            {
1709                // Strip blockquote prefix if present for list detection (reuse cached result)
1710                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1711                    (content, prefix.len())
1712                } else {
1713                    (&**line, 0)
1714                };
1715
1716                if let Some((leading_spaces, marker, spacing, _content)) =
1717                    Self::parse_unordered_list(line_for_list_check)
1718                {
1719                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1720                    let content_column = marker_column + 1 + spacing.len();
1721
1722                    // According to CommonMark spec, unordered list items MUST have at least one space
1723                    // after the marker (-, *, or +). Without a space, it's not a list item.
1724                    // This also naturally handles cases like:
1725                    // - *emphasis* (not a list)
1726                    // - **bold** (not a list)
1727                    // - --- (horizontal rule, not a list)
1728                    if spacing.is_empty() {
1729                        None
1730                    } else {
1731                        Some(ListItemInfo {
1732                            marker: marker.to_string(),
1733                            is_ordered: false,
1734                            number: None,
1735                            marker_column,
1736                            content_column,
1737                        })
1738                    }
1739                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1740                    Self::parse_ordered_list(line_for_list_check)
1741                {
1742                    let marker = format!("{number_str}{delimiter}");
1743                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1744                    let content_column = marker_column + marker.len() + spacing.len();
1745
1746                    // According to CommonMark spec, ordered list items MUST have at least one space
1747                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1748                    if spacing.is_empty() {
1749                        None
1750                    } else {
1751                        Some(ListItemInfo {
1752                            marker,
1753                            is_ordered: true,
1754                            number: number_str.parse().ok(),
1755                            marker_column,
1756                            content_column,
1757                        })
1758                    }
1759                } else {
1760                    None
1761                }
1762            } else {
1763                None
1764            };
1765
1766            lines.push(LineInfo {
1767                byte_offset,
1768                byte_len: line.len(),
1769                indent,
1770                is_blank,
1771                in_code_block,
1772                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1773                in_html_block: false, // Will be populated after line creation
1774                in_html_comment,
1775                list_item,
1776                heading: None,    // Will be populated in second pass for Setext headings
1777                blockquote: None, // Will be populated after line creation
1778                in_mkdocstrings,
1779                in_esm_block: false, // Will be populated after line creation for MDX files
1780            });
1781        }
1782
1783        lines
1784    }
1785
1786    /// Detect headings and blockquotes (called after HTML block detection)
1787    fn detect_headings_and_blockquotes(
1788        content: &str,
1789        lines: &mut [LineInfo],
1790        flavor: MarkdownFlavor,
1791        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1792    ) {
1793        // Regex for heading detection
1794        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1795            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1796        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1797            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1798
1799        let content_lines: Vec<&str> = content.lines().collect();
1800
1801        // Detect front matter boundaries to skip those lines
1802        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1803
1804        // Detect headings (including Setext which needs look-ahead) and blockquotes
1805        for i in 0..lines.len() {
1806            if lines[i].in_code_block {
1807                continue;
1808            }
1809
1810            // Skip lines in front matter
1811            if front_matter_end > 0 && i < front_matter_end {
1812                continue;
1813            }
1814
1815            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1816            if lines[i].in_html_block {
1817                continue;
1818            }
1819
1820            let line = content_lines[i];
1821
1822            // Check for blockquotes (even on blank lines within blockquotes)
1823            if let Some(bq) = parse_blockquote_detailed(line) {
1824                let nesting_level = bq.markers.len(); // Each '>' is one level
1825                let marker_column = bq.indent.len();
1826
1827                // Build the prefix (indentation + markers + space)
1828                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1829
1830                // Check for various blockquote issues
1831                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1832                // Only flag multiple literal spaces, not tabs
1833                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
1834                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1835
1836                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1837                // MD028 flags empty blockquote lines that don't have a single space after the marker
1838                // Lines like "> " or ">> " are already correct and don't need fixing
1839                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1840
1841                lines[i].blockquote = Some(BlockquoteInfo {
1842                    nesting_level,
1843                    indent: bq.indent.to_string(),
1844                    marker_column,
1845                    prefix,
1846                    content: bq.content.to_string(),
1847                    has_no_space_after_marker: has_no_space,
1848                    has_multiple_spaces_after_marker: has_multiple_spaces,
1849                    needs_md028_fix,
1850                });
1851            }
1852
1853            // Skip heading detection for blank lines
1854            if lines[i].is_blank {
1855                continue;
1856            }
1857
1858            // Check for ATX headings (but skip MkDocs snippet lines)
1859            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1860            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1861                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1862                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1863            } else {
1864                false
1865            };
1866
1867            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1868                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1869                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1870                    continue;
1871                }
1872                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1873                let hashes = caps.get(2).map_or("", |m| m.as_str());
1874                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1875                let rest = caps.get(4).map_or("", |m| m.as_str());
1876
1877                let level = hashes.len() as u8;
1878                let marker_column = leading_spaces.len();
1879
1880                // Check for closing sequence, but handle custom IDs that might come after
1881                let (text, has_closing, closing_seq) = {
1882                    // First check if there's a custom ID at the end
1883                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1884                        // Check if this looks like a valid custom ID (ends with })
1885                        if rest[id_start..].trim_end().ends_with('}') {
1886                            // Split off the custom ID
1887                            (&rest[..id_start], &rest[id_start..])
1888                        } else {
1889                            (rest, "")
1890                        }
1891                    } else {
1892                        (rest, "")
1893                    };
1894
1895                    // Now look for closing hashes in the part before the custom ID
1896                    let trimmed_rest = rest_without_id.trim_end();
1897                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1898                        // Look for the start of the hash sequence
1899                        let mut start_of_hashes = last_hash_pos;
1900                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1901                            start_of_hashes -= 1;
1902                        }
1903
1904                        // Check if there's at least one space before the closing hashes
1905                        let has_space_before = start_of_hashes == 0
1906                            || trimmed_rest
1907                                .chars()
1908                                .nth(start_of_hashes - 1)
1909                                .is_some_and(|c| c.is_whitespace());
1910
1911                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1912                        let potential_closing = &trimmed_rest[start_of_hashes..];
1913                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1914
1915                        if is_all_hashes && has_space_before {
1916                            // This is a closing sequence
1917                            let closing_hashes = potential_closing.to_string();
1918                            // The text is everything before the closing hashes
1919                            // Don't include the custom ID here - it will be extracted later
1920                            let text_part = if !custom_id_part.is_empty() {
1921                                // If we have a custom ID, append it back to get the full rest
1922                                // This allows the extract_header_id function to handle it properly
1923                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1924                            } else {
1925                                rest_without_id[..start_of_hashes].trim_end().to_string()
1926                            };
1927                            (text_part, true, closing_hashes)
1928                        } else {
1929                            // Not a valid closing sequence, return the full content
1930                            (rest.to_string(), false, String::new())
1931                        }
1932                    } else {
1933                        // No hashes found, return the full content
1934                        (rest.to_string(), false, String::new())
1935                    }
1936                };
1937
1938                let content_column = marker_column + hashes.len() + spaces_after.len();
1939
1940                // Extract custom header ID if present
1941                let raw_text = text.trim().to_string();
1942                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1943
1944                // If no custom ID was found on the header line, check the next line for standalone attr-list
1945                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1946                    let next_line = content_lines[i + 1];
1947                    if !lines[i + 1].in_code_block
1948                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1949                        && let Some(next_line_id) =
1950                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1951                    {
1952                        custom_id = Some(next_line_id);
1953                    }
1954                }
1955
1956                lines[i].heading = Some(HeadingInfo {
1957                    level,
1958                    style: HeadingStyle::ATX,
1959                    marker: hashes.to_string(),
1960                    marker_column,
1961                    content_column,
1962                    text: clean_text,
1963                    custom_id,
1964                    raw_text,
1965                    has_closing_sequence: has_closing,
1966                    closing_sequence: closing_seq,
1967                });
1968            }
1969            // Check for Setext headings (need to look at next line)
1970            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1971                let next_line = content_lines[i + 1];
1972                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1973                    // Skip if next line is front matter delimiter
1974                    if front_matter_end > 0 && i < front_matter_end {
1975                        continue;
1976                    }
1977
1978                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1979                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1980                    {
1981                        continue;
1982                    }
1983
1984                    let underline = next_line.trim();
1985
1986                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1987                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1988                    if underline == "---" {
1989                        continue;
1990                    }
1991
1992                    // Skip if the current line looks like YAML key-value syntax
1993                    let current_line_trimmed = line.trim();
1994                    if current_line_trimmed.contains(':')
1995                        && !current_line_trimmed.starts_with('#')
1996                        && !current_line_trimmed.contains('[')
1997                        && !current_line_trimmed.contains("](")
1998                    {
1999                        // This looks like "key: value" which suggests YAML, not a heading
2000                        continue;
2001                    }
2002
2003                    let level = if underline.starts_with('=') { 1 } else { 2 };
2004                    let style = if level == 1 {
2005                        HeadingStyle::Setext1
2006                    } else {
2007                        HeadingStyle::Setext2
2008                    };
2009
2010                    // Extract custom header ID if present
2011                    let raw_text = line.trim().to_string();
2012                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2013
2014                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2015                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2016                        let attr_line = content_lines[i + 2];
2017                        if !lines[i + 2].in_code_block
2018                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2019                            && let Some(attr_line_id) =
2020                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2021                        {
2022                            custom_id = Some(attr_line_id);
2023                        }
2024                    }
2025
2026                    lines[i].heading = Some(HeadingInfo {
2027                        level,
2028                        style,
2029                        marker: underline.to_string(),
2030                        marker_column: next_line.len() - next_line.trim_start().len(),
2031                        content_column: lines[i].indent,
2032                        text: clean_text,
2033                        custom_id,
2034                        raw_text,
2035                        has_closing_sequence: false,
2036                        closing_sequence: String::new(),
2037                    });
2038                }
2039            }
2040        }
2041    }
2042
2043    /// Detect HTML blocks in the content
2044    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2045        // HTML block elements that trigger block context
2046        const BLOCK_ELEMENTS: &[&str] = &[
2047            "address",
2048            "article",
2049            "aside",
2050            "blockquote",
2051            "details",
2052            "dialog",
2053            "dd",
2054            "div",
2055            "dl",
2056            "dt",
2057            "fieldset",
2058            "figcaption",
2059            "figure",
2060            "footer",
2061            "form",
2062            "h1",
2063            "h2",
2064            "h3",
2065            "h4",
2066            "h5",
2067            "h6",
2068            "header",
2069            "hr",
2070            "li",
2071            "main",
2072            "nav",
2073            "ol",
2074            "p",
2075            "picture",
2076            "pre",
2077            "script",
2078            "section",
2079            "style",
2080            "table",
2081            "tbody",
2082            "td",
2083            "textarea",
2084            "tfoot",
2085            "th",
2086            "thead",
2087            "tr",
2088            "ul",
2089        ];
2090
2091        let mut i = 0;
2092        while i < lines.len() {
2093            // Skip if already in code block or front matter
2094            if lines[i].in_code_block || lines[i].in_front_matter {
2095                i += 1;
2096                continue;
2097            }
2098
2099            let trimmed = lines[i].content(content).trim_start();
2100
2101            // Check if line starts with an HTML tag
2102            if trimmed.starts_with('<') && trimmed.len() > 1 {
2103                // Extract tag name safely
2104                let after_bracket = &trimmed[1..];
2105                let is_closing = after_bracket.starts_with('/');
2106                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2107
2108                // Extract tag name (stop at space, >, /, or end of string)
2109                let tag_name = tag_start
2110                    .chars()
2111                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2112                    .collect::<String>()
2113                    .to_lowercase();
2114
2115                // Check if it's a block element
2116                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2117                    // Mark this line as in HTML block
2118                    lines[i].in_html_block = true;
2119
2120                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2121                    // This avoids complex nesting logic that might cause infinite loops
2122                    if !is_closing {
2123                        let closing_tag = format!("</{tag_name}>");
2124                        // style and script tags can contain blank lines (CSS/JS formatting)
2125                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2126                        let mut j = i + 1;
2127                        while j < lines.len() && j < i + 100 {
2128                            // Limit search to 100 lines
2129                            // Stop at blank lines (except for style/script tags)
2130                            if !allow_blank_lines && lines[j].is_blank {
2131                                break;
2132                            }
2133
2134                            lines[j].in_html_block = true;
2135
2136                            // Check if this line contains the closing tag
2137                            if lines[j].content(content).contains(&closing_tag) {
2138                                break;
2139                            }
2140                            j += 1;
2141                        }
2142                    }
2143                }
2144            }
2145
2146            i += 1;
2147        }
2148    }
2149
2150    /// Detect ESM import/export blocks in MDX files
2151    /// ESM blocks consist of contiguous import/export statements at the top of the file
2152    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2153        // Only process MDX files
2154        if !flavor.supports_esm_blocks() {
2155            return;
2156        }
2157
2158        for line in lines.iter_mut() {
2159            // Skip blank lines and comments at the start
2160            if line.is_blank || line.in_html_comment {
2161                continue;
2162            }
2163
2164            // Check if line starts with import or export
2165            let trimmed = line.content(content).trim_start();
2166            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2167                line.in_esm_block = true;
2168            } else {
2169                // Once we hit a non-ESM line, we're done with the ESM block
2170                break;
2171            }
2172        }
2173    }
2174
2175    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2176    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2177        let mut code_spans = Vec::new();
2178
2179        // Quick check - if no backticks, no code spans
2180        if !content.contains('`') {
2181            return code_spans;
2182        }
2183
2184        // Use pulldown-cmark's streaming parser with byte offsets
2185        let parser = Parser::new(content).into_offset_iter();
2186
2187        for (event, range) in parser {
2188            if let Event::Code(_) = event {
2189                let start_pos = range.start;
2190                let end_pos = range.end;
2191
2192                // The range includes the backticks, extract the actual content
2193                let full_span = &content[start_pos..end_pos];
2194                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2195
2196                // Extract content between backticks, preserving spaces
2197                let content_start = start_pos + backtick_count;
2198                let content_end = end_pos - backtick_count;
2199                let span_content = if content_start < content_end {
2200                    content[content_start..content_end].to_string()
2201                } else {
2202                    String::new()
2203                };
2204
2205                // Use binary search to find line number - O(log n) instead of O(n)
2206                // Find the rightmost line whose byte_offset <= start_pos
2207                let line_idx = lines
2208                    .partition_point(|line| line.byte_offset <= start_pos)
2209                    .saturating_sub(1);
2210                let line_num = line_idx + 1;
2211                let col_start = start_pos - lines[line_idx].byte_offset;
2212
2213                // Find end column using binary search
2214                let end_line_idx = lines
2215                    .partition_point(|line| line.byte_offset <= end_pos)
2216                    .saturating_sub(1);
2217                let col_end = end_pos - lines[end_line_idx].byte_offset;
2218
2219                code_spans.push(CodeSpan {
2220                    line: line_num,
2221                    start_col: col_start,
2222                    end_col: col_end,
2223                    byte_offset: start_pos,
2224                    byte_end: end_pos,
2225                    backtick_count,
2226                    content: span_content,
2227                });
2228            }
2229        }
2230
2231        // Sort by position to ensure consistent ordering
2232        code_spans.sort_by_key(|span| span.byte_offset);
2233
2234        code_spans
2235    }
2236
2237    /// Parse all list blocks in the content (legacy line-by-line approach)
2238    ///
2239    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2240    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2241    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2242    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2243    ///   treated as list continuation (based on the list marker width)
2244    ///
2245    /// When a new list item is encountered, we check if list-breaking content was seen
2246    /// since the last item. If so, we start a new list block.
2247    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2248        // Minimum indentation for unordered list continuation per CommonMark spec
2249        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2250
2251        /// Initialize or reset the forward-scanning tracking state.
2252        /// This helper eliminates code duplication across three initialization sites.
2253        #[inline]
2254        fn reset_tracking_state(
2255            list_item: &ListItemInfo,
2256            has_list_breaking_content: &mut bool,
2257            min_continuation: &mut usize,
2258        ) {
2259            *has_list_breaking_content = false;
2260            let marker_width = if list_item.is_ordered {
2261                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2262            } else {
2263                list_item.marker.len()
2264            };
2265            *min_continuation = if list_item.is_ordered {
2266                marker_width
2267            } else {
2268                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2269            };
2270        }
2271
2272        // Pre-size based on lines that could be list items
2273        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2274        let mut current_block: Option<ListBlock> = None;
2275        let mut last_list_item_line = 0;
2276        let mut current_indent_level = 0;
2277        let mut last_marker_width = 0;
2278
2279        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2280        let mut has_list_breaking_content_since_last_item = false;
2281        let mut min_continuation_for_tracking = 0;
2282
2283        for (line_idx, line_info) in lines.iter().enumerate() {
2284            let line_num = line_idx + 1;
2285
2286            // Enhanced code block handling using Design #3's context analysis
2287            if line_info.in_code_block {
2288                if let Some(ref mut block) = current_block {
2289                    // Calculate minimum indentation for list continuation
2290                    let min_continuation_indent =
2291                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2292
2293                    // Analyze code block context using the three-tier classification
2294                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2295
2296                    match context {
2297                        CodeBlockContext::Indented => {
2298                            // Code block is properly indented - continues the list
2299                            block.end_line = line_num;
2300                            continue;
2301                        }
2302                        CodeBlockContext::Standalone => {
2303                            // Code block separates lists - end current block
2304                            let completed_block = current_block.take().unwrap();
2305                            list_blocks.push(completed_block);
2306                            continue;
2307                        }
2308                        CodeBlockContext::Adjacent => {
2309                            // Edge case - use conservative behavior (continue list)
2310                            block.end_line = line_num;
2311                            continue;
2312                        }
2313                    }
2314                } else {
2315                    // No current list block - skip code block lines
2316                    continue;
2317                }
2318            }
2319
2320            // Extract blockquote prefix if any
2321            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2322                caps.get(0).unwrap().as_str().to_string()
2323            } else {
2324                String::new()
2325            };
2326
2327            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2328            if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2329                let line_content = line_info.content(content).trim();
2330
2331                // Check for structural separators that break lists
2332                let breaks_list = line_info.heading.is_some()
2333                    || line_content.starts_with("---")
2334                    || line_content.starts_with("***")
2335                    || line_content.starts_with("___")
2336                    || (line_content.contains('|')
2337                        && !line_content.contains("](")
2338                        && !line_content.contains("http")
2339                        && (line_content.matches('|').count() > 1
2340                            || line_content.starts_with('|')
2341                            || line_content.ends_with('|')))
2342                    || line_content.starts_with(">")
2343                    || (line_info.indent < min_continuation_for_tracking);
2344
2345                if breaks_list {
2346                    has_list_breaking_content_since_last_item = true;
2347                }
2348            }
2349
2350            // Check if this line is a list item
2351            if let Some(list_item) = &line_info.list_item {
2352                // Calculate nesting level based on indentation
2353                let item_indent = list_item.marker_column;
2354                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2355
2356                if let Some(ref mut block) = current_block {
2357                    // Check if this continues the current block
2358                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2359                    // or a continuation at the same or lower level
2360                    let is_nested = nesting > block.nesting_level;
2361                    let same_type =
2362                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2363                    let same_context = block.blockquote_prefix == blockquote_prefix;
2364                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2365
2366                    // For unordered lists, also check marker consistency
2367                    let marker_compatible =
2368                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2369
2370                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2371                    // This eliminates the quadratic bottleneck from issue #148
2372                    let has_non_list_content = has_list_breaking_content_since_last_item;
2373
2374                    // A list continues if:
2375                    // 1. It's a nested item (indented more than the parent), OR
2376                    // 2. It's the same type at the same level with reasonable distance
2377                    let mut continues_list = if is_nested {
2378                        // Nested items always continue the list if they're in the same context
2379                        same_context && reasonable_distance && !has_non_list_content
2380                    } else {
2381                        // Same-level items need to match type and markers
2382                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2383                    };
2384
2385                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2386                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2387                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2388                        // Check if the previous line was a list item
2389                        if block.item_lines.contains(&(line_num - 1)) {
2390                            // They're consecutive list items - force them to be in the same list
2391                            continues_list = true;
2392                        }
2393                    }
2394
2395                    if continues_list {
2396                        // Extend current block
2397                        block.end_line = line_num;
2398                        block.item_lines.push(line_num);
2399
2400                        // Update max marker width
2401                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2402                            list_item.marker.len() + 1
2403                        } else {
2404                            list_item.marker.len()
2405                        });
2406
2407                        // Update marker consistency for unordered lists
2408                        if !block.is_ordered
2409                            && block.marker.is_some()
2410                            && block.marker.as_ref() != Some(&list_item.marker)
2411                        {
2412                            // Mixed markers, clear the marker field
2413                            block.marker = None;
2414                        }
2415
2416                        // Reset tracked state for issue #148 optimization
2417                        reset_tracking_state(
2418                            list_item,
2419                            &mut has_list_breaking_content_since_last_item,
2420                            &mut min_continuation_for_tracking,
2421                        );
2422                    } else {
2423                        // End current block and start a new one
2424
2425                        list_blocks.push(block.clone());
2426
2427                        *block = ListBlock {
2428                            start_line: line_num,
2429                            end_line: line_num,
2430                            is_ordered: list_item.is_ordered,
2431                            marker: if list_item.is_ordered {
2432                                None
2433                            } else {
2434                                Some(list_item.marker.clone())
2435                            },
2436                            blockquote_prefix: blockquote_prefix.clone(),
2437                            item_lines: vec![line_num],
2438                            nesting_level: nesting,
2439                            max_marker_width: if list_item.is_ordered {
2440                                list_item.marker.len() + 1
2441                            } else {
2442                                list_item.marker.len()
2443                            },
2444                        };
2445
2446                        // Initialize tracked state for new block (issue #148 optimization)
2447                        reset_tracking_state(
2448                            list_item,
2449                            &mut has_list_breaking_content_since_last_item,
2450                            &mut min_continuation_for_tracking,
2451                        );
2452                    }
2453                } else {
2454                    // Start a new block
2455                    current_block = Some(ListBlock {
2456                        start_line: line_num,
2457                        end_line: line_num,
2458                        is_ordered: list_item.is_ordered,
2459                        marker: if list_item.is_ordered {
2460                            None
2461                        } else {
2462                            Some(list_item.marker.clone())
2463                        },
2464                        blockquote_prefix,
2465                        item_lines: vec![line_num],
2466                        nesting_level: nesting,
2467                        max_marker_width: list_item.marker.len(),
2468                    });
2469
2470                    // Initialize tracked state for new block (issue #148 optimization)
2471                    reset_tracking_state(
2472                        list_item,
2473                        &mut has_list_breaking_content_since_last_item,
2474                        &mut min_continuation_for_tracking,
2475                    );
2476                }
2477
2478                last_list_item_line = line_num;
2479                current_indent_level = item_indent;
2480                last_marker_width = if list_item.is_ordered {
2481                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2482                } else {
2483                    list_item.marker.len()
2484                };
2485            } else if let Some(ref mut block) = current_block {
2486                // Not a list item - check if it continues the current block
2487
2488                // For MD032 compatibility, we use a simple approach:
2489                // - Indented lines continue the list
2490                // - Blank lines followed by indented content continue the list
2491                // - Everything else ends the list
2492
2493                // Check if the last line in the list block ended with a backslash (hard line break)
2494                // This handles cases where list items use backslash for hard line breaks
2495                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2496                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2497                } else {
2498                    false
2499                };
2500
2501                // Calculate minimum indentation for list continuation
2502                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2503                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2504                let min_continuation_indent = if block.is_ordered {
2505                    current_indent_level + last_marker_width
2506                } else {
2507                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2508                };
2509
2510                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2511                    // Indented line or backslash continuation continues the list
2512                    block.end_line = line_num;
2513                } else if line_info.is_blank {
2514                    // Blank line - check if it's internal to the list or ending it
2515                    // We only include blank lines that are followed by more list content
2516                    let mut check_idx = line_idx + 1;
2517                    let mut found_continuation = false;
2518
2519                    // Skip additional blank lines
2520                    while check_idx < lines.len() && lines[check_idx].is_blank {
2521                        check_idx += 1;
2522                    }
2523
2524                    if check_idx < lines.len() {
2525                        let next_line = &lines[check_idx];
2526                        // Check if followed by indented content (list continuation)
2527                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2528                            found_continuation = true;
2529                        }
2530                        // Check if followed by another list item at the same level
2531                        else if !next_line.in_code_block
2532                            && next_line.list_item.is_some()
2533                            && let Some(item) = &next_line.list_item
2534                        {
2535                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2536                                .find(next_line.content(content))
2537                                .map_or(String::new(), |m| m.as_str().to_string());
2538                            if item.marker_column == current_indent_level
2539                                && item.is_ordered == block.is_ordered
2540                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2541                            {
2542                                // Check if there was meaningful content between the list items (unused now)
2543                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2544                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2545                                    if let Some(between_line) = lines.get(idx) {
2546                                        let between_content = between_line.content(content);
2547                                        let trimmed = between_content.trim();
2548                                        // Skip empty lines
2549                                        if trimmed.is_empty() {
2550                                            return false;
2551                                        }
2552                                        // Check for meaningful content
2553                                        let line_indent = between_content.len() - between_content.trim_start().len();
2554
2555                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2556                                        if trimmed.starts_with("```")
2557                                            || trimmed.starts_with("~~~")
2558                                            || trimmed.starts_with("---")
2559                                            || trimmed.starts_with("***")
2560                                            || trimmed.starts_with("___")
2561                                            || trimmed.starts_with(">")
2562                                            || trimmed.contains('|') // Tables
2563                                            || between_line.heading.is_some()
2564                                        {
2565                                            return true; // These are structural separators - meaningful content that breaks lists
2566                                        }
2567
2568                                        // Only properly indented content continues the list
2569                                        line_indent >= min_continuation_indent
2570                                    } else {
2571                                        false
2572                                    }
2573                                });
2574
2575                                if block.is_ordered {
2576                                    // For ordered lists: don't continue if there are structural separators
2577                                    // Check if there are structural separators between the list items
2578                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2579                                        if let Some(between_line) = lines.get(idx) {
2580                                            let trimmed = between_line.content(content).trim();
2581                                            if trimmed.is_empty() {
2582                                                return false;
2583                                            }
2584                                            // Check for structural separators that break lists
2585                                            trimmed.starts_with("```")
2586                                                || trimmed.starts_with("~~~")
2587                                                || trimmed.starts_with("---")
2588                                                || trimmed.starts_with("***")
2589                                                || trimmed.starts_with("___")
2590                                                || trimmed.starts_with(">")
2591                                                || trimmed.contains('|') // Tables
2592                                                || between_line.heading.is_some()
2593                                        } else {
2594                                            false
2595                                        }
2596                                    });
2597                                    found_continuation = !has_structural_separators;
2598                                } else {
2599                                    // For unordered lists: also check for structural separators
2600                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2601                                        if let Some(between_line) = lines.get(idx) {
2602                                            let trimmed = between_line.content(content).trim();
2603                                            if trimmed.is_empty() {
2604                                                return false;
2605                                            }
2606                                            // Check for structural separators that break lists
2607                                            trimmed.starts_with("```")
2608                                                || trimmed.starts_with("~~~")
2609                                                || trimmed.starts_with("---")
2610                                                || trimmed.starts_with("***")
2611                                                || trimmed.starts_with("___")
2612                                                || trimmed.starts_with(">")
2613                                                || trimmed.contains('|') // Tables
2614                                                || between_line.heading.is_some()
2615                                        } else {
2616                                            false
2617                                        }
2618                                    });
2619                                    found_continuation = !has_structural_separators;
2620                                }
2621                            }
2622                        }
2623                    }
2624
2625                    if found_continuation {
2626                        // Include the blank line in the block
2627                        block.end_line = line_num;
2628                    } else {
2629                        // Blank line ends the list - don't include it
2630                        list_blocks.push(block.clone());
2631                        current_block = None;
2632                    }
2633                } else {
2634                    // Check for lazy continuation - non-indented line immediately after a list item
2635                    // But only if the line has sufficient indentation for the list type
2636                    let min_required_indent = if block.is_ordered {
2637                        current_indent_level + last_marker_width
2638                    } else {
2639                        current_indent_level + 2
2640                    };
2641
2642                    // For lazy continuation to apply, the line must either:
2643                    // 1. Have no indentation (true lazy continuation)
2644                    // 2. Have sufficient indentation for the list type
2645                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2646                    let line_content = line_info.content(content).trim();
2647                    let is_structural_separator = line_info.heading.is_some()
2648                        || line_content.starts_with("```")
2649                        || line_content.starts_with("~~~")
2650                        || line_content.starts_with("---")
2651                        || line_content.starts_with("***")
2652                        || line_content.starts_with("___")
2653                        || line_content.starts_with(">")
2654                        || (line_content.contains('|')
2655                            && !line_content.contains("](")
2656                            && !line_content.contains("http")
2657                            && (line_content.matches('|').count() > 1
2658                                || line_content.starts_with('|')
2659                                || line_content.ends_with('|'))); // Tables
2660
2661                    // Allow lazy continuation if we're still within the same list block
2662                    // (not just immediately after a list item)
2663                    let is_lazy_continuation = !is_structural_separator
2664                        && !line_info.is_blank
2665                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2666
2667                    if is_lazy_continuation {
2668                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2669                        // it's probably not a continuation
2670                        let content_to_check = if !blockquote_prefix.is_empty() {
2671                            // Strip blockquote prefix to check the actual content
2672                            line_info
2673                                .content(content)
2674                                .strip_prefix(&blockquote_prefix)
2675                                .unwrap_or(line_info.content(content))
2676                                .trim()
2677                        } else {
2678                            line_info.content(content).trim()
2679                        };
2680
2681                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2682
2683                        // If it starts with uppercase and the previous line ended with punctuation,
2684                        // it's likely a new paragraph, not a continuation
2685                        if starts_with_uppercase && last_list_item_line > 0 {
2686                            // This looks like a new paragraph
2687                            list_blocks.push(block.clone());
2688                            current_block = None;
2689                        } else {
2690                            // This is a lazy continuation line
2691                            block.end_line = line_num;
2692                        }
2693                    } else {
2694                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2695                        list_blocks.push(block.clone());
2696                        current_block = None;
2697                    }
2698                }
2699            }
2700        }
2701
2702        // Don't forget the last block
2703        if let Some(block) = current_block {
2704            list_blocks.push(block);
2705        }
2706
2707        // Merge adjacent blocks that should be one
2708        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2709
2710        list_blocks
2711    }
2712
2713    /// Compute character frequency for fast content analysis
2714    fn compute_char_frequency(content: &str) -> CharFrequency {
2715        let mut frequency = CharFrequency::default();
2716
2717        for ch in content.chars() {
2718            match ch {
2719                '#' => frequency.hash_count += 1,
2720                '*' => frequency.asterisk_count += 1,
2721                '_' => frequency.underscore_count += 1,
2722                '-' => frequency.hyphen_count += 1,
2723                '+' => frequency.plus_count += 1,
2724                '>' => frequency.gt_count += 1,
2725                '|' => frequency.pipe_count += 1,
2726                '[' => frequency.bracket_count += 1,
2727                '`' => frequency.backtick_count += 1,
2728                '<' => frequency.lt_count += 1,
2729                '!' => frequency.exclamation_count += 1,
2730                '\n' => frequency.newline_count += 1,
2731                _ => {}
2732            }
2733        }
2734
2735        frequency
2736    }
2737
2738    /// Parse HTML tags in the content
2739    fn parse_html_tags(
2740        content: &str,
2741        lines: &[LineInfo],
2742        code_blocks: &[(usize, usize)],
2743        flavor: MarkdownFlavor,
2744    ) -> Vec<HtmlTag> {
2745        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2746            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2747
2748        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2749
2750        for cap in HTML_TAG_REGEX.captures_iter(content) {
2751            let full_match = cap.get(0).unwrap();
2752            let match_start = full_match.start();
2753            let match_end = full_match.end();
2754
2755            // Skip if in code block
2756            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2757                continue;
2758            }
2759
2760            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2761            let tag_name_original = cap.get(2).unwrap().as_str();
2762            let tag_name = tag_name_original.to_lowercase();
2763            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2764
2765            // Skip JSX components in MDX files (tags starting with uppercase letter)
2766            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2767            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2768                continue;
2769            }
2770
2771            // Find which line this tag is on
2772            let mut line_num = 1;
2773            let mut col_start = match_start;
2774            let mut col_end = match_end;
2775            for (idx, line_info) in lines.iter().enumerate() {
2776                if match_start >= line_info.byte_offset {
2777                    line_num = idx + 1;
2778                    col_start = match_start - line_info.byte_offset;
2779                    col_end = match_end - line_info.byte_offset;
2780                } else {
2781                    break;
2782                }
2783            }
2784
2785            html_tags.push(HtmlTag {
2786                line: line_num,
2787                start_col: col_start,
2788                end_col: col_end,
2789                byte_offset: match_start,
2790                byte_end: match_end,
2791                tag_name,
2792                is_closing,
2793                is_self_closing,
2794                raw_content: full_match.as_str().to_string(),
2795            });
2796        }
2797
2798        html_tags
2799    }
2800
2801    /// Parse emphasis spans in the content
2802    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2803        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2804            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2805
2806        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2807
2808        for cap in EMPHASIS_REGEX.captures_iter(content) {
2809            let full_match = cap.get(0).unwrap();
2810            let match_start = full_match.start();
2811            let match_end = full_match.end();
2812
2813            // Skip if in code block
2814            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2815                continue;
2816            }
2817
2818            let opening_markers = cap.get(1).unwrap().as_str();
2819            let content_part = cap.get(2).unwrap().as_str();
2820            let closing_markers = cap.get(3).unwrap().as_str();
2821
2822            // Validate matching markers
2823            if opening_markers.chars().next() != closing_markers.chars().next()
2824                || opening_markers.len() != closing_markers.len()
2825            {
2826                continue;
2827            }
2828
2829            let marker = opening_markers.chars().next().unwrap();
2830            let marker_count = opening_markers.len();
2831
2832            // Find which line this emphasis is on
2833            let mut line_num = 1;
2834            let mut col_start = match_start;
2835            let mut col_end = match_end;
2836            for (idx, line_info) in lines.iter().enumerate() {
2837                if match_start >= line_info.byte_offset {
2838                    line_num = idx + 1;
2839                    col_start = match_start - line_info.byte_offset;
2840                    col_end = match_end - line_info.byte_offset;
2841                } else {
2842                    break;
2843                }
2844            }
2845
2846            emphasis_spans.push(EmphasisSpan {
2847                line: line_num,
2848                start_col: col_start,
2849                end_col: col_end,
2850                byte_offset: match_start,
2851                byte_end: match_end,
2852                marker,
2853                marker_count,
2854                content: content_part.to_string(),
2855            });
2856        }
2857
2858        emphasis_spans
2859    }
2860
2861    /// Parse table rows in the content
2862    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2863        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2864
2865        for (line_idx, line_info) in lines.iter().enumerate() {
2866            // Skip lines in code blocks or blank lines
2867            if line_info.in_code_block || line_info.is_blank {
2868                continue;
2869            }
2870
2871            let line = line_info.content(content);
2872            let line_num = line_idx + 1;
2873
2874            // Check if this line contains pipes (potential table row)
2875            if !line.contains('|') {
2876                continue;
2877            }
2878
2879            // Count columns by splitting on pipes
2880            let parts: Vec<&str> = line.split('|').collect();
2881            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2882
2883            // Check if this is a separator row
2884            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2885            let mut column_alignments = Vec::new();
2886
2887            if is_separator {
2888                for part in &parts[1..parts.len() - 1] {
2889                    // Skip first and last empty parts
2890                    let trimmed = part.trim();
2891                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2892                        "center".to_string()
2893                    } else if trimmed.ends_with(':') {
2894                        "right".to_string()
2895                    } else if trimmed.starts_with(':') {
2896                        "left".to_string()
2897                    } else {
2898                        "none".to_string()
2899                    };
2900                    column_alignments.push(alignment);
2901                }
2902            }
2903
2904            table_rows.push(TableRow {
2905                line: line_num,
2906                is_separator,
2907                column_count,
2908                column_alignments,
2909            });
2910        }
2911
2912        table_rows
2913    }
2914
2915    /// Parse bare URLs and emails in the content
2916    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2917        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2918
2919        // Check for bare URLs (not in angle brackets or markdown links)
2920        for cap in BARE_URL_PATTERN.captures_iter(content) {
2921            let full_match = cap.get(0).unwrap();
2922            let match_start = full_match.start();
2923            let match_end = full_match.end();
2924
2925            // Skip if in code block
2926            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2927                continue;
2928            }
2929
2930            // Skip if already in angle brackets or markdown links
2931            let preceding_char = if match_start > 0 {
2932                content.chars().nth(match_start - 1)
2933            } else {
2934                None
2935            };
2936            let following_char = content.chars().nth(match_end);
2937
2938            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2939                continue;
2940            }
2941            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2942                continue;
2943            }
2944
2945            let url = full_match.as_str();
2946            let url_type = if url.starts_with("https://") {
2947                "https"
2948            } else if url.starts_with("http://") {
2949                "http"
2950            } else if url.starts_with("ftp://") {
2951                "ftp"
2952            } else {
2953                "other"
2954            };
2955
2956            // Find which line this URL is on
2957            let mut line_num = 1;
2958            let mut col_start = match_start;
2959            let mut col_end = match_end;
2960            for (idx, line_info) in lines.iter().enumerate() {
2961                if match_start >= line_info.byte_offset {
2962                    line_num = idx + 1;
2963                    col_start = match_start - line_info.byte_offset;
2964                    col_end = match_end - line_info.byte_offset;
2965                } else {
2966                    break;
2967                }
2968            }
2969
2970            bare_urls.push(BareUrl {
2971                line: line_num,
2972                start_col: col_start,
2973                end_col: col_end,
2974                byte_offset: match_start,
2975                byte_end: match_end,
2976                url: url.to_string(),
2977                url_type: url_type.to_string(),
2978            });
2979        }
2980
2981        // Check for bare email addresses
2982        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2983            let full_match = cap.get(0).unwrap();
2984            let match_start = full_match.start();
2985            let match_end = full_match.end();
2986
2987            // Skip if in code block
2988            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2989                continue;
2990            }
2991
2992            // Skip if already in angle brackets or markdown links
2993            let preceding_char = if match_start > 0 {
2994                content.chars().nth(match_start - 1)
2995            } else {
2996                None
2997            };
2998            let following_char = content.chars().nth(match_end);
2999
3000            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3001                continue;
3002            }
3003            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3004                continue;
3005            }
3006
3007            let email = full_match.as_str();
3008
3009            // Find which line this email is on
3010            let mut line_num = 1;
3011            let mut col_start = match_start;
3012            let mut col_end = match_end;
3013            for (idx, line_info) in lines.iter().enumerate() {
3014                if match_start >= line_info.byte_offset {
3015                    line_num = idx + 1;
3016                    col_start = match_start - line_info.byte_offset;
3017                    col_end = match_end - line_info.byte_offset;
3018                } else {
3019                    break;
3020                }
3021            }
3022
3023            bare_urls.push(BareUrl {
3024                line: line_num,
3025                start_col: col_start,
3026                end_col: col_end,
3027                byte_offset: match_start,
3028                byte_end: match_end,
3029                url: email.to_string(),
3030                url_type: "email".to_string(),
3031            });
3032        }
3033
3034        bare_urls
3035    }
3036}
3037
3038/// Merge adjacent list blocks that should be treated as one
3039fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3040    if list_blocks.len() < 2 {
3041        return;
3042    }
3043
3044    let mut merger = ListBlockMerger::new(content, lines);
3045    *list_blocks = merger.merge(list_blocks);
3046}
3047
3048/// Helper struct to manage the complex logic of merging list blocks
3049struct ListBlockMerger<'a> {
3050    content: &'a str,
3051    lines: &'a [LineInfo],
3052}
3053
3054impl<'a> ListBlockMerger<'a> {
3055    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3056        Self { content, lines }
3057    }
3058
3059    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3060        let mut merged = Vec::with_capacity(list_blocks.len());
3061        let mut current = list_blocks[0].clone();
3062
3063        for next in list_blocks.iter().skip(1) {
3064            if self.should_merge_blocks(&current, next) {
3065                current = self.merge_two_blocks(current, next);
3066            } else {
3067                merged.push(current);
3068                current = next.clone();
3069            }
3070        }
3071
3072        merged.push(current);
3073        merged
3074    }
3075
3076    /// Determine if two adjacent list blocks should be merged
3077    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3078        // Basic compatibility checks
3079        if !self.blocks_are_compatible(current, next) {
3080            return false;
3081        }
3082
3083        // Check spacing and content between blocks
3084        let spacing = self.analyze_spacing_between(current, next);
3085        match spacing {
3086            BlockSpacing::Consecutive => true,
3087            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3088            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3089                self.can_merge_with_content_between(current, next)
3090            }
3091        }
3092    }
3093
3094    /// Check if blocks have compatible structure for merging
3095    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3096        current.is_ordered == next.is_ordered
3097            && current.blockquote_prefix == next.blockquote_prefix
3098            && current.nesting_level == next.nesting_level
3099    }
3100
3101    /// Analyze the spacing between two list blocks
3102    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3103        let gap = next.start_line - current.end_line;
3104
3105        match gap {
3106            1 => BlockSpacing::Consecutive,
3107            2 => BlockSpacing::SingleBlank,
3108            _ if gap > 2 => {
3109                if self.has_only_blank_lines_between(current, next) {
3110                    BlockSpacing::MultipleBlanks
3111                } else {
3112                    BlockSpacing::ContentBetween
3113                }
3114            }
3115            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3116        }
3117    }
3118
3119    /// Check if unordered lists can be merged with a single blank line between
3120    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3121        // Check if there are structural separators between the blocks
3122        // If has_meaningful_content_between returns true, it means there are structural separators
3123        if has_meaningful_content_between(self.content, current, next, self.lines) {
3124            return false; // Structural separators prevent merging
3125        }
3126
3127        // Only merge unordered lists with same marker across single blank
3128        !current.is_ordered && current.marker == next.marker
3129    }
3130
3131    /// Check if ordered lists can be merged when there's content between them
3132    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3133        // Do not merge lists if there are structural separators between them
3134        if has_meaningful_content_between(self.content, current, next, self.lines) {
3135            return false; // Structural separators prevent merging
3136        }
3137
3138        // Only consider merging ordered lists if there's no structural content between
3139        current.is_ordered && next.is_ordered
3140    }
3141
3142    /// Check if there are only blank lines between blocks
3143    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3144        for line_num in (current.end_line + 1)..next.start_line {
3145            if let Some(line_info) = self.lines.get(line_num - 1)
3146                && !line_info.content(self.content).trim().is_empty()
3147            {
3148                return false;
3149            }
3150        }
3151        true
3152    }
3153
3154    /// Merge two compatible list blocks into one
3155    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3156        current.end_line = next.end_line;
3157        current.item_lines.extend_from_slice(&next.item_lines);
3158
3159        // Update max marker width
3160        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3161
3162        // Handle marker consistency for unordered lists
3163        if !current.is_ordered && self.markers_differ(&current, next) {
3164            current.marker = None; // Mixed markers
3165        }
3166
3167        current
3168    }
3169
3170    /// Check if two blocks have different markers
3171    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3172        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3173    }
3174}
3175
3176/// Types of spacing between list blocks
3177#[derive(Debug, PartialEq)]
3178enum BlockSpacing {
3179    Consecutive,    // No gap between blocks
3180    SingleBlank,    // One blank line between blocks
3181    MultipleBlanks, // Multiple blank lines but no content
3182    ContentBetween, // Content exists between blocks
3183}
3184
3185/// Check if there's meaningful content (not just blank lines) between two list blocks
3186fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3187    // Check lines between current.end_line and next.start_line
3188    for line_num in (current.end_line + 1)..next.start_line {
3189        if let Some(line_info) = lines.get(line_num - 1) {
3190            // Convert to 0-indexed
3191            let trimmed = line_info.content(content).trim();
3192
3193            // Skip empty lines
3194            if trimmed.is_empty() {
3195                continue;
3196            }
3197
3198            // Check for structural separators that should separate lists (CommonMark compliant)
3199
3200            // Headings separate lists
3201            if line_info.heading.is_some() {
3202                return true; // Has meaningful content - headings separate lists
3203            }
3204
3205            // Horizontal rules separate lists (---, ***, ___)
3206            if is_horizontal_rule(trimmed) {
3207                return true; // Has meaningful content - horizontal rules separate lists
3208            }
3209
3210            // Tables separate lists (lines containing | but not in URLs or code)
3211            // Simple heuristic: tables typically have | at start/end or multiple |
3212            if trimmed.contains('|') && trimmed.len() > 1 {
3213                // Don't treat URLs with | as tables
3214                if !trimmed.contains("](") && !trimmed.contains("http") {
3215                    // More robust check: tables usually have multiple | or | at edges
3216                    let pipe_count = trimmed.matches('|').count();
3217                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3218                        return true; // Has meaningful content - tables separate lists
3219                    }
3220                }
3221            }
3222
3223            // Blockquotes separate lists
3224            if trimmed.starts_with('>') {
3225                return true; // Has meaningful content - blockquotes separate lists
3226            }
3227
3228            // Code block fences separate lists (unless properly indented as list content)
3229            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3230                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3231
3232                // Check if this code block is properly indented as list continuation
3233                let min_continuation_indent = if current.is_ordered {
3234                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3235                } else {
3236                    current.nesting_level + 2
3237                };
3238
3239                if line_indent < min_continuation_indent {
3240                    // This is a standalone code block that separates lists
3241                    return true; // Has meaningful content - standalone code blocks separate lists
3242                }
3243            }
3244
3245            // Check if this line has proper indentation for list continuation
3246            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3247
3248            // Calculate minimum indentation needed to be list continuation
3249            let min_indent = if current.is_ordered {
3250                current.nesting_level + current.max_marker_width
3251            } else {
3252                current.nesting_level + 2
3253            };
3254
3255            // If the line is not indented enough to be list continuation, it's meaningful content
3256            if line_indent < min_indent {
3257                return true; // Has meaningful content - content not indented as list continuation
3258            }
3259
3260            // If we reach here, the line is properly indented as list continuation
3261            // Continue checking other lines
3262        }
3263    }
3264
3265    // Only blank lines or properly indented list continuation content between blocks
3266    false
3267}
3268
3269/// Check if a line is a horizontal rule (---, ***, ___)
3270fn is_horizontal_rule(trimmed: &str) -> bool {
3271    if trimmed.len() < 3 {
3272        return false;
3273    }
3274
3275    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3276    let chars: Vec<char> = trimmed.chars().collect();
3277    if let Some(&first_char) = chars.first()
3278        && (first_char == '-' || first_char == '*' || first_char == '_')
3279    {
3280        let mut count = 0;
3281        for &ch in &chars {
3282            if ch == first_char {
3283                count += 1;
3284            } else if ch != ' ' && ch != '\t' {
3285                return false; // Non-matching, non-whitespace character
3286            }
3287        }
3288        return count >= 3;
3289    }
3290    false
3291}
3292
3293/// Check if content contains patterns that cause the markdown crate to panic
3294#[cfg(test)]
3295mod tests {
3296    use super::*;
3297
3298    #[test]
3299    fn test_empty_content() {
3300        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3301        assert_eq!(ctx.content, "");
3302        assert_eq!(ctx.line_offsets, vec![0]);
3303        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3304        assert_eq!(ctx.lines.len(), 0);
3305    }
3306
3307    #[test]
3308    fn test_single_line() {
3309        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3310        assert_eq!(ctx.content, "# Hello");
3311        assert_eq!(ctx.line_offsets, vec![0]);
3312        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3313        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3314    }
3315
3316    #[test]
3317    fn test_multi_line() {
3318        let content = "# Title\n\nSecond line\nThird line";
3319        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3320        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3321        // Test offset to line/col
3322        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3323        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3324        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3325        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3326        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3327    }
3328
3329    #[test]
3330    fn test_line_info() {
3331        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3332        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3333
3334        // Test line info
3335        assert_eq!(ctx.lines.len(), 7);
3336
3337        // Line 1: "# Title"
3338        let line1 = &ctx.lines[0];
3339        assert_eq!(line1.content(ctx.content), "# Title");
3340        assert_eq!(line1.byte_offset, 0);
3341        assert_eq!(line1.indent, 0);
3342        assert!(!line1.is_blank);
3343        assert!(!line1.in_code_block);
3344        assert!(line1.list_item.is_none());
3345
3346        // Line 2: "    indented"
3347        let line2 = &ctx.lines[1];
3348        assert_eq!(line2.content(ctx.content), "    indented");
3349        assert_eq!(line2.byte_offset, 8);
3350        assert_eq!(line2.indent, 4);
3351        assert!(!line2.is_blank);
3352
3353        // Line 3: "" (blank)
3354        let line3 = &ctx.lines[2];
3355        assert_eq!(line3.content(ctx.content), "");
3356        assert!(line3.is_blank);
3357
3358        // Test helper methods
3359        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3360        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3361        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3362        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3363    }
3364
3365    #[test]
3366    fn test_list_item_detection() {
3367        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3368        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3369
3370        // Line 1: "- Unordered item"
3371        let line1 = &ctx.lines[0];
3372        assert!(line1.list_item.is_some());
3373        let list1 = line1.list_item.as_ref().unwrap();
3374        assert_eq!(list1.marker, "-");
3375        assert!(!list1.is_ordered);
3376        assert_eq!(list1.marker_column, 0);
3377        assert_eq!(list1.content_column, 2);
3378
3379        // Line 2: "  * Nested item"
3380        let line2 = &ctx.lines[1];
3381        assert!(line2.list_item.is_some());
3382        let list2 = line2.list_item.as_ref().unwrap();
3383        assert_eq!(list2.marker, "*");
3384        assert_eq!(list2.marker_column, 2);
3385
3386        // Line 3: "1. Ordered item"
3387        let line3 = &ctx.lines[2];
3388        assert!(line3.list_item.is_some());
3389        let list3 = line3.list_item.as_ref().unwrap();
3390        assert_eq!(list3.marker, "1.");
3391        assert!(list3.is_ordered);
3392        assert_eq!(list3.number, Some(1));
3393
3394        // Line 6: "Not a list"
3395        let line6 = &ctx.lines[5];
3396        assert!(line6.list_item.is_none());
3397    }
3398
3399    #[test]
3400    fn test_offset_to_line_col_edge_cases() {
3401        let content = "a\nb\nc";
3402        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3403        // line_offsets: [0, 2, 4]
3404        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3405        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3406        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3407        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3408        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3409        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3410    }
3411
3412    #[test]
3413    fn test_mdx_esm_blocks() {
3414        let content = r##"import {Chart} from './snowfall.js'
3415export const year = 2023
3416
3417# Last year's snowfall
3418
3419In {year}, the snowfall was above average.
3420It was followed by a warm spring which caused
3421flood conditions in many of the nearby rivers.
3422
3423<Chart color="#fcb32c" year={year} />
3424"##;
3425
3426        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3427
3428        // Check that lines 1 and 2 are marked as ESM blocks
3429        assert_eq!(ctx.lines.len(), 10);
3430        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3431        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3432        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3433        assert!(
3434            !ctx.lines[3].in_esm_block,
3435            "Line 4 (heading) should NOT be in_esm_block"
3436        );
3437        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3438        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3439    }
3440
3441    #[test]
3442    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3443        let content = r#"import {Chart} from './snowfall.js'
3444export const year = 2023
3445
3446# Last year's snowfall
3447"#;
3448
3449        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3450
3451        // ESM blocks should NOT be detected in Standard flavor
3452        assert!(
3453            !ctx.lines[0].in_esm_block,
3454            "Line 1 should NOT be in_esm_block in Standard flavor"
3455        );
3456        assert!(
3457            !ctx.lines[1].in_esm_block,
3458            "Line 2 should NOT be in_esm_block in Standard flavor"
3459        );
3460    }
3461}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs