rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9/// Macro for profiling sections - only active in non-WASM builds
10#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12    ($name:expr, $profile:expr, $code:expr) => {{
13        let start = std::time::Instant::now();
14        let result = $code;
15        if $profile {
16            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17        }
18        result
19    }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27// Comprehensive link pattern that captures both inline and reference links
28// Use (?s) flag to make . match newlines
29static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30    Regex::new(
31        r#"(?sx)
32        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33        (?:
34            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
35            |
36            \[([^\]]*)\]      # Reference ID in group 6
37        )"#
38    ).unwrap()
39});
40
41// Image pattern (similar to links but with ! prefix)
42// Use (?s) flag to make . match newlines
43static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(
45        r#"(?sx)
46        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47        (?:
48            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
49            |
50            \[([^\]]*)\]      # Reference ID in group 6
51        )"#
52    ).unwrap()
53});
54
55// Reference definition pattern
56static REF_DEF_PATTERN: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59// Pattern for bare URLs
60static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61    Regex::new(
62        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63    ).unwrap()
64});
65
66// Pattern for email addresses
67static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70// Pattern for blockquote prefix in parse_list_blocks
71static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73/// Pre-computed information about a line
74#[derive(Debug, Clone)]
75pub struct LineInfo {
76    /// Byte offset where this line starts in the document
77    pub byte_offset: usize,
78    /// Length of the line in bytes (without newline)
79    pub byte_len: usize,
80    /// Number of leading spaces/tabs
81    pub indent: usize,
82    /// Whether the line is blank (empty or only whitespace)
83    pub is_blank: bool,
84    /// Whether this line is inside a code block
85    pub in_code_block: bool,
86    /// Whether this line is inside front matter
87    pub in_front_matter: bool,
88    /// Whether this line is inside an HTML block
89    pub in_html_block: bool,
90    /// Whether this line is inside an HTML comment
91    pub in_html_comment: bool,
92    /// List item information if this line starts a list item
93    pub list_item: Option<ListItemInfo>,
94    /// Heading information if this line is a heading
95    pub heading: Option<HeadingInfo>,
96    /// Blockquote information if this line is a blockquote
97    pub blockquote: Option<BlockquoteInfo>,
98    /// Whether this line is inside a mkdocstrings autodoc block
99    pub in_mkdocstrings: bool,
100    /// Whether this line is part of an ESM import/export block (MDX only)
101    pub in_esm_block: bool,
102}
103
104impl LineInfo {
105    /// Get the line content as a string slice from the source document
106    pub fn content<'a>(&self, source: &'a str) -> &'a str {
107        &source[self.byte_offset..self.byte_offset + self.byte_len]
108    }
109}
110
111/// Information about a list item
112#[derive(Debug, Clone)]
113pub struct ListItemInfo {
114    /// The marker used (*, -, +, or number with . or ))
115    pub marker: String,
116    /// Whether it's ordered (true) or unordered (false)
117    pub is_ordered: bool,
118    /// The number for ordered lists
119    pub number: Option<usize>,
120    /// Column where the marker starts (0-based)
121    pub marker_column: usize,
122    /// Column where content after marker starts
123    pub content_column: usize,
124}
125
126/// Heading style type
127#[derive(Debug, Clone, PartialEq)]
128pub enum HeadingStyle {
129    /// ATX style heading (# Heading)
130    ATX,
131    /// Setext style heading with = underline
132    Setext1,
133    /// Setext style heading with - underline
134    Setext2,
135}
136
137/// Parsed link information
138#[derive(Debug, Clone)]
139pub struct ParsedLink<'a> {
140    /// Line number (1-indexed)
141    pub line: usize,
142    /// Start column (0-indexed) in the line
143    pub start_col: usize,
144    /// End column (0-indexed) in the line
145    pub end_col: usize,
146    /// Byte offset in document
147    pub byte_offset: usize,
148    /// End byte offset in document
149    pub byte_end: usize,
150    /// Link text
151    pub text: Cow<'a, str>,
152    /// Link URL or reference
153    pub url: Cow<'a, str>,
154    /// Whether this is a reference link [text][ref] vs inline [text](url)
155    pub is_reference: bool,
156    /// Reference ID for reference links
157    pub reference_id: Option<Cow<'a, str>>,
158    /// Link type from pulldown-cmark
159    pub link_type: LinkType,
160}
161
162/// Information about a broken link reported by pulldown-cmark
163#[derive(Debug, Clone)]
164pub struct BrokenLinkInfo {
165    /// The reference text that couldn't be resolved
166    pub reference: String,
167    /// Byte span in the source document
168    pub span: std::ops::Range<usize>,
169}
170
171/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
172#[derive(Debug, Clone)]
173pub struct FootnoteRef {
174    /// The footnote ID (without the ^ prefix)
175    pub id: String,
176    /// Line number (1-indexed)
177    pub line: usize,
178    /// Start byte offset in document
179    pub byte_offset: usize,
180    /// End byte offset in document
181    pub byte_end: usize,
182}
183
184/// Parsed image information
185#[derive(Debug, Clone)]
186pub struct ParsedImage<'a> {
187    /// Line number (1-indexed)
188    pub line: usize,
189    /// Start column (0-indexed) in the line
190    pub start_col: usize,
191    /// End column (0-indexed) in the line
192    pub end_col: usize,
193    /// Byte offset in document
194    pub byte_offset: usize,
195    /// End byte offset in document
196    pub byte_end: usize,
197    /// Alt text
198    pub alt_text: Cow<'a, str>,
199    /// Image URL or reference
200    pub url: Cow<'a, str>,
201    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
202    pub is_reference: bool,
203    /// Reference ID for reference images
204    pub reference_id: Option<Cow<'a, str>>,
205    /// Link type from pulldown-cmark
206    pub link_type: LinkType,
207}
208
209/// Reference definition [ref]: url "title"
210#[derive(Debug, Clone)]
211pub struct ReferenceDef {
212    /// Line number (1-indexed)
213    pub line: usize,
214    /// Reference ID (normalized to lowercase)
215    pub id: String,
216    /// URL
217    pub url: String,
218    /// Optional title
219    pub title: Option<String>,
220    /// Byte offset where the reference definition starts
221    pub byte_offset: usize,
222    /// Byte offset where the reference definition ends
223    pub byte_end: usize,
224}
225
226/// Parsed code span information
227#[derive(Debug, Clone)]
228pub struct CodeSpan {
229    /// Line number (1-indexed)
230    pub line: usize,
231    /// Start column (0-indexed) in the line
232    pub start_col: usize,
233    /// End column (0-indexed) in the line
234    pub end_col: usize,
235    /// Byte offset in document
236    pub byte_offset: usize,
237    /// End byte offset in document
238    pub byte_end: usize,
239    /// Number of backticks used (1, 2, 3, etc.)
240    pub backtick_count: usize,
241    /// Content inside the code span (without backticks)
242    pub content: String,
243}
244
245/// Information about a heading
246#[derive(Debug, Clone)]
247pub struct HeadingInfo {
248    /// Heading level (1-6 for ATX, 1-2 for Setext)
249    pub level: u8,
250    /// Style of heading
251    pub style: HeadingStyle,
252    /// The heading marker (# characters or underline)
253    pub marker: String,
254    /// Column where the marker starts (0-based)
255    pub marker_column: usize,
256    /// Column where heading text starts
257    pub content_column: usize,
258    /// The heading text (without markers and without custom ID syntax)
259    pub text: String,
260    /// Custom header ID if present (e.g., from {#custom-id} syntax)
261    pub custom_id: Option<String>,
262    /// Original heading text including custom ID syntax
263    pub raw_text: String,
264    /// Whether it has a closing sequence (for ATX)
265    pub has_closing_sequence: bool,
266    /// The closing sequence if present
267    pub closing_sequence: String,
268}
269
270/// Information about a blockquote line
271#[derive(Debug, Clone)]
272pub struct BlockquoteInfo {
273    /// Nesting level (1 for >, 2 for >>, etc.)
274    pub nesting_level: usize,
275    /// The indentation before the blockquote marker
276    pub indent: String,
277    /// Column where the first > starts (0-based)
278    pub marker_column: usize,
279    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
280    pub prefix: String,
281    /// Content after the blockquote marker(s)
282    pub content: String,
283    /// Whether the line has no space after the marker
284    pub has_no_space_after_marker: bool,
285    /// Whether the line has multiple spaces after the marker
286    pub has_multiple_spaces_after_marker: bool,
287    /// Whether this is an empty blockquote line needing MD028 fix
288    pub needs_md028_fix: bool,
289}
290
291/// Information about a list block
292#[derive(Debug, Clone)]
293pub struct ListBlock {
294    /// Line number where the list starts (1-indexed)
295    pub start_line: usize,
296    /// Line number where the list ends (1-indexed)
297    pub end_line: usize,
298    /// Whether it's ordered or unordered
299    pub is_ordered: bool,
300    /// The consistent marker for unordered lists (if any)
301    pub marker: Option<String>,
302    /// Blockquote prefix for this list (empty if not in blockquote)
303    pub blockquote_prefix: String,
304    /// Lines that are list items within this block
305    pub item_lines: Vec<usize>,
306    /// Nesting level (0 for top-level lists)
307    pub nesting_level: usize,
308    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
309    pub max_marker_width: usize,
310}
311
312use std::sync::{Arc, Mutex};
313
314/// Character frequency data for fast content analysis
315#[derive(Debug, Clone, Default)]
316pub struct CharFrequency {
317    /// Count of # characters (headings)
318    pub hash_count: usize,
319    /// Count of * characters (emphasis, lists, horizontal rules)
320    pub asterisk_count: usize,
321    /// Count of _ characters (emphasis, horizontal rules)
322    pub underscore_count: usize,
323    /// Count of - characters (lists, horizontal rules, setext headings)
324    pub hyphen_count: usize,
325    /// Count of + characters (lists)
326    pub plus_count: usize,
327    /// Count of > characters (blockquotes)
328    pub gt_count: usize,
329    /// Count of | characters (tables)
330    pub pipe_count: usize,
331    /// Count of [ characters (links, images)
332    pub bracket_count: usize,
333    /// Count of ` characters (code spans, code blocks)
334    pub backtick_count: usize,
335    /// Count of < characters (HTML tags, autolinks)
336    pub lt_count: usize,
337    /// Count of ! characters (images)
338    pub exclamation_count: usize,
339    /// Count of newline characters
340    pub newline_count: usize,
341}
342
343/// Pre-parsed HTML tag information
344#[derive(Debug, Clone)]
345pub struct HtmlTag {
346    /// Line number (1-indexed)
347    pub line: usize,
348    /// Start column (0-indexed) in the line
349    pub start_col: usize,
350    /// End column (0-indexed) in the line
351    pub end_col: usize,
352    /// Byte offset in document
353    pub byte_offset: usize,
354    /// End byte offset in document
355    pub byte_end: usize,
356    /// Tag name (e.g., "div", "img", "br")
357    pub tag_name: String,
358    /// Whether it's a closing tag (`</tag>`)
359    pub is_closing: bool,
360    /// Whether it's self-closing (`<tag />`)
361    pub is_self_closing: bool,
362    /// Raw tag content
363    pub raw_content: String,
364}
365
366/// Pre-parsed emphasis span information
367#[derive(Debug, Clone)]
368pub struct EmphasisSpan {
369    /// Line number (1-indexed)
370    pub line: usize,
371    /// Start column (0-indexed) in the line
372    pub start_col: usize,
373    /// End column (0-indexed) in the line
374    pub end_col: usize,
375    /// Byte offset in document
376    pub byte_offset: usize,
377    /// End byte offset in document
378    pub byte_end: usize,
379    /// Type of emphasis ('*' or '_')
380    pub marker: char,
381    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
382    pub marker_count: usize,
383    /// Content inside the emphasis
384    pub content: String,
385}
386
387/// Pre-parsed table row information
388#[derive(Debug, Clone)]
389pub struct TableRow {
390    /// Line number (1-indexed)
391    pub line: usize,
392    /// Whether this is a separator row (contains only |, -, :, and spaces)
393    pub is_separator: bool,
394    /// Number of columns (pipe-separated cells)
395    pub column_count: usize,
396    /// Alignment info from separator row
397    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
398}
399
400/// Pre-parsed bare URL information (not in links)
401#[derive(Debug, Clone)]
402pub struct BareUrl {
403    /// Line number (1-indexed)
404    pub line: usize,
405    /// Start column (0-indexed) in the line
406    pub start_col: usize,
407    /// End column (0-indexed) in the line
408    pub end_col: usize,
409    /// Byte offset in document
410    pub byte_offset: usize,
411    /// End byte offset in document
412    pub byte_end: usize,
413    /// The URL string
414    pub url: String,
415    /// Type of URL ("http", "https", "ftp", "email")
416    pub url_type: String,
417}
418
419pub struct LintContext<'a> {
420    pub content: &'a str,
421    pub line_offsets: Vec<usize>,
422    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
423    pub lines: Vec<LineInfo>,             // Pre-computed line information
424    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
425    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
426    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
427    pub footnote_refs: Vec<FootnoteRef>,  // Pre-parsed footnote references
428    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
429    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
430    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
431    pub char_frequency: CharFrequency,    // Character frequency analysis
432    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
433    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
434    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
435    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
436    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
437    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
438    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
439    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
440    pub flavor: MarkdownFlavor,           // Markdown flavor being used
441}
442
443/// Detailed blockquote parse result with all components
444struct BlockquoteComponents<'a> {
445    indent: &'a str,
446    markers: &'a str,
447    spaces_after: &'a str,
448    content: &'a str,
449}
450
451/// Parse blockquote prefix with detailed components using manual parsing
452#[inline]
453fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
454    let bytes = line.as_bytes();
455    let mut pos = 0;
456
457    // Parse leading whitespace (indent)
458    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
459        pos += 1;
460    }
461    let indent_end = pos;
462
463    // Must have at least one '>' marker
464    if pos >= bytes.len() || bytes[pos] != b'>' {
465        return None;
466    }
467
468    // Parse '>' markers
469    while pos < bytes.len() && bytes[pos] == b'>' {
470        pos += 1;
471    }
472    let markers_end = pos;
473
474    // Parse spaces after markers
475    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
476        pos += 1;
477    }
478    let spaces_end = pos;
479
480    Some(BlockquoteComponents {
481        indent: &line[0..indent_end],
482        markers: &line[indent_end..markers_end],
483        spaces_after: &line[markers_end..spaces_end],
484        content: &line[spaces_end..],
485    })
486}
487
488impl<'a> LintContext<'a> {
489    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
490        #[cfg(not(target_arch = "wasm32"))]
491        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
492        #[cfg(target_arch = "wasm32")]
493        let profile = false;
494
495        let line_offsets = profile_section!("Line offsets", profile, {
496            let mut offsets = vec![0];
497            for (i, c) in content.char_indices() {
498                if c == '\n' {
499                    offsets.push(i + 1);
500                }
501            }
502            offsets
503        });
504
505        // Detect code blocks once and cache them
506        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
507
508        // Pre-compute HTML comment ranges ONCE for all operations
509        let html_comment_ranges = profile_section!(
510            "HTML comment ranges",
511            profile,
512            crate::utils::skip_context::compute_html_comment_ranges(content)
513        );
514
515        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
516        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
517            if flavor == MarkdownFlavor::MkDocs {
518                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
519            } else {
520                Vec::new()
521            }
522        });
523
524        // Pre-compute line information (without headings/blockquotes yet)
525        let mut lines = profile_section!(
526            "Basic line info",
527            profile,
528            Self::compute_basic_line_info(
529                content,
530                &line_offsets,
531                &code_blocks,
532                flavor,
533                &html_comment_ranges,
534                &autodoc_ranges,
535            )
536        );
537
538        // Detect HTML blocks BEFORE heading detection
539        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
540
541        // Detect ESM import/export blocks in MDX files BEFORE heading detection
542        profile_section!(
543            "ESM blocks",
544            profile,
545            Self::detect_esm_blocks(content, &mut lines, flavor)
546        );
547
548        // Now detect headings and blockquotes
549        profile_section!(
550            "Headings & blockquotes",
551            profile,
552            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
553        );
554
555        // Parse code spans early so we can exclude them from link/image parsing
556        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
557
558        // Parse links, images, references, and list blocks
559        let (links, broken_links, footnote_refs) = profile_section!(
560            "Links",
561            profile,
562            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
563        );
564
565        let images = profile_section!(
566            "Images",
567            profile,
568            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
569        );
570
571        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
572
573        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
574
575        // Compute character frequency for fast content analysis
576        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
577
578        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
579        let table_blocks = profile_section!(
580            "Table blocks",
581            profile,
582            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
583                content,
584                &code_blocks,
585                &code_spans,
586                &html_comment_ranges,
587            )
588        );
589
590        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
591        let line_index = profile_section!(
592            "Line index",
593            profile,
594            crate::utils::range_utils::LineIndex::new(content)
595        );
596
597        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
598        let jinja_ranges = profile_section!(
599            "Jinja ranges",
600            profile,
601            crate::utils::jinja_utils::find_jinja_ranges(content)
602        );
603
604        Self {
605            content,
606            line_offsets,
607            code_blocks,
608            lines,
609            links,
610            images,
611            broken_links,
612            footnote_refs,
613            reference_defs,
614            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
615            list_blocks,
616            char_frequency,
617            html_tags_cache: Mutex::new(None),
618            emphasis_spans_cache: Mutex::new(None),
619            table_rows_cache: Mutex::new(None),
620            bare_urls_cache: Mutex::new(None),
621            html_comment_ranges,
622            table_blocks,
623            line_index,
624            jinja_ranges,
625            flavor,
626        }
627    }
628
629    /// Get code spans - computed lazily on first access
630    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
631        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
632
633        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
634    }
635
636    /// Get HTML comment ranges - pre-computed during LintContext construction
637    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
638        &self.html_comment_ranges
639    }
640
641    /// Get HTML tags - computed lazily on first access
642    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
643        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
644
645        Arc::clone(cache.get_or_insert_with(|| {
646            Arc::new(Self::parse_html_tags(
647                self.content,
648                &self.lines,
649                &self.code_blocks,
650                self.flavor,
651            ))
652        }))
653    }
654
655    /// Get emphasis spans - computed lazily on first access
656    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
657        let mut cache = self
658            .emphasis_spans_cache
659            .lock()
660            .expect("Emphasis spans cache mutex poisoned");
661
662        Arc::clone(
663            cache.get_or_insert_with(|| {
664                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
665            }),
666        )
667    }
668
669    /// Get table rows - computed lazily on first access
670    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
671        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
672
673        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
674    }
675
676    /// Get bare URLs - computed lazily on first access
677    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
678        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
679
680        Arc::clone(
681            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
682        )
683    }
684
685    /// Map a byte offset to (line, column)
686    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
687        match self.line_offsets.binary_search(&offset) {
688            Ok(line) => (line + 1, 1),
689            Err(line) => {
690                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
691                (line, offset - line_start + 1)
692            }
693        }
694    }
695
696    /// Check if a position is within a code block or code span
697    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
698        // Check code blocks first
699        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
700            return true;
701        }
702
703        // Check inline code spans (lazy load if needed)
704        self.code_spans()
705            .iter()
706            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
707    }
708
709    /// Get line information by line number (1-indexed)
710    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
711        if line_num > 0 {
712            self.lines.get(line_num - 1)
713        } else {
714            None
715        }
716    }
717
718    /// Get byte offset for a line number (1-indexed)
719    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
720        self.line_info(line_num).map(|info| info.byte_offset)
721    }
722
723    /// Get URL for a reference link/image by its ID
724    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
725        let normalized_id = ref_id.to_lowercase();
726        self.reference_defs
727            .iter()
728            .find(|def| def.id == normalized_id)
729            .map(|def| def.url.as_str())
730    }
731
732    /// Check if a line is part of a list block
733    pub fn is_in_list_block(&self, line_num: usize) -> bool {
734        self.list_blocks
735            .iter()
736            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
737    }
738
739    /// Get the list block containing a specific line
740    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
741        self.list_blocks
742            .iter()
743            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
744    }
745
746    // Compatibility methods for DocumentStructure migration
747
748    /// Check if a line is within a code block
749    pub fn is_in_code_block(&self, line_num: usize) -> bool {
750        if line_num == 0 || line_num > self.lines.len() {
751            return false;
752        }
753        self.lines[line_num - 1].in_code_block
754    }
755
756    /// Check if a line is within front matter
757    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
758        if line_num == 0 || line_num > self.lines.len() {
759            return false;
760        }
761        self.lines[line_num - 1].in_front_matter
762    }
763
764    /// Check if a line is within an HTML block
765    pub fn is_in_html_block(&self, line_num: usize) -> bool {
766        if line_num == 0 || line_num > self.lines.len() {
767            return false;
768        }
769        self.lines[line_num - 1].in_html_block
770    }
771
772    /// Check if a line and column is within a code span
773    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
774        if line_num == 0 || line_num > self.lines.len() {
775            return false;
776        }
777
778        // Use the code spans cache to check
779        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
780        // Convert col to 0-indexed for comparison
781        let col_0indexed = if col > 0 { col - 1 } else { 0 };
782        let code_spans = self.code_spans();
783        code_spans
784            .iter()
785            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
786    }
787
788    /// Check if a byte position is within a reference definition
789    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
790    #[inline]
791    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
792        self.reference_defs
793            .iter()
794            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
795    }
796
797    /// Check if a byte position is within an HTML comment
798    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
799    /// where k is the number of HTML comments (typically very small)
800    #[inline]
801    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
802        self.html_comment_ranges
803            .iter()
804            .any(|range| byte_pos >= range.start && byte_pos < range.end)
805    }
806
807    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
808    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
809        self.jinja_ranges
810            .iter()
811            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
812    }
813
814    /// Check if content has any instances of a specific character (fast)
815    pub fn has_char(&self, ch: char) -> bool {
816        match ch {
817            '#' => self.char_frequency.hash_count > 0,
818            '*' => self.char_frequency.asterisk_count > 0,
819            '_' => self.char_frequency.underscore_count > 0,
820            '-' => self.char_frequency.hyphen_count > 0,
821            '+' => self.char_frequency.plus_count > 0,
822            '>' => self.char_frequency.gt_count > 0,
823            '|' => self.char_frequency.pipe_count > 0,
824            '[' => self.char_frequency.bracket_count > 0,
825            '`' => self.char_frequency.backtick_count > 0,
826            '<' => self.char_frequency.lt_count > 0,
827            '!' => self.char_frequency.exclamation_count > 0,
828            '\n' => self.char_frequency.newline_count > 0,
829            _ => self.content.contains(ch), // Fallback for other characters
830        }
831    }
832
833    /// Get count of a specific character (fast)
834    pub fn char_count(&self, ch: char) -> usize {
835        match ch {
836            '#' => self.char_frequency.hash_count,
837            '*' => self.char_frequency.asterisk_count,
838            '_' => self.char_frequency.underscore_count,
839            '-' => self.char_frequency.hyphen_count,
840            '+' => self.char_frequency.plus_count,
841            '>' => self.char_frequency.gt_count,
842            '|' => self.char_frequency.pipe_count,
843            '[' => self.char_frequency.bracket_count,
844            '`' => self.char_frequency.backtick_count,
845            '<' => self.char_frequency.lt_count,
846            '!' => self.char_frequency.exclamation_count,
847            '\n' => self.char_frequency.newline_count,
848            _ => self.content.matches(ch).count(), // Fallback for other characters
849        }
850    }
851
852    /// Check if content likely contains headings (fast)
853    pub fn likely_has_headings(&self) -> bool {
854        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
855    }
856
857    /// Check if content likely contains lists (fast)
858    pub fn likely_has_lists(&self) -> bool {
859        self.char_frequency.asterisk_count > 0
860            || self.char_frequency.hyphen_count > 0
861            || self.char_frequency.plus_count > 0
862    }
863
864    /// Check if content likely contains emphasis (fast)
865    pub fn likely_has_emphasis(&self) -> bool {
866        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
867    }
868
869    /// Check if content likely contains tables (fast)
870    pub fn likely_has_tables(&self) -> bool {
871        self.char_frequency.pipe_count > 2
872    }
873
874    /// Check if content likely contains blockquotes (fast)
875    pub fn likely_has_blockquotes(&self) -> bool {
876        self.char_frequency.gt_count > 0
877    }
878
879    /// Check if content likely contains code (fast)
880    pub fn likely_has_code(&self) -> bool {
881        self.char_frequency.backtick_count > 0
882    }
883
884    /// Check if content likely contains links or images (fast)
885    pub fn likely_has_links_or_images(&self) -> bool {
886        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
887    }
888
889    /// Check if content likely contains HTML (fast)
890    pub fn likely_has_html(&self) -> bool {
891        self.char_frequency.lt_count > 0
892    }
893
894    /// Get HTML tags on a specific line
895    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
896        self.html_tags()
897            .iter()
898            .filter(|tag| tag.line == line_num)
899            .cloned()
900            .collect()
901    }
902
903    /// Get emphasis spans on a specific line
904    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
905        self.emphasis_spans()
906            .iter()
907            .filter(|span| span.line == line_num)
908            .cloned()
909            .collect()
910    }
911
912    /// Get table rows on a specific line
913    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
914        self.table_rows()
915            .iter()
916            .filter(|row| row.line == line_num)
917            .cloned()
918            .collect()
919    }
920
921    /// Get bare URLs on a specific line
922    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
923        self.bare_urls()
924            .iter()
925            .filter(|url| url.line == line_num)
926            .cloned()
927            .collect()
928    }
929
930    /// Find the line index for a given byte offset using binary search.
931    /// Returns (line_index, line_number, column) where:
932    /// - line_index is the 0-based index in the lines array
933    /// - line_number is the 1-based line number
934    /// - column is the byte offset within that line
935    #[inline]
936    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
937        // Binary search to find the line containing this byte offset
938        let idx = match lines.binary_search_by(|line| {
939            if byte_offset < line.byte_offset {
940                std::cmp::Ordering::Greater
941            } else if byte_offset > line.byte_offset + line.byte_len {
942                std::cmp::Ordering::Less
943            } else {
944                std::cmp::Ordering::Equal
945            }
946        }) {
947            Ok(idx) => idx,
948            Err(idx) => idx.saturating_sub(1),
949        };
950
951        let line = &lines[idx];
952        let line_num = idx + 1;
953        let col = byte_offset.saturating_sub(line.byte_offset);
954
955        (idx, line_num, col)
956    }
957
958    /// Check if a byte offset is within a code span using binary search
959    #[inline]
960    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
961        // Since spans are sorted by byte_offset, use partition_point for binary search
962        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
963
964        // Check the span that starts at or before our offset
965        if idx > 0 {
966            let span = &code_spans[idx - 1];
967            if offset >= span.byte_offset && offset < span.byte_end {
968                return true;
969            }
970        }
971
972        false
973    }
974
975    /// Parse all links in the content
976    fn parse_links(
977        content: &'a str,
978        lines: &[LineInfo],
979        code_blocks: &[(usize, usize)],
980        code_spans: &[CodeSpan],
981        flavor: MarkdownFlavor,
982        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
983    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
984        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
985        use std::collections::HashSet;
986
987        let mut links = Vec::with_capacity(content.len() / 500);
988        let mut broken_links = Vec::new();
989        let mut footnote_refs = Vec::new();
990
991        // Track byte positions of links found by pulldown-cmark
992        let mut found_positions = HashSet::new();
993
994        // Use pulldown-cmark's streaming parser with BrokenLink callback
995        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
996        // This automatically handles:
997        // - Escaped links (won't generate events)
998        // - Links in code blocks/spans (won't generate Link events)
999        // - Images (generates Tag::Image instead)
1000        // - Reference resolution (dest_url is already resolved!)
1001        // - Broken references (callback is invoked)
1002        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1003        let mut options = Options::empty();
1004        options.insert(Options::ENABLE_WIKILINKS);
1005        options.insert(Options::ENABLE_FOOTNOTES);
1006
1007        let parser = Parser::new_with_broken_link_callback(
1008            content,
1009            options,
1010            Some(|link: BrokenLink<'_>| {
1011                broken_links.push(BrokenLinkInfo {
1012                    reference: link.reference.to_string(),
1013                    span: link.span.clone(),
1014                });
1015                None
1016            }),
1017        )
1018        .into_offset_iter();
1019
1020        let mut link_stack: Vec<(
1021            usize,
1022            usize,
1023            pulldown_cmark::CowStr<'a>,
1024            LinkType,
1025            pulldown_cmark::CowStr<'a>,
1026        )> = Vec::new();
1027        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1028
1029        for (event, range) in parser {
1030            match event {
1031                Event::Start(Tag::Link {
1032                    link_type,
1033                    dest_url,
1034                    id,
1035                    ..
1036                }) => {
1037                    // Link start - record position, URL, and reference ID
1038                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1039                    text_chunks.clear();
1040                }
1041                Event::Text(text) if !link_stack.is_empty() => {
1042                    // Track text content with its byte range
1043                    text_chunks.push((text.to_string(), range.start, range.end));
1044                }
1045                Event::Code(code) if !link_stack.is_empty() => {
1046                    // Include inline code in link text (with backticks)
1047                    let code_text = format!("`{code}`");
1048                    text_chunks.push((code_text, range.start, range.end));
1049                }
1050                Event::End(TagEnd::Link) => {
1051                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1052                        // Skip if in HTML comment
1053                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1054                            text_chunks.clear();
1055                            continue;
1056                        }
1057
1058                        // Find line and column information
1059                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1060
1061                        // Skip if this link is on a MkDocs snippet line
1062                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1063                            text_chunks.clear();
1064                            continue;
1065                        }
1066
1067                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1068
1069                        let is_reference = matches!(
1070                            link_type,
1071                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1072                        );
1073
1074                        // Extract link text directly from source bytes to preserve escaping
1075                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1076                        let link_text = if start_pos < content.len() {
1077                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1078
1079                            // Find MATCHING ] by tracking bracket depth for nested brackets
1080                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1081                            // Brackets inside code spans (between backticks) should be ignored
1082                            let mut close_pos = None;
1083                            let mut depth = 0;
1084                            let mut in_code_span = false;
1085
1086                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1087                                // Count preceding backslashes
1088                                let mut backslash_count = 0;
1089                                let mut j = i;
1090                                while j > 0 && link_bytes[j - 1] == b'\\' {
1091                                    backslash_count += 1;
1092                                    j -= 1;
1093                                }
1094                                let is_escaped = backslash_count % 2 != 0;
1095
1096                                // Track code spans - backticks toggle in/out of code
1097                                if byte == b'`' && !is_escaped {
1098                                    in_code_span = !in_code_span;
1099                                }
1100
1101                                // Only count brackets when NOT in a code span
1102                                if !is_escaped && !in_code_span {
1103                                    if byte == b'[' {
1104                                        depth += 1;
1105                                    } else if byte == b']' {
1106                                        if depth == 0 {
1107                                            // Found the matching closing bracket
1108                                            close_pos = Some(i);
1109                                            break;
1110                                        } else {
1111                                            depth -= 1;
1112                                        }
1113                                    }
1114                                }
1115                            }
1116
1117                            if let Some(pos) = close_pos {
1118                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1119                            } else {
1120                                Cow::Borrowed("")
1121                            }
1122                        } else {
1123                            Cow::Borrowed("")
1124                        };
1125
1126                        // For reference links, use the actual reference ID from pulldown-cmark
1127                        let reference_id = if is_reference && !ref_id.is_empty() {
1128                            Some(Cow::Owned(ref_id.to_lowercase()))
1129                        } else if is_reference {
1130                            // For collapsed/shortcut references without explicit ID, use the link text
1131                            Some(Cow::Owned(link_text.to_lowercase()))
1132                        } else {
1133                            None
1134                        };
1135
1136                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1137                        // Check for escaped image syntax: \![text](url)
1138                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1139                        let has_escaped_bang = start_pos >= 2
1140                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1141                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1142
1143                        // Check for escaped bracket: \[text](url)
1144                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1145                        let has_escaped_bracket =
1146                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1147
1148                        if has_escaped_bang || has_escaped_bracket {
1149                            text_chunks.clear();
1150                            continue; // Skip: this is escaped markdown, not a real link
1151                        }
1152
1153                        // Track this position as found
1154                        found_positions.insert(start_pos);
1155
1156                        links.push(ParsedLink {
1157                            line: line_num,
1158                            start_col: col_start,
1159                            end_col: col_end,
1160                            byte_offset: start_pos,
1161                            byte_end: range.end,
1162                            text: link_text,
1163                            url: Cow::Owned(url.to_string()),
1164                            is_reference,
1165                            reference_id,
1166                            link_type,
1167                        });
1168
1169                        text_chunks.clear();
1170                    }
1171                }
1172                Event::FootnoteReference(footnote_id) => {
1173                    // Capture footnote references like [^1], [^note]
1174                    // Skip if in HTML comment
1175                    if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1176                        continue;
1177                    }
1178
1179                    let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1180                    footnote_refs.push(FootnoteRef {
1181                        id: footnote_id.to_string(),
1182                        line: line_num,
1183                        byte_offset: range.start,
1184                        byte_end: range.end,
1185                    });
1186                }
1187                _ => {}
1188            }
1189        }
1190
1191        // Also find undefined references using regex
1192        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1193        // because the reference is undefined
1194        for cap in LINK_PATTERN.captures_iter(content) {
1195            let full_match = cap.get(0).unwrap();
1196            let match_start = full_match.start();
1197            let match_end = full_match.end();
1198
1199            // Skip if this was already found by pulldown-cmark (it's a valid link)
1200            if found_positions.contains(&match_start) {
1201                continue;
1202            }
1203
1204            // Skip if escaped
1205            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1206                continue;
1207            }
1208
1209            // Skip if it's an image
1210            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1211                continue;
1212            }
1213
1214            // Skip if in code block
1215            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1216                continue;
1217            }
1218
1219            // Skip if in code span
1220            if Self::is_offset_in_code_span(code_spans, match_start) {
1221                continue;
1222            }
1223
1224            // Skip if in HTML comment
1225            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1226                continue;
1227            }
1228
1229            // Find line and column information
1230            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1231
1232            // Skip if this link is on a MkDocs snippet line
1233            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1234                continue;
1235            }
1236
1237            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1238
1239            let text = cap.get(1).map_or("", |m| m.as_str());
1240
1241            // Only process reference links (group 6)
1242            if let Some(ref_id) = cap.get(6) {
1243                let ref_id_str = ref_id.as_str();
1244                let normalized_ref = if ref_id_str.is_empty() {
1245                    Cow::Owned(text.to_lowercase()) // Implicit reference
1246                } else {
1247                    Cow::Owned(ref_id_str.to_lowercase())
1248                };
1249
1250                // This is an undefined reference (pulldown-cmark didn't parse it)
1251                links.push(ParsedLink {
1252                    line: line_num,
1253                    start_col: col_start,
1254                    end_col: col_end,
1255                    byte_offset: match_start,
1256                    byte_end: match_end,
1257                    text: Cow::Borrowed(text),
1258                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1259                    is_reference: true,
1260                    reference_id: Some(normalized_ref),
1261                    link_type: LinkType::Reference, // Undefined references are reference-style
1262                });
1263            }
1264        }
1265
1266        (links, broken_links, footnote_refs)
1267    }
1268
1269    /// Parse all images in the content
1270    fn parse_images(
1271        content: &'a str,
1272        lines: &[LineInfo],
1273        code_blocks: &[(usize, usize)],
1274        code_spans: &[CodeSpan],
1275        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1276    ) -> Vec<ParsedImage<'a>> {
1277        use crate::utils::skip_context::is_in_html_comment_ranges;
1278        use std::collections::HashSet;
1279
1280        // Pre-size based on a heuristic: images are less common than links
1281        let mut images = Vec::with_capacity(content.len() / 1000);
1282        let mut found_positions = HashSet::new();
1283
1284        // Use pulldown-cmark for parsing - more accurate and faster
1285        let parser = Parser::new(content).into_offset_iter();
1286        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1287            Vec::new();
1288        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1289
1290        for (event, range) in parser {
1291            match event {
1292                Event::Start(Tag::Image {
1293                    link_type,
1294                    dest_url,
1295                    id,
1296                    ..
1297                }) => {
1298                    image_stack.push((range.start, dest_url, link_type, id));
1299                    text_chunks.clear();
1300                }
1301                Event::Text(text) if !image_stack.is_empty() => {
1302                    text_chunks.push((text.to_string(), range.start, range.end));
1303                }
1304                Event::Code(code) if !image_stack.is_empty() => {
1305                    let code_text = format!("`{code}`");
1306                    text_chunks.push((code_text, range.start, range.end));
1307                }
1308                Event::End(TagEnd::Image) => {
1309                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1310                        // Skip if in code block
1311                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1312                            continue;
1313                        }
1314
1315                        // Skip if in code span
1316                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1317                            continue;
1318                        }
1319
1320                        // Skip if in HTML comment
1321                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1322                            continue;
1323                        }
1324
1325                        // Find line and column using binary search
1326                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1327                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1328
1329                        let is_reference = matches!(
1330                            link_type,
1331                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1332                        );
1333
1334                        // Extract alt text directly from source bytes to preserve escaping
1335                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1336                        let alt_text = if start_pos < content.len() {
1337                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1338
1339                            // Find MATCHING ] by tracking bracket depth for nested brackets
1340                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1341                            let mut close_pos = None;
1342                            let mut depth = 0;
1343
1344                            if image_bytes.len() > 2 {
1345                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1346                                    // Count preceding backslashes
1347                                    let mut backslash_count = 0;
1348                                    let mut j = i;
1349                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1350                                        backslash_count += 1;
1351                                        j -= 1;
1352                                    }
1353                                    let is_escaped = backslash_count % 2 != 0;
1354
1355                                    if !is_escaped {
1356                                        if byte == b'[' {
1357                                            depth += 1;
1358                                        } else if byte == b']' {
1359                                            if depth == 0 {
1360                                                // Found the matching closing bracket
1361                                                close_pos = Some(i);
1362                                                break;
1363                                            } else {
1364                                                depth -= 1;
1365                                            }
1366                                        }
1367                                    }
1368                                }
1369                            }
1370
1371                            if let Some(pos) = close_pos {
1372                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1373                            } else {
1374                                Cow::Borrowed("")
1375                            }
1376                        } else {
1377                            Cow::Borrowed("")
1378                        };
1379
1380                        let reference_id = if is_reference && !ref_id.is_empty() {
1381                            Some(Cow::Owned(ref_id.to_lowercase()))
1382                        } else if is_reference {
1383                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1384                        } else {
1385                            None
1386                        };
1387
1388                        found_positions.insert(start_pos);
1389                        images.push(ParsedImage {
1390                            line: line_num,
1391                            start_col: col_start,
1392                            end_col: col_end,
1393                            byte_offset: start_pos,
1394                            byte_end: range.end,
1395                            alt_text,
1396                            url: Cow::Owned(url.to_string()),
1397                            is_reference,
1398                            reference_id,
1399                            link_type,
1400                        });
1401                    }
1402                }
1403                _ => {}
1404            }
1405        }
1406
1407        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1408        for cap in IMAGE_PATTERN.captures_iter(content) {
1409            let full_match = cap.get(0).unwrap();
1410            let match_start = full_match.start();
1411            let match_end = full_match.end();
1412
1413            // Skip if already found by pulldown-cmark
1414            if found_positions.contains(&match_start) {
1415                continue;
1416            }
1417
1418            // Skip if the ! is escaped
1419            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1420                continue;
1421            }
1422
1423            // Skip if in code block, code span, or HTML comment
1424            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1425                || Self::is_offset_in_code_span(code_spans, match_start)
1426                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1427            {
1428                continue;
1429            }
1430
1431            // Only process reference images (undefined references not found by pulldown-cmark)
1432            if let Some(ref_id) = cap.get(6) {
1433                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1434                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1435                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1436                let ref_id_str = ref_id.as_str();
1437                let normalized_ref = if ref_id_str.is_empty() {
1438                    Cow::Owned(alt_text.to_lowercase())
1439                } else {
1440                    Cow::Owned(ref_id_str.to_lowercase())
1441                };
1442
1443                images.push(ParsedImage {
1444                    line: line_num,
1445                    start_col: col_start,
1446                    end_col: col_end,
1447                    byte_offset: match_start,
1448                    byte_end: match_end,
1449                    alt_text: Cow::Borrowed(alt_text),
1450                    url: Cow::Borrowed(""),
1451                    is_reference: true,
1452                    reference_id: Some(normalized_ref),
1453                    link_type: LinkType::Reference, // Undefined references are reference-style
1454                });
1455            }
1456        }
1457
1458        images
1459    }
1460
1461    /// Parse reference definitions
1462    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1463        // Pre-size based on lines count as reference definitions are line-based
1464        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1465
1466        for (line_idx, line_info) in lines.iter().enumerate() {
1467            // Skip lines in code blocks
1468            if line_info.in_code_block {
1469                continue;
1470            }
1471
1472            let line = line_info.content(content);
1473            let line_num = line_idx + 1;
1474
1475            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1476                let id = cap.get(1).unwrap().as_str().to_lowercase();
1477                let url = cap.get(2).unwrap().as_str().to_string();
1478                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1479
1480                // Calculate byte positions
1481                // The match starts at the beginning of the line (0) and extends to the end
1482                let match_obj = cap.get(0).unwrap();
1483                let byte_offset = line_info.byte_offset + match_obj.start();
1484                let byte_end = line_info.byte_offset + match_obj.end();
1485
1486                refs.push(ReferenceDef {
1487                    line: line_num,
1488                    id,
1489                    url,
1490                    title,
1491                    byte_offset,
1492                    byte_end,
1493                });
1494            }
1495        }
1496
1497        refs
1498    }
1499
1500    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1501    /// Matches: ^(\s*>\s*)(.*)
1502    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1503    #[inline]
1504    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1505        let trimmed_start = line.trim_start();
1506        if !trimmed_start.starts_with('>') {
1507            return None;
1508        }
1509
1510        let leading_ws_len = line.len() - trimmed_start.len();
1511        let after_gt = &trimmed_start[1..];
1512        let content = after_gt.trim_start();
1513        let ws_after_gt_len = after_gt.len() - content.len();
1514        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1515
1516        Some((&line[..prefix_len], content))
1517    }
1518
1519    /// Fast unordered list parser - replaces regex for 5-10x speedup
1520    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1521    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1522    #[inline]
1523    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1524        let bytes = line.as_bytes();
1525        let mut i = 0;
1526
1527        // Skip leading whitespace
1528        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1529            i += 1;
1530        }
1531
1532        // Check for marker
1533        if i >= bytes.len() {
1534            return None;
1535        }
1536        let marker = bytes[i] as char;
1537        if marker != '-' && marker != '*' && marker != '+' {
1538            return None;
1539        }
1540        let marker_pos = i;
1541        i += 1;
1542
1543        // Collect spacing after marker (space or tab only)
1544        let spacing_start = i;
1545        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1546            i += 1;
1547        }
1548
1549        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1550    }
1551
1552    /// Fast ordered list parser - replaces regex for 5-10x speedup
1553    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1554    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1555    #[inline]
1556    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1557        let bytes = line.as_bytes();
1558        let mut i = 0;
1559
1560        // Skip leading whitespace
1561        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1562            i += 1;
1563        }
1564
1565        // Collect digits
1566        let number_start = i;
1567        while i < bytes.len() && bytes[i].is_ascii_digit() {
1568            i += 1;
1569        }
1570        if i == number_start {
1571            return None; // No digits found
1572        }
1573
1574        // Check for delimiter
1575        if i >= bytes.len() {
1576            return None;
1577        }
1578        let delimiter = bytes[i] as char;
1579        if delimiter != '.' && delimiter != ')' {
1580            return None;
1581        }
1582        let delimiter_pos = i;
1583        i += 1;
1584
1585        // Collect spacing after delimiter (space or tab only)
1586        let spacing_start = i;
1587        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1588            i += 1;
1589        }
1590
1591        Some((
1592            &line[..number_start],
1593            &line[number_start..delimiter_pos],
1594            delimiter,
1595            &line[spacing_start..i],
1596            &line[i..],
1597        ))
1598    }
1599
1600    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1601    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1602    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1603        let num_lines = line_offsets.len();
1604        let mut in_code_block = vec![false; num_lines];
1605
1606        // For each code block, mark all lines within it
1607        for &(start, end) in code_blocks {
1608            // Ensure we're at valid UTF-8 boundaries
1609            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1610                let mut boundary = start;
1611                while boundary > 0 && !content.is_char_boundary(boundary) {
1612                    boundary -= 1;
1613                }
1614                boundary
1615            } else {
1616                start
1617            };
1618
1619            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1620                let mut boundary = end;
1621                while boundary < content.len() && !content.is_char_boundary(boundary) {
1622                    boundary += 1;
1623                }
1624                boundary
1625            } else {
1626                end.min(content.len())
1627            };
1628
1629            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1630            // That function now has proper list context awareness (see code_block_utils.rs)
1631            // and correctly distinguishes between:
1632            // - Fenced code blocks (``` or ~~~)
1633            // - Indented code blocks at document level (4 spaces + blank line before)
1634            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1635            //
1636            // We no longer need to re-validate here. The original validation logic
1637            // was causing false positives by marking list continuation paragraphs as
1638            // code blocks when they have 4 spaces of indentation.
1639
1640            // Use binary search to find the first and last line indices
1641            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1642            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1643            //
1644            // Find the line that CONTAINS safe_start: the line with the largest
1645            // start offset that is <= safe_start. partition_point gives us the
1646            // first line that starts AFTER safe_start, so we subtract 1.
1647            let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
1648            let first_line = first_line_after.saturating_sub(1);
1649            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1650
1651            // Mark all lines in the range at once
1652            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1653                *flag = true;
1654            }
1655        }
1656
1657        in_code_block
1658    }
1659
1660    /// Pre-compute basic line information (without headings/blockquotes)
1661    fn compute_basic_line_info(
1662        content: &str,
1663        line_offsets: &[usize],
1664        code_blocks: &[(usize, usize)],
1665        flavor: MarkdownFlavor,
1666        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1667        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1668    ) -> Vec<LineInfo> {
1669        let content_lines: Vec<&str> = content.lines().collect();
1670        let mut lines = Vec::with_capacity(content_lines.len());
1671
1672        // Pre-compute which lines are in code blocks
1673        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1674
1675        // Detect front matter boundaries FIRST, before any other parsing
1676        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1677        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1678
1679        for (i, line) in content_lines.iter().enumerate() {
1680            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1681            let indent = line.len() - line.trim_start().len();
1682
1683            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1684            let blockquote_parse = Self::parse_blockquote_prefix(line);
1685
1686            // For blank detection, consider blockquote context
1687            let is_blank = if let Some((_, content)) = blockquote_parse {
1688                // In blockquote context, check if content after prefix is blank
1689                content.trim().is_empty()
1690            } else {
1691                line.trim().is_empty()
1692            };
1693
1694            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1695            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1696
1697            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1698            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1699                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1700            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1701            let in_html_comment =
1702                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1703            let list_item = if !(in_code_block
1704                || is_blank
1705                || in_mkdocstrings
1706                || in_html_comment
1707                || (front_matter_end > 0 && i < front_matter_end))
1708            {
1709                // Strip blockquote prefix if present for list detection (reuse cached result)
1710                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1711                    (content, prefix.len())
1712                } else {
1713                    (&**line, 0)
1714                };
1715
1716                if let Some((leading_spaces, marker, spacing, _content)) =
1717                    Self::parse_unordered_list(line_for_list_check)
1718                {
1719                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1720                    let content_column = marker_column + 1 + spacing.len();
1721
1722                    // According to CommonMark spec, unordered list items MUST have at least one space
1723                    // after the marker (-, *, or +). Without a space, it's not a list item.
1724                    // This also naturally handles cases like:
1725                    // - *emphasis* (not a list)
1726                    // - **bold** (not a list)
1727                    // - --- (horizontal rule, not a list)
1728                    if spacing.is_empty() {
1729                        None
1730                    } else {
1731                        Some(ListItemInfo {
1732                            marker: marker.to_string(),
1733                            is_ordered: false,
1734                            number: None,
1735                            marker_column,
1736                            content_column,
1737                        })
1738                    }
1739                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1740                    Self::parse_ordered_list(line_for_list_check)
1741                {
1742                    let marker = format!("{number_str}{delimiter}");
1743                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1744                    let content_column = marker_column + marker.len() + spacing.len();
1745
1746                    // According to CommonMark spec, ordered list items MUST have at least one space
1747                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1748                    if spacing.is_empty() {
1749                        None
1750                    } else {
1751                        Some(ListItemInfo {
1752                            marker,
1753                            is_ordered: true,
1754                            number: number_str.parse().ok(),
1755                            marker_column,
1756                            content_column,
1757                        })
1758                    }
1759                } else {
1760                    None
1761                }
1762            } else {
1763                None
1764            };
1765
1766            lines.push(LineInfo {
1767                byte_offset,
1768                byte_len: line.len(),
1769                indent,
1770                is_blank,
1771                in_code_block,
1772                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1773                in_html_block: false, // Will be populated after line creation
1774                in_html_comment,
1775                list_item,
1776                heading: None,    // Will be populated in second pass for Setext headings
1777                blockquote: None, // Will be populated after line creation
1778                in_mkdocstrings,
1779                in_esm_block: false, // Will be populated after line creation for MDX files
1780            });
1781        }
1782
1783        lines
1784    }
1785
1786    /// Detect headings and blockquotes (called after HTML block detection)
1787    fn detect_headings_and_blockquotes(
1788        content: &str,
1789        lines: &mut [LineInfo],
1790        flavor: MarkdownFlavor,
1791        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1792    ) {
1793        // Regex for heading detection
1794        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1795            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1796        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1797            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1798
1799        let content_lines: Vec<&str> = content.lines().collect();
1800
1801        // Detect front matter boundaries to skip those lines
1802        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1803
1804        // Detect headings (including Setext which needs look-ahead) and blockquotes
1805        for i in 0..lines.len() {
1806            if lines[i].in_code_block {
1807                continue;
1808            }
1809
1810            // Skip lines in front matter
1811            if front_matter_end > 0 && i < front_matter_end {
1812                continue;
1813            }
1814
1815            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1816            if lines[i].in_html_block {
1817                continue;
1818            }
1819
1820            let line = content_lines[i];
1821
1822            // Check for blockquotes (even on blank lines within blockquotes)
1823            if let Some(bq) = parse_blockquote_detailed(line) {
1824                let nesting_level = bq.markers.len(); // Each '>' is one level
1825                let marker_column = bq.indent.len();
1826
1827                // Build the prefix (indentation + markers + space)
1828                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1829
1830                // Check for various blockquote issues
1831                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1832                // Only flag multiple literal spaces, not tabs
1833                // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
1834                let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
1835
1836                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1837                // MD028 flags empty blockquote lines that don't have a single space after the marker
1838                // Lines like "> " or ">> " are already correct and don't need fixing
1839                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1840
1841                lines[i].blockquote = Some(BlockquoteInfo {
1842                    nesting_level,
1843                    indent: bq.indent.to_string(),
1844                    marker_column,
1845                    prefix,
1846                    content: bq.content.to_string(),
1847                    has_no_space_after_marker: has_no_space,
1848                    has_multiple_spaces_after_marker: has_multiple_spaces,
1849                    needs_md028_fix,
1850                });
1851            }
1852
1853            // Skip heading detection for blank lines
1854            if lines[i].is_blank {
1855                continue;
1856            }
1857
1858            // Check for ATX headings (but skip MkDocs snippet lines)
1859            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1860            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1861                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1862                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1863            } else {
1864                false
1865            };
1866
1867            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1868                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1869                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1870                    continue;
1871                }
1872                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1873                let hashes = caps.get(2).map_or("", |m| m.as_str());
1874                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1875                let rest = caps.get(4).map_or("", |m| m.as_str());
1876
1877                let level = hashes.len() as u8;
1878                let marker_column = leading_spaces.len();
1879
1880                // Check for closing sequence, but handle custom IDs that might come after
1881                let (text, has_closing, closing_seq) = {
1882                    // First check if there's a custom ID at the end
1883                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1884                        // Check if this looks like a valid custom ID (ends with })
1885                        if rest[id_start..].trim_end().ends_with('}') {
1886                            // Split off the custom ID
1887                            (&rest[..id_start], &rest[id_start..])
1888                        } else {
1889                            (rest, "")
1890                        }
1891                    } else {
1892                        (rest, "")
1893                    };
1894
1895                    // Now look for closing hashes in the part before the custom ID
1896                    let trimmed_rest = rest_without_id.trim_end();
1897                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1898                        // Look for the start of the hash sequence
1899                        let mut start_of_hashes = last_hash_pos;
1900                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1901                            start_of_hashes -= 1;
1902                        }
1903
1904                        // Check if there's at least one space before the closing hashes
1905                        let has_space_before = start_of_hashes == 0
1906                            || trimmed_rest
1907                                .chars()
1908                                .nth(start_of_hashes - 1)
1909                                .is_some_and(|c| c.is_whitespace());
1910
1911                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1912                        let potential_closing = &trimmed_rest[start_of_hashes..];
1913                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1914
1915                        if is_all_hashes && has_space_before {
1916                            // This is a closing sequence
1917                            let closing_hashes = potential_closing.to_string();
1918                            // The text is everything before the closing hashes
1919                            // Don't include the custom ID here - it will be extracted later
1920                            let text_part = if !custom_id_part.is_empty() {
1921                                // If we have a custom ID, append it back to get the full rest
1922                                // This allows the extract_header_id function to handle it properly
1923                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1924                            } else {
1925                                rest_without_id[..start_of_hashes].trim_end().to_string()
1926                            };
1927                            (text_part, true, closing_hashes)
1928                        } else {
1929                            // Not a valid closing sequence, return the full content
1930                            (rest.to_string(), false, String::new())
1931                        }
1932                    } else {
1933                        // No hashes found, return the full content
1934                        (rest.to_string(), false, String::new())
1935                    }
1936                };
1937
1938                let content_column = marker_column + hashes.len() + spaces_after.len();
1939
1940                // Extract custom header ID if present
1941                let raw_text = text.trim().to_string();
1942                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1943
1944                // If no custom ID was found on the header line, check the next line for standalone attr-list
1945                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1946                    let next_line = content_lines[i + 1];
1947                    if !lines[i + 1].in_code_block
1948                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1949                        && let Some(next_line_id) =
1950                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1951                    {
1952                        custom_id = Some(next_line_id);
1953                    }
1954                }
1955
1956                lines[i].heading = Some(HeadingInfo {
1957                    level,
1958                    style: HeadingStyle::ATX,
1959                    marker: hashes.to_string(),
1960                    marker_column,
1961                    content_column,
1962                    text: clean_text,
1963                    custom_id,
1964                    raw_text,
1965                    has_closing_sequence: has_closing,
1966                    closing_sequence: closing_seq,
1967                });
1968            }
1969            // Check for Setext headings (need to look at next line)
1970            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1971                let next_line = content_lines[i + 1];
1972                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1973                    // Skip if next line is front matter delimiter
1974                    if front_matter_end > 0 && i < front_matter_end {
1975                        continue;
1976                    }
1977
1978                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1979                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1980                    {
1981                        continue;
1982                    }
1983
1984                    let underline = next_line.trim();
1985
1986                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1987                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1988                    if underline == "---" {
1989                        continue;
1990                    }
1991
1992                    // Skip if the current line looks like YAML key-value syntax
1993                    let current_line_trimmed = line.trim();
1994                    if current_line_trimmed.contains(':')
1995                        && !current_line_trimmed.starts_with('#')
1996                        && !current_line_trimmed.contains('[')
1997                        && !current_line_trimmed.contains("](")
1998                    {
1999                        // This looks like "key: value" which suggests YAML, not a heading
2000                        continue;
2001                    }
2002
2003                    let level = if underline.starts_with('=') { 1 } else { 2 };
2004                    let style = if level == 1 {
2005                        HeadingStyle::Setext1
2006                    } else {
2007                        HeadingStyle::Setext2
2008                    };
2009
2010                    // Extract custom header ID if present
2011                    let raw_text = line.trim().to_string();
2012                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2013
2014                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2015                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2016                        let attr_line = content_lines[i + 2];
2017                        if !lines[i + 2].in_code_block
2018                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2019                            && let Some(attr_line_id) =
2020                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2021                        {
2022                            custom_id = Some(attr_line_id);
2023                        }
2024                    }
2025
2026                    lines[i].heading = Some(HeadingInfo {
2027                        level,
2028                        style,
2029                        marker: underline.to_string(),
2030                        marker_column: next_line.len() - next_line.trim_start().len(),
2031                        content_column: lines[i].indent,
2032                        text: clean_text,
2033                        custom_id,
2034                        raw_text,
2035                        has_closing_sequence: false,
2036                        closing_sequence: String::new(),
2037                    });
2038                }
2039            }
2040        }
2041    }
2042
2043    /// Detect HTML blocks in the content
2044    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2045        // HTML block elements that trigger block context
2046        const BLOCK_ELEMENTS: &[&str] = &[
2047            "address",
2048            "article",
2049            "aside",
2050            "blockquote",
2051            "details",
2052            "dialog",
2053            "dd",
2054            "div",
2055            "dl",
2056            "dt",
2057            "fieldset",
2058            "figcaption",
2059            "figure",
2060            "footer",
2061            "form",
2062            "h1",
2063            "h2",
2064            "h3",
2065            "h4",
2066            "h5",
2067            "h6",
2068            "header",
2069            "hr",
2070            "li",
2071            "main",
2072            "nav",
2073            "ol",
2074            "p",
2075            "picture",
2076            "pre",
2077            "script",
2078            "section",
2079            "style",
2080            "table",
2081            "tbody",
2082            "td",
2083            "textarea",
2084            "tfoot",
2085            "th",
2086            "thead",
2087            "tr",
2088            "ul",
2089        ];
2090
2091        let mut i = 0;
2092        while i < lines.len() {
2093            // Skip if already in code block or front matter
2094            if lines[i].in_code_block || lines[i].in_front_matter {
2095                i += 1;
2096                continue;
2097            }
2098
2099            let trimmed = lines[i].content(content).trim_start();
2100
2101            // Check if line starts with an HTML tag
2102            if trimmed.starts_with('<') && trimmed.len() > 1 {
2103                // Extract tag name safely
2104                let after_bracket = &trimmed[1..];
2105                let is_closing = after_bracket.starts_with('/');
2106                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2107
2108                // Extract tag name (stop at space, >, /, or end of string)
2109                let tag_name = tag_start
2110                    .chars()
2111                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2112                    .collect::<String>()
2113                    .to_lowercase();
2114
2115                // Check if it's a block element
2116                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2117                    // Mark this line as in HTML block
2118                    lines[i].in_html_block = true;
2119
2120                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2121                    // This avoids complex nesting logic that might cause infinite loops
2122                    if !is_closing {
2123                        let closing_tag = format!("</{tag_name}>");
2124                        // style and script tags can contain blank lines (CSS/JS formatting)
2125                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2126                        let mut j = i + 1;
2127                        while j < lines.len() && j < i + 100 {
2128                            // Limit search to 100 lines
2129                            // Stop at blank lines (except for style/script tags)
2130                            if !allow_blank_lines && lines[j].is_blank {
2131                                break;
2132                            }
2133
2134                            lines[j].in_html_block = true;
2135
2136                            // Check if this line contains the closing tag
2137                            if lines[j].content(content).contains(&closing_tag) {
2138                                break;
2139                            }
2140                            j += 1;
2141                        }
2142                    }
2143                }
2144            }
2145
2146            i += 1;
2147        }
2148    }
2149
2150    /// Detect ESM import/export blocks in MDX files
2151    /// ESM blocks consist of contiguous import/export statements at the top of the file
2152    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2153        // Only process MDX files
2154        if !flavor.supports_esm_blocks() {
2155            return;
2156        }
2157
2158        for line in lines.iter_mut() {
2159            // Skip blank lines and comments at the start
2160            if line.is_blank || line.in_html_comment {
2161                continue;
2162            }
2163
2164            // Check if line starts with import or export
2165            let trimmed = line.content(content).trim_start();
2166            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2167                line.in_esm_block = true;
2168            } else {
2169                // Once we hit a non-ESM line, we're done with the ESM block
2170                break;
2171            }
2172        }
2173    }
2174
2175    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2176    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2177        let mut code_spans = Vec::new();
2178
2179        // Quick check - if no backticks, no code spans
2180        if !content.contains('`') {
2181            return code_spans;
2182        }
2183
2184        // Use pulldown-cmark's streaming parser with byte offsets
2185        let parser = Parser::new(content).into_offset_iter();
2186
2187        for (event, range) in parser {
2188            if let Event::Code(_) = event {
2189                let start_pos = range.start;
2190                let end_pos = range.end;
2191
2192                // The range includes the backticks, extract the actual content
2193                let full_span = &content[start_pos..end_pos];
2194                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2195
2196                // Extract content between backticks, preserving spaces
2197                let content_start = start_pos + backtick_count;
2198                let content_end = end_pos - backtick_count;
2199                let span_content = if content_start < content_end {
2200                    content[content_start..content_end].to_string()
2201                } else {
2202                    String::new()
2203                };
2204
2205                // Use binary search to find line number - O(log n) instead of O(n)
2206                // Find the rightmost line whose byte_offset <= start_pos
2207                let line_idx = lines
2208                    .partition_point(|line| line.byte_offset <= start_pos)
2209                    .saturating_sub(1);
2210                let line_num = line_idx + 1;
2211                let byte_col_start = start_pos - lines[line_idx].byte_offset;
2212
2213                // Find end column using binary search
2214                let end_line_idx = lines
2215                    .partition_point(|line| line.byte_offset <= end_pos)
2216                    .saturating_sub(1);
2217                let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2218
2219                // Convert byte offsets to character positions for correct Unicode handling
2220                // This ensures consistency with warning.column which uses character positions
2221                let line_content = lines[line_idx].content(content);
2222                let col_start = if byte_col_start <= line_content.len() {
2223                    line_content[..byte_col_start].chars().count()
2224                } else {
2225                    line_content.chars().count()
2226                };
2227
2228                let end_line_content = lines[end_line_idx].content(content);
2229                let col_end = if byte_col_end <= end_line_content.len() {
2230                    end_line_content[..byte_col_end].chars().count()
2231                } else {
2232                    end_line_content.chars().count()
2233                };
2234
2235                code_spans.push(CodeSpan {
2236                    line: line_num,
2237                    start_col: col_start,
2238                    end_col: col_end,
2239                    byte_offset: start_pos,
2240                    byte_end: end_pos,
2241                    backtick_count,
2242                    content: span_content,
2243                });
2244            }
2245        }
2246
2247        // Sort by position to ensure consistent ordering
2248        code_spans.sort_by_key(|span| span.byte_offset);
2249
2250        code_spans
2251    }
2252
2253    /// Parse all list blocks in the content (legacy line-by-line approach)
2254    ///
2255    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2256    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2257    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2258    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2259    ///   treated as list continuation (based on the list marker width)
2260    ///
2261    /// When a new list item is encountered, we check if list-breaking content was seen
2262    /// since the last item. If so, we start a new list block.
2263    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2264        // Minimum indentation for unordered list continuation per CommonMark spec
2265        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2266
2267        /// Initialize or reset the forward-scanning tracking state.
2268        /// This helper eliminates code duplication across three initialization sites.
2269        #[inline]
2270        fn reset_tracking_state(
2271            list_item: &ListItemInfo,
2272            has_list_breaking_content: &mut bool,
2273            min_continuation: &mut usize,
2274        ) {
2275            *has_list_breaking_content = false;
2276            let marker_width = if list_item.is_ordered {
2277                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2278            } else {
2279                list_item.marker.len()
2280            };
2281            *min_continuation = if list_item.is_ordered {
2282                marker_width
2283            } else {
2284                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2285            };
2286        }
2287
2288        // Pre-size based on lines that could be list items
2289        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2290        let mut current_block: Option<ListBlock> = None;
2291        let mut last_list_item_line = 0;
2292        let mut current_indent_level = 0;
2293        let mut last_marker_width = 0;
2294
2295        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2296        let mut has_list_breaking_content_since_last_item = false;
2297        let mut min_continuation_for_tracking = 0;
2298
2299        for (line_idx, line_info) in lines.iter().enumerate() {
2300            let line_num = line_idx + 1;
2301
2302            // Enhanced code block handling using Design #3's context analysis
2303            if line_info.in_code_block {
2304                if let Some(ref mut block) = current_block {
2305                    // Calculate minimum indentation for list continuation
2306                    let min_continuation_indent =
2307                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2308
2309                    // Analyze code block context using the three-tier classification
2310                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2311
2312                    match context {
2313                        CodeBlockContext::Indented => {
2314                            // Code block is properly indented - continues the list
2315                            block.end_line = line_num;
2316                            continue;
2317                        }
2318                        CodeBlockContext::Standalone => {
2319                            // Code block separates lists - end current block
2320                            let completed_block = current_block.take().unwrap();
2321                            list_blocks.push(completed_block);
2322                            continue;
2323                        }
2324                        CodeBlockContext::Adjacent => {
2325                            // Edge case - use conservative behavior (continue list)
2326                            block.end_line = line_num;
2327                            continue;
2328                        }
2329                    }
2330                } else {
2331                    // No current list block - skip code block lines
2332                    continue;
2333                }
2334            }
2335
2336            // Extract blockquote prefix if any
2337            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2338                caps.get(0).unwrap().as_str().to_string()
2339            } else {
2340                String::new()
2341            };
2342
2343            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2344            if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2345                let line_content = line_info.content(content).trim();
2346
2347                // Check for structural separators that break lists
2348                let breaks_list = line_info.heading.is_some()
2349                    || line_content.starts_with("---")
2350                    || line_content.starts_with("***")
2351                    || line_content.starts_with("___")
2352                    || (line_content.contains('|')
2353                        && !line_content.contains("](")
2354                        && !line_content.contains("http")
2355                        && (line_content.matches('|').count() > 1
2356                            || line_content.starts_with('|')
2357                            || line_content.ends_with('|')))
2358                    || line_content.starts_with(">")
2359                    || (line_info.indent < min_continuation_for_tracking);
2360
2361                if breaks_list {
2362                    has_list_breaking_content_since_last_item = true;
2363                }
2364            }
2365
2366            // Check if this line is a list item
2367            if let Some(list_item) = &line_info.list_item {
2368                // Calculate nesting level based on indentation
2369                let item_indent = list_item.marker_column;
2370                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2371
2372                if let Some(ref mut block) = current_block {
2373                    // Check if this continues the current block
2374                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2375                    // or a continuation at the same or lower level
2376                    let is_nested = nesting > block.nesting_level;
2377                    let same_type =
2378                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2379                    let same_context = block.blockquote_prefix == blockquote_prefix;
2380                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2381
2382                    // For unordered lists, also check marker consistency
2383                    let marker_compatible =
2384                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2385
2386                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2387                    // This eliminates the quadratic bottleneck from issue #148
2388                    let has_non_list_content = has_list_breaking_content_since_last_item;
2389
2390                    // A list continues if:
2391                    // 1. It's a nested item (indented more than the parent), OR
2392                    // 2. It's the same type at the same level with reasonable distance
2393                    let mut continues_list = if is_nested {
2394                        // Nested items always continue the list if they're in the same context
2395                        same_context && reasonable_distance && !has_non_list_content
2396                    } else {
2397                        // Same-level items need to match type and markers
2398                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2399                    };
2400
2401                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2402                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2403                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2404                        // Check if the previous line was a list item
2405                        if block.item_lines.contains(&(line_num - 1)) {
2406                            // They're consecutive list items - force them to be in the same list
2407                            continues_list = true;
2408                        }
2409                    }
2410
2411                    if continues_list {
2412                        // Extend current block
2413                        block.end_line = line_num;
2414                        block.item_lines.push(line_num);
2415
2416                        // Update max marker width
2417                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2418                            list_item.marker.len() + 1
2419                        } else {
2420                            list_item.marker.len()
2421                        });
2422
2423                        // Update marker consistency for unordered lists
2424                        if !block.is_ordered
2425                            && block.marker.is_some()
2426                            && block.marker.as_ref() != Some(&list_item.marker)
2427                        {
2428                            // Mixed markers, clear the marker field
2429                            block.marker = None;
2430                        }
2431
2432                        // Reset tracked state for issue #148 optimization
2433                        reset_tracking_state(
2434                            list_item,
2435                            &mut has_list_breaking_content_since_last_item,
2436                            &mut min_continuation_for_tracking,
2437                        );
2438                    } else {
2439                        // End current block and start a new one
2440
2441                        list_blocks.push(block.clone());
2442
2443                        *block = ListBlock {
2444                            start_line: line_num,
2445                            end_line: line_num,
2446                            is_ordered: list_item.is_ordered,
2447                            marker: if list_item.is_ordered {
2448                                None
2449                            } else {
2450                                Some(list_item.marker.clone())
2451                            },
2452                            blockquote_prefix: blockquote_prefix.clone(),
2453                            item_lines: vec![line_num],
2454                            nesting_level: nesting,
2455                            max_marker_width: if list_item.is_ordered {
2456                                list_item.marker.len() + 1
2457                            } else {
2458                                list_item.marker.len()
2459                            },
2460                        };
2461
2462                        // Initialize tracked state for new block (issue #148 optimization)
2463                        reset_tracking_state(
2464                            list_item,
2465                            &mut has_list_breaking_content_since_last_item,
2466                            &mut min_continuation_for_tracking,
2467                        );
2468                    }
2469                } else {
2470                    // Start a new block
2471                    current_block = Some(ListBlock {
2472                        start_line: line_num,
2473                        end_line: line_num,
2474                        is_ordered: list_item.is_ordered,
2475                        marker: if list_item.is_ordered {
2476                            None
2477                        } else {
2478                            Some(list_item.marker.clone())
2479                        },
2480                        blockquote_prefix,
2481                        item_lines: vec![line_num],
2482                        nesting_level: nesting,
2483                        max_marker_width: list_item.marker.len(),
2484                    });
2485
2486                    // Initialize tracked state for new block (issue #148 optimization)
2487                    reset_tracking_state(
2488                        list_item,
2489                        &mut has_list_breaking_content_since_last_item,
2490                        &mut min_continuation_for_tracking,
2491                    );
2492                }
2493
2494                last_list_item_line = line_num;
2495                current_indent_level = item_indent;
2496                last_marker_width = if list_item.is_ordered {
2497                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2498                } else {
2499                    list_item.marker.len()
2500                };
2501            } else if let Some(ref mut block) = current_block {
2502                // Not a list item - check if it continues the current block
2503
2504                // For MD032 compatibility, we use a simple approach:
2505                // - Indented lines continue the list
2506                // - Blank lines followed by indented content continue the list
2507                // - Everything else ends the list
2508
2509                // Check if the last line in the list block ended with a backslash (hard line break)
2510                // This handles cases where list items use backslash for hard line breaks
2511                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2512                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2513                } else {
2514                    false
2515                };
2516
2517                // Calculate minimum indentation for list continuation
2518                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2519                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2520                let min_continuation_indent = if block.is_ordered {
2521                    current_indent_level + last_marker_width
2522                } else {
2523                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2524                };
2525
2526                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2527                    // Indented line or backslash continuation continues the list
2528                    block.end_line = line_num;
2529                } else if line_info.is_blank {
2530                    // Blank line - check if it's internal to the list or ending it
2531                    // We only include blank lines that are followed by more list content
2532                    let mut check_idx = line_idx + 1;
2533                    let mut found_continuation = false;
2534
2535                    // Skip additional blank lines
2536                    while check_idx < lines.len() && lines[check_idx].is_blank {
2537                        check_idx += 1;
2538                    }
2539
2540                    if check_idx < lines.len() {
2541                        let next_line = &lines[check_idx];
2542                        // Check if followed by indented content (list continuation)
2543                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2544                            found_continuation = true;
2545                        }
2546                        // Check if followed by another list item at the same level
2547                        else if !next_line.in_code_block
2548                            && next_line.list_item.is_some()
2549                            && let Some(item) = &next_line.list_item
2550                        {
2551                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2552                                .find(next_line.content(content))
2553                                .map_or(String::new(), |m| m.as_str().to_string());
2554                            if item.marker_column == current_indent_level
2555                                && item.is_ordered == block.is_ordered
2556                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2557                            {
2558                                // Check if there was meaningful content between the list items (unused now)
2559                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2560                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2561                                    if let Some(between_line) = lines.get(idx) {
2562                                        let between_content = between_line.content(content);
2563                                        let trimmed = between_content.trim();
2564                                        // Skip empty lines
2565                                        if trimmed.is_empty() {
2566                                            return false;
2567                                        }
2568                                        // Check for meaningful content
2569                                        let line_indent = between_content.len() - between_content.trim_start().len();
2570
2571                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2572                                        if trimmed.starts_with("```")
2573                                            || trimmed.starts_with("~~~")
2574                                            || trimmed.starts_with("---")
2575                                            || trimmed.starts_with("***")
2576                                            || trimmed.starts_with("___")
2577                                            || trimmed.starts_with(">")
2578                                            || trimmed.contains('|') // Tables
2579                                            || between_line.heading.is_some()
2580                                        {
2581                                            return true; // These are structural separators - meaningful content that breaks lists
2582                                        }
2583
2584                                        // Only properly indented content continues the list
2585                                        line_indent >= min_continuation_indent
2586                                    } else {
2587                                        false
2588                                    }
2589                                });
2590
2591                                if block.is_ordered {
2592                                    // For ordered lists: don't continue if there are structural separators
2593                                    // Check if there are structural separators between the list items
2594                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2595                                        if let Some(between_line) = lines.get(idx) {
2596                                            let trimmed = between_line.content(content).trim();
2597                                            if trimmed.is_empty() {
2598                                                return false;
2599                                            }
2600                                            // Check for structural separators that break lists
2601                                            trimmed.starts_with("```")
2602                                                || trimmed.starts_with("~~~")
2603                                                || trimmed.starts_with("---")
2604                                                || trimmed.starts_with("***")
2605                                                || trimmed.starts_with("___")
2606                                                || trimmed.starts_with(">")
2607                                                || trimmed.contains('|') // Tables
2608                                                || between_line.heading.is_some()
2609                                        } else {
2610                                            false
2611                                        }
2612                                    });
2613                                    found_continuation = !has_structural_separators;
2614                                } else {
2615                                    // For unordered lists: also check for structural separators
2616                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2617                                        if let Some(between_line) = lines.get(idx) {
2618                                            let trimmed = between_line.content(content).trim();
2619                                            if trimmed.is_empty() {
2620                                                return false;
2621                                            }
2622                                            // Check for structural separators that break lists
2623                                            trimmed.starts_with("```")
2624                                                || trimmed.starts_with("~~~")
2625                                                || trimmed.starts_with("---")
2626                                                || trimmed.starts_with("***")
2627                                                || trimmed.starts_with("___")
2628                                                || trimmed.starts_with(">")
2629                                                || trimmed.contains('|') // Tables
2630                                                || between_line.heading.is_some()
2631                                        } else {
2632                                            false
2633                                        }
2634                                    });
2635                                    found_continuation = !has_structural_separators;
2636                                }
2637                            }
2638                        }
2639                    }
2640
2641                    if found_continuation {
2642                        // Include the blank line in the block
2643                        block.end_line = line_num;
2644                    } else {
2645                        // Blank line ends the list - don't include it
2646                        list_blocks.push(block.clone());
2647                        current_block = None;
2648                    }
2649                } else {
2650                    // Check for lazy continuation - non-indented line immediately after a list item
2651                    // But only if the line has sufficient indentation for the list type
2652                    let min_required_indent = if block.is_ordered {
2653                        current_indent_level + last_marker_width
2654                    } else {
2655                        current_indent_level + 2
2656                    };
2657
2658                    // For lazy continuation to apply, the line must either:
2659                    // 1. Have no indentation (true lazy continuation)
2660                    // 2. Have sufficient indentation for the list type
2661                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2662                    let line_content = line_info.content(content).trim();
2663                    let is_structural_separator = line_info.heading.is_some()
2664                        || line_content.starts_with("```")
2665                        || line_content.starts_with("~~~")
2666                        || line_content.starts_with("---")
2667                        || line_content.starts_with("***")
2668                        || line_content.starts_with("___")
2669                        || line_content.starts_with(">")
2670                        || (line_content.contains('|')
2671                            && !line_content.contains("](")
2672                            && !line_content.contains("http")
2673                            && (line_content.matches('|').count() > 1
2674                                || line_content.starts_with('|')
2675                                || line_content.ends_with('|'))); // Tables
2676
2677                    // Allow lazy continuation if we're still within the same list block
2678                    // (not just immediately after a list item)
2679                    let is_lazy_continuation = !is_structural_separator
2680                        && !line_info.is_blank
2681                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2682
2683                    if is_lazy_continuation {
2684                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2685                        // it's probably not a continuation
2686                        let content_to_check = if !blockquote_prefix.is_empty() {
2687                            // Strip blockquote prefix to check the actual content
2688                            line_info
2689                                .content(content)
2690                                .strip_prefix(&blockquote_prefix)
2691                                .unwrap_or(line_info.content(content))
2692                                .trim()
2693                        } else {
2694                            line_info.content(content).trim()
2695                        };
2696
2697                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2698
2699                        // If it starts with uppercase and the previous line ended with punctuation,
2700                        // it's likely a new paragraph, not a continuation
2701                        if starts_with_uppercase && last_list_item_line > 0 {
2702                            // This looks like a new paragraph
2703                            list_blocks.push(block.clone());
2704                            current_block = None;
2705                        } else {
2706                            // This is a lazy continuation line
2707                            block.end_line = line_num;
2708                        }
2709                    } else {
2710                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2711                        list_blocks.push(block.clone());
2712                        current_block = None;
2713                    }
2714                }
2715            }
2716        }
2717
2718        // Don't forget the last block
2719        if let Some(block) = current_block {
2720            list_blocks.push(block);
2721        }
2722
2723        // Merge adjacent blocks that should be one
2724        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2725
2726        list_blocks
2727    }
2728
2729    /// Compute character frequency for fast content analysis
2730    fn compute_char_frequency(content: &str) -> CharFrequency {
2731        let mut frequency = CharFrequency::default();
2732
2733        for ch in content.chars() {
2734            match ch {
2735                '#' => frequency.hash_count += 1,
2736                '*' => frequency.asterisk_count += 1,
2737                '_' => frequency.underscore_count += 1,
2738                '-' => frequency.hyphen_count += 1,
2739                '+' => frequency.plus_count += 1,
2740                '>' => frequency.gt_count += 1,
2741                '|' => frequency.pipe_count += 1,
2742                '[' => frequency.bracket_count += 1,
2743                '`' => frequency.backtick_count += 1,
2744                '<' => frequency.lt_count += 1,
2745                '!' => frequency.exclamation_count += 1,
2746                '\n' => frequency.newline_count += 1,
2747                _ => {}
2748            }
2749        }
2750
2751        frequency
2752    }
2753
2754    /// Parse HTML tags in the content
2755    fn parse_html_tags(
2756        content: &str,
2757        lines: &[LineInfo],
2758        code_blocks: &[(usize, usize)],
2759        flavor: MarkdownFlavor,
2760    ) -> Vec<HtmlTag> {
2761        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2762            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2763
2764        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2765
2766        for cap in HTML_TAG_REGEX.captures_iter(content) {
2767            let full_match = cap.get(0).unwrap();
2768            let match_start = full_match.start();
2769            let match_end = full_match.end();
2770
2771            // Skip if in code block
2772            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2773                continue;
2774            }
2775
2776            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2777            let tag_name_original = cap.get(2).unwrap().as_str();
2778            let tag_name = tag_name_original.to_lowercase();
2779            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2780
2781            // Skip JSX components in MDX files (tags starting with uppercase letter)
2782            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2783            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2784                continue;
2785            }
2786
2787            // Find which line this tag is on
2788            let mut line_num = 1;
2789            let mut col_start = match_start;
2790            let mut col_end = match_end;
2791            for (idx, line_info) in lines.iter().enumerate() {
2792                if match_start >= line_info.byte_offset {
2793                    line_num = idx + 1;
2794                    col_start = match_start - line_info.byte_offset;
2795                    col_end = match_end - line_info.byte_offset;
2796                } else {
2797                    break;
2798                }
2799            }
2800
2801            html_tags.push(HtmlTag {
2802                line: line_num,
2803                start_col: col_start,
2804                end_col: col_end,
2805                byte_offset: match_start,
2806                byte_end: match_end,
2807                tag_name,
2808                is_closing,
2809                is_self_closing,
2810                raw_content: full_match.as_str().to_string(),
2811            });
2812        }
2813
2814        html_tags
2815    }
2816
2817    /// Parse emphasis spans in the content
2818    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2819        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2820            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2821
2822        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2823
2824        for cap in EMPHASIS_REGEX.captures_iter(content) {
2825            let full_match = cap.get(0).unwrap();
2826            let match_start = full_match.start();
2827            let match_end = full_match.end();
2828
2829            // Skip if in code block
2830            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2831                continue;
2832            }
2833
2834            let opening_markers = cap.get(1).unwrap().as_str();
2835            let content_part = cap.get(2).unwrap().as_str();
2836            let closing_markers = cap.get(3).unwrap().as_str();
2837
2838            // Validate matching markers
2839            if opening_markers.chars().next() != closing_markers.chars().next()
2840                || opening_markers.len() != closing_markers.len()
2841            {
2842                continue;
2843            }
2844
2845            let marker = opening_markers.chars().next().unwrap();
2846            let marker_count = opening_markers.len();
2847
2848            // Find which line this emphasis is on
2849            let mut line_num = 1;
2850            let mut col_start = match_start;
2851            let mut col_end = match_end;
2852            for (idx, line_info) in lines.iter().enumerate() {
2853                if match_start >= line_info.byte_offset {
2854                    line_num = idx + 1;
2855                    col_start = match_start - line_info.byte_offset;
2856                    col_end = match_end - line_info.byte_offset;
2857                } else {
2858                    break;
2859                }
2860            }
2861
2862            emphasis_spans.push(EmphasisSpan {
2863                line: line_num,
2864                start_col: col_start,
2865                end_col: col_end,
2866                byte_offset: match_start,
2867                byte_end: match_end,
2868                marker,
2869                marker_count,
2870                content: content_part.to_string(),
2871            });
2872        }
2873
2874        emphasis_spans
2875    }
2876
2877    /// Parse table rows in the content
2878    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2879        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2880
2881        for (line_idx, line_info) in lines.iter().enumerate() {
2882            // Skip lines in code blocks or blank lines
2883            if line_info.in_code_block || line_info.is_blank {
2884                continue;
2885            }
2886
2887            let line = line_info.content(content);
2888            let line_num = line_idx + 1;
2889
2890            // Check if this line contains pipes (potential table row)
2891            if !line.contains('|') {
2892                continue;
2893            }
2894
2895            // Count columns by splitting on pipes
2896            let parts: Vec<&str> = line.split('|').collect();
2897            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2898
2899            // Check if this is a separator row
2900            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2901            let mut column_alignments = Vec::new();
2902
2903            if is_separator {
2904                for part in &parts[1..parts.len() - 1] {
2905                    // Skip first and last empty parts
2906                    let trimmed = part.trim();
2907                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2908                        "center".to_string()
2909                    } else if trimmed.ends_with(':') {
2910                        "right".to_string()
2911                    } else if trimmed.starts_with(':') {
2912                        "left".to_string()
2913                    } else {
2914                        "none".to_string()
2915                    };
2916                    column_alignments.push(alignment);
2917                }
2918            }
2919
2920            table_rows.push(TableRow {
2921                line: line_num,
2922                is_separator,
2923                column_count,
2924                column_alignments,
2925            });
2926        }
2927
2928        table_rows
2929    }
2930
2931    /// Parse bare URLs and emails in the content
2932    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2933        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2934
2935        // Check for bare URLs (not in angle brackets or markdown links)
2936        for cap in BARE_URL_PATTERN.captures_iter(content) {
2937            let full_match = cap.get(0).unwrap();
2938            let match_start = full_match.start();
2939            let match_end = full_match.end();
2940
2941            // Skip if in code block
2942            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2943                continue;
2944            }
2945
2946            // Skip if already in angle brackets or markdown links
2947            let preceding_char = if match_start > 0 {
2948                content.chars().nth(match_start - 1)
2949            } else {
2950                None
2951            };
2952            let following_char = content.chars().nth(match_end);
2953
2954            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2955                continue;
2956            }
2957            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2958                continue;
2959            }
2960
2961            let url = full_match.as_str();
2962            let url_type = if url.starts_with("https://") {
2963                "https"
2964            } else if url.starts_with("http://") {
2965                "http"
2966            } else if url.starts_with("ftp://") {
2967                "ftp"
2968            } else {
2969                "other"
2970            };
2971
2972            // Find which line this URL is on
2973            let mut line_num = 1;
2974            let mut col_start = match_start;
2975            let mut col_end = match_end;
2976            for (idx, line_info) in lines.iter().enumerate() {
2977                if match_start >= line_info.byte_offset {
2978                    line_num = idx + 1;
2979                    col_start = match_start - line_info.byte_offset;
2980                    col_end = match_end - line_info.byte_offset;
2981                } else {
2982                    break;
2983                }
2984            }
2985
2986            bare_urls.push(BareUrl {
2987                line: line_num,
2988                start_col: col_start,
2989                end_col: col_end,
2990                byte_offset: match_start,
2991                byte_end: match_end,
2992                url: url.to_string(),
2993                url_type: url_type.to_string(),
2994            });
2995        }
2996
2997        // Check for bare email addresses
2998        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2999            let full_match = cap.get(0).unwrap();
3000            let match_start = full_match.start();
3001            let match_end = full_match.end();
3002
3003            // Skip if in code block
3004            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3005                continue;
3006            }
3007
3008            // Skip if already in angle brackets or markdown links
3009            let preceding_char = if match_start > 0 {
3010                content.chars().nth(match_start - 1)
3011            } else {
3012                None
3013            };
3014            let following_char = content.chars().nth(match_end);
3015
3016            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3017                continue;
3018            }
3019            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3020                continue;
3021            }
3022
3023            let email = full_match.as_str();
3024
3025            // Find which line this email is on
3026            let mut line_num = 1;
3027            let mut col_start = match_start;
3028            let mut col_end = match_end;
3029            for (idx, line_info) in lines.iter().enumerate() {
3030                if match_start >= line_info.byte_offset {
3031                    line_num = idx + 1;
3032                    col_start = match_start - line_info.byte_offset;
3033                    col_end = match_end - line_info.byte_offset;
3034                } else {
3035                    break;
3036                }
3037            }
3038
3039            bare_urls.push(BareUrl {
3040                line: line_num,
3041                start_col: col_start,
3042                end_col: col_end,
3043                byte_offset: match_start,
3044                byte_end: match_end,
3045                url: email.to_string(),
3046                url_type: "email".to_string(),
3047            });
3048        }
3049
3050        bare_urls
3051    }
3052}
3053
3054/// Merge adjacent list blocks that should be treated as one
3055fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3056    if list_blocks.len() < 2 {
3057        return;
3058    }
3059
3060    let mut merger = ListBlockMerger::new(content, lines);
3061    *list_blocks = merger.merge(list_blocks);
3062}
3063
3064/// Helper struct to manage the complex logic of merging list blocks
3065struct ListBlockMerger<'a> {
3066    content: &'a str,
3067    lines: &'a [LineInfo],
3068}
3069
3070impl<'a> ListBlockMerger<'a> {
3071    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3072        Self { content, lines }
3073    }
3074
3075    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3076        let mut merged = Vec::with_capacity(list_blocks.len());
3077        let mut current = list_blocks[0].clone();
3078
3079        for next in list_blocks.iter().skip(1) {
3080            if self.should_merge_blocks(&current, next) {
3081                current = self.merge_two_blocks(current, next);
3082            } else {
3083                merged.push(current);
3084                current = next.clone();
3085            }
3086        }
3087
3088        merged.push(current);
3089        merged
3090    }
3091
3092    /// Determine if two adjacent list blocks should be merged
3093    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3094        // Basic compatibility checks
3095        if !self.blocks_are_compatible(current, next) {
3096            return false;
3097        }
3098
3099        // Check spacing and content between blocks
3100        let spacing = self.analyze_spacing_between(current, next);
3101        match spacing {
3102            BlockSpacing::Consecutive => true,
3103            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3104            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3105                self.can_merge_with_content_between(current, next)
3106            }
3107        }
3108    }
3109
3110    /// Check if blocks have compatible structure for merging
3111    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3112        current.is_ordered == next.is_ordered
3113            && current.blockquote_prefix == next.blockquote_prefix
3114            && current.nesting_level == next.nesting_level
3115    }
3116
3117    /// Analyze the spacing between two list blocks
3118    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3119        let gap = next.start_line - current.end_line;
3120
3121        match gap {
3122            1 => BlockSpacing::Consecutive,
3123            2 => BlockSpacing::SingleBlank,
3124            _ if gap > 2 => {
3125                if self.has_only_blank_lines_between(current, next) {
3126                    BlockSpacing::MultipleBlanks
3127                } else {
3128                    BlockSpacing::ContentBetween
3129                }
3130            }
3131            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3132        }
3133    }
3134
3135    /// Check if unordered lists can be merged with a single blank line between
3136    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3137        // Check if there are structural separators between the blocks
3138        // If has_meaningful_content_between returns true, it means there are structural separators
3139        if has_meaningful_content_between(self.content, current, next, self.lines) {
3140            return false; // Structural separators prevent merging
3141        }
3142
3143        // Only merge unordered lists with same marker across single blank
3144        !current.is_ordered && current.marker == next.marker
3145    }
3146
3147    /// Check if ordered lists can be merged when there's content between them
3148    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3149        // Do not merge lists if there are structural separators between them
3150        if has_meaningful_content_between(self.content, current, next, self.lines) {
3151            return false; // Structural separators prevent merging
3152        }
3153
3154        // Only consider merging ordered lists if there's no structural content between
3155        current.is_ordered && next.is_ordered
3156    }
3157
3158    /// Check if there are only blank lines between blocks
3159    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3160        for line_num in (current.end_line + 1)..next.start_line {
3161            if let Some(line_info) = self.lines.get(line_num - 1)
3162                && !line_info.content(self.content).trim().is_empty()
3163            {
3164                return false;
3165            }
3166        }
3167        true
3168    }
3169
3170    /// Merge two compatible list blocks into one
3171    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3172        current.end_line = next.end_line;
3173        current.item_lines.extend_from_slice(&next.item_lines);
3174
3175        // Update max marker width
3176        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3177
3178        // Handle marker consistency for unordered lists
3179        if !current.is_ordered && self.markers_differ(&current, next) {
3180            current.marker = None; // Mixed markers
3181        }
3182
3183        current
3184    }
3185
3186    /// Check if two blocks have different markers
3187    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3188        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3189    }
3190}
3191
3192/// Types of spacing between list blocks
3193#[derive(Debug, PartialEq)]
3194enum BlockSpacing {
3195    Consecutive,    // No gap between blocks
3196    SingleBlank,    // One blank line between blocks
3197    MultipleBlanks, // Multiple blank lines but no content
3198    ContentBetween, // Content exists between blocks
3199}
3200
3201/// Check if there's meaningful content (not just blank lines) between two list blocks
3202fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3203    // Check lines between current.end_line and next.start_line
3204    for line_num in (current.end_line + 1)..next.start_line {
3205        if let Some(line_info) = lines.get(line_num - 1) {
3206            // Convert to 0-indexed
3207            let trimmed = line_info.content(content).trim();
3208
3209            // Skip empty lines
3210            if trimmed.is_empty() {
3211                continue;
3212            }
3213
3214            // Check for structural separators that should separate lists (CommonMark compliant)
3215
3216            // Headings separate lists
3217            if line_info.heading.is_some() {
3218                return true; // Has meaningful content - headings separate lists
3219            }
3220
3221            // Horizontal rules separate lists (---, ***, ___)
3222            if is_horizontal_rule(trimmed) {
3223                return true; // Has meaningful content - horizontal rules separate lists
3224            }
3225
3226            // Tables separate lists (lines containing | but not in URLs or code)
3227            // Simple heuristic: tables typically have | at start/end or multiple |
3228            if trimmed.contains('|') && trimmed.len() > 1 {
3229                // Don't treat URLs with | as tables
3230                if !trimmed.contains("](") && !trimmed.contains("http") {
3231                    // More robust check: tables usually have multiple | or | at edges
3232                    let pipe_count = trimmed.matches('|').count();
3233                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3234                        return true; // Has meaningful content - tables separate lists
3235                    }
3236                }
3237            }
3238
3239            // Blockquotes separate lists
3240            if trimmed.starts_with('>') {
3241                return true; // Has meaningful content - blockquotes separate lists
3242            }
3243
3244            // Code block fences separate lists (unless properly indented as list content)
3245            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3246                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3247
3248                // Check if this code block is properly indented as list continuation
3249                let min_continuation_indent = if current.is_ordered {
3250                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3251                } else {
3252                    current.nesting_level + 2
3253                };
3254
3255                if line_indent < min_continuation_indent {
3256                    // This is a standalone code block that separates lists
3257                    return true; // Has meaningful content - standalone code blocks separate lists
3258                }
3259            }
3260
3261            // Check if this line has proper indentation for list continuation
3262            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3263
3264            // Calculate minimum indentation needed to be list continuation
3265            let min_indent = if current.is_ordered {
3266                current.nesting_level + current.max_marker_width
3267            } else {
3268                current.nesting_level + 2
3269            };
3270
3271            // If the line is not indented enough to be list continuation, it's meaningful content
3272            if line_indent < min_indent {
3273                return true; // Has meaningful content - content not indented as list continuation
3274            }
3275
3276            // If we reach here, the line is properly indented as list continuation
3277            // Continue checking other lines
3278        }
3279    }
3280
3281    // Only blank lines or properly indented list continuation content between blocks
3282    false
3283}
3284
3285/// Check if a line is a horizontal rule (---, ***, ___)
3286fn is_horizontal_rule(trimmed: &str) -> bool {
3287    if trimmed.len() < 3 {
3288        return false;
3289    }
3290
3291    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3292    let chars: Vec<char> = trimmed.chars().collect();
3293    if let Some(&first_char) = chars.first()
3294        && (first_char == '-' || first_char == '*' || first_char == '_')
3295    {
3296        let mut count = 0;
3297        for &ch in &chars {
3298            if ch == first_char {
3299                count += 1;
3300            } else if ch != ' ' && ch != '\t' {
3301                return false; // Non-matching, non-whitespace character
3302            }
3303        }
3304        return count >= 3;
3305    }
3306    false
3307}
3308
3309/// Check if content contains patterns that cause the markdown crate to panic
3310#[cfg(test)]
3311mod tests {
3312    use super::*;
3313
3314    #[test]
3315    fn test_empty_content() {
3316        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3317        assert_eq!(ctx.content, "");
3318        assert_eq!(ctx.line_offsets, vec![0]);
3319        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3320        assert_eq!(ctx.lines.len(), 0);
3321    }
3322
3323    #[test]
3324    fn test_single_line() {
3325        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3326        assert_eq!(ctx.content, "# Hello");
3327        assert_eq!(ctx.line_offsets, vec![0]);
3328        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3329        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3330    }
3331
3332    #[test]
3333    fn test_multi_line() {
3334        let content = "# Title\n\nSecond line\nThird line";
3335        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3336        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3337        // Test offset to line/col
3338        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3339        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3340        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3341        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3342        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3343    }
3344
3345    #[test]
3346    fn test_line_info() {
3347        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3348        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3349
3350        // Test line info
3351        assert_eq!(ctx.lines.len(), 7);
3352
3353        // Line 1: "# Title"
3354        let line1 = &ctx.lines[0];
3355        assert_eq!(line1.content(ctx.content), "# Title");
3356        assert_eq!(line1.byte_offset, 0);
3357        assert_eq!(line1.indent, 0);
3358        assert!(!line1.is_blank);
3359        assert!(!line1.in_code_block);
3360        assert!(line1.list_item.is_none());
3361
3362        // Line 2: "    indented"
3363        let line2 = &ctx.lines[1];
3364        assert_eq!(line2.content(ctx.content), "    indented");
3365        assert_eq!(line2.byte_offset, 8);
3366        assert_eq!(line2.indent, 4);
3367        assert!(!line2.is_blank);
3368
3369        // Line 3: "" (blank)
3370        let line3 = &ctx.lines[2];
3371        assert_eq!(line3.content(ctx.content), "");
3372        assert!(line3.is_blank);
3373
3374        // Test helper methods
3375        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3376        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3377        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3378        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3379    }
3380
3381    #[test]
3382    fn test_list_item_detection() {
3383        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3384        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3385
3386        // Line 1: "- Unordered item"
3387        let line1 = &ctx.lines[0];
3388        assert!(line1.list_item.is_some());
3389        let list1 = line1.list_item.as_ref().unwrap();
3390        assert_eq!(list1.marker, "-");
3391        assert!(!list1.is_ordered);
3392        assert_eq!(list1.marker_column, 0);
3393        assert_eq!(list1.content_column, 2);
3394
3395        // Line 2: "  * Nested item"
3396        let line2 = &ctx.lines[1];
3397        assert!(line2.list_item.is_some());
3398        let list2 = line2.list_item.as_ref().unwrap();
3399        assert_eq!(list2.marker, "*");
3400        assert_eq!(list2.marker_column, 2);
3401
3402        // Line 3: "1. Ordered item"
3403        let line3 = &ctx.lines[2];
3404        assert!(line3.list_item.is_some());
3405        let list3 = line3.list_item.as_ref().unwrap();
3406        assert_eq!(list3.marker, "1.");
3407        assert!(list3.is_ordered);
3408        assert_eq!(list3.number, Some(1));
3409
3410        // Line 6: "Not a list"
3411        let line6 = &ctx.lines[5];
3412        assert!(line6.list_item.is_none());
3413    }
3414
3415    #[test]
3416    fn test_offset_to_line_col_edge_cases() {
3417        let content = "a\nb\nc";
3418        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3419        // line_offsets: [0, 2, 4]
3420        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3421        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3422        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3423        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3424        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3425        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3426    }
3427
3428    #[test]
3429    fn test_mdx_esm_blocks() {
3430        let content = r##"import {Chart} from './snowfall.js'
3431export const year = 2023
3432
3433# Last year's snowfall
3434
3435In {year}, the snowfall was above average.
3436It was followed by a warm spring which caused
3437flood conditions in many of the nearby rivers.
3438
3439<Chart color="#fcb32c" year={year} />
3440"##;
3441
3442        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3443
3444        // Check that lines 1 and 2 are marked as ESM blocks
3445        assert_eq!(ctx.lines.len(), 10);
3446        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3447        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3448        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3449        assert!(
3450            !ctx.lines[3].in_esm_block,
3451            "Line 4 (heading) should NOT be in_esm_block"
3452        );
3453        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3454        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3455    }
3456
3457    #[test]
3458    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3459        let content = r#"import {Chart} from './snowfall.js'
3460export const year = 2023
3461
3462# Last year's snowfall
3463"#;
3464
3465        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3466
3467        // ESM blocks should NOT be detected in Standard flavor
3468        assert!(
3469            !ctx.lines[0].in_esm_block,
3470            "Line 1 should NOT be in_esm_block in Standard flavor"
3471        );
3472        assert!(
3473            !ctx.lines[1].in_esm_block,
3474            "Line 2 should NOT be in_esm_block in Standard flavor"
3475        );
3476    }
3477}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs