rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9// Comprehensive link pattern that captures both inline and reference links
10// Use (?s) flag to make . match newlines
11static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
12    Regex::new(
13        r#"(?sx)
14        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
15        (?:
16            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
17            |
18            \[([^\]]*)\]      # Reference ID in group 6
19        )"#
20    ).unwrap()
21});
22
23// Image pattern (similar to links but with ! prefix)
24// Use (?s) flag to make . match newlines
25static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
26    Regex::new(
27        r#"(?sx)
28        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
29        (?:
30            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
31            |
32            \[([^\]]*)\]      # Reference ID in group 6
33        )"#
34    ).unwrap()
35});
36
37// Reference definition pattern
38static REF_DEF_PATTERN: LazyLock<Regex> =
39    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
40
41// Pattern for bare URLs
42static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
43    Regex::new(
44        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45    ).unwrap()
46});
47
48// Pattern for email addresses
49static BARE_EMAIL_PATTERN: LazyLock<Regex> =
50    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
51
52// Pattern for blockquote prefix in parse_list_blocks
53static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
54
55/// Pre-computed information about a line
56#[derive(Debug, Clone)]
57pub struct LineInfo {
58    /// Byte offset where this line starts in the document
59    pub byte_offset: usize,
60    /// Length of the line in bytes (without newline)
61    pub byte_len: usize,
62    /// Number of leading spaces/tabs
63    pub indent: usize,
64    /// Whether the line is blank (empty or only whitespace)
65    pub is_blank: bool,
66    /// Whether this line is inside a code block
67    pub in_code_block: bool,
68    /// Whether this line is inside front matter
69    pub in_front_matter: bool,
70    /// Whether this line is inside an HTML block
71    pub in_html_block: bool,
72    /// Whether this line is inside an HTML comment
73    pub in_html_comment: bool,
74    /// List item information if this line starts a list item
75    pub list_item: Option<ListItemInfo>,
76    /// Heading information if this line is a heading
77    pub heading: Option<HeadingInfo>,
78    /// Blockquote information if this line is a blockquote
79    pub blockquote: Option<BlockquoteInfo>,
80    /// Whether this line is inside a mkdocstrings autodoc block
81    pub in_mkdocstrings: bool,
82    /// Whether this line is part of an ESM import/export block (MDX only)
83    pub in_esm_block: bool,
84}
85
86impl LineInfo {
87    /// Get the line content as a string slice from the source document
88    pub fn content<'a>(&self, source: &'a str) -> &'a str {
89        &source[self.byte_offset..self.byte_offset + self.byte_len]
90    }
91}
92
93/// Information about a list item
94#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96    /// The marker used (*, -, +, or number with . or ))
97    pub marker: String,
98    /// Whether it's ordered (true) or unordered (false)
99    pub is_ordered: bool,
100    /// The number for ordered lists
101    pub number: Option<usize>,
102    /// Column where the marker starts (0-based)
103    pub marker_column: usize,
104    /// Column where content after marker starts
105    pub content_column: usize,
106}
107
108/// Heading style type
109#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111    /// ATX style heading (# Heading)
112    ATX,
113    /// Setext style heading with = underline
114    Setext1,
115    /// Setext style heading with - underline
116    Setext2,
117}
118
119/// Parsed link information
120#[derive(Debug, Clone)]
121pub struct ParsedLink<'a> {
122    /// Line number (1-indexed)
123    pub line: usize,
124    /// Start column (0-indexed) in the line
125    pub start_col: usize,
126    /// End column (0-indexed) in the line
127    pub end_col: usize,
128    /// Byte offset in document
129    pub byte_offset: usize,
130    /// End byte offset in document
131    pub byte_end: usize,
132    /// Link text
133    pub text: Cow<'a, str>,
134    /// Link URL or reference
135    pub url: Cow<'a, str>,
136    /// Whether this is a reference link [text][ref] vs inline [text](url)
137    pub is_reference: bool,
138    /// Reference ID for reference links
139    pub reference_id: Option<Cow<'a, str>>,
140    /// Link type from pulldown-cmark
141    pub link_type: LinkType,
142}
143
144/// Information about a broken link reported by pulldown-cmark
145#[derive(Debug, Clone)]
146pub struct BrokenLinkInfo {
147    /// The reference text that couldn't be resolved
148    pub reference: String,
149    /// Byte span in the source document
150    pub span: std::ops::Range<usize>,
151}
152
153/// Parsed image information
154#[derive(Debug, Clone)]
155pub struct ParsedImage<'a> {
156    /// Line number (1-indexed)
157    pub line: usize,
158    /// Start column (0-indexed) in the line
159    pub start_col: usize,
160    /// End column (0-indexed) in the line
161    pub end_col: usize,
162    /// Byte offset in document
163    pub byte_offset: usize,
164    /// End byte offset in document
165    pub byte_end: usize,
166    /// Alt text
167    pub alt_text: Cow<'a, str>,
168    /// Image URL or reference
169    pub url: Cow<'a, str>,
170    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
171    pub is_reference: bool,
172    /// Reference ID for reference images
173    pub reference_id: Option<Cow<'a, str>>,
174    /// Link type from pulldown-cmark
175    pub link_type: LinkType,
176}
177
178/// Reference definition [ref]: url "title"
179#[derive(Debug, Clone)]
180pub struct ReferenceDef {
181    /// Line number (1-indexed)
182    pub line: usize,
183    /// Reference ID (normalized to lowercase)
184    pub id: String,
185    /// URL
186    pub url: String,
187    /// Optional title
188    pub title: Option<String>,
189    /// Byte offset where the reference definition starts
190    pub byte_offset: usize,
191    /// Byte offset where the reference definition ends
192    pub byte_end: usize,
193}
194
195/// Parsed code span information
196#[derive(Debug, Clone)]
197pub struct CodeSpan {
198    /// Line number (1-indexed)
199    pub line: usize,
200    /// Start column (0-indexed) in the line
201    pub start_col: usize,
202    /// End column (0-indexed) in the line
203    pub end_col: usize,
204    /// Byte offset in document
205    pub byte_offset: usize,
206    /// End byte offset in document
207    pub byte_end: usize,
208    /// Number of backticks used (1, 2, 3, etc.)
209    pub backtick_count: usize,
210    /// Content inside the code span (without backticks)
211    pub content: String,
212}
213
214/// Information about a heading
215#[derive(Debug, Clone)]
216pub struct HeadingInfo {
217    /// Heading level (1-6 for ATX, 1-2 for Setext)
218    pub level: u8,
219    /// Style of heading
220    pub style: HeadingStyle,
221    /// The heading marker (# characters or underline)
222    pub marker: String,
223    /// Column where the marker starts (0-based)
224    pub marker_column: usize,
225    /// Column where heading text starts
226    pub content_column: usize,
227    /// The heading text (without markers and without custom ID syntax)
228    pub text: String,
229    /// Custom header ID if present (e.g., from {#custom-id} syntax)
230    pub custom_id: Option<String>,
231    /// Original heading text including custom ID syntax
232    pub raw_text: String,
233    /// Whether it has a closing sequence (for ATX)
234    pub has_closing_sequence: bool,
235    /// The closing sequence if present
236    pub closing_sequence: String,
237}
238
239/// Information about a blockquote line
240#[derive(Debug, Clone)]
241pub struct BlockquoteInfo {
242    /// Nesting level (1 for >, 2 for >>, etc.)
243    pub nesting_level: usize,
244    /// The indentation before the blockquote marker
245    pub indent: String,
246    /// Column where the first > starts (0-based)
247    pub marker_column: usize,
248    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
249    pub prefix: String,
250    /// Content after the blockquote marker(s)
251    pub content: String,
252    /// Whether the line has no space after the marker
253    pub has_no_space_after_marker: bool,
254    /// Whether the line has multiple spaces after the marker
255    pub has_multiple_spaces_after_marker: bool,
256    /// Whether this is an empty blockquote line needing MD028 fix
257    pub needs_md028_fix: bool,
258}
259
260/// Information about a list block
261#[derive(Debug, Clone)]
262pub struct ListBlock {
263    /// Line number where the list starts (1-indexed)
264    pub start_line: usize,
265    /// Line number where the list ends (1-indexed)
266    pub end_line: usize,
267    /// Whether it's ordered or unordered
268    pub is_ordered: bool,
269    /// The consistent marker for unordered lists (if any)
270    pub marker: Option<String>,
271    /// Blockquote prefix for this list (empty if not in blockquote)
272    pub blockquote_prefix: String,
273    /// Lines that are list items within this block
274    pub item_lines: Vec<usize>,
275    /// Nesting level (0 for top-level lists)
276    pub nesting_level: usize,
277    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
278    pub max_marker_width: usize,
279}
280
281use std::sync::{Arc, Mutex};
282
283/// Character frequency data for fast content analysis
284#[derive(Debug, Clone, Default)]
285pub struct CharFrequency {
286    /// Count of # characters (headings)
287    pub hash_count: usize,
288    /// Count of * characters (emphasis, lists, horizontal rules)
289    pub asterisk_count: usize,
290    /// Count of _ characters (emphasis, horizontal rules)
291    pub underscore_count: usize,
292    /// Count of - characters (lists, horizontal rules, setext headings)
293    pub hyphen_count: usize,
294    /// Count of + characters (lists)
295    pub plus_count: usize,
296    /// Count of > characters (blockquotes)
297    pub gt_count: usize,
298    /// Count of | characters (tables)
299    pub pipe_count: usize,
300    /// Count of [ characters (links, images)
301    pub bracket_count: usize,
302    /// Count of ` characters (code spans, code blocks)
303    pub backtick_count: usize,
304    /// Count of < characters (HTML tags, autolinks)
305    pub lt_count: usize,
306    /// Count of ! characters (images)
307    pub exclamation_count: usize,
308    /// Count of newline characters
309    pub newline_count: usize,
310}
311
312/// Pre-parsed HTML tag information
313#[derive(Debug, Clone)]
314pub struct HtmlTag {
315    /// Line number (1-indexed)
316    pub line: usize,
317    /// Start column (0-indexed) in the line
318    pub start_col: usize,
319    /// End column (0-indexed) in the line
320    pub end_col: usize,
321    /// Byte offset in document
322    pub byte_offset: usize,
323    /// End byte offset in document
324    pub byte_end: usize,
325    /// Tag name (e.g., "div", "img", "br")
326    pub tag_name: String,
327    /// Whether it's a closing tag (`</tag>`)
328    pub is_closing: bool,
329    /// Whether it's self-closing (`<tag />`)
330    pub is_self_closing: bool,
331    /// Raw tag content
332    pub raw_content: String,
333}
334
335/// Pre-parsed emphasis span information
336#[derive(Debug, Clone)]
337pub struct EmphasisSpan {
338    /// Line number (1-indexed)
339    pub line: usize,
340    /// Start column (0-indexed) in the line
341    pub start_col: usize,
342    /// End column (0-indexed) in the line
343    pub end_col: usize,
344    /// Byte offset in document
345    pub byte_offset: usize,
346    /// End byte offset in document
347    pub byte_end: usize,
348    /// Type of emphasis ('*' or '_')
349    pub marker: char,
350    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
351    pub marker_count: usize,
352    /// Content inside the emphasis
353    pub content: String,
354}
355
356/// Pre-parsed table row information
357#[derive(Debug, Clone)]
358pub struct TableRow {
359    /// Line number (1-indexed)
360    pub line: usize,
361    /// Whether this is a separator row (contains only |, -, :, and spaces)
362    pub is_separator: bool,
363    /// Number of columns (pipe-separated cells)
364    pub column_count: usize,
365    /// Alignment info from separator row
366    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
367}
368
369/// Pre-parsed bare URL information (not in links)
370#[derive(Debug, Clone)]
371pub struct BareUrl {
372    /// Line number (1-indexed)
373    pub line: usize,
374    /// Start column (0-indexed) in the line
375    pub start_col: usize,
376    /// End column (0-indexed) in the line
377    pub end_col: usize,
378    /// Byte offset in document
379    pub byte_offset: usize,
380    /// End byte offset in document
381    pub byte_end: usize,
382    /// The URL string
383    pub url: String,
384    /// Type of URL ("http", "https", "ftp", "email")
385    pub url_type: String,
386}
387
388pub struct LintContext<'a> {
389    pub content: &'a str,
390    pub line_offsets: Vec<usize>,
391    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
392    pub lines: Vec<LineInfo>,             // Pre-computed line information
393    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
394    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
395    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
396    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
397    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
398    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
399    pub char_frequency: CharFrequency,    // Character frequency analysis
400    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
401    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
402    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
403    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
404    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
405    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
406    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
407    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
408    pub flavor: MarkdownFlavor,           // Markdown flavor being used
409}
410
411/// Detailed blockquote parse result with all components
412struct BlockquoteComponents<'a> {
413    indent: &'a str,
414    markers: &'a str,
415    spaces_after: &'a str,
416    content: &'a str,
417}
418
419/// Parse blockquote prefix with detailed components using manual parsing
420#[inline]
421fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
422    let bytes = line.as_bytes();
423    let mut pos = 0;
424
425    // Parse leading whitespace (indent)
426    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
427        pos += 1;
428    }
429    let indent_end = pos;
430
431    // Must have at least one '>' marker
432    if pos >= bytes.len() || bytes[pos] != b'>' {
433        return None;
434    }
435
436    // Parse '>' markers
437    while pos < bytes.len() && bytes[pos] == b'>' {
438        pos += 1;
439    }
440    let markers_end = pos;
441
442    // Parse spaces after markers
443    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
444        pos += 1;
445    }
446    let spaces_end = pos;
447
448    Some(BlockquoteComponents {
449        indent: &line[0..indent_end],
450        markers: &line[indent_end..markers_end],
451        spaces_after: &line[markers_end..spaces_end],
452        content: &line[spaces_end..],
453    })
454}
455
456impl<'a> LintContext<'a> {
457    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
458        use std::time::Instant;
459        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
460
461        let start = Instant::now();
462        let mut line_offsets = vec![0];
463        for (i, c) in content.char_indices() {
464            if c == '\n' {
465                line_offsets.push(i + 1);
466            }
467        }
468        if profile {
469            eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
470        }
471
472        // Detect code blocks once and cache them
473        let start = Instant::now();
474        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
475        if profile {
476            eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
477        }
478
479        // Pre-compute HTML comment ranges ONCE for all operations
480        let start = Instant::now();
481        let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
482        if profile {
483            eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
484        }
485
486        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
487        let start = Instant::now();
488        let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
489            crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
490        } else {
491            Vec::new()
492        };
493        if profile {
494            eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
495        }
496
497        // Pre-compute line information (without headings/blockquotes yet)
498        let start = Instant::now();
499        let mut lines = Self::compute_basic_line_info(
500            content,
501            &line_offsets,
502            &code_blocks,
503            flavor,
504            &html_comment_ranges,
505            &autodoc_ranges,
506        );
507        if profile {
508            eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
509        }
510
511        // Detect HTML blocks BEFORE heading detection
512        let start = Instant::now();
513        Self::detect_html_blocks(content, &mut lines);
514        if profile {
515            eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
516        }
517
518        // Detect ESM import/export blocks in MDX files BEFORE heading detection
519        let start = Instant::now();
520        Self::detect_esm_blocks(content, &mut lines, flavor);
521        if profile {
522            eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
523        }
524
525        // Now detect headings and blockquotes
526        let start = Instant::now();
527        Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
528        if profile {
529            eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
530        }
531
532        // Parse code spans early so we can exclude them from link/image parsing
533        let start = Instant::now();
534        let code_spans = Self::parse_code_spans(content, &lines);
535        if profile {
536            eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
537        }
538
539        // Parse links, images, references, and list blocks
540        let start = Instant::now();
541        let (links, broken_links) =
542            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
543        if profile {
544            eprintln!("[PROFILE] Links: {:?}", start.elapsed());
545        }
546
547        let start = Instant::now();
548        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
549        if profile {
550            eprintln!("[PROFILE] Images: {:?}", start.elapsed());
551        }
552
553        let start = Instant::now();
554        let reference_defs = Self::parse_reference_defs(content, &lines);
555        if profile {
556            eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
557        }
558
559        let start = Instant::now();
560        let list_blocks = Self::parse_list_blocks(content, &lines);
561        if profile {
562            eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
563        }
564
565        // Compute character frequency for fast content analysis
566        let start = Instant::now();
567        let char_frequency = Self::compute_char_frequency(content);
568        if profile {
569            eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
570        }
571
572        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
573        let start = Instant::now();
574        let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
575            content,
576            &code_blocks,
577            &code_spans,
578            &html_comment_ranges,
579        );
580        if profile {
581            eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
582        }
583
584        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
585        let start = Instant::now();
586        let line_index = crate::utils::range_utils::LineIndex::new(content);
587        if profile {
588            eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
589        }
590
591        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
592        let start = Instant::now();
593        let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
594        if profile {
595            eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
596        }
597
598        Self {
599            content,
600            line_offsets,
601            code_blocks,
602            lines,
603            links,
604            images,
605            broken_links,
606            reference_defs,
607            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
608            list_blocks,
609            char_frequency,
610            html_tags_cache: Mutex::new(None),
611            emphasis_spans_cache: Mutex::new(None),
612            table_rows_cache: Mutex::new(None),
613            bare_urls_cache: Mutex::new(None),
614            html_comment_ranges,
615            table_blocks,
616            line_index,
617            jinja_ranges,
618            flavor,
619        }
620    }
621
622    /// Get code spans - computed lazily on first access
623    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
624        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
625
626        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
627    }
628
629    /// Get HTML comment ranges - pre-computed during LintContext construction
630    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
631        &self.html_comment_ranges
632    }
633
634    /// Get HTML tags - computed lazily on first access
635    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
636        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
637
638        Arc::clone(cache.get_or_insert_with(|| {
639            Arc::new(Self::parse_html_tags(
640                self.content,
641                &self.lines,
642                &self.code_blocks,
643                self.flavor,
644            ))
645        }))
646    }
647
648    /// Get emphasis spans - computed lazily on first access
649    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
650        let mut cache = self
651            .emphasis_spans_cache
652            .lock()
653            .expect("Emphasis spans cache mutex poisoned");
654
655        Arc::clone(
656            cache.get_or_insert_with(|| {
657                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
658            }),
659        )
660    }
661
662    /// Get table rows - computed lazily on first access
663    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
664        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
665
666        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
667    }
668
669    /// Get bare URLs - computed lazily on first access
670    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
671        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
672
673        Arc::clone(
674            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
675        )
676    }
677
678    /// Map a byte offset to (line, column)
679    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
680        match self.line_offsets.binary_search(&offset) {
681            Ok(line) => (line + 1, 1),
682            Err(line) => {
683                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
684                (line, offset - line_start + 1)
685            }
686        }
687    }
688
689    /// Check if a position is within a code block or code span
690    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
691        // Check code blocks first
692        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
693            return true;
694        }
695
696        // Check inline code spans (lazy load if needed)
697        self.code_spans()
698            .iter()
699            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
700    }
701
702    /// Get line information by line number (1-indexed)
703    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
704        if line_num > 0 {
705            self.lines.get(line_num - 1)
706        } else {
707            None
708        }
709    }
710
711    /// Get byte offset for a line number (1-indexed)
712    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
713        self.line_info(line_num).map(|info| info.byte_offset)
714    }
715
716    /// Get URL for a reference link/image by its ID
717    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
718        let normalized_id = ref_id.to_lowercase();
719        self.reference_defs
720            .iter()
721            .find(|def| def.id == normalized_id)
722            .map(|def| def.url.as_str())
723    }
724
725    /// Get links on a specific line
726    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink<'_>> {
727        self.links.iter().filter(|link| link.line == line_num).collect()
728    }
729
730    /// Get images on a specific line
731    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage<'_>> {
732        self.images.iter().filter(|img| img.line == line_num).collect()
733    }
734
735    /// Check if a line is part of a list block
736    pub fn is_in_list_block(&self, line_num: usize) -> bool {
737        self.list_blocks
738            .iter()
739            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
740    }
741
742    /// Get the list block containing a specific line
743    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
744        self.list_blocks
745            .iter()
746            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
747    }
748
749    // Compatibility methods for DocumentStructure migration
750
751    /// Check if a line is within a code block
752    pub fn is_in_code_block(&self, line_num: usize) -> bool {
753        if line_num == 0 || line_num > self.lines.len() {
754            return false;
755        }
756        self.lines[line_num - 1].in_code_block
757    }
758
759    /// Check if a line is within front matter
760    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
761        if line_num == 0 || line_num > self.lines.len() {
762            return false;
763        }
764        self.lines[line_num - 1].in_front_matter
765    }
766
767    /// Check if a line is within an HTML block
768    pub fn is_in_html_block(&self, line_num: usize) -> bool {
769        if line_num == 0 || line_num > self.lines.len() {
770            return false;
771        }
772        self.lines[line_num - 1].in_html_block
773    }
774
775    /// Check if a line and column is within a code span
776    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
777        if line_num == 0 || line_num > self.lines.len() {
778            return false;
779        }
780
781        // Use the code spans cache to check
782        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
783        // Convert col to 0-indexed for comparison
784        let col_0indexed = if col > 0 { col - 1 } else { 0 };
785        let code_spans = self.code_spans();
786        code_spans
787            .iter()
788            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
789    }
790
791    /// Check if a byte position is within a reference definition
792    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
793    #[inline]
794    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
795        self.reference_defs
796            .iter()
797            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
798    }
799
800    /// Check if a byte position is within an HTML comment
801    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
802    /// where k is the number of HTML comments (typically very small)
803    #[inline]
804    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
805        self.html_comment_ranges
806            .iter()
807            .any(|range| byte_pos >= range.start && byte_pos < range.end)
808    }
809
810    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
811    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
812        self.jinja_ranges
813            .iter()
814            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
815    }
816
817    /// Check if content has any instances of a specific character (fast)
818    pub fn has_char(&self, ch: char) -> bool {
819        match ch {
820            '#' => self.char_frequency.hash_count > 0,
821            '*' => self.char_frequency.asterisk_count > 0,
822            '_' => self.char_frequency.underscore_count > 0,
823            '-' => self.char_frequency.hyphen_count > 0,
824            '+' => self.char_frequency.plus_count > 0,
825            '>' => self.char_frequency.gt_count > 0,
826            '|' => self.char_frequency.pipe_count > 0,
827            '[' => self.char_frequency.bracket_count > 0,
828            '`' => self.char_frequency.backtick_count > 0,
829            '<' => self.char_frequency.lt_count > 0,
830            '!' => self.char_frequency.exclamation_count > 0,
831            '\n' => self.char_frequency.newline_count > 0,
832            _ => self.content.contains(ch), // Fallback for other characters
833        }
834    }
835
836    /// Get count of a specific character (fast)
837    pub fn char_count(&self, ch: char) -> usize {
838        match ch {
839            '#' => self.char_frequency.hash_count,
840            '*' => self.char_frequency.asterisk_count,
841            '_' => self.char_frequency.underscore_count,
842            '-' => self.char_frequency.hyphen_count,
843            '+' => self.char_frequency.plus_count,
844            '>' => self.char_frequency.gt_count,
845            '|' => self.char_frequency.pipe_count,
846            '[' => self.char_frequency.bracket_count,
847            '`' => self.char_frequency.backtick_count,
848            '<' => self.char_frequency.lt_count,
849            '!' => self.char_frequency.exclamation_count,
850            '\n' => self.char_frequency.newline_count,
851            _ => self.content.matches(ch).count(), // Fallback for other characters
852        }
853    }
854
855    /// Check if content likely contains headings (fast)
856    pub fn likely_has_headings(&self) -> bool {
857        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
858    }
859
860    /// Check if content likely contains lists (fast)
861    pub fn likely_has_lists(&self) -> bool {
862        self.char_frequency.asterisk_count > 0
863            || self.char_frequency.hyphen_count > 0
864            || self.char_frequency.plus_count > 0
865    }
866
867    /// Check if content likely contains emphasis (fast)
868    pub fn likely_has_emphasis(&self) -> bool {
869        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
870    }
871
872    /// Check if content likely contains tables (fast)
873    pub fn likely_has_tables(&self) -> bool {
874        self.char_frequency.pipe_count > 2
875    }
876
877    /// Check if content likely contains blockquotes (fast)
878    pub fn likely_has_blockquotes(&self) -> bool {
879        self.char_frequency.gt_count > 0
880    }
881
882    /// Check if content likely contains code (fast)
883    pub fn likely_has_code(&self) -> bool {
884        self.char_frequency.backtick_count > 0
885    }
886
887    /// Check if content likely contains links or images (fast)
888    pub fn likely_has_links_or_images(&self) -> bool {
889        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
890    }
891
892    /// Check if content likely contains HTML (fast)
893    pub fn likely_has_html(&self) -> bool {
894        self.char_frequency.lt_count > 0
895    }
896
897    /// Get HTML tags on a specific line
898    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
899        self.html_tags()
900            .iter()
901            .filter(|tag| tag.line == line_num)
902            .cloned()
903            .collect()
904    }
905
906    /// Get emphasis spans on a specific line
907    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
908        self.emphasis_spans()
909            .iter()
910            .filter(|span| span.line == line_num)
911            .cloned()
912            .collect()
913    }
914
915    /// Get table rows on a specific line
916    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
917        self.table_rows()
918            .iter()
919            .filter(|row| row.line == line_num)
920            .cloned()
921            .collect()
922    }
923
924    /// Get bare URLs on a specific line
925    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
926        self.bare_urls()
927            .iter()
928            .filter(|url| url.line == line_num)
929            .cloned()
930            .collect()
931    }
932
933    /// Find the line index for a given byte offset using binary search.
934    /// Returns (line_index, line_number, column) where:
935    /// - line_index is the 0-based index in the lines array
936    /// - line_number is the 1-based line number
937    /// - column is the byte offset within that line
938    #[inline]
939    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
940        // Binary search to find the line containing this byte offset
941        let idx = match lines.binary_search_by(|line| {
942            if byte_offset < line.byte_offset {
943                std::cmp::Ordering::Greater
944            } else if byte_offset > line.byte_offset + line.byte_len {
945                std::cmp::Ordering::Less
946            } else {
947                std::cmp::Ordering::Equal
948            }
949        }) {
950            Ok(idx) => idx,
951            Err(idx) => idx.saturating_sub(1),
952        };
953
954        let line = &lines[idx];
955        let line_num = idx + 1;
956        let col = byte_offset.saturating_sub(line.byte_offset);
957
958        (idx, line_num, col)
959    }
960
961    /// Check if a byte offset is within a code span using binary search
962    #[inline]
963    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
964        // Since spans are sorted by byte_offset, use partition_point for binary search
965        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
966
967        // Check the span that starts at or before our offset
968        if idx > 0 {
969            let span = &code_spans[idx - 1];
970            if offset >= span.byte_offset && offset < span.byte_end {
971                return true;
972            }
973        }
974
975        false
976    }
977
978    /// Parse all links in the content
979    fn parse_links(
980        content: &'a str,
981        lines: &[LineInfo],
982        code_blocks: &[(usize, usize)],
983        code_spans: &[CodeSpan],
984        flavor: MarkdownFlavor,
985        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
986    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>) {
987        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
988        use std::collections::HashSet;
989
990        let mut links = Vec::with_capacity(content.len() / 500);
991        let mut broken_links = Vec::new();
992
993        // Track byte positions of links found by pulldown-cmark
994        let mut found_positions = HashSet::new();
995
996        // Use pulldown-cmark's streaming parser with BrokenLink callback
997        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
998        // This automatically handles:
999        // - Escaped links (won't generate events)
1000        // - Links in code blocks/spans (won't generate Link events)
1001        // - Images (generates Tag::Image instead)
1002        // - Reference resolution (dest_url is already resolved!)
1003        // - Broken references (callback is invoked)
1004        // - Wiki-links (enabled via ENABLE_WIKILINKS)
1005        let mut options = Options::empty();
1006        options.insert(Options::ENABLE_WIKILINKS);
1007
1008        let parser = Parser::new_with_broken_link_callback(
1009            content,
1010            options,
1011            Some(|link: BrokenLink<'_>| {
1012                broken_links.push(BrokenLinkInfo {
1013                    reference: link.reference.to_string(),
1014                    span: link.span.clone(),
1015                });
1016                None
1017            }),
1018        )
1019        .into_offset_iter();
1020
1021        let mut link_stack: Vec<(
1022            usize,
1023            usize,
1024            pulldown_cmark::CowStr<'a>,
1025            LinkType,
1026            pulldown_cmark::CowStr<'a>,
1027        )> = Vec::new();
1028        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1029
1030        for (event, range) in parser {
1031            match event {
1032                Event::Start(Tag::Link {
1033                    link_type,
1034                    dest_url,
1035                    id,
1036                    ..
1037                }) => {
1038                    // Link start - record position, URL, and reference ID
1039                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1040                    text_chunks.clear();
1041                }
1042                Event::Text(text) if !link_stack.is_empty() => {
1043                    // Track text content with its byte range
1044                    text_chunks.push((text.to_string(), range.start, range.end));
1045                }
1046                Event::Code(code) if !link_stack.is_empty() => {
1047                    // Include inline code in link text (with backticks)
1048                    let code_text = format!("`{code}`");
1049                    text_chunks.push((code_text, range.start, range.end));
1050                }
1051                Event::End(TagEnd::Link) => {
1052                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1053                        // Skip if in HTML comment
1054                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1055                            text_chunks.clear();
1056                            continue;
1057                        }
1058
1059                        // Find line and column information
1060                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1061
1062                        // Skip if this link is on a MkDocs snippet line
1063                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1064                            text_chunks.clear();
1065                            continue;
1066                        }
1067
1068                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1069
1070                        let is_reference = matches!(
1071                            link_type,
1072                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1073                        );
1074
1075                        // Extract link text directly from source bytes to preserve escaping
1076                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1077                        let link_text = if start_pos < content.len() {
1078                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1079
1080                            // Find MATCHING ] by tracking bracket depth for nested brackets
1081                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1082                            // Brackets inside code spans (between backticks) should be ignored
1083                            let mut close_pos = None;
1084                            let mut depth = 0;
1085                            let mut in_code_span = false;
1086
1087                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1088                                // Count preceding backslashes
1089                                let mut backslash_count = 0;
1090                                let mut j = i;
1091                                while j > 0 && link_bytes[j - 1] == b'\\' {
1092                                    backslash_count += 1;
1093                                    j -= 1;
1094                                }
1095                                let is_escaped = backslash_count % 2 != 0;
1096
1097                                // Track code spans - backticks toggle in/out of code
1098                                if byte == b'`' && !is_escaped {
1099                                    in_code_span = !in_code_span;
1100                                }
1101
1102                                // Only count brackets when NOT in a code span
1103                                if !is_escaped && !in_code_span {
1104                                    if byte == b'[' {
1105                                        depth += 1;
1106                                    } else if byte == b']' {
1107                                        if depth == 0 {
1108                                            // Found the matching closing bracket
1109                                            close_pos = Some(i);
1110                                            break;
1111                                        } else {
1112                                            depth -= 1;
1113                                        }
1114                                    }
1115                                }
1116                            }
1117
1118                            if let Some(pos) = close_pos {
1119                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1120                            } else {
1121                                Cow::Borrowed("")
1122                            }
1123                        } else {
1124                            Cow::Borrowed("")
1125                        };
1126
1127                        // For reference links, use the actual reference ID from pulldown-cmark
1128                        let reference_id = if is_reference && !ref_id.is_empty() {
1129                            Some(Cow::Owned(ref_id.to_lowercase()))
1130                        } else if is_reference {
1131                            // For collapsed/shortcut references without explicit ID, use the link text
1132                            Some(Cow::Owned(link_text.to_lowercase()))
1133                        } else {
1134                            None
1135                        };
1136
1137                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1138                        // Check for escaped image syntax: \![text](url)
1139                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1140                        let has_escaped_bang = start_pos >= 2
1141                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1142                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1143
1144                        // Check for escaped bracket: \[text](url)
1145                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1146                        let has_escaped_bracket =
1147                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1148
1149                        if has_escaped_bang || has_escaped_bracket {
1150                            text_chunks.clear();
1151                            continue; // Skip: this is escaped markdown, not a real link
1152                        }
1153
1154                        // Track this position as found
1155                        found_positions.insert(start_pos);
1156
1157                        links.push(ParsedLink {
1158                            line: line_num,
1159                            start_col: col_start,
1160                            end_col: col_end,
1161                            byte_offset: start_pos,
1162                            byte_end: range.end,
1163                            text: link_text,
1164                            url: Cow::Owned(url.to_string()),
1165                            is_reference,
1166                            reference_id,
1167                            link_type,
1168                        });
1169
1170                        text_chunks.clear();
1171                    }
1172                }
1173                _ => {}
1174            }
1175        }
1176
1177        // Also find undefined references using regex
1178        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1179        // because the reference is undefined
1180        for cap in LINK_PATTERN.captures_iter(content) {
1181            let full_match = cap.get(0).unwrap();
1182            let match_start = full_match.start();
1183            let match_end = full_match.end();
1184
1185            // Skip if this was already found by pulldown-cmark (it's a valid link)
1186            if found_positions.contains(&match_start) {
1187                continue;
1188            }
1189
1190            // Skip if escaped
1191            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1192                continue;
1193            }
1194
1195            // Skip if it's an image
1196            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1197                continue;
1198            }
1199
1200            // Skip if in code block
1201            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1202                continue;
1203            }
1204
1205            // Skip if in code span
1206            if Self::is_offset_in_code_span(code_spans, match_start) {
1207                continue;
1208            }
1209
1210            // Skip if in HTML comment
1211            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1212                continue;
1213            }
1214
1215            // Find line and column information
1216            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1217
1218            // Skip if this link is on a MkDocs snippet line
1219            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1220                continue;
1221            }
1222
1223            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1224
1225            let text = cap.get(1).map_or("", |m| m.as_str());
1226
1227            // Only process reference links (group 6)
1228            if let Some(ref_id) = cap.get(6) {
1229                let ref_id_str = ref_id.as_str();
1230                let normalized_ref = if ref_id_str.is_empty() {
1231                    Cow::Owned(text.to_lowercase()) // Implicit reference
1232                } else {
1233                    Cow::Owned(ref_id_str.to_lowercase())
1234                };
1235
1236                // This is an undefined reference (pulldown-cmark didn't parse it)
1237                links.push(ParsedLink {
1238                    line: line_num,
1239                    start_col: col_start,
1240                    end_col: col_end,
1241                    byte_offset: match_start,
1242                    byte_end: match_end,
1243                    text: Cow::Borrowed(text),
1244                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1245                    is_reference: true,
1246                    reference_id: Some(normalized_ref),
1247                    link_type: LinkType::Reference, // Undefined references are reference-style
1248                });
1249            }
1250        }
1251
1252        (links, broken_links)
1253    }
1254
1255    /// Parse all images in the content
1256    fn parse_images(
1257        content: &'a str,
1258        lines: &[LineInfo],
1259        code_blocks: &[(usize, usize)],
1260        code_spans: &[CodeSpan],
1261        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1262    ) -> Vec<ParsedImage<'a>> {
1263        use crate::utils::skip_context::is_in_html_comment_ranges;
1264        use std::collections::HashSet;
1265
1266        // Pre-size based on a heuristic: images are less common than links
1267        let mut images = Vec::with_capacity(content.len() / 1000);
1268        let mut found_positions = HashSet::new();
1269
1270        // Use pulldown-cmark for parsing - more accurate and faster
1271        let parser = Parser::new(content).into_offset_iter();
1272        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1273            Vec::new();
1274        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1275
1276        for (event, range) in parser {
1277            match event {
1278                Event::Start(Tag::Image {
1279                    link_type,
1280                    dest_url,
1281                    id,
1282                    ..
1283                }) => {
1284                    image_stack.push((range.start, dest_url, link_type, id));
1285                    text_chunks.clear();
1286                }
1287                Event::Text(text) if !image_stack.is_empty() => {
1288                    text_chunks.push((text.to_string(), range.start, range.end));
1289                }
1290                Event::Code(code) if !image_stack.is_empty() => {
1291                    let code_text = format!("`{code}`");
1292                    text_chunks.push((code_text, range.start, range.end));
1293                }
1294                Event::End(TagEnd::Image) => {
1295                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1296                        // Skip if in code block
1297                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1298                            continue;
1299                        }
1300
1301                        // Skip if in code span
1302                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1303                            continue;
1304                        }
1305
1306                        // Skip if in HTML comment
1307                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1308                            continue;
1309                        }
1310
1311                        // Find line and column using binary search
1312                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1313                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1314
1315                        let is_reference = matches!(
1316                            link_type,
1317                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1318                        );
1319
1320                        // Extract alt text directly from source bytes to preserve escaping
1321                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1322                        let alt_text = if start_pos < content.len() {
1323                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1324
1325                            // Find MATCHING ] by tracking bracket depth for nested brackets
1326                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1327                            let mut close_pos = None;
1328                            let mut depth = 0;
1329
1330                            if image_bytes.len() > 2 {
1331                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1332                                    // Count preceding backslashes
1333                                    let mut backslash_count = 0;
1334                                    let mut j = i;
1335                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1336                                        backslash_count += 1;
1337                                        j -= 1;
1338                                    }
1339                                    let is_escaped = backslash_count % 2 != 0;
1340
1341                                    if !is_escaped {
1342                                        if byte == b'[' {
1343                                            depth += 1;
1344                                        } else if byte == b']' {
1345                                            if depth == 0 {
1346                                                // Found the matching closing bracket
1347                                                close_pos = Some(i);
1348                                                break;
1349                                            } else {
1350                                                depth -= 1;
1351                                            }
1352                                        }
1353                                    }
1354                                }
1355                            }
1356
1357                            if let Some(pos) = close_pos {
1358                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1359                            } else {
1360                                Cow::Borrowed("")
1361                            }
1362                        } else {
1363                            Cow::Borrowed("")
1364                        };
1365
1366                        let reference_id = if is_reference && !ref_id.is_empty() {
1367                            Some(Cow::Owned(ref_id.to_lowercase()))
1368                        } else if is_reference {
1369                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1370                        } else {
1371                            None
1372                        };
1373
1374                        found_positions.insert(start_pos);
1375                        images.push(ParsedImage {
1376                            line: line_num,
1377                            start_col: col_start,
1378                            end_col: col_end,
1379                            byte_offset: start_pos,
1380                            byte_end: range.end,
1381                            alt_text,
1382                            url: Cow::Owned(url.to_string()),
1383                            is_reference,
1384                            reference_id,
1385                            link_type,
1386                        });
1387                    }
1388                }
1389                _ => {}
1390            }
1391        }
1392
1393        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1394        for cap in IMAGE_PATTERN.captures_iter(content) {
1395            let full_match = cap.get(0).unwrap();
1396            let match_start = full_match.start();
1397            let match_end = full_match.end();
1398
1399            // Skip if already found by pulldown-cmark
1400            if found_positions.contains(&match_start) {
1401                continue;
1402            }
1403
1404            // Skip if the ! is escaped
1405            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1406                continue;
1407            }
1408
1409            // Skip if in code block, code span, or HTML comment
1410            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1411                || Self::is_offset_in_code_span(code_spans, match_start)
1412                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1413            {
1414                continue;
1415            }
1416
1417            // Only process reference images (undefined references not found by pulldown-cmark)
1418            if let Some(ref_id) = cap.get(6) {
1419                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1420                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1421                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1422                let ref_id_str = ref_id.as_str();
1423                let normalized_ref = if ref_id_str.is_empty() {
1424                    Cow::Owned(alt_text.to_lowercase())
1425                } else {
1426                    Cow::Owned(ref_id_str.to_lowercase())
1427                };
1428
1429                images.push(ParsedImage {
1430                    line: line_num,
1431                    start_col: col_start,
1432                    end_col: col_end,
1433                    byte_offset: match_start,
1434                    byte_end: match_end,
1435                    alt_text: Cow::Borrowed(alt_text),
1436                    url: Cow::Borrowed(""),
1437                    is_reference: true,
1438                    reference_id: Some(normalized_ref),
1439                    link_type: LinkType::Reference, // Undefined references are reference-style
1440                });
1441            }
1442        }
1443
1444        images
1445    }
1446
1447    /// Parse reference definitions
1448    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1449        // Pre-size based on lines count as reference definitions are line-based
1450        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1451
1452        for (line_idx, line_info) in lines.iter().enumerate() {
1453            // Skip lines in code blocks
1454            if line_info.in_code_block {
1455                continue;
1456            }
1457
1458            let line = line_info.content(content);
1459            let line_num = line_idx + 1;
1460
1461            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1462                let id = cap.get(1).unwrap().as_str().to_lowercase();
1463                let url = cap.get(2).unwrap().as_str().to_string();
1464                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1465
1466                // Calculate byte positions
1467                // The match starts at the beginning of the line (0) and extends to the end
1468                let match_obj = cap.get(0).unwrap();
1469                let byte_offset = line_info.byte_offset + match_obj.start();
1470                let byte_end = line_info.byte_offset + match_obj.end();
1471
1472                refs.push(ReferenceDef {
1473                    line: line_num,
1474                    id,
1475                    url,
1476                    title,
1477                    byte_offset,
1478                    byte_end,
1479                });
1480            }
1481        }
1482
1483        refs
1484    }
1485
1486    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1487    /// Matches: ^(\s*>\s*)(.*)
1488    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1489    #[inline]
1490    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1491        let trimmed_start = line.trim_start();
1492        if !trimmed_start.starts_with('>') {
1493            return None;
1494        }
1495
1496        let leading_ws_len = line.len() - trimmed_start.len();
1497        let after_gt = &trimmed_start[1..];
1498        let content = after_gt.trim_start();
1499        let ws_after_gt_len = after_gt.len() - content.len();
1500        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1501
1502        Some((&line[..prefix_len], content))
1503    }
1504
1505    /// Fast unordered list parser - replaces regex for 5-10x speedup
1506    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1507    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1508    #[inline]
1509    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1510        let bytes = line.as_bytes();
1511        let mut i = 0;
1512
1513        // Skip leading whitespace
1514        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1515            i += 1;
1516        }
1517
1518        // Check for marker
1519        if i >= bytes.len() {
1520            return None;
1521        }
1522        let marker = bytes[i] as char;
1523        if marker != '-' && marker != '*' && marker != '+' {
1524            return None;
1525        }
1526        let marker_pos = i;
1527        i += 1;
1528
1529        // Collect spacing after marker (space or tab only)
1530        let spacing_start = i;
1531        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1532            i += 1;
1533        }
1534
1535        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1536    }
1537
1538    /// Fast ordered list parser - replaces regex for 5-10x speedup
1539    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1540    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1541    #[inline]
1542    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1543        let bytes = line.as_bytes();
1544        let mut i = 0;
1545
1546        // Skip leading whitespace
1547        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1548            i += 1;
1549        }
1550
1551        // Collect digits
1552        let number_start = i;
1553        while i < bytes.len() && bytes[i].is_ascii_digit() {
1554            i += 1;
1555        }
1556        if i == number_start {
1557            return None; // No digits found
1558        }
1559
1560        // Check for delimiter
1561        if i >= bytes.len() {
1562            return None;
1563        }
1564        let delimiter = bytes[i] as char;
1565        if delimiter != '.' && delimiter != ')' {
1566            return None;
1567        }
1568        let delimiter_pos = i;
1569        i += 1;
1570
1571        // Collect spacing after delimiter (space or tab only)
1572        let spacing_start = i;
1573        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1574            i += 1;
1575        }
1576
1577        Some((
1578            &line[..number_start],
1579            &line[number_start..delimiter_pos],
1580            delimiter,
1581            &line[spacing_start..i],
1582            &line[i..],
1583        ))
1584    }
1585
1586    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1587    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1588    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1589        let num_lines = line_offsets.len();
1590        let mut in_code_block = vec![false; num_lines];
1591
1592        // For each code block, mark all lines within it
1593        for &(start, end) in code_blocks {
1594            // Ensure we're at valid UTF-8 boundaries
1595            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1596                let mut boundary = start;
1597                while boundary > 0 && !content.is_char_boundary(boundary) {
1598                    boundary -= 1;
1599                }
1600                boundary
1601            } else {
1602                start
1603            };
1604
1605            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1606                let mut boundary = end;
1607                while boundary < content.len() && !content.is_char_boundary(boundary) {
1608                    boundary += 1;
1609                }
1610                boundary
1611            } else {
1612                end.min(content.len())
1613            };
1614
1615            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1616            // That function now has proper list context awareness (see code_block_utils.rs)
1617            // and correctly distinguishes between:
1618            // - Fenced code blocks (``` or ~~~)
1619            // - Indented code blocks at document level (4 spaces + blank line before)
1620            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1621            //
1622            // We no longer need to re-validate here. The original validation logic
1623            // was causing false positives by marking list continuation paragraphs as
1624            // code blocks when they have 4 spaces of indentation.
1625
1626            // Use binary search to find the first and last line indices
1627            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1628            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1629            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1630            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1631
1632            // Mark all lines in the range at once
1633            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1634                *flag = true;
1635            }
1636        }
1637
1638        in_code_block
1639    }
1640
1641    /// Pre-compute basic line information (without headings/blockquotes)
1642    fn compute_basic_line_info(
1643        content: &str,
1644        line_offsets: &[usize],
1645        code_blocks: &[(usize, usize)],
1646        flavor: MarkdownFlavor,
1647        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1648        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1649    ) -> Vec<LineInfo> {
1650        let content_lines: Vec<&str> = content.lines().collect();
1651        let mut lines = Vec::with_capacity(content_lines.len());
1652
1653        // Pre-compute which lines are in code blocks
1654        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1655
1656        // Detect front matter boundaries FIRST, before any other parsing
1657        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1658        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1659
1660        for (i, line) in content_lines.iter().enumerate() {
1661            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1662            let indent = line.len() - line.trim_start().len();
1663
1664            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1665            let blockquote_parse = Self::parse_blockquote_prefix(line);
1666
1667            // For blank detection, consider blockquote context
1668            let is_blank = if let Some((_, content)) = blockquote_parse {
1669                // In blockquote context, check if content after prefix is blank
1670                content.trim().is_empty()
1671            } else {
1672                line.trim().is_empty()
1673            };
1674
1675            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1676            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1677
1678            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1679            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1680                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1681            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1682            let in_html_comment =
1683                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1684            let list_item = if !(in_code_block
1685                || is_blank
1686                || in_mkdocstrings
1687                || in_html_comment
1688                || (front_matter_end > 0 && i < front_matter_end))
1689            {
1690                // Strip blockquote prefix if present for list detection (reuse cached result)
1691                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1692                    (content, prefix.len())
1693                } else {
1694                    (&**line, 0)
1695                };
1696
1697                if let Some((leading_spaces, marker, spacing, _content)) =
1698                    Self::parse_unordered_list(line_for_list_check)
1699                {
1700                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1701                    let content_column = marker_column + 1 + spacing.len();
1702
1703                    // According to CommonMark spec, unordered list items MUST have at least one space
1704                    // after the marker (-, *, or +). Without a space, it's not a list item.
1705                    // This also naturally handles cases like:
1706                    // - *emphasis* (not a list)
1707                    // - **bold** (not a list)
1708                    // - --- (horizontal rule, not a list)
1709                    if spacing.is_empty() {
1710                        None
1711                    } else {
1712                        Some(ListItemInfo {
1713                            marker: marker.to_string(),
1714                            is_ordered: false,
1715                            number: None,
1716                            marker_column,
1717                            content_column,
1718                        })
1719                    }
1720                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1721                    Self::parse_ordered_list(line_for_list_check)
1722                {
1723                    let marker = format!("{number_str}{delimiter}");
1724                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1725                    let content_column = marker_column + marker.len() + spacing.len();
1726
1727                    // According to CommonMark spec, ordered list items MUST have at least one space
1728                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1729                    if spacing.is_empty() {
1730                        None
1731                    } else {
1732                        Some(ListItemInfo {
1733                            marker,
1734                            is_ordered: true,
1735                            number: number_str.parse().ok(),
1736                            marker_column,
1737                            content_column,
1738                        })
1739                    }
1740                } else {
1741                    None
1742                }
1743            } else {
1744                None
1745            };
1746
1747            lines.push(LineInfo {
1748                byte_offset,
1749                byte_len: line.len(),
1750                indent,
1751                is_blank,
1752                in_code_block,
1753                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1754                in_html_block: false, // Will be populated after line creation
1755                in_html_comment,
1756                list_item,
1757                heading: None,    // Will be populated in second pass for Setext headings
1758                blockquote: None, // Will be populated after line creation
1759                in_mkdocstrings,
1760                in_esm_block: false, // Will be populated after line creation for MDX files
1761            });
1762        }
1763
1764        lines
1765    }
1766
1767    /// Detect headings and blockquotes (called after HTML block detection)
1768    fn detect_headings_and_blockquotes(
1769        content: &str,
1770        lines: &mut [LineInfo],
1771        flavor: MarkdownFlavor,
1772        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1773    ) {
1774        // Regex for heading detection
1775        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1776            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1777        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1778            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1779
1780        let content_lines: Vec<&str> = content.lines().collect();
1781
1782        // Detect front matter boundaries to skip those lines
1783        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1784
1785        // Detect headings (including Setext which needs look-ahead) and blockquotes
1786        for i in 0..lines.len() {
1787            if lines[i].in_code_block {
1788                continue;
1789            }
1790
1791            // Skip lines in front matter
1792            if front_matter_end > 0 && i < front_matter_end {
1793                continue;
1794            }
1795
1796            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1797            if lines[i].in_html_block {
1798                continue;
1799            }
1800
1801            let line = content_lines[i];
1802
1803            // Check for blockquotes (even on blank lines within blockquotes)
1804            if let Some(bq) = parse_blockquote_detailed(line) {
1805                let nesting_level = bq.markers.len(); // Each '>' is one level
1806                let marker_column = bq.indent.len();
1807
1808                // Build the prefix (indentation + markers + space)
1809                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1810
1811                // Check for various blockquote issues
1812                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1813                // Consider tabs as multiple spaces, or actual multiple spaces
1814                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1815
1816                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1817                // MD028 flags empty blockquote lines that don't have a single space after the marker
1818                // Lines like "> " or ">> " are already correct and don't need fixing
1819                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1820
1821                lines[i].blockquote = Some(BlockquoteInfo {
1822                    nesting_level,
1823                    indent: bq.indent.to_string(),
1824                    marker_column,
1825                    prefix,
1826                    content: bq.content.to_string(),
1827                    has_no_space_after_marker: has_no_space,
1828                    has_multiple_spaces_after_marker: has_multiple_spaces,
1829                    needs_md028_fix,
1830                });
1831            }
1832
1833            // Skip heading detection for blank lines
1834            if lines[i].is_blank {
1835                continue;
1836            }
1837
1838            // Check for ATX headings (but skip MkDocs snippet lines)
1839            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1840            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1841                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1842                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1843            } else {
1844                false
1845            };
1846
1847            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1848                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1849                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1850                    continue;
1851                }
1852                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1853                let hashes = caps.get(2).map_or("", |m| m.as_str());
1854                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1855                let rest = caps.get(4).map_or("", |m| m.as_str());
1856
1857                let level = hashes.len() as u8;
1858                let marker_column = leading_spaces.len();
1859
1860                // Check for closing sequence, but handle custom IDs that might come after
1861                let (text, has_closing, closing_seq) = {
1862                    // First check if there's a custom ID at the end
1863                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1864                        // Check if this looks like a valid custom ID (ends with })
1865                        if rest[id_start..].trim_end().ends_with('}') {
1866                            // Split off the custom ID
1867                            (&rest[..id_start], &rest[id_start..])
1868                        } else {
1869                            (rest, "")
1870                        }
1871                    } else {
1872                        (rest, "")
1873                    };
1874
1875                    // Now look for closing hashes in the part before the custom ID
1876                    let trimmed_rest = rest_without_id.trim_end();
1877                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1878                        // Look for the start of the hash sequence
1879                        let mut start_of_hashes = last_hash_pos;
1880                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1881                            start_of_hashes -= 1;
1882                        }
1883
1884                        // Check if there's at least one space before the closing hashes
1885                        let has_space_before = start_of_hashes == 0
1886                            || trimmed_rest
1887                                .chars()
1888                                .nth(start_of_hashes - 1)
1889                                .is_some_and(|c| c.is_whitespace());
1890
1891                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1892                        let potential_closing = &trimmed_rest[start_of_hashes..];
1893                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1894
1895                        if is_all_hashes && has_space_before {
1896                            // This is a closing sequence
1897                            let closing_hashes = potential_closing.to_string();
1898                            // The text is everything before the closing hashes
1899                            // Don't include the custom ID here - it will be extracted later
1900                            let text_part = if !custom_id_part.is_empty() {
1901                                // If we have a custom ID, append it back to get the full rest
1902                                // This allows the extract_header_id function to handle it properly
1903                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1904                            } else {
1905                                rest_without_id[..start_of_hashes].trim_end().to_string()
1906                            };
1907                            (text_part, true, closing_hashes)
1908                        } else {
1909                            // Not a valid closing sequence, return the full content
1910                            (rest.to_string(), false, String::new())
1911                        }
1912                    } else {
1913                        // No hashes found, return the full content
1914                        (rest.to_string(), false, String::new())
1915                    }
1916                };
1917
1918                let content_column = marker_column + hashes.len() + spaces_after.len();
1919
1920                // Extract custom header ID if present
1921                let raw_text = text.trim().to_string();
1922                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1923
1924                // If no custom ID was found on the header line, check the next line for standalone attr-list
1925                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1926                    let next_line = content_lines[i + 1];
1927                    if !lines[i + 1].in_code_block
1928                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1929                        && let Some(next_line_id) =
1930                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1931                    {
1932                        custom_id = Some(next_line_id);
1933                    }
1934                }
1935
1936                lines[i].heading = Some(HeadingInfo {
1937                    level,
1938                    style: HeadingStyle::ATX,
1939                    marker: hashes.to_string(),
1940                    marker_column,
1941                    content_column,
1942                    text: clean_text,
1943                    custom_id,
1944                    raw_text,
1945                    has_closing_sequence: has_closing,
1946                    closing_sequence: closing_seq,
1947                });
1948            }
1949            // Check for Setext headings (need to look at next line)
1950            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1951                let next_line = content_lines[i + 1];
1952                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1953                    // Skip if next line is front matter delimiter
1954                    if front_matter_end > 0 && i < front_matter_end {
1955                        continue;
1956                    }
1957
1958                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1959                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1960                    {
1961                        continue;
1962                    }
1963
1964                    let underline = next_line.trim();
1965
1966                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1967                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1968                    if underline == "---" {
1969                        continue;
1970                    }
1971
1972                    // Skip if the current line looks like YAML key-value syntax
1973                    let current_line_trimmed = line.trim();
1974                    if current_line_trimmed.contains(':')
1975                        && !current_line_trimmed.starts_with('#')
1976                        && !current_line_trimmed.contains('[')
1977                        && !current_line_trimmed.contains("](")
1978                    {
1979                        // This looks like "key: value" which suggests YAML, not a heading
1980                        continue;
1981                    }
1982
1983                    let level = if underline.starts_with('=') { 1 } else { 2 };
1984                    let style = if level == 1 {
1985                        HeadingStyle::Setext1
1986                    } else {
1987                        HeadingStyle::Setext2
1988                    };
1989
1990                    // Extract custom header ID if present
1991                    let raw_text = line.trim().to_string();
1992                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1993
1994                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1995                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1996                        let attr_line = content_lines[i + 2];
1997                        if !lines[i + 2].in_code_block
1998                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1999                            && let Some(attr_line_id) =
2000                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2001                        {
2002                            custom_id = Some(attr_line_id);
2003                        }
2004                    }
2005
2006                    lines[i].heading = Some(HeadingInfo {
2007                        level,
2008                        style,
2009                        marker: underline.to_string(),
2010                        marker_column: next_line.len() - next_line.trim_start().len(),
2011                        content_column: lines[i].indent,
2012                        text: clean_text,
2013                        custom_id,
2014                        raw_text,
2015                        has_closing_sequence: false,
2016                        closing_sequence: String::new(),
2017                    });
2018                }
2019            }
2020        }
2021    }
2022
2023    /// Detect HTML blocks in the content
2024    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2025        // HTML block elements that trigger block context
2026        const BLOCK_ELEMENTS: &[&str] = &[
2027            "address",
2028            "article",
2029            "aside",
2030            "blockquote",
2031            "details",
2032            "dialog",
2033            "dd",
2034            "div",
2035            "dl",
2036            "dt",
2037            "fieldset",
2038            "figcaption",
2039            "figure",
2040            "footer",
2041            "form",
2042            "h1",
2043            "h2",
2044            "h3",
2045            "h4",
2046            "h5",
2047            "h6",
2048            "header",
2049            "hr",
2050            "li",
2051            "main",
2052            "nav",
2053            "ol",
2054            "p",
2055            "pre",
2056            "script",
2057            "section",
2058            "style",
2059            "table",
2060            "tbody",
2061            "td",
2062            "tfoot",
2063            "th",
2064            "thead",
2065            "tr",
2066            "ul",
2067        ];
2068
2069        let mut i = 0;
2070        while i < lines.len() {
2071            // Skip if already in code block or front matter
2072            if lines[i].in_code_block || lines[i].in_front_matter {
2073                i += 1;
2074                continue;
2075            }
2076
2077            let trimmed = lines[i].content(content).trim_start();
2078
2079            // Check if line starts with an HTML tag
2080            if trimmed.starts_with('<') && trimmed.len() > 1 {
2081                // Extract tag name safely
2082                let after_bracket = &trimmed[1..];
2083                let is_closing = after_bracket.starts_with('/');
2084                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2085
2086                // Extract tag name (stop at space, >, /, or end of string)
2087                let tag_name = tag_start
2088                    .chars()
2089                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2090                    .collect::<String>()
2091                    .to_lowercase();
2092
2093                // Check if it's a block element
2094                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2095                    // Mark this line as in HTML block
2096                    lines[i].in_html_block = true;
2097
2098                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2099                    // This avoids complex nesting logic that might cause infinite loops
2100                    if !is_closing {
2101                        let closing_tag = format!("</{tag_name}>");
2102                        // style and script tags can contain blank lines (CSS/JS formatting)
2103                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2104                        let mut j = i + 1;
2105                        while j < lines.len() && j < i + 100 {
2106                            // Limit search to 100 lines
2107                            // Stop at blank lines (except for style/script tags)
2108                            if !allow_blank_lines && lines[j].is_blank {
2109                                break;
2110                            }
2111
2112                            lines[j].in_html_block = true;
2113
2114                            // Check if this line contains the closing tag
2115                            if lines[j].content(content).contains(&closing_tag) {
2116                                break;
2117                            }
2118                            j += 1;
2119                        }
2120                    }
2121                }
2122            }
2123
2124            i += 1;
2125        }
2126    }
2127
2128    /// Detect ESM import/export blocks in MDX files
2129    /// ESM blocks consist of contiguous import/export statements at the top of the file
2130    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2131        // Only process MDX files
2132        if !flavor.supports_esm_blocks() {
2133            return;
2134        }
2135
2136        for line in lines.iter_mut() {
2137            // Skip blank lines and comments at the start
2138            if line.is_blank || line.in_html_comment {
2139                continue;
2140            }
2141
2142            // Check if line starts with import or export
2143            let trimmed = line.content(content).trim_start();
2144            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2145                line.in_esm_block = true;
2146            } else {
2147                // Once we hit a non-ESM line, we're done with the ESM block
2148                break;
2149            }
2150        }
2151    }
2152
2153    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2154    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2155        let mut code_spans = Vec::new();
2156
2157        // Quick check - if no backticks, no code spans
2158        if !content.contains('`') {
2159            return code_spans;
2160        }
2161
2162        // Use pulldown-cmark's streaming parser with byte offsets
2163        let parser = Parser::new(content).into_offset_iter();
2164
2165        for (event, range) in parser {
2166            if let Event::Code(_) = event {
2167                let start_pos = range.start;
2168                let end_pos = range.end;
2169
2170                // The range includes the backticks, extract the actual content
2171                let full_span = &content[start_pos..end_pos];
2172                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2173
2174                // Extract content between backticks, preserving spaces
2175                let content_start = start_pos + backtick_count;
2176                let content_end = end_pos - backtick_count;
2177                let span_content = if content_start < content_end {
2178                    content[content_start..content_end].to_string()
2179                } else {
2180                    String::new()
2181                };
2182
2183                // Use binary search to find line number - O(log n) instead of O(n)
2184                // Find the rightmost line whose byte_offset <= start_pos
2185                let line_idx = lines
2186                    .partition_point(|line| line.byte_offset <= start_pos)
2187                    .saturating_sub(1);
2188                let line_num = line_idx + 1;
2189                let col_start = start_pos - lines[line_idx].byte_offset;
2190
2191                // Find end column using binary search
2192                let end_line_idx = lines
2193                    .partition_point(|line| line.byte_offset <= end_pos)
2194                    .saturating_sub(1);
2195                let col_end = end_pos - lines[end_line_idx].byte_offset;
2196
2197                code_spans.push(CodeSpan {
2198                    line: line_num,
2199                    start_col: col_start,
2200                    end_col: col_end,
2201                    byte_offset: start_pos,
2202                    byte_end: end_pos,
2203                    backtick_count,
2204                    content: span_content,
2205                });
2206            }
2207        }
2208
2209        // Sort by position to ensure consistent ordering
2210        code_spans.sort_by_key(|span| span.byte_offset);
2211
2212        code_spans
2213    }
2214
2215    /// Parse all list blocks in the content (legacy line-by-line approach)
2216    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2217        // Pre-size based on lines that could be list items
2218        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2219        let mut current_block: Option<ListBlock> = None;
2220        let mut last_list_item_line = 0;
2221        let mut current_indent_level = 0;
2222        let mut last_marker_width = 0;
2223
2224        for (line_idx, line_info) in lines.iter().enumerate() {
2225            let line_num = line_idx + 1;
2226
2227            // Enhanced code block handling using Design #3's context analysis
2228            if line_info.in_code_block {
2229                if let Some(ref mut block) = current_block {
2230                    // Calculate minimum indentation for list continuation
2231                    let min_continuation_indent =
2232                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2233
2234                    // Analyze code block context using the three-tier classification
2235                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2236
2237                    match context {
2238                        CodeBlockContext::Indented => {
2239                            // Code block is properly indented - continues the list
2240                            block.end_line = line_num;
2241                            continue;
2242                        }
2243                        CodeBlockContext::Standalone => {
2244                            // Code block separates lists - end current block
2245                            let completed_block = current_block.take().unwrap();
2246                            list_blocks.push(completed_block);
2247                            continue;
2248                        }
2249                        CodeBlockContext::Adjacent => {
2250                            // Edge case - use conservative behavior (continue list)
2251                            block.end_line = line_num;
2252                            continue;
2253                        }
2254                    }
2255                } else {
2256                    // No current list block - skip code block lines
2257                    continue;
2258                }
2259            }
2260
2261            // Extract blockquote prefix if any
2262            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2263                caps.get(0).unwrap().as_str().to_string()
2264            } else {
2265                String::new()
2266            };
2267
2268            // Check if this line is a list item
2269            if let Some(list_item) = &line_info.list_item {
2270                // Calculate nesting level based on indentation
2271                let item_indent = list_item.marker_column;
2272                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2273
2274                if let Some(ref mut block) = current_block {
2275                    // Check if this continues the current block
2276                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2277                    // or a continuation at the same or lower level
2278                    let is_nested = nesting > block.nesting_level;
2279                    let same_type =
2280                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2281                    let same_context = block.blockquote_prefix == blockquote_prefix;
2282                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2283
2284                    // For unordered lists, also check marker consistency
2285                    let marker_compatible =
2286                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2287
2288                    // Check if there's non-list content between the last item and this one
2289                    let has_non_list_content = {
2290                        let mut found_non_list = false;
2291                        // Use the last item from the current block, not the global last_list_item_line
2292                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2293
2294                        // Debug: Special check for problematic line
2295                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2296                            let last_line = &lines[block_last_item_line - 1];
2297                            let last_line_content = last_line.content(content);
2298                            if last_line_content.contains(r"`sqlalchemy`") && last_line_content.contains(r"\`") {
2299                                log::debug!(
2300                                    "After problematic line {}: checking lines {} to {} for non-list content",
2301                                    block_last_item_line,
2302                                    block_last_item_line + 1,
2303                                    line_num
2304                                );
2305                                // If they're consecutive list items, there's no content between
2306                                if line_num == block_last_item_line + 1 {
2307                                    log::debug!("Lines are consecutive, no content between");
2308                                }
2309                            }
2310                        }
2311
2312                        for check_line in (block_last_item_line + 1)..line_num {
2313                            let check_idx = check_line - 1;
2314                            if check_idx < lines.len() {
2315                                let check_info = &lines[check_idx];
2316                                // Check for content that breaks the list
2317                                let is_list_breaking_content = if check_info.in_code_block {
2318                                    // Use enhanced code block classification for list separation
2319                                    let last_item_marker_width =
2320                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2321                                            lines[block_last_item_line - 1]
2322                                                .list_item
2323                                                .as_ref()
2324                                                .map(|li| {
2325                                                    if li.is_ordered {
2326                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
2327                                                    } else {
2328                                                        li.marker.len()
2329                                                    }
2330                                                })
2331                                                .unwrap_or(3) // fallback to 3 if no list item found
2332                                        } else {
2333                                            3 // fallback
2334                                        };
2335
2336                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2337
2338                                    // Analyze code block context using our enhanced classification
2339                                    let context = CodeBlockUtils::analyze_code_block_context(
2340                                        lines,
2341                                        check_line - 1,
2342                                        min_continuation,
2343                                    );
2344
2345                                    // Standalone code blocks break lists, indented ones continue them
2346                                    matches!(context, CodeBlockContext::Standalone)
2347                                } else if !check_info.is_blank && check_info.list_item.is_none() {
2348                                    // Check for structural separators that should break lists (from issue #42)
2349                                    let line_content = check_info.content(content).trim();
2350
2351                                    // Any of these structural separators break lists
2352                                    if check_info.heading.is_some()
2353                                        || line_content.starts_with("---")
2354                                        || line_content.starts_with("***")
2355                                        || line_content.starts_with("___")
2356                                        || (line_content.contains('|')
2357                                            && !line_content.contains("](")
2358                                            && !line_content.contains("http")
2359                                            && (line_content.matches('|').count() > 1
2360                                                || line_content.starts_with('|')
2361                                                || line_content.ends_with('|')))
2362                                        || line_content.starts_with(">")
2363                                    {
2364                                        true
2365                                    }
2366                                    // Other non-list content - check if properly indented
2367                                    else {
2368                                        let last_item_marker_width =
2369                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2370                                                lines[block_last_item_line - 1]
2371                                                    .list_item
2372                                                    .as_ref()
2373                                                    .map(|li| {
2374                                                        if li.is_ordered {
2375                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
2376                                                        } else {
2377                                                            li.marker.len()
2378                                                        }
2379                                                    })
2380                                                    .unwrap_or(3) // fallback to 3 if no list item found
2381                                            } else {
2382                                                3 // fallback
2383                                            };
2384
2385                                        let min_continuation =
2386                                            if block.is_ordered { last_item_marker_width } else { 2 };
2387                                        check_info.indent < min_continuation
2388                                    }
2389                                } else {
2390                                    false
2391                                };
2392
2393                                if is_list_breaking_content {
2394                                    // Not indented enough, so it breaks the list
2395                                    found_non_list = true;
2396                                    break;
2397                                }
2398                            }
2399                        }
2400                        found_non_list
2401                    };
2402
2403                    // A list continues if:
2404                    // 1. It's a nested item (indented more than the parent), OR
2405                    // 2. It's the same type at the same level with reasonable distance
2406                    let mut continues_list = if is_nested {
2407                        // Nested items always continue the list if they're in the same context
2408                        same_context && reasonable_distance && !has_non_list_content
2409                    } else {
2410                        // Same-level items need to match type and markers
2411                        let result = same_type
2412                            && same_context
2413                            && reasonable_distance
2414                            && marker_compatible
2415                            && !has_non_list_content;
2416
2417                        // Debug logging for lines after problematic content
2418                        if block.item_lines.last().is_some_and(|&last_line| {
2419                            last_line > 0
2420                                && last_line <= lines.len()
2421                                && lines[last_line - 1].content(content).contains(r"`sqlalchemy`")
2422                                && lines[last_line - 1].content(content).contains(r"\`")
2423                        }) {
2424                            log::debug!(
2425                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2426                            );
2427                            if line_num > 0 && line_num <= lines.len() {
2428                                log::debug!("Current line content: {:?}", lines[line_num - 1].content(content));
2429                            }
2430                        }
2431
2432                        result
2433                    };
2434
2435                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2436                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2437                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2438                        // Check if the previous line was a list item
2439                        if block.item_lines.contains(&(line_num - 1)) {
2440                            // They're consecutive list items - force them to be in the same list
2441                            continues_list = true;
2442                        }
2443                    }
2444
2445                    if continues_list {
2446                        // Extend current block
2447                        block.end_line = line_num;
2448                        block.item_lines.push(line_num);
2449
2450                        // Update max marker width
2451                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2452                            list_item.marker.len() + 1
2453                        } else {
2454                            list_item.marker.len()
2455                        });
2456
2457                        // Update marker consistency for unordered lists
2458                        if !block.is_ordered
2459                            && block.marker.is_some()
2460                            && block.marker.as_ref() != Some(&list_item.marker)
2461                        {
2462                            // Mixed markers, clear the marker field
2463                            block.marker = None;
2464                        }
2465                    } else {
2466                        // End current block and start a new one
2467
2468                        list_blocks.push(block.clone());
2469
2470                        *block = ListBlock {
2471                            start_line: line_num,
2472                            end_line: line_num,
2473                            is_ordered: list_item.is_ordered,
2474                            marker: if list_item.is_ordered {
2475                                None
2476                            } else {
2477                                Some(list_item.marker.clone())
2478                            },
2479                            blockquote_prefix: blockquote_prefix.clone(),
2480                            item_lines: vec![line_num],
2481                            nesting_level: nesting,
2482                            max_marker_width: if list_item.is_ordered {
2483                                list_item.marker.len() + 1
2484                            } else {
2485                                list_item.marker.len()
2486                            },
2487                        };
2488                    }
2489                } else {
2490                    // Start a new block
2491                    current_block = Some(ListBlock {
2492                        start_line: line_num,
2493                        end_line: line_num,
2494                        is_ordered: list_item.is_ordered,
2495                        marker: if list_item.is_ordered {
2496                            None
2497                        } else {
2498                            Some(list_item.marker.clone())
2499                        },
2500                        blockquote_prefix,
2501                        item_lines: vec![line_num],
2502                        nesting_level: nesting,
2503                        max_marker_width: list_item.marker.len(),
2504                    });
2505                }
2506
2507                last_list_item_line = line_num;
2508                current_indent_level = item_indent;
2509                last_marker_width = if list_item.is_ordered {
2510                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2511                } else {
2512                    list_item.marker.len()
2513                };
2514            } else if let Some(ref mut block) = current_block {
2515                // Not a list item - check if it continues the current block
2516
2517                // For MD032 compatibility, we use a simple approach:
2518                // - Indented lines continue the list
2519                // - Blank lines followed by indented content continue the list
2520                // - Everything else ends the list
2521
2522                // Check if the last line in the list block ended with a backslash (hard line break)
2523                // This handles cases where list items use backslash for hard line breaks
2524                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2525                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2526                } else {
2527                    false
2528                };
2529
2530                // Calculate minimum indentation for list continuation
2531                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2532                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2533                let min_continuation_indent = if block.is_ordered {
2534                    current_indent_level + last_marker_width
2535                } else {
2536                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2537                };
2538
2539                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2540                    // Indented line or backslash continuation continues the list
2541                    block.end_line = line_num;
2542                } else if line_info.is_blank {
2543                    // Blank line - check if it's internal to the list or ending it
2544                    // We only include blank lines that are followed by more list content
2545                    let mut check_idx = line_idx + 1;
2546                    let mut found_continuation = false;
2547
2548                    // Skip additional blank lines
2549                    while check_idx < lines.len() && lines[check_idx].is_blank {
2550                        check_idx += 1;
2551                    }
2552
2553                    if check_idx < lines.len() {
2554                        let next_line = &lines[check_idx];
2555                        // Check if followed by indented content (list continuation)
2556                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2557                            found_continuation = true;
2558                        }
2559                        // Check if followed by another list item at the same level
2560                        else if !next_line.in_code_block
2561                            && next_line.list_item.is_some()
2562                            && let Some(item) = &next_line.list_item
2563                        {
2564                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2565                                .find(next_line.content(content))
2566                                .map_or(String::new(), |m| m.as_str().to_string());
2567                            if item.marker_column == current_indent_level
2568                                && item.is_ordered == block.is_ordered
2569                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2570                            {
2571                                // Check if there was meaningful content between the list items (unused now)
2572                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2573                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2574                                    if let Some(between_line) = lines.get(idx) {
2575                                        let between_content = between_line.content(content);
2576                                        let trimmed = between_content.trim();
2577                                        // Skip empty lines
2578                                        if trimmed.is_empty() {
2579                                            return false;
2580                                        }
2581                                        // Check for meaningful content
2582                                        let line_indent = between_content.len() - between_content.trim_start().len();
2583
2584                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2585                                        if trimmed.starts_with("```")
2586                                            || trimmed.starts_with("~~~")
2587                                            || trimmed.starts_with("---")
2588                                            || trimmed.starts_with("***")
2589                                            || trimmed.starts_with("___")
2590                                            || trimmed.starts_with(">")
2591                                            || trimmed.contains('|') // Tables
2592                                            || between_line.heading.is_some()
2593                                        {
2594                                            return true; // These are structural separators - meaningful content that breaks lists
2595                                        }
2596
2597                                        // Only properly indented content continues the list
2598                                        line_indent >= min_continuation_indent
2599                                    } else {
2600                                        false
2601                                    }
2602                                });
2603
2604                                if block.is_ordered {
2605                                    // For ordered lists: don't continue if there are structural separators
2606                                    // Check if there are structural separators between the list items
2607                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2608                                        if let Some(between_line) = lines.get(idx) {
2609                                            let trimmed = between_line.content(content).trim();
2610                                            if trimmed.is_empty() {
2611                                                return false;
2612                                            }
2613                                            // Check for structural separators that break lists
2614                                            trimmed.starts_with("```")
2615                                                || trimmed.starts_with("~~~")
2616                                                || trimmed.starts_with("---")
2617                                                || trimmed.starts_with("***")
2618                                                || trimmed.starts_with("___")
2619                                                || trimmed.starts_with(">")
2620                                                || trimmed.contains('|') // Tables
2621                                                || between_line.heading.is_some()
2622                                        } else {
2623                                            false
2624                                        }
2625                                    });
2626                                    found_continuation = !has_structural_separators;
2627                                } else {
2628                                    // For unordered lists: also check for structural separators
2629                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2630                                        if let Some(between_line) = lines.get(idx) {
2631                                            let trimmed = between_line.content(content).trim();
2632                                            if trimmed.is_empty() {
2633                                                return false;
2634                                            }
2635                                            // Check for structural separators that break lists
2636                                            trimmed.starts_with("```")
2637                                                || trimmed.starts_with("~~~")
2638                                                || trimmed.starts_with("---")
2639                                                || trimmed.starts_with("***")
2640                                                || trimmed.starts_with("___")
2641                                                || trimmed.starts_with(">")
2642                                                || trimmed.contains('|') // Tables
2643                                                || between_line.heading.is_some()
2644                                        } else {
2645                                            false
2646                                        }
2647                                    });
2648                                    found_continuation = !has_structural_separators;
2649                                }
2650                            }
2651                        }
2652                    }
2653
2654                    if found_continuation {
2655                        // Include the blank line in the block
2656                        block.end_line = line_num;
2657                    } else {
2658                        // Blank line ends the list - don't include it
2659                        list_blocks.push(block.clone());
2660                        current_block = None;
2661                    }
2662                } else {
2663                    // Check for lazy continuation - non-indented line immediately after a list item
2664                    // But only if the line has sufficient indentation for the list type
2665                    let min_required_indent = if block.is_ordered {
2666                        current_indent_level + last_marker_width
2667                    } else {
2668                        current_indent_level + 2
2669                    };
2670
2671                    // For lazy continuation to apply, the line must either:
2672                    // 1. Have no indentation (true lazy continuation)
2673                    // 2. Have sufficient indentation for the list type
2674                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2675                    let line_content = line_info.content(content).trim();
2676                    let is_structural_separator = line_info.heading.is_some()
2677                        || line_content.starts_with("```")
2678                        || line_content.starts_with("~~~")
2679                        || line_content.starts_with("---")
2680                        || line_content.starts_with("***")
2681                        || line_content.starts_with("___")
2682                        || line_content.starts_with(">")
2683                        || (line_content.contains('|')
2684                            && !line_content.contains("](")
2685                            && !line_content.contains("http")
2686                            && (line_content.matches('|').count() > 1
2687                                || line_content.starts_with('|')
2688                                || line_content.ends_with('|'))); // Tables
2689
2690                    // Allow lazy continuation if we're still within the same list block
2691                    // (not just immediately after a list item)
2692                    let is_lazy_continuation = !is_structural_separator
2693                        && !line_info.is_blank
2694                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2695
2696                    if is_lazy_continuation {
2697                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2698                        // it's probably not a continuation
2699                        let content_to_check = if !blockquote_prefix.is_empty() {
2700                            // Strip blockquote prefix to check the actual content
2701                            line_info
2702                                .content(content)
2703                                .strip_prefix(&blockquote_prefix)
2704                                .unwrap_or(line_info.content(content))
2705                                .trim()
2706                        } else {
2707                            line_info.content(content).trim()
2708                        };
2709
2710                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2711
2712                        // If it starts with uppercase and the previous line ended with punctuation,
2713                        // it's likely a new paragraph, not a continuation
2714                        if starts_with_uppercase && last_list_item_line > 0 {
2715                            // This looks like a new paragraph
2716                            list_blocks.push(block.clone());
2717                            current_block = None;
2718                        } else {
2719                            // This is a lazy continuation line
2720                            block.end_line = line_num;
2721                        }
2722                    } else {
2723                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2724                        list_blocks.push(block.clone());
2725                        current_block = None;
2726                    }
2727                }
2728            }
2729        }
2730
2731        // Don't forget the last block
2732        if let Some(block) = current_block {
2733            list_blocks.push(block);
2734        }
2735
2736        // Merge adjacent blocks that should be one
2737        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2738
2739        list_blocks
2740    }
2741
2742    /// Compute character frequency for fast content analysis
2743    fn compute_char_frequency(content: &str) -> CharFrequency {
2744        let mut frequency = CharFrequency::default();
2745
2746        for ch in content.chars() {
2747            match ch {
2748                '#' => frequency.hash_count += 1,
2749                '*' => frequency.asterisk_count += 1,
2750                '_' => frequency.underscore_count += 1,
2751                '-' => frequency.hyphen_count += 1,
2752                '+' => frequency.plus_count += 1,
2753                '>' => frequency.gt_count += 1,
2754                '|' => frequency.pipe_count += 1,
2755                '[' => frequency.bracket_count += 1,
2756                '`' => frequency.backtick_count += 1,
2757                '<' => frequency.lt_count += 1,
2758                '!' => frequency.exclamation_count += 1,
2759                '\n' => frequency.newline_count += 1,
2760                _ => {}
2761            }
2762        }
2763
2764        frequency
2765    }
2766
2767    /// Parse HTML tags in the content
2768    fn parse_html_tags(
2769        content: &str,
2770        lines: &[LineInfo],
2771        code_blocks: &[(usize, usize)],
2772        flavor: MarkdownFlavor,
2773    ) -> Vec<HtmlTag> {
2774        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2775            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2776
2777        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2778
2779        for cap in HTML_TAG_REGEX.captures_iter(content) {
2780            let full_match = cap.get(0).unwrap();
2781            let match_start = full_match.start();
2782            let match_end = full_match.end();
2783
2784            // Skip if in code block
2785            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2786                continue;
2787            }
2788
2789            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2790            let tag_name_original = cap.get(2).unwrap().as_str();
2791            let tag_name = tag_name_original.to_lowercase();
2792            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2793
2794            // Skip JSX components in MDX files (tags starting with uppercase letter)
2795            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2796            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2797                continue;
2798            }
2799
2800            // Find which line this tag is on
2801            let mut line_num = 1;
2802            let mut col_start = match_start;
2803            let mut col_end = match_end;
2804            for (idx, line_info) in lines.iter().enumerate() {
2805                if match_start >= line_info.byte_offset {
2806                    line_num = idx + 1;
2807                    col_start = match_start - line_info.byte_offset;
2808                    col_end = match_end - line_info.byte_offset;
2809                } else {
2810                    break;
2811                }
2812            }
2813
2814            html_tags.push(HtmlTag {
2815                line: line_num,
2816                start_col: col_start,
2817                end_col: col_end,
2818                byte_offset: match_start,
2819                byte_end: match_end,
2820                tag_name,
2821                is_closing,
2822                is_self_closing,
2823                raw_content: full_match.as_str().to_string(),
2824            });
2825        }
2826
2827        html_tags
2828    }
2829
2830    /// Parse emphasis spans in the content
2831    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2832        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2833            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2834
2835        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2836
2837        for cap in EMPHASIS_REGEX.captures_iter(content) {
2838            let full_match = cap.get(0).unwrap();
2839            let match_start = full_match.start();
2840            let match_end = full_match.end();
2841
2842            // Skip if in code block
2843            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2844                continue;
2845            }
2846
2847            let opening_markers = cap.get(1).unwrap().as_str();
2848            let content_part = cap.get(2).unwrap().as_str();
2849            let closing_markers = cap.get(3).unwrap().as_str();
2850
2851            // Validate matching markers
2852            if opening_markers.chars().next() != closing_markers.chars().next()
2853                || opening_markers.len() != closing_markers.len()
2854            {
2855                continue;
2856            }
2857
2858            let marker = opening_markers.chars().next().unwrap();
2859            let marker_count = opening_markers.len();
2860
2861            // Find which line this emphasis is on
2862            let mut line_num = 1;
2863            let mut col_start = match_start;
2864            let mut col_end = match_end;
2865            for (idx, line_info) in lines.iter().enumerate() {
2866                if match_start >= line_info.byte_offset {
2867                    line_num = idx + 1;
2868                    col_start = match_start - line_info.byte_offset;
2869                    col_end = match_end - line_info.byte_offset;
2870                } else {
2871                    break;
2872                }
2873            }
2874
2875            emphasis_spans.push(EmphasisSpan {
2876                line: line_num,
2877                start_col: col_start,
2878                end_col: col_end,
2879                byte_offset: match_start,
2880                byte_end: match_end,
2881                marker,
2882                marker_count,
2883                content: content_part.to_string(),
2884            });
2885        }
2886
2887        emphasis_spans
2888    }
2889
2890    /// Parse table rows in the content
2891    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2892        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2893
2894        for (line_idx, line_info) in lines.iter().enumerate() {
2895            // Skip lines in code blocks or blank lines
2896            if line_info.in_code_block || line_info.is_blank {
2897                continue;
2898            }
2899
2900            let line = line_info.content(content);
2901            let line_num = line_idx + 1;
2902
2903            // Check if this line contains pipes (potential table row)
2904            if !line.contains('|') {
2905                continue;
2906            }
2907
2908            // Count columns by splitting on pipes
2909            let parts: Vec<&str> = line.split('|').collect();
2910            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2911
2912            // Check if this is a separator row
2913            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2914            let mut column_alignments = Vec::new();
2915
2916            if is_separator {
2917                for part in &parts[1..parts.len() - 1] {
2918                    // Skip first and last empty parts
2919                    let trimmed = part.trim();
2920                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2921                        "center".to_string()
2922                    } else if trimmed.ends_with(':') {
2923                        "right".to_string()
2924                    } else if trimmed.starts_with(':') {
2925                        "left".to_string()
2926                    } else {
2927                        "none".to_string()
2928                    };
2929                    column_alignments.push(alignment);
2930                }
2931            }
2932
2933            table_rows.push(TableRow {
2934                line: line_num,
2935                is_separator,
2936                column_count,
2937                column_alignments,
2938            });
2939        }
2940
2941        table_rows
2942    }
2943
2944    /// Parse bare URLs and emails in the content
2945    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2946        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2947
2948        // Check for bare URLs (not in angle brackets or markdown links)
2949        for cap in BARE_URL_PATTERN.captures_iter(content) {
2950            let full_match = cap.get(0).unwrap();
2951            let match_start = full_match.start();
2952            let match_end = full_match.end();
2953
2954            // Skip if in code block
2955            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2956                continue;
2957            }
2958
2959            // Skip if already in angle brackets or markdown links
2960            let preceding_char = if match_start > 0 {
2961                content.chars().nth(match_start - 1)
2962            } else {
2963                None
2964            };
2965            let following_char = content.chars().nth(match_end);
2966
2967            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2968                continue;
2969            }
2970            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2971                continue;
2972            }
2973
2974            let url = full_match.as_str();
2975            let url_type = if url.starts_with("https://") {
2976                "https"
2977            } else if url.starts_with("http://") {
2978                "http"
2979            } else if url.starts_with("ftp://") {
2980                "ftp"
2981            } else {
2982                "other"
2983            };
2984
2985            // Find which line this URL is on
2986            let mut line_num = 1;
2987            let mut col_start = match_start;
2988            let mut col_end = match_end;
2989            for (idx, line_info) in lines.iter().enumerate() {
2990                if match_start >= line_info.byte_offset {
2991                    line_num = idx + 1;
2992                    col_start = match_start - line_info.byte_offset;
2993                    col_end = match_end - line_info.byte_offset;
2994                } else {
2995                    break;
2996                }
2997            }
2998
2999            bare_urls.push(BareUrl {
3000                line: line_num,
3001                start_col: col_start,
3002                end_col: col_end,
3003                byte_offset: match_start,
3004                byte_end: match_end,
3005                url: url.to_string(),
3006                url_type: url_type.to_string(),
3007            });
3008        }
3009
3010        // Check for bare email addresses
3011        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3012            let full_match = cap.get(0).unwrap();
3013            let match_start = full_match.start();
3014            let match_end = full_match.end();
3015
3016            // Skip if in code block
3017            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3018                continue;
3019            }
3020
3021            // Skip if already in angle brackets or markdown links
3022            let preceding_char = if match_start > 0 {
3023                content.chars().nth(match_start - 1)
3024            } else {
3025                None
3026            };
3027            let following_char = content.chars().nth(match_end);
3028
3029            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3030                continue;
3031            }
3032            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3033                continue;
3034            }
3035
3036            let email = full_match.as_str();
3037
3038            // Find which line this email is on
3039            let mut line_num = 1;
3040            let mut col_start = match_start;
3041            let mut col_end = match_end;
3042            for (idx, line_info) in lines.iter().enumerate() {
3043                if match_start >= line_info.byte_offset {
3044                    line_num = idx + 1;
3045                    col_start = match_start - line_info.byte_offset;
3046                    col_end = match_end - line_info.byte_offset;
3047                } else {
3048                    break;
3049                }
3050            }
3051
3052            bare_urls.push(BareUrl {
3053                line: line_num,
3054                start_col: col_start,
3055                end_col: col_end,
3056                byte_offset: match_start,
3057                byte_end: match_end,
3058                url: email.to_string(),
3059                url_type: "email".to_string(),
3060            });
3061        }
3062
3063        bare_urls
3064    }
3065}
3066
3067/// Merge adjacent list blocks that should be treated as one
3068fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3069    if list_blocks.len() < 2 {
3070        return;
3071    }
3072
3073    let mut merger = ListBlockMerger::new(content, lines);
3074    *list_blocks = merger.merge(list_blocks);
3075}
3076
3077/// Helper struct to manage the complex logic of merging list blocks
3078struct ListBlockMerger<'a> {
3079    content: &'a str,
3080    lines: &'a [LineInfo],
3081}
3082
3083impl<'a> ListBlockMerger<'a> {
3084    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3085        Self { content, lines }
3086    }
3087
3088    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3089        let mut merged = Vec::with_capacity(list_blocks.len());
3090        let mut current = list_blocks[0].clone();
3091
3092        for next in list_blocks.iter().skip(1) {
3093            if self.should_merge_blocks(&current, next) {
3094                current = self.merge_two_blocks(current, next);
3095            } else {
3096                merged.push(current);
3097                current = next.clone();
3098            }
3099        }
3100
3101        merged.push(current);
3102        merged
3103    }
3104
3105    /// Determine if two adjacent list blocks should be merged
3106    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3107        // Basic compatibility checks
3108        if !self.blocks_are_compatible(current, next) {
3109            return false;
3110        }
3111
3112        // Check spacing and content between blocks
3113        let spacing = self.analyze_spacing_between(current, next);
3114        match spacing {
3115            BlockSpacing::Consecutive => true,
3116            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3117            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3118                self.can_merge_with_content_between(current, next)
3119            }
3120        }
3121    }
3122
3123    /// Check if blocks have compatible structure for merging
3124    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3125        current.is_ordered == next.is_ordered
3126            && current.blockquote_prefix == next.blockquote_prefix
3127            && current.nesting_level == next.nesting_level
3128    }
3129
3130    /// Analyze the spacing between two list blocks
3131    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3132        let gap = next.start_line - current.end_line;
3133
3134        match gap {
3135            1 => BlockSpacing::Consecutive,
3136            2 => BlockSpacing::SingleBlank,
3137            _ if gap > 2 => {
3138                if self.has_only_blank_lines_between(current, next) {
3139                    BlockSpacing::MultipleBlanks
3140                } else {
3141                    BlockSpacing::ContentBetween
3142                }
3143            }
3144            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3145        }
3146    }
3147
3148    /// Check if unordered lists can be merged with a single blank line between
3149    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3150        // Check if there are structural separators between the blocks
3151        // If has_meaningful_content_between returns true, it means there are structural separators
3152        if has_meaningful_content_between(self.content, current, next, self.lines) {
3153            return false; // Structural separators prevent merging
3154        }
3155
3156        // Only merge unordered lists with same marker across single blank
3157        !current.is_ordered && current.marker == next.marker
3158    }
3159
3160    /// Check if ordered lists can be merged when there's content between them
3161    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3162        // Do not merge lists if there are structural separators between them
3163        if has_meaningful_content_between(self.content, current, next, self.lines) {
3164            return false; // Structural separators prevent merging
3165        }
3166
3167        // Only consider merging ordered lists if there's no structural content between
3168        current.is_ordered && next.is_ordered
3169    }
3170
3171    /// Check if there are only blank lines between blocks
3172    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3173        for line_num in (current.end_line + 1)..next.start_line {
3174            if let Some(line_info) = self.lines.get(line_num - 1)
3175                && !line_info.content(self.content).trim().is_empty()
3176            {
3177                return false;
3178            }
3179        }
3180        true
3181    }
3182
3183    /// Merge two compatible list blocks into one
3184    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3185        current.end_line = next.end_line;
3186        current.item_lines.extend_from_slice(&next.item_lines);
3187
3188        // Update max marker width
3189        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3190
3191        // Handle marker consistency for unordered lists
3192        if !current.is_ordered && self.markers_differ(&current, next) {
3193            current.marker = None; // Mixed markers
3194        }
3195
3196        current
3197    }
3198
3199    /// Check if two blocks have different markers
3200    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3201        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3202    }
3203}
3204
3205/// Types of spacing between list blocks
3206#[derive(Debug, PartialEq)]
3207enum BlockSpacing {
3208    Consecutive,    // No gap between blocks
3209    SingleBlank,    // One blank line between blocks
3210    MultipleBlanks, // Multiple blank lines but no content
3211    ContentBetween, // Content exists between blocks
3212}
3213
3214/// Check if there's meaningful content (not just blank lines) between two list blocks
3215fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3216    // Check lines between current.end_line and next.start_line
3217    for line_num in (current.end_line + 1)..next.start_line {
3218        if let Some(line_info) = lines.get(line_num - 1) {
3219            // Convert to 0-indexed
3220            let trimmed = line_info.content(content).trim();
3221
3222            // Skip empty lines
3223            if trimmed.is_empty() {
3224                continue;
3225            }
3226
3227            // Check for structural separators that should separate lists (CommonMark compliant)
3228
3229            // Headings separate lists
3230            if line_info.heading.is_some() {
3231                return true; // Has meaningful content - headings separate lists
3232            }
3233
3234            // Horizontal rules separate lists (---, ***, ___)
3235            if is_horizontal_rule(trimmed) {
3236                return true; // Has meaningful content - horizontal rules separate lists
3237            }
3238
3239            // Tables separate lists (lines containing | but not in URLs or code)
3240            // Simple heuristic: tables typically have | at start/end or multiple |
3241            if trimmed.contains('|') && trimmed.len() > 1 {
3242                // Don't treat URLs with | as tables
3243                if !trimmed.contains("](") && !trimmed.contains("http") {
3244                    // More robust check: tables usually have multiple | or | at edges
3245                    let pipe_count = trimmed.matches('|').count();
3246                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3247                        return true; // Has meaningful content - tables separate lists
3248                    }
3249                }
3250            }
3251
3252            // Blockquotes separate lists
3253            if trimmed.starts_with('>') {
3254                return true; // Has meaningful content - blockquotes separate lists
3255            }
3256
3257            // Code block fences separate lists (unless properly indented as list content)
3258            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3259                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3260
3261                // Check if this code block is properly indented as list continuation
3262                let min_continuation_indent = if current.is_ordered {
3263                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3264                } else {
3265                    current.nesting_level + 2
3266                };
3267
3268                if line_indent < min_continuation_indent {
3269                    // This is a standalone code block that separates lists
3270                    return true; // Has meaningful content - standalone code blocks separate lists
3271                }
3272            }
3273
3274            // Check if this line has proper indentation for list continuation
3275            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3276
3277            // Calculate minimum indentation needed to be list continuation
3278            let min_indent = if current.is_ordered {
3279                current.nesting_level + current.max_marker_width
3280            } else {
3281                current.nesting_level + 2
3282            };
3283
3284            // If the line is not indented enough to be list continuation, it's meaningful content
3285            if line_indent < min_indent {
3286                return true; // Has meaningful content - content not indented as list continuation
3287            }
3288
3289            // If we reach here, the line is properly indented as list continuation
3290            // Continue checking other lines
3291        }
3292    }
3293
3294    // Only blank lines or properly indented list continuation content between blocks
3295    false
3296}
3297
3298/// Check if a line is a horizontal rule (---, ***, ___)
3299fn is_horizontal_rule(trimmed: &str) -> bool {
3300    if trimmed.len() < 3 {
3301        return false;
3302    }
3303
3304    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3305    let chars: Vec<char> = trimmed.chars().collect();
3306    if let Some(&first_char) = chars.first()
3307        && (first_char == '-' || first_char == '*' || first_char == '_')
3308    {
3309        let mut count = 0;
3310        for &ch in &chars {
3311            if ch == first_char {
3312                count += 1;
3313            } else if ch != ' ' && ch != '\t' {
3314                return false; // Non-matching, non-whitespace character
3315            }
3316        }
3317        return count >= 3;
3318    }
3319    false
3320}
3321
3322/// Check if content contains patterns that cause the markdown crate to panic
3323#[cfg(test)]
3324mod tests {
3325    use super::*;
3326
3327    #[test]
3328    fn test_empty_content() {
3329        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3330        assert_eq!(ctx.content, "");
3331        assert_eq!(ctx.line_offsets, vec![0]);
3332        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3333        assert_eq!(ctx.lines.len(), 0);
3334    }
3335
3336    #[test]
3337    fn test_single_line() {
3338        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3339        assert_eq!(ctx.content, "# Hello");
3340        assert_eq!(ctx.line_offsets, vec![0]);
3341        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3342        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3343    }
3344
3345    #[test]
3346    fn test_multi_line() {
3347        let content = "# Title\n\nSecond line\nThird line";
3348        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3349        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3350        // Test offset to line/col
3351        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3352        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3353        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3354        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3355        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3356    }
3357
3358    #[test]
3359    fn test_line_info() {
3360        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3361        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3362
3363        // Test line info
3364        assert_eq!(ctx.lines.len(), 7);
3365
3366        // Line 1: "# Title"
3367        let line1 = &ctx.lines[0];
3368        assert_eq!(line1.content(ctx.content), "# Title");
3369        assert_eq!(line1.byte_offset, 0);
3370        assert_eq!(line1.indent, 0);
3371        assert!(!line1.is_blank);
3372        assert!(!line1.in_code_block);
3373        assert!(line1.list_item.is_none());
3374
3375        // Line 2: "    indented"
3376        let line2 = &ctx.lines[1];
3377        assert_eq!(line2.content(ctx.content), "    indented");
3378        assert_eq!(line2.byte_offset, 8);
3379        assert_eq!(line2.indent, 4);
3380        assert!(!line2.is_blank);
3381
3382        // Line 3: "" (blank)
3383        let line3 = &ctx.lines[2];
3384        assert_eq!(line3.content(ctx.content), "");
3385        assert!(line3.is_blank);
3386
3387        // Test helper methods
3388        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3389        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3390        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3391        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3392    }
3393
3394    #[test]
3395    fn test_list_item_detection() {
3396        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3397        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3398
3399        // Line 1: "- Unordered item"
3400        let line1 = &ctx.lines[0];
3401        assert!(line1.list_item.is_some());
3402        let list1 = line1.list_item.as_ref().unwrap();
3403        assert_eq!(list1.marker, "-");
3404        assert!(!list1.is_ordered);
3405        assert_eq!(list1.marker_column, 0);
3406        assert_eq!(list1.content_column, 2);
3407
3408        // Line 2: "  * Nested item"
3409        let line2 = &ctx.lines[1];
3410        assert!(line2.list_item.is_some());
3411        let list2 = line2.list_item.as_ref().unwrap();
3412        assert_eq!(list2.marker, "*");
3413        assert_eq!(list2.marker_column, 2);
3414
3415        // Line 3: "1. Ordered item"
3416        let line3 = &ctx.lines[2];
3417        assert!(line3.list_item.is_some());
3418        let list3 = line3.list_item.as_ref().unwrap();
3419        assert_eq!(list3.marker, "1.");
3420        assert!(list3.is_ordered);
3421        assert_eq!(list3.number, Some(1));
3422
3423        // Line 6: "Not a list"
3424        let line6 = &ctx.lines[5];
3425        assert!(line6.list_item.is_none());
3426    }
3427
3428    #[test]
3429    fn test_offset_to_line_col_edge_cases() {
3430        let content = "a\nb\nc";
3431        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3432        // line_offsets: [0, 2, 4]
3433        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3434        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3435        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3436        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3437        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3438        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3439    }
3440
3441    #[test]
3442    fn test_mdx_esm_blocks() {
3443        let content = r##"import {Chart} from './snowfall.js'
3444export const year = 2023
3445
3446# Last year's snowfall
3447
3448In {year}, the snowfall was above average.
3449It was followed by a warm spring which caused
3450flood conditions in many of the nearby rivers.
3451
3452<Chart color="#fcb32c" year={year} />
3453"##;
3454
3455        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3456
3457        // Check that lines 1 and 2 are marked as ESM blocks
3458        assert_eq!(ctx.lines.len(), 10);
3459        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3460        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3461        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3462        assert!(
3463            !ctx.lines[3].in_esm_block,
3464            "Line 4 (heading) should NOT be in_esm_block"
3465        );
3466        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3467        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3468    }
3469
3470    #[test]
3471    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3472        let content = r#"import {Chart} from './snowfall.js'
3473export const year = 2023
3474
3475# Last year's snowfall
3476"#;
3477
3478        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3479
3480        // ESM blocks should NOT be detected in Standard flavor
3481        assert!(
3482            !ctx.lines[0].in_esm_block,
3483            "Line 1 should NOT be in_esm_block in Standard flavor"
3484        );
3485        assert!(
3486            !ctx.lines[1].in_esm_block,
3487            "Line 2 should NOT be in_esm_block in Standard flavor"
3488        );
3489    }
3490}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs