rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::sync::LazyLock;
7
8// Comprehensive link pattern that captures both inline and reference links
9// Use (?s) flag to make . match newlines
10static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
11    Regex::new(
12        r#"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
16            |
17            \[([^\]]*)\]      # Reference ID in group 6
18        )"#
19    ).unwrap()
20});
21
22// Image pattern (similar to links but with ! prefix)
23// Use (?s) flag to make . match newlines
24static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
25    Regex::new(
26        r#"(?sx)
27        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
28        (?:
29            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
30            |
31            \[([^\]]*)\]      # Reference ID in group 6
32        )"#
33    ).unwrap()
34});
35
36// Reference definition pattern
37static REF_DEF_PATTERN: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
39
40// Pattern for bare URLs
41static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
42    Regex::new(
43        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44    ).unwrap()
45});
46
47// Pattern for email addresses
48static BARE_EMAIL_PATTERN: LazyLock<Regex> =
49    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
50
51// Pattern for blockquote prefix in parse_list_blocks
52static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
53
54/// Pre-computed information about a line
55#[derive(Debug, Clone)]
56pub struct LineInfo {
57    /// The actual line content (without newline)
58    pub content: String,
59    /// Byte offset where this line starts in the document
60    pub byte_offset: usize,
61    /// Number of leading spaces/tabs
62    pub indent: usize,
63    /// Whether the line is blank (empty or only whitespace)
64    pub is_blank: bool,
65    /// Whether this line is inside a code block
66    pub in_code_block: bool,
67    /// Whether this line is inside front matter
68    pub in_front_matter: bool,
69    /// Whether this line is inside an HTML block
70    pub in_html_block: bool,
71    /// Whether this line is inside an HTML comment
72    pub in_html_comment: bool,
73    /// List item information if this line starts a list item
74    pub list_item: Option<ListItemInfo>,
75    /// Heading information if this line is a heading
76    pub heading: Option<HeadingInfo>,
77    /// Blockquote information if this line is a blockquote
78    pub blockquote: Option<BlockquoteInfo>,
79    /// Whether this line is inside a mkdocstrings autodoc block
80    pub in_mkdocstrings: bool,
81    /// Whether this line is part of an ESM import/export block (MDX only)
82    pub in_esm_block: bool,
83}
84
85/// Information about a list item
86#[derive(Debug, Clone)]
87pub struct ListItemInfo {
88    /// The marker used (*, -, +, or number with . or ))
89    pub marker: String,
90    /// Whether it's ordered (true) or unordered (false)
91    pub is_ordered: bool,
92    /// The number for ordered lists
93    pub number: Option<usize>,
94    /// Column where the marker starts (0-based)
95    pub marker_column: usize,
96    /// Column where content after marker starts
97    pub content_column: usize,
98}
99
100/// Heading style type
101#[derive(Debug, Clone, PartialEq)]
102pub enum HeadingStyle {
103    /// ATX style heading (# Heading)
104    ATX,
105    /// Setext style heading with = underline
106    Setext1,
107    /// Setext style heading with - underline
108    Setext2,
109}
110
111/// Parsed link information
112#[derive(Debug, Clone)]
113pub struct ParsedLink {
114    /// Line number (1-indexed)
115    pub line: usize,
116    /// Start column (0-indexed) in the line
117    pub start_col: usize,
118    /// End column (0-indexed) in the line
119    pub end_col: usize,
120    /// Byte offset in document
121    pub byte_offset: usize,
122    /// End byte offset in document
123    pub byte_end: usize,
124    /// Link text
125    pub text: String,
126    /// Link URL or reference
127    pub url: String,
128    /// Whether this is a reference link [text][ref] vs inline [text](url)
129    pub is_reference: bool,
130    /// Reference ID for reference links
131    pub reference_id: Option<String>,
132    /// Link type from pulldown-cmark
133    pub link_type: LinkType,
134}
135
136/// Information about a broken link reported by pulldown-cmark
137#[derive(Debug, Clone)]
138pub struct BrokenLinkInfo {
139    /// The reference text that couldn't be resolved
140    pub reference: String,
141    /// Byte span in the source document
142    pub span: std::ops::Range<usize>,
143}
144
145/// Parsed image information
146#[derive(Debug, Clone)]
147pub struct ParsedImage {
148    /// Line number (1-indexed)
149    pub line: usize,
150    /// Start column (0-indexed) in the line
151    pub start_col: usize,
152    /// End column (0-indexed) in the line
153    pub end_col: usize,
154    /// Byte offset in document
155    pub byte_offset: usize,
156    /// End byte offset in document
157    pub byte_end: usize,
158    /// Alt text
159    pub alt_text: String,
160    /// Image URL or reference
161    pub url: String,
162    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
163    pub is_reference: bool,
164    /// Reference ID for reference images
165    pub reference_id: Option<String>,
166    /// Link type from pulldown-cmark
167    pub link_type: LinkType,
168}
169
170/// Reference definition [ref]: url "title"
171#[derive(Debug, Clone)]
172pub struct ReferenceDef {
173    /// Line number (1-indexed)
174    pub line: usize,
175    /// Reference ID (normalized to lowercase)
176    pub id: String,
177    /// URL
178    pub url: String,
179    /// Optional title
180    pub title: Option<String>,
181    /// Byte offset where the reference definition starts
182    pub byte_offset: usize,
183    /// Byte offset where the reference definition ends
184    pub byte_end: usize,
185}
186
187/// Parsed code span information
188#[derive(Debug, Clone)]
189pub struct CodeSpan {
190    /// Line number (1-indexed)
191    pub line: usize,
192    /// Start column (0-indexed) in the line
193    pub start_col: usize,
194    /// End column (0-indexed) in the line
195    pub end_col: usize,
196    /// Byte offset in document
197    pub byte_offset: usize,
198    /// End byte offset in document
199    pub byte_end: usize,
200    /// Number of backticks used (1, 2, 3, etc.)
201    pub backtick_count: usize,
202    /// Content inside the code span (without backticks)
203    pub content: String,
204}
205
206/// Information about a heading
207#[derive(Debug, Clone)]
208pub struct HeadingInfo {
209    /// Heading level (1-6 for ATX, 1-2 for Setext)
210    pub level: u8,
211    /// Style of heading
212    pub style: HeadingStyle,
213    /// The heading marker (# characters or underline)
214    pub marker: String,
215    /// Column where the marker starts (0-based)
216    pub marker_column: usize,
217    /// Column where heading text starts
218    pub content_column: usize,
219    /// The heading text (without markers and without custom ID syntax)
220    pub text: String,
221    /// Custom header ID if present (e.g., from {#custom-id} syntax)
222    pub custom_id: Option<String>,
223    /// Original heading text including custom ID syntax
224    pub raw_text: String,
225    /// Whether it has a closing sequence (for ATX)
226    pub has_closing_sequence: bool,
227    /// The closing sequence if present
228    pub closing_sequence: String,
229}
230
231/// Information about a blockquote line
232#[derive(Debug, Clone)]
233pub struct BlockquoteInfo {
234    /// Nesting level (1 for >, 2 for >>, etc.)
235    pub nesting_level: usize,
236    /// The indentation before the blockquote marker
237    pub indent: String,
238    /// Column where the first > starts (0-based)
239    pub marker_column: usize,
240    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
241    pub prefix: String,
242    /// Content after the blockquote marker(s)
243    pub content: String,
244    /// Whether the line has no space after the marker
245    pub has_no_space_after_marker: bool,
246    /// Whether the line has multiple spaces after the marker
247    pub has_multiple_spaces_after_marker: bool,
248    /// Whether this is an empty blockquote line needing MD028 fix
249    pub needs_md028_fix: bool,
250}
251
252/// Information about a list block
253#[derive(Debug, Clone)]
254pub struct ListBlock {
255    /// Line number where the list starts (1-indexed)
256    pub start_line: usize,
257    /// Line number where the list ends (1-indexed)
258    pub end_line: usize,
259    /// Whether it's ordered or unordered
260    pub is_ordered: bool,
261    /// The consistent marker for unordered lists (if any)
262    pub marker: Option<String>,
263    /// Blockquote prefix for this list (empty if not in blockquote)
264    pub blockquote_prefix: String,
265    /// Lines that are list items within this block
266    pub item_lines: Vec<usize>,
267    /// Nesting level (0 for top-level lists)
268    pub nesting_level: usize,
269    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
270    pub max_marker_width: usize,
271}
272
273use std::sync::{Arc, Mutex};
274
275/// Character frequency data for fast content analysis
276#[derive(Debug, Clone, Default)]
277pub struct CharFrequency {
278    /// Count of # characters (headings)
279    pub hash_count: usize,
280    /// Count of * characters (emphasis, lists, horizontal rules)
281    pub asterisk_count: usize,
282    /// Count of _ characters (emphasis, horizontal rules)
283    pub underscore_count: usize,
284    /// Count of - characters (lists, horizontal rules, setext headings)
285    pub hyphen_count: usize,
286    /// Count of + characters (lists)
287    pub plus_count: usize,
288    /// Count of > characters (blockquotes)
289    pub gt_count: usize,
290    /// Count of | characters (tables)
291    pub pipe_count: usize,
292    /// Count of [ characters (links, images)
293    pub bracket_count: usize,
294    /// Count of ` characters (code spans, code blocks)
295    pub backtick_count: usize,
296    /// Count of < characters (HTML tags, autolinks)
297    pub lt_count: usize,
298    /// Count of ! characters (images)
299    pub exclamation_count: usize,
300    /// Count of newline characters
301    pub newline_count: usize,
302}
303
304/// Pre-parsed HTML tag information
305#[derive(Debug, Clone)]
306pub struct HtmlTag {
307    /// Line number (1-indexed)
308    pub line: usize,
309    /// Start column (0-indexed) in the line
310    pub start_col: usize,
311    /// End column (0-indexed) in the line
312    pub end_col: usize,
313    /// Byte offset in document
314    pub byte_offset: usize,
315    /// End byte offset in document
316    pub byte_end: usize,
317    /// Tag name (e.g., "div", "img", "br")
318    pub tag_name: String,
319    /// Whether it's a closing tag (`</tag>`)
320    pub is_closing: bool,
321    /// Whether it's self-closing (`<tag />`)
322    pub is_self_closing: bool,
323    /// Raw tag content
324    pub raw_content: String,
325}
326
327/// Pre-parsed emphasis span information
328#[derive(Debug, Clone)]
329pub struct EmphasisSpan {
330    /// Line number (1-indexed)
331    pub line: usize,
332    /// Start column (0-indexed) in the line
333    pub start_col: usize,
334    /// End column (0-indexed) in the line
335    pub end_col: usize,
336    /// Byte offset in document
337    pub byte_offset: usize,
338    /// End byte offset in document
339    pub byte_end: usize,
340    /// Type of emphasis ('*' or '_')
341    pub marker: char,
342    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
343    pub marker_count: usize,
344    /// Content inside the emphasis
345    pub content: String,
346}
347
348/// Pre-parsed table row information
349#[derive(Debug, Clone)]
350pub struct TableRow {
351    /// Line number (1-indexed)
352    pub line: usize,
353    /// Whether this is a separator row (contains only |, -, :, and spaces)
354    pub is_separator: bool,
355    /// Number of columns (pipe-separated cells)
356    pub column_count: usize,
357    /// Alignment info from separator row
358    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
359}
360
361/// Pre-parsed bare URL information (not in links)
362#[derive(Debug, Clone)]
363pub struct BareUrl {
364    /// Line number (1-indexed)
365    pub line: usize,
366    /// Start column (0-indexed) in the line
367    pub start_col: usize,
368    /// End column (0-indexed) in the line
369    pub end_col: usize,
370    /// Byte offset in document
371    pub byte_offset: usize,
372    /// End byte offset in document
373    pub byte_end: usize,
374    /// The URL string
375    pub url: String,
376    /// Type of URL ("http", "https", "ftp", "email")
377    pub url_type: String,
378}
379
380pub struct LintContext<'a> {
381    pub content: &'a str,
382    pub line_offsets: Vec<usize>,
383    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
384    pub lines: Vec<LineInfo>,             // Pre-computed line information
385    pub links: Vec<ParsedLink>,           // Pre-parsed links
386    pub images: Vec<ParsedImage>,         // Pre-parsed images
387    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
388    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
389    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
390    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
391    pub char_frequency: CharFrequency,    // Character frequency analysis
392    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
393    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
394    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
395    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
396    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
397    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
398    pub line_index: crate::utils::range_utils::LineIndex, // Pre-computed line index for byte position calculations
399    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
400    pub flavor: MarkdownFlavor,           // Markdown flavor being used
401}
402
403/// Detailed blockquote parse result with all components
404struct BlockquoteComponents<'a> {
405    indent: &'a str,
406    markers: &'a str,
407    spaces_after: &'a str,
408    content: &'a str,
409}
410
411/// Parse blockquote prefix with detailed components using manual parsing
412#[inline]
413fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
414    let bytes = line.as_bytes();
415    let mut pos = 0;
416
417    // Parse leading whitespace (indent)
418    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
419        pos += 1;
420    }
421    let indent_end = pos;
422
423    // Must have at least one '>' marker
424    if pos >= bytes.len() || bytes[pos] != b'>' {
425        return None;
426    }
427
428    // Parse '>' markers
429    while pos < bytes.len() && bytes[pos] == b'>' {
430        pos += 1;
431    }
432    let markers_end = pos;
433
434    // Parse spaces after markers
435    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
436        pos += 1;
437    }
438    let spaces_end = pos;
439
440    Some(BlockquoteComponents {
441        indent: &line[0..indent_end],
442        markers: &line[indent_end..markers_end],
443        spaces_after: &line[markers_end..spaces_end],
444        content: &line[spaces_end..],
445    })
446}
447
448impl<'a> LintContext<'a> {
449    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
450        use std::time::Instant;
451        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
452
453        let start = Instant::now();
454        let mut line_offsets = vec![0];
455        for (i, c) in content.char_indices() {
456            if c == '\n' {
457                line_offsets.push(i + 1);
458            }
459        }
460        if profile {
461            eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
462        }
463
464        // Detect code blocks once and cache them
465        let start = Instant::now();
466        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
467        if profile {
468            eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
469        }
470
471        // Pre-compute HTML comment ranges ONCE for all operations
472        let start = Instant::now();
473        let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
474        if profile {
475            eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
476        }
477
478        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
479        let start = Instant::now();
480        let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
481            crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
482        } else {
483            Vec::new()
484        };
485        if profile {
486            eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
487        }
488
489        // Pre-compute line information (without headings/blockquotes yet)
490        let start = Instant::now();
491        let mut lines = Self::compute_basic_line_info(
492            content,
493            &line_offsets,
494            &code_blocks,
495            flavor,
496            &html_comment_ranges,
497            &autodoc_ranges,
498        );
499        if profile {
500            eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
501        }
502
503        // Detect HTML blocks BEFORE heading detection
504        let start = Instant::now();
505        Self::detect_html_blocks(&mut lines);
506        if profile {
507            eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
508        }
509
510        // Detect ESM import/export blocks in MDX files BEFORE heading detection
511        let start = Instant::now();
512        Self::detect_esm_blocks(&mut lines, flavor);
513        if profile {
514            eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
515        }
516
517        // Now detect headings and blockquotes
518        let start = Instant::now();
519        Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
520        if profile {
521            eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
522        }
523
524        // Parse code spans early so we can exclude them from link/image parsing
525        let start = Instant::now();
526        let code_spans = Self::parse_code_spans(content, &lines);
527        if profile {
528            eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
529        }
530
531        // Parse links, images, references, and list blocks
532        let start = Instant::now();
533        let (links, broken_links) =
534            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
535        if profile {
536            eprintln!("[PROFILE] Links: {:?}", start.elapsed());
537        }
538
539        let start = Instant::now();
540        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
541        if profile {
542            eprintln!("[PROFILE] Images: {:?}", start.elapsed());
543        }
544
545        let start = Instant::now();
546        let reference_defs = Self::parse_reference_defs(content, &lines);
547        if profile {
548            eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
549        }
550
551        let start = Instant::now();
552        let list_blocks = Self::parse_list_blocks(&lines);
553        if profile {
554            eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
555        }
556
557        // Compute character frequency for fast content analysis
558        let start = Instant::now();
559        let char_frequency = Self::compute_char_frequency(content);
560        if profile {
561            eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
562        }
563
564        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
565        let start = Instant::now();
566        let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
567            content,
568            &code_blocks,
569            &code_spans,
570            &html_comment_ranges,
571        );
572        if profile {
573            eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
574        }
575
576        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
577        let start = Instant::now();
578        let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
579        if profile {
580            eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
581        }
582
583        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
584        let start = Instant::now();
585        let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
586        if profile {
587            eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
588        }
589
590        Self {
591            content,
592            line_offsets,
593            code_blocks,
594            lines,
595            links,
596            images,
597            broken_links,
598            reference_defs,
599            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
600            list_blocks,
601            char_frequency,
602            html_tags_cache: Mutex::new(None),
603            emphasis_spans_cache: Mutex::new(None),
604            table_rows_cache: Mutex::new(None),
605            bare_urls_cache: Mutex::new(None),
606            html_comment_ranges,
607            table_blocks,
608            line_index,
609            jinja_ranges,
610            flavor,
611        }
612    }
613
614    /// Get code spans - computed lazily on first access
615    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
616        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
617
618        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
619    }
620
621    /// Get HTML comment ranges - pre-computed during LintContext construction
622    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
623        &self.html_comment_ranges
624    }
625
626    /// Get HTML tags - computed lazily on first access
627    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
628        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
629
630        Arc::clone(cache.get_or_insert_with(|| {
631            Arc::new(Self::parse_html_tags(
632                self.content,
633                &self.lines,
634                &self.code_blocks,
635                self.flavor,
636            ))
637        }))
638    }
639
640    /// Get emphasis spans - computed lazily on first access
641    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
642        let mut cache = self
643            .emphasis_spans_cache
644            .lock()
645            .expect("Emphasis spans cache mutex poisoned");
646
647        Arc::clone(
648            cache.get_or_insert_with(|| {
649                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
650            }),
651        )
652    }
653
654    /// Get table rows - computed lazily on first access
655    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
656        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
657
658        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(&self.lines))))
659    }
660
661    /// Get bare URLs - computed lazily on first access
662    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
663        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
664
665        Arc::clone(
666            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
667        )
668    }
669
670    /// Map a byte offset to (line, column)
671    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
672        match self.line_offsets.binary_search(&offset) {
673            Ok(line) => (line + 1, 1),
674            Err(line) => {
675                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
676                (line, offset - line_start + 1)
677            }
678        }
679    }
680
681    /// Check if a position is within a code block or code span
682    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
683        // Check code blocks first
684        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
685            return true;
686        }
687
688        // Check inline code spans (lazy load if needed)
689        self.code_spans()
690            .iter()
691            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
692    }
693
694    /// Get line information by line number (1-indexed)
695    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
696        if line_num > 0 {
697            self.lines.get(line_num - 1)
698        } else {
699            None
700        }
701    }
702
703    /// Get byte offset for a line number (1-indexed)
704    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
705        self.line_info(line_num).map(|info| info.byte_offset)
706    }
707
708    /// Get URL for a reference link/image by its ID
709    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
710        let normalized_id = ref_id.to_lowercase();
711        self.reference_defs
712            .iter()
713            .find(|def| def.id == normalized_id)
714            .map(|def| def.url.as_str())
715    }
716
717    /// Get links on a specific line
718    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
719        self.links.iter().filter(|link| link.line == line_num).collect()
720    }
721
722    /// Get images on a specific line
723    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
724        self.images.iter().filter(|img| img.line == line_num).collect()
725    }
726
727    /// Check if a line is part of a list block
728    pub fn is_in_list_block(&self, line_num: usize) -> bool {
729        self.list_blocks
730            .iter()
731            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
732    }
733
734    /// Get the list block containing a specific line
735    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
736        self.list_blocks
737            .iter()
738            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
739    }
740
741    // Compatibility methods for DocumentStructure migration
742
743    /// Check if a line is within a code block
744    pub fn is_in_code_block(&self, line_num: usize) -> bool {
745        if line_num == 0 || line_num > self.lines.len() {
746            return false;
747        }
748        self.lines[line_num - 1].in_code_block
749    }
750
751    /// Check if a line is within front matter
752    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
753        if line_num == 0 || line_num > self.lines.len() {
754            return false;
755        }
756        self.lines[line_num - 1].in_front_matter
757    }
758
759    /// Check if a line is within an HTML block
760    pub fn is_in_html_block(&self, line_num: usize) -> bool {
761        if line_num == 0 || line_num > self.lines.len() {
762            return false;
763        }
764        self.lines[line_num - 1].in_html_block
765    }
766
767    /// Check if a line and column is within a code span
768    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
769        if line_num == 0 || line_num > self.lines.len() {
770            return false;
771        }
772
773        // Use the code spans cache to check
774        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
775        // Convert col to 0-indexed for comparison
776        let col_0indexed = if col > 0 { col - 1 } else { 0 };
777        let code_spans = self.code_spans();
778        code_spans
779            .iter()
780            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
781    }
782
783    /// Check if a byte position is within a reference definition
784    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
785    #[inline]
786    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
787        self.reference_defs
788            .iter()
789            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
790    }
791
792    /// Check if a byte position is within an HTML comment
793    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
794    /// where k is the number of HTML comments (typically very small)
795    #[inline]
796    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
797        self.html_comment_ranges
798            .iter()
799            .any(|range| byte_pos >= range.start && byte_pos < range.end)
800    }
801
802    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
803    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
804        self.jinja_ranges
805            .iter()
806            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
807    }
808
809    /// Check if content has any instances of a specific character (fast)
810    pub fn has_char(&self, ch: char) -> bool {
811        match ch {
812            '#' => self.char_frequency.hash_count > 0,
813            '*' => self.char_frequency.asterisk_count > 0,
814            '_' => self.char_frequency.underscore_count > 0,
815            '-' => self.char_frequency.hyphen_count > 0,
816            '+' => self.char_frequency.plus_count > 0,
817            '>' => self.char_frequency.gt_count > 0,
818            '|' => self.char_frequency.pipe_count > 0,
819            '[' => self.char_frequency.bracket_count > 0,
820            '`' => self.char_frequency.backtick_count > 0,
821            '<' => self.char_frequency.lt_count > 0,
822            '!' => self.char_frequency.exclamation_count > 0,
823            '\n' => self.char_frequency.newline_count > 0,
824            _ => self.content.contains(ch), // Fallback for other characters
825        }
826    }
827
828    /// Get count of a specific character (fast)
829    pub fn char_count(&self, ch: char) -> usize {
830        match ch {
831            '#' => self.char_frequency.hash_count,
832            '*' => self.char_frequency.asterisk_count,
833            '_' => self.char_frequency.underscore_count,
834            '-' => self.char_frequency.hyphen_count,
835            '+' => self.char_frequency.plus_count,
836            '>' => self.char_frequency.gt_count,
837            '|' => self.char_frequency.pipe_count,
838            '[' => self.char_frequency.bracket_count,
839            '`' => self.char_frequency.backtick_count,
840            '<' => self.char_frequency.lt_count,
841            '!' => self.char_frequency.exclamation_count,
842            '\n' => self.char_frequency.newline_count,
843            _ => self.content.matches(ch).count(), // Fallback for other characters
844        }
845    }
846
847    /// Check if content likely contains headings (fast)
848    pub fn likely_has_headings(&self) -> bool {
849        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
850    }
851
852    /// Check if content likely contains lists (fast)
853    pub fn likely_has_lists(&self) -> bool {
854        self.char_frequency.asterisk_count > 0
855            || self.char_frequency.hyphen_count > 0
856            || self.char_frequency.plus_count > 0
857    }
858
859    /// Check if content likely contains emphasis (fast)
860    pub fn likely_has_emphasis(&self) -> bool {
861        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
862    }
863
864    /// Check if content likely contains tables (fast)
865    pub fn likely_has_tables(&self) -> bool {
866        self.char_frequency.pipe_count > 2
867    }
868
869    /// Check if content likely contains blockquotes (fast)
870    pub fn likely_has_blockquotes(&self) -> bool {
871        self.char_frequency.gt_count > 0
872    }
873
874    /// Check if content likely contains code (fast)
875    pub fn likely_has_code(&self) -> bool {
876        self.char_frequency.backtick_count > 0
877    }
878
879    /// Check if content likely contains links or images (fast)
880    pub fn likely_has_links_or_images(&self) -> bool {
881        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
882    }
883
884    /// Check if content likely contains HTML (fast)
885    pub fn likely_has_html(&self) -> bool {
886        self.char_frequency.lt_count > 0
887    }
888
889    /// Get HTML tags on a specific line
890    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
891        self.html_tags()
892            .iter()
893            .filter(|tag| tag.line == line_num)
894            .cloned()
895            .collect()
896    }
897
898    /// Get emphasis spans on a specific line
899    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
900        self.emphasis_spans()
901            .iter()
902            .filter(|span| span.line == line_num)
903            .cloned()
904            .collect()
905    }
906
907    /// Get table rows on a specific line
908    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
909        self.table_rows()
910            .iter()
911            .filter(|row| row.line == line_num)
912            .cloned()
913            .collect()
914    }
915
916    /// Get bare URLs on a specific line
917    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
918        self.bare_urls()
919            .iter()
920            .filter(|url| url.line == line_num)
921            .cloned()
922            .collect()
923    }
924
925    /// Find the line index for a given byte offset using binary search.
926    /// Returns (line_index, line_number, column) where:
927    /// - line_index is the 0-based index in the lines array
928    /// - line_number is the 1-based line number
929    /// - column is the byte offset within that line
930    #[inline]
931    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
932        // Binary search to find the line containing this byte offset
933        let idx = match lines.binary_search_by(|line| {
934            if byte_offset < line.byte_offset {
935                std::cmp::Ordering::Greater
936            } else if byte_offset > line.byte_offset + line.content.len() {
937                std::cmp::Ordering::Less
938            } else {
939                std::cmp::Ordering::Equal
940            }
941        }) {
942            Ok(idx) => idx,
943            Err(idx) => idx.saturating_sub(1),
944        };
945
946        let line = &lines[idx];
947        let line_num = idx + 1;
948        let col = byte_offset.saturating_sub(line.byte_offset);
949
950        (idx, line_num, col)
951    }
952
953    /// Check if a byte offset is within a code span using binary search
954    #[inline]
955    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
956        // Since spans are sorted by byte_offset, use partition_point for binary search
957        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
958
959        // Check the span that starts at or before our offset
960        if idx > 0 {
961            let span = &code_spans[idx - 1];
962            if offset >= span.byte_offset && offset < span.byte_end {
963                return true;
964            }
965        }
966
967        false
968    }
969
970    /// Parse all links in the content
971    fn parse_links(
972        content: &str,
973        lines: &[LineInfo],
974        code_blocks: &[(usize, usize)],
975        code_spans: &[CodeSpan],
976        flavor: MarkdownFlavor,
977        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
978    ) -> (Vec<ParsedLink>, Vec<BrokenLinkInfo>) {
979        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
980        use std::collections::HashSet;
981
982        let mut links = Vec::with_capacity(content.len() / 500);
983        let mut broken_links = Vec::new();
984
985        // Track byte positions of links found by pulldown-cmark
986        let mut found_positions = HashSet::new();
987
988        // Use pulldown-cmark's streaming parser with BrokenLink callback
989        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
990        // This automatically handles:
991        // - Escaped links (won't generate events)
992        // - Links in code blocks/spans (won't generate Link events)
993        // - Images (generates Tag::Image instead)
994        // - Reference resolution (dest_url is already resolved!)
995        // - Broken references (callback is invoked)
996        // - Wiki-links (enabled via ENABLE_WIKILINKS)
997        let mut options = Options::empty();
998        options.insert(Options::ENABLE_WIKILINKS);
999
1000        let parser = Parser::new_with_broken_link_callback(
1001            content,
1002            options,
1003            Some(|link: BrokenLink<'_>| {
1004                broken_links.push(BrokenLinkInfo {
1005                    reference: link.reference.to_string(),
1006                    span: link.span.clone(),
1007                });
1008                None
1009            }),
1010        )
1011        .into_offset_iter();
1012
1013        let mut link_stack: Vec<(usize, usize, String, LinkType, String)> = Vec::new();
1014        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1015
1016        for (event, range) in parser {
1017            match event {
1018                Event::Start(Tag::Link {
1019                    link_type,
1020                    dest_url,
1021                    id,
1022                    ..
1023                }) => {
1024                    // Link start - record position, URL, and reference ID
1025                    link_stack.push((range.start, range.end, dest_url.to_string(), link_type, id.to_string()));
1026                    text_chunks.clear();
1027                }
1028                Event::Text(text) if !link_stack.is_empty() => {
1029                    // Track text content with its byte range
1030                    text_chunks.push((text.to_string(), range.start, range.end));
1031                }
1032                Event::Code(code) if !link_stack.is_empty() => {
1033                    // Include inline code in link text (with backticks)
1034                    let code_text = format!("`{code}`");
1035                    text_chunks.push((code_text, range.start, range.end));
1036                }
1037                Event::End(TagEnd::Link) => {
1038                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1039                        // Skip if in HTML comment
1040                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1041                            text_chunks.clear();
1042                            continue;
1043                        }
1044
1045                        // Find line and column information
1046                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1047
1048                        // Skip if this link is on a MkDocs snippet line
1049                        if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1050                            text_chunks.clear();
1051                            continue;
1052                        }
1053
1054                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1055
1056                        let is_reference = matches!(
1057                            link_type,
1058                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1059                        );
1060
1061                        // Extract link text directly from source bytes to preserve escaping
1062                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1063                        let link_text = if start_pos < content.len() {
1064                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1065
1066                            // Find MATCHING ] by tracking bracket depth for nested brackets
1067                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1068                            // Brackets inside code spans (between backticks) should be ignored
1069                            let mut close_pos = None;
1070                            let mut depth = 0;
1071                            let mut in_code_span = false;
1072
1073                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1074                                // Count preceding backslashes
1075                                let mut backslash_count = 0;
1076                                let mut j = i;
1077                                while j > 0 && link_bytes[j - 1] == b'\\' {
1078                                    backslash_count += 1;
1079                                    j -= 1;
1080                                }
1081                                let is_escaped = backslash_count % 2 != 0;
1082
1083                                // Track code spans - backticks toggle in/out of code
1084                                if byte == b'`' && !is_escaped {
1085                                    in_code_span = !in_code_span;
1086                                }
1087
1088                                // Only count brackets when NOT in a code span
1089                                if !is_escaped && !in_code_span {
1090                                    if byte == b'[' {
1091                                        depth += 1;
1092                                    } else if byte == b']' {
1093                                        if depth == 0 {
1094                                            // Found the matching closing bracket
1095                                            close_pos = Some(i);
1096                                            break;
1097                                        } else {
1098                                            depth -= 1;
1099                                        }
1100                                    }
1101                                }
1102                            }
1103
1104                            if let Some(pos) = close_pos {
1105                                std::str::from_utf8(&link_bytes[1..pos]).unwrap_or("").to_string()
1106                            } else {
1107                                String::new()
1108                            }
1109                        } else {
1110                            String::new()
1111                        };
1112
1113                        // For reference links, use the actual reference ID from pulldown-cmark
1114                        let reference_id = if is_reference && !ref_id.is_empty() {
1115                            Some(ref_id.to_lowercase())
1116                        } else if is_reference {
1117                            // For collapsed/shortcut references without explicit ID, use the link text
1118                            Some(link_text.to_lowercase())
1119                        } else {
1120                            None
1121                        };
1122
1123                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1124                        // Check for escaped image syntax: \![text](url)
1125                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1126                        let has_escaped_bang = start_pos >= 2
1127                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1128                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1129
1130                        // Check for escaped bracket: \[text](url)
1131                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1132                        let has_escaped_bracket =
1133                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1134
1135                        if has_escaped_bang || has_escaped_bracket {
1136                            text_chunks.clear();
1137                            continue; // Skip: this is escaped markdown, not a real link
1138                        }
1139
1140                        // Track this position as found
1141                        found_positions.insert(start_pos);
1142
1143                        links.push(ParsedLink {
1144                            line: line_num,
1145                            start_col: col_start,
1146                            end_col: col_end,
1147                            byte_offset: start_pos,
1148                            byte_end: range.end,
1149                            text: link_text,
1150                            url,
1151                            is_reference,
1152                            reference_id,
1153                            link_type,
1154                        });
1155
1156                        text_chunks.clear();
1157                    }
1158                }
1159                _ => {}
1160            }
1161        }
1162
1163        // Also find undefined references using regex
1164        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1165        // because the reference is undefined
1166        for cap in LINK_PATTERN.captures_iter(content) {
1167            let full_match = cap.get(0).unwrap();
1168            let match_start = full_match.start();
1169            let match_end = full_match.end();
1170
1171            // Skip if this was already found by pulldown-cmark (it's a valid link)
1172            if found_positions.contains(&match_start) {
1173                continue;
1174            }
1175
1176            // Skip if escaped
1177            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1178                continue;
1179            }
1180
1181            // Skip if it's an image
1182            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1183                continue;
1184            }
1185
1186            // Skip if in code block
1187            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1188                continue;
1189            }
1190
1191            // Skip if in code span
1192            if Self::is_offset_in_code_span(code_spans, match_start) {
1193                continue;
1194            }
1195
1196            // Skip if in HTML comment
1197            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1198                continue;
1199            }
1200
1201            // Find line and column information
1202            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1203
1204            // Skip if this link is on a MkDocs snippet line
1205            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1206                continue;
1207            }
1208
1209            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1210
1211            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1212
1213            // Only process reference links (group 6)
1214            if let Some(ref_id) = cap.get(6) {
1215                let ref_id_str = ref_id.as_str();
1216                let normalized_ref = if ref_id_str.is_empty() {
1217                    text.to_lowercase() // Implicit reference
1218                } else {
1219                    ref_id_str.to_lowercase()
1220                };
1221
1222                // This is an undefined reference (pulldown-cmark didn't parse it)
1223                links.push(ParsedLink {
1224                    line: line_num,
1225                    start_col: col_start,
1226                    end_col: col_end,
1227                    byte_offset: match_start,
1228                    byte_end: match_end,
1229                    text,
1230                    url: String::new(), // Empty URL indicates undefined reference
1231                    is_reference: true,
1232                    reference_id: Some(normalized_ref),
1233                    link_type: LinkType::Reference, // Undefined references are reference-style
1234                });
1235            }
1236        }
1237
1238        (links, broken_links)
1239    }
1240
1241    /// Parse all images in the content
1242    fn parse_images(
1243        content: &str,
1244        lines: &[LineInfo],
1245        code_blocks: &[(usize, usize)],
1246        code_spans: &[CodeSpan],
1247        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1248    ) -> Vec<ParsedImage> {
1249        use crate::utils::skip_context::is_in_html_comment_ranges;
1250        use std::collections::HashSet;
1251
1252        // Pre-size based on a heuristic: images are less common than links
1253        let mut images = Vec::with_capacity(content.len() / 1000);
1254        let mut found_positions = HashSet::new();
1255
1256        // Use pulldown-cmark for parsing - more accurate and faster
1257        let parser = Parser::new(content).into_offset_iter();
1258        let mut image_stack: Vec<(usize, String, LinkType, String)> = Vec::new();
1259        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1260
1261        for (event, range) in parser {
1262            match event {
1263                Event::Start(Tag::Image {
1264                    link_type,
1265                    dest_url,
1266                    id,
1267                    ..
1268                }) => {
1269                    image_stack.push((range.start, dest_url.to_string(), link_type, id.to_string()));
1270                    text_chunks.clear();
1271                }
1272                Event::Text(text) if !image_stack.is_empty() => {
1273                    text_chunks.push((text.to_string(), range.start, range.end));
1274                }
1275                Event::Code(code) if !image_stack.is_empty() => {
1276                    let code_text = format!("`{code}`");
1277                    text_chunks.push((code_text, range.start, range.end));
1278                }
1279                Event::End(TagEnd::Image) => {
1280                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1281                        // Skip if in code block
1282                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1283                            continue;
1284                        }
1285
1286                        // Skip if in code span
1287                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1288                            continue;
1289                        }
1290
1291                        // Skip if in HTML comment
1292                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1293                            continue;
1294                        }
1295
1296                        // Find line and column using binary search
1297                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1298                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1299
1300                        let is_reference = matches!(
1301                            link_type,
1302                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1303                        );
1304
1305                        // Extract alt text directly from source bytes to preserve escaping
1306                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1307                        let alt_text = if start_pos < content.len() {
1308                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1309
1310                            // Find MATCHING ] by tracking bracket depth for nested brackets
1311                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1312                            let mut close_pos = None;
1313                            let mut depth = 0;
1314
1315                            if image_bytes.len() > 2 {
1316                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1317                                    // Count preceding backslashes
1318                                    let mut backslash_count = 0;
1319                                    let mut j = i;
1320                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1321                                        backslash_count += 1;
1322                                        j -= 1;
1323                                    }
1324                                    let is_escaped = backslash_count % 2 != 0;
1325
1326                                    if !is_escaped {
1327                                        if byte == b'[' {
1328                                            depth += 1;
1329                                        } else if byte == b']' {
1330                                            if depth == 0 {
1331                                                // Found the matching closing bracket
1332                                                close_pos = Some(i);
1333                                                break;
1334                                            } else {
1335                                                depth -= 1;
1336                                            }
1337                                        }
1338                                    }
1339                                }
1340                            }
1341
1342                            if let Some(pos) = close_pos {
1343                                std::str::from_utf8(&image_bytes[2..pos]).unwrap_or("").to_string()
1344                            } else {
1345                                String::new()
1346                            }
1347                        } else {
1348                            String::new()
1349                        };
1350
1351                        let reference_id = if is_reference && !ref_id.is_empty() {
1352                            Some(ref_id.to_lowercase())
1353                        } else if is_reference {
1354                            Some(alt_text.to_lowercase()) // Collapsed/shortcut references
1355                        } else {
1356                            None
1357                        };
1358
1359                        found_positions.insert(start_pos);
1360                        images.push(ParsedImage {
1361                            line: line_num,
1362                            start_col: col_start,
1363                            end_col: col_end,
1364                            byte_offset: start_pos,
1365                            byte_end: range.end,
1366                            alt_text,
1367                            url,
1368                            is_reference,
1369                            reference_id,
1370                            link_type,
1371                        });
1372                    }
1373                }
1374                _ => {}
1375            }
1376        }
1377
1378        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1379        for cap in IMAGE_PATTERN.captures_iter(content) {
1380            let full_match = cap.get(0).unwrap();
1381            let match_start = full_match.start();
1382            let match_end = full_match.end();
1383
1384            // Skip if already found by pulldown-cmark
1385            if found_positions.contains(&match_start) {
1386                continue;
1387            }
1388
1389            // Skip if the ! is escaped
1390            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1391                continue;
1392            }
1393
1394            // Skip if in code block, code span, or HTML comment
1395            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1396                || Self::is_offset_in_code_span(code_spans, match_start)
1397                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1398            {
1399                continue;
1400            }
1401
1402            // Only process reference images (undefined references not found by pulldown-cmark)
1403            if let Some(ref_id) = cap.get(6) {
1404                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1405                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1406                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1407                let ref_id_str = ref_id.as_str();
1408                let normalized_ref = if ref_id_str.is_empty() {
1409                    alt_text.to_lowercase()
1410                } else {
1411                    ref_id_str.to_lowercase()
1412                };
1413
1414                images.push(ParsedImage {
1415                    line: line_num,
1416                    start_col: col_start,
1417                    end_col: col_end,
1418                    byte_offset: match_start,
1419                    byte_end: match_end,
1420                    alt_text,
1421                    url: String::new(),
1422                    is_reference: true,
1423                    reference_id: Some(normalized_ref),
1424                    link_type: LinkType::Reference, // Undefined references are reference-style
1425                });
1426            }
1427        }
1428
1429        images
1430    }
1431
1432    /// Parse reference definitions
1433    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1434        // Pre-size based on lines count as reference definitions are line-based
1435        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1436
1437        for (line_idx, line_info) in lines.iter().enumerate() {
1438            // Skip lines in code blocks
1439            if line_info.in_code_block {
1440                continue;
1441            }
1442
1443            let line = &line_info.content;
1444            let line_num = line_idx + 1;
1445
1446            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1447                let id = cap.get(1).unwrap().as_str().to_lowercase();
1448                let url = cap.get(2).unwrap().as_str().to_string();
1449                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1450
1451                // Calculate byte positions
1452                // The match starts at the beginning of the line (0) and extends to the end
1453                let match_obj = cap.get(0).unwrap();
1454                let byte_offset = line_info.byte_offset + match_obj.start();
1455                let byte_end = line_info.byte_offset + match_obj.end();
1456
1457                refs.push(ReferenceDef {
1458                    line: line_num,
1459                    id,
1460                    url,
1461                    title,
1462                    byte_offset,
1463                    byte_end,
1464                });
1465            }
1466        }
1467
1468        refs
1469    }
1470
1471    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1472    /// Matches: ^(\s*>\s*)(.*)
1473    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1474    #[inline]
1475    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1476        let trimmed_start = line.trim_start();
1477        if !trimmed_start.starts_with('>') {
1478            return None;
1479        }
1480
1481        let leading_ws_len = line.len() - trimmed_start.len();
1482        let after_gt = &trimmed_start[1..];
1483        let content = after_gt.trim_start();
1484        let ws_after_gt_len = after_gt.len() - content.len();
1485        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1486
1487        Some((&line[..prefix_len], content))
1488    }
1489
1490    /// Fast unordered list parser - replaces regex for 5-10x speedup
1491    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1492    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1493    #[inline]
1494    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1495        let bytes = line.as_bytes();
1496        let mut i = 0;
1497
1498        // Skip leading whitespace
1499        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1500            i += 1;
1501        }
1502
1503        // Check for marker
1504        if i >= bytes.len() {
1505            return None;
1506        }
1507        let marker = bytes[i] as char;
1508        if marker != '-' && marker != '*' && marker != '+' {
1509            return None;
1510        }
1511        let marker_pos = i;
1512        i += 1;
1513
1514        // Collect spacing after marker (space or tab only)
1515        let spacing_start = i;
1516        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1517            i += 1;
1518        }
1519
1520        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1521    }
1522
1523    /// Fast ordered list parser - replaces regex for 5-10x speedup
1524    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1525    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1526    #[inline]
1527    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1528        let bytes = line.as_bytes();
1529        let mut i = 0;
1530
1531        // Skip leading whitespace
1532        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1533            i += 1;
1534        }
1535
1536        // Collect digits
1537        let number_start = i;
1538        while i < bytes.len() && bytes[i].is_ascii_digit() {
1539            i += 1;
1540        }
1541        if i == number_start {
1542            return None; // No digits found
1543        }
1544
1545        // Check for delimiter
1546        if i >= bytes.len() {
1547            return None;
1548        }
1549        let delimiter = bytes[i] as char;
1550        if delimiter != '.' && delimiter != ')' {
1551            return None;
1552        }
1553        let delimiter_pos = i;
1554        i += 1;
1555
1556        // Collect spacing after delimiter (space or tab only)
1557        let spacing_start = i;
1558        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1559            i += 1;
1560        }
1561
1562        Some((
1563            &line[..number_start],
1564            &line[number_start..delimiter_pos],
1565            delimiter,
1566            &line[spacing_start..i],
1567            &line[i..],
1568        ))
1569    }
1570
1571    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1572    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1573    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1574        let num_lines = line_offsets.len();
1575        let mut in_code_block = vec![false; num_lines];
1576
1577        // For each code block, mark all lines within it
1578        for &(start, end) in code_blocks {
1579            // Ensure we're at valid UTF-8 boundaries
1580            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1581                let mut boundary = start;
1582                while boundary > 0 && !content.is_char_boundary(boundary) {
1583                    boundary -= 1;
1584                }
1585                boundary
1586            } else {
1587                start
1588            };
1589
1590            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1591                let mut boundary = end;
1592                while boundary < content.len() && !content.is_char_boundary(boundary) {
1593                    boundary += 1;
1594                }
1595                boundary
1596            } else {
1597                end.min(content.len())
1598            };
1599
1600            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1601            // That function now has proper list context awareness (see code_block_utils.rs)
1602            // and correctly distinguishes between:
1603            // - Fenced code blocks (``` or ~~~)
1604            // - Indented code blocks at document level (4 spaces + blank line before)
1605            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1606            //
1607            // We no longer need to re-validate here. The original validation logic
1608            // was causing false positives by marking list continuation paragraphs as
1609            // code blocks when they have 4 spaces of indentation.
1610
1611            // Use binary search to find the first and last line indices
1612            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1613            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1614            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1615            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1616
1617            // Mark all lines in the range at once
1618            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1619                *flag = true;
1620            }
1621        }
1622
1623        in_code_block
1624    }
1625
1626    /// Pre-compute basic line information (without headings/blockquotes)
1627    fn compute_basic_line_info(
1628        content: &str,
1629        line_offsets: &[usize],
1630        code_blocks: &[(usize, usize)],
1631        flavor: MarkdownFlavor,
1632        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1633        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1634    ) -> Vec<LineInfo> {
1635        let content_lines: Vec<&str> = content.lines().collect();
1636        let mut lines = Vec::with_capacity(content_lines.len());
1637
1638        // Pre-compute which lines are in code blocks
1639        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1640
1641        // Detect front matter boundaries FIRST, before any other parsing
1642        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1643        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1644
1645        for (i, line) in content_lines.iter().enumerate() {
1646            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1647            let indent = line.len() - line.trim_start().len();
1648
1649            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1650            let blockquote_parse = Self::parse_blockquote_prefix(line);
1651
1652            // For blank detection, consider blockquote context
1653            let is_blank = if let Some((_, content)) = blockquote_parse {
1654                // In blockquote context, check if content after prefix is blank
1655                content.trim().is_empty()
1656            } else {
1657                line.trim().is_empty()
1658            };
1659
1660            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1661            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1662
1663            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1664            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1665                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1666            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1667            let in_html_comment =
1668                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1669            let list_item = if !(in_code_block
1670                || is_blank
1671                || in_mkdocstrings
1672                || in_html_comment
1673                || (front_matter_end > 0 && i < front_matter_end))
1674            {
1675                // Strip blockquote prefix if present for list detection (reuse cached result)
1676                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1677                    (content, prefix.len())
1678                } else {
1679                    (&**line, 0)
1680                };
1681
1682                if let Some((leading_spaces, marker, spacing, _content)) =
1683                    Self::parse_unordered_list(line_for_list_check)
1684                {
1685                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1686                    let content_column = marker_column + 1 + spacing.len();
1687
1688                    // According to CommonMark spec, unordered list items MUST have at least one space
1689                    // after the marker (-, *, or +). Without a space, it's not a list item.
1690                    // This also naturally handles cases like:
1691                    // - *emphasis* (not a list)
1692                    // - **bold** (not a list)
1693                    // - --- (horizontal rule, not a list)
1694                    if spacing.is_empty() {
1695                        None
1696                    } else {
1697                        Some(ListItemInfo {
1698                            marker: marker.to_string(),
1699                            is_ordered: false,
1700                            number: None,
1701                            marker_column,
1702                            content_column,
1703                        })
1704                    }
1705                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1706                    Self::parse_ordered_list(line_for_list_check)
1707                {
1708                    let marker = format!("{number_str}{delimiter}");
1709                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1710                    let content_column = marker_column + marker.len() + spacing.len();
1711
1712                    // According to CommonMark spec, ordered list items MUST have at least one space
1713                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1714                    if spacing.is_empty() {
1715                        None
1716                    } else {
1717                        Some(ListItemInfo {
1718                            marker,
1719                            is_ordered: true,
1720                            number: number_str.parse().ok(),
1721                            marker_column,
1722                            content_column,
1723                        })
1724                    }
1725                } else {
1726                    None
1727                }
1728            } else {
1729                None
1730            };
1731
1732            lines.push(LineInfo {
1733                content: line.to_string(),
1734                byte_offset,
1735                indent,
1736                is_blank,
1737                in_code_block,
1738                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1739                in_html_block: false, // Will be populated after line creation
1740                in_html_comment,
1741                list_item,
1742                heading: None,    // Will be populated in second pass for Setext headings
1743                blockquote: None, // Will be populated after line creation
1744                in_mkdocstrings,
1745                in_esm_block: false, // Will be populated after line creation for MDX files
1746            });
1747        }
1748
1749        lines
1750    }
1751
1752    /// Detect headings and blockquotes (called after HTML block detection)
1753    fn detect_headings_and_blockquotes(
1754        content: &str,
1755        lines: &mut [LineInfo],
1756        flavor: MarkdownFlavor,
1757        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1758    ) {
1759        // Regex for heading detection
1760        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1761            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1762        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1763            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1764
1765        let content_lines: Vec<&str> = content.lines().collect();
1766
1767        // Detect front matter boundaries to skip those lines
1768        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1769
1770        // Detect headings (including Setext which needs look-ahead) and blockquotes
1771        for i in 0..lines.len() {
1772            if lines[i].in_code_block {
1773                continue;
1774            }
1775
1776            // Skip lines in front matter
1777            if front_matter_end > 0 && i < front_matter_end {
1778                continue;
1779            }
1780
1781            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1782            if lines[i].in_html_block {
1783                continue;
1784            }
1785
1786            let line = content_lines[i];
1787
1788            // Check for blockquotes (even on blank lines within blockquotes)
1789            if let Some(bq) = parse_blockquote_detailed(line) {
1790                let nesting_level = bq.markers.len(); // Each '>' is one level
1791                let marker_column = bq.indent.len();
1792
1793                // Build the prefix (indentation + markers + space)
1794                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1795
1796                // Check for various blockquote issues
1797                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1798                // Consider tabs as multiple spaces, or actual multiple spaces
1799                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1800
1801                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1802                // MD028 flags empty blockquote lines that don't have a single space after the marker
1803                // Lines like "> " or ">> " are already correct and don't need fixing
1804                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1805
1806                lines[i].blockquote = Some(BlockquoteInfo {
1807                    nesting_level,
1808                    indent: bq.indent.to_string(),
1809                    marker_column,
1810                    prefix,
1811                    content: bq.content.to_string(),
1812                    has_no_space_after_marker: has_no_space,
1813                    has_multiple_spaces_after_marker: has_multiple_spaces,
1814                    needs_md028_fix,
1815                });
1816            }
1817
1818            // Skip heading detection for blank lines
1819            if lines[i].is_blank {
1820                continue;
1821            }
1822
1823            // Check for ATX headings (but skip MkDocs snippet lines)
1824            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1825            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1826                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1827                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1828            } else {
1829                false
1830            };
1831
1832            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1833                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1834                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1835                    continue;
1836                }
1837                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1838                let hashes = caps.get(2).map_or("", |m| m.as_str());
1839                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1840                let rest = caps.get(4).map_or("", |m| m.as_str());
1841
1842                let level = hashes.len() as u8;
1843                let marker_column = leading_spaces.len();
1844
1845                // Check for closing sequence, but handle custom IDs that might come after
1846                let (text, has_closing, closing_seq) = {
1847                    // First check if there's a custom ID at the end
1848                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1849                        // Check if this looks like a valid custom ID (ends with })
1850                        if rest[id_start..].trim_end().ends_with('}') {
1851                            // Split off the custom ID
1852                            (&rest[..id_start], &rest[id_start..])
1853                        } else {
1854                            (rest, "")
1855                        }
1856                    } else {
1857                        (rest, "")
1858                    };
1859
1860                    // Now look for closing hashes in the part before the custom ID
1861                    let trimmed_rest = rest_without_id.trim_end();
1862                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1863                        // Look for the start of the hash sequence
1864                        let mut start_of_hashes = last_hash_pos;
1865                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1866                            start_of_hashes -= 1;
1867                        }
1868
1869                        // Check if there's at least one space before the closing hashes
1870                        let has_space_before = start_of_hashes == 0
1871                            || trimmed_rest
1872                                .chars()
1873                                .nth(start_of_hashes - 1)
1874                                .is_some_and(|c| c.is_whitespace());
1875
1876                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1877                        let potential_closing = &trimmed_rest[start_of_hashes..];
1878                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1879
1880                        if is_all_hashes && has_space_before {
1881                            // This is a closing sequence
1882                            let closing_hashes = potential_closing.to_string();
1883                            // The text is everything before the closing hashes
1884                            // Don't include the custom ID here - it will be extracted later
1885                            let text_part = if !custom_id_part.is_empty() {
1886                                // If we have a custom ID, append it back to get the full rest
1887                                // This allows the extract_header_id function to handle it properly
1888                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1889                            } else {
1890                                rest_without_id[..start_of_hashes].trim_end().to_string()
1891                            };
1892                            (text_part, true, closing_hashes)
1893                        } else {
1894                            // Not a valid closing sequence, return the full content
1895                            (rest.to_string(), false, String::new())
1896                        }
1897                    } else {
1898                        // No hashes found, return the full content
1899                        (rest.to_string(), false, String::new())
1900                    }
1901                };
1902
1903                let content_column = marker_column + hashes.len() + spaces_after.len();
1904
1905                // Extract custom header ID if present
1906                let raw_text = text.trim().to_string();
1907                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1908
1909                // If no custom ID was found on the header line, check the next line for standalone attr-list
1910                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1911                    let next_line = content_lines[i + 1];
1912                    if !lines[i + 1].in_code_block
1913                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1914                        && let Some(next_line_id) =
1915                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1916                    {
1917                        custom_id = Some(next_line_id);
1918                    }
1919                }
1920
1921                lines[i].heading = Some(HeadingInfo {
1922                    level,
1923                    style: HeadingStyle::ATX,
1924                    marker: hashes.to_string(),
1925                    marker_column,
1926                    content_column,
1927                    text: clean_text,
1928                    custom_id,
1929                    raw_text,
1930                    has_closing_sequence: has_closing,
1931                    closing_sequence: closing_seq,
1932                });
1933            }
1934            // Check for Setext headings (need to look at next line)
1935            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1936                let next_line = content_lines[i + 1];
1937                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1938                    // Skip if next line is front matter delimiter
1939                    if front_matter_end > 0 && i < front_matter_end {
1940                        continue;
1941                    }
1942
1943                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1944                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1945                    {
1946                        continue;
1947                    }
1948
1949                    let underline = next_line.trim();
1950
1951                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1952                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1953                    if underline == "---" {
1954                        continue;
1955                    }
1956
1957                    // Skip if the current line looks like YAML key-value syntax
1958                    let current_line_trimmed = line.trim();
1959                    if current_line_trimmed.contains(':')
1960                        && !current_line_trimmed.starts_with('#')
1961                        && !current_line_trimmed.contains('[')
1962                        && !current_line_trimmed.contains("](")
1963                    {
1964                        // This looks like "key: value" which suggests YAML, not a heading
1965                        continue;
1966                    }
1967
1968                    let level = if underline.starts_with('=') { 1 } else { 2 };
1969                    let style = if level == 1 {
1970                        HeadingStyle::Setext1
1971                    } else {
1972                        HeadingStyle::Setext2
1973                    };
1974
1975                    // Extract custom header ID if present
1976                    let raw_text = line.trim().to_string();
1977                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1978
1979                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1980                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1981                        let attr_line = content_lines[i + 2];
1982                        if !lines[i + 2].in_code_block
1983                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1984                            && let Some(attr_line_id) =
1985                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1986                        {
1987                            custom_id = Some(attr_line_id);
1988                        }
1989                    }
1990
1991                    lines[i].heading = Some(HeadingInfo {
1992                        level,
1993                        style,
1994                        marker: underline.to_string(),
1995                        marker_column: next_line.len() - next_line.trim_start().len(),
1996                        content_column: lines[i].indent,
1997                        text: clean_text,
1998                        custom_id,
1999                        raw_text,
2000                        has_closing_sequence: false,
2001                        closing_sequence: String::new(),
2002                    });
2003                }
2004            }
2005        }
2006    }
2007
2008    /// Detect HTML blocks in the content
2009    fn detect_html_blocks(lines: &mut [LineInfo]) {
2010        // HTML block elements that trigger block context
2011        const BLOCK_ELEMENTS: &[&str] = &[
2012            "address",
2013            "article",
2014            "aside",
2015            "blockquote",
2016            "details",
2017            "dialog",
2018            "dd",
2019            "div",
2020            "dl",
2021            "dt",
2022            "fieldset",
2023            "figcaption",
2024            "figure",
2025            "footer",
2026            "form",
2027            "h1",
2028            "h2",
2029            "h3",
2030            "h4",
2031            "h5",
2032            "h6",
2033            "header",
2034            "hr",
2035            "li",
2036            "main",
2037            "nav",
2038            "ol",
2039            "p",
2040            "pre",
2041            "script",
2042            "section",
2043            "style",
2044            "table",
2045            "tbody",
2046            "td",
2047            "tfoot",
2048            "th",
2049            "thead",
2050            "tr",
2051            "ul",
2052        ];
2053
2054        let mut i = 0;
2055        while i < lines.len() {
2056            // Skip if already in code block or front matter
2057            if lines[i].in_code_block || lines[i].in_front_matter {
2058                i += 1;
2059                continue;
2060            }
2061
2062            let trimmed = lines[i].content.trim_start();
2063
2064            // Check if line starts with an HTML tag
2065            if trimmed.starts_with('<') && trimmed.len() > 1 {
2066                // Extract tag name safely
2067                let after_bracket = &trimmed[1..];
2068                let is_closing = after_bracket.starts_with('/');
2069                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2070
2071                // Extract tag name (stop at space, >, /, or end of string)
2072                let tag_name = tag_start
2073                    .chars()
2074                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2075                    .collect::<String>()
2076                    .to_lowercase();
2077
2078                // Check if it's a block element
2079                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2080                    // Mark this line as in HTML block
2081                    lines[i].in_html_block = true;
2082
2083                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2084                    // This avoids complex nesting logic that might cause infinite loops
2085                    if !is_closing {
2086                        let closing_tag = format!("</{tag_name}>");
2087                        // style and script tags can contain blank lines (CSS/JS formatting)
2088                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2089                        let mut j = i + 1;
2090                        while j < lines.len() && j < i + 100 {
2091                            // Limit search to 100 lines
2092                            // Stop at blank lines (except for style/script tags)
2093                            if !allow_blank_lines && lines[j].is_blank {
2094                                break;
2095                            }
2096
2097                            lines[j].in_html_block = true;
2098
2099                            // Check if this line contains the closing tag
2100                            if lines[j].content.contains(&closing_tag) {
2101                                break;
2102                            }
2103                            j += 1;
2104                        }
2105                    }
2106                }
2107            }
2108
2109            i += 1;
2110        }
2111    }
2112
2113    /// Detect ESM import/export blocks in MDX files
2114    /// ESM blocks consist of contiguous import/export statements at the top of the file
2115    fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2116        // Only process MDX files
2117        if !flavor.supports_esm_blocks() {
2118            return;
2119        }
2120
2121        for line in lines.iter_mut() {
2122            // Skip blank lines and comments at the start
2123            if line.is_blank || line.in_html_comment {
2124                continue;
2125            }
2126
2127            // Check if line starts with import or export
2128            let trimmed = line.content.trim_start();
2129            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2130                line.in_esm_block = true;
2131            } else {
2132                // Once we hit a non-ESM line, we're done with the ESM block
2133                break;
2134            }
2135        }
2136    }
2137
2138    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2139    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2140        let mut code_spans = Vec::new();
2141
2142        // Quick check - if no backticks, no code spans
2143        if !content.contains('`') {
2144            return code_spans;
2145        }
2146
2147        // Use pulldown-cmark's streaming parser with byte offsets
2148        let parser = Parser::new(content).into_offset_iter();
2149
2150        for (event, range) in parser {
2151            if let Event::Code(_) = event {
2152                let start_pos = range.start;
2153                let end_pos = range.end;
2154
2155                // The range includes the backticks, extract the actual content
2156                let full_span = &content[start_pos..end_pos];
2157                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2158
2159                // Extract content between backticks, preserving spaces
2160                let content_start = start_pos + backtick_count;
2161                let content_end = end_pos - backtick_count;
2162                let span_content = if content_start < content_end {
2163                    content[content_start..content_end].to_string()
2164                } else {
2165                    String::new()
2166                };
2167
2168                // Use binary search to find line number - O(log n) instead of O(n)
2169                // Find the rightmost line whose byte_offset <= start_pos
2170                let line_idx = lines
2171                    .partition_point(|line| line.byte_offset <= start_pos)
2172                    .saturating_sub(1);
2173                let line_num = line_idx + 1;
2174                let col_start = start_pos - lines[line_idx].byte_offset;
2175
2176                // Find end column using binary search
2177                let end_line_idx = lines
2178                    .partition_point(|line| line.byte_offset <= end_pos)
2179                    .saturating_sub(1);
2180                let col_end = end_pos - lines[end_line_idx].byte_offset;
2181
2182                code_spans.push(CodeSpan {
2183                    line: line_num,
2184                    start_col: col_start,
2185                    end_col: col_end,
2186                    byte_offset: start_pos,
2187                    byte_end: end_pos,
2188                    backtick_count,
2189                    content: span_content,
2190                });
2191            }
2192        }
2193
2194        // Sort by position to ensure consistent ordering
2195        code_spans.sort_by_key(|span| span.byte_offset);
2196
2197        code_spans
2198    }
2199
2200    /// Parse all list blocks in the content (legacy line-by-line approach)
2201    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
2202        // Pre-size based on lines that could be list items
2203        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2204        let mut current_block: Option<ListBlock> = None;
2205        let mut last_list_item_line = 0;
2206        let mut current_indent_level = 0;
2207        let mut last_marker_width = 0;
2208
2209        for (line_idx, line_info) in lines.iter().enumerate() {
2210            let line_num = line_idx + 1;
2211
2212            // Enhanced code block handling using Design #3's context analysis
2213            if line_info.in_code_block {
2214                if let Some(ref mut block) = current_block {
2215                    // Calculate minimum indentation for list continuation
2216                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
2217
2218                    // Analyze code block context using the three-tier classification
2219                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2220
2221                    match context {
2222                        CodeBlockContext::Indented => {
2223                            // Code block is properly indented - continues the list
2224                            block.end_line = line_num;
2225                            continue;
2226                        }
2227                        CodeBlockContext::Standalone => {
2228                            // Code block separates lists - end current block
2229                            let completed_block = current_block.take().unwrap();
2230                            list_blocks.push(completed_block);
2231                            continue;
2232                        }
2233                        CodeBlockContext::Adjacent => {
2234                            // Edge case - use conservative behavior (continue list)
2235                            block.end_line = line_num;
2236                            continue;
2237                        }
2238                    }
2239                } else {
2240                    // No current list block - skip code block lines
2241                    continue;
2242                }
2243            }
2244
2245            // Extract blockquote prefix if any
2246            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2247                caps.get(0).unwrap().as_str().to_string()
2248            } else {
2249                String::new()
2250            };
2251
2252            // Check if this line is a list item
2253            if let Some(list_item) = &line_info.list_item {
2254                // Calculate nesting level based on indentation
2255                let item_indent = list_item.marker_column;
2256                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2257
2258                if let Some(ref mut block) = current_block {
2259                    // Check if this continues the current block
2260                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2261                    // or a continuation at the same or lower level
2262                    let is_nested = nesting > block.nesting_level;
2263                    let same_type =
2264                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2265                    let same_context = block.blockquote_prefix == blockquote_prefix;
2266                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2267
2268                    // For unordered lists, also check marker consistency
2269                    let marker_compatible =
2270                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2271
2272                    // Check if there's non-list content between the last item and this one
2273                    let has_non_list_content = {
2274                        let mut found_non_list = false;
2275                        // Use the last item from the current block, not the global last_list_item_line
2276                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2277
2278                        // Debug: Special check for problematic line
2279                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2280                            let last_line = &lines[block_last_item_line - 1];
2281                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2282                                log::debug!(
2283                                    "After problematic line {}: checking lines {} to {} for non-list content",
2284                                    block_last_item_line,
2285                                    block_last_item_line + 1,
2286                                    line_num
2287                                );
2288                                // If they're consecutive list items, there's no content between
2289                                if line_num == block_last_item_line + 1 {
2290                                    log::debug!("Lines are consecutive, no content between");
2291                                }
2292                            }
2293                        }
2294
2295                        for check_line in (block_last_item_line + 1)..line_num {
2296                            let check_idx = check_line - 1;
2297                            if check_idx < lines.len() {
2298                                let check_info = &lines[check_idx];
2299                                // Check for content that breaks the list
2300                                let is_list_breaking_content = if check_info.in_code_block {
2301                                    // Use enhanced code block classification for list separation
2302                                    let last_item_marker_width =
2303                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2304                                            lines[block_last_item_line - 1]
2305                                                .list_item
2306                                                .as_ref()
2307                                                .map(|li| {
2308                                                    if li.is_ordered {
2309                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
2310                                                    } else {
2311                                                        li.marker.len()
2312                                                    }
2313                                                })
2314                                                .unwrap_or(3) // fallback to 3 if no list item found
2315                                        } else {
2316                                            3 // fallback
2317                                        };
2318
2319                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2320
2321                                    // Analyze code block context using our enhanced classification
2322                                    let context = CodeBlockUtils::analyze_code_block_context(
2323                                        lines,
2324                                        check_line - 1,
2325                                        min_continuation,
2326                                    );
2327
2328                                    // Standalone code blocks break lists, indented ones continue them
2329                                    matches!(context, CodeBlockContext::Standalone)
2330                                } else if !check_info.is_blank && check_info.list_item.is_none() {
2331                                    // Check for structural separators that should break lists (from issue #42)
2332                                    let line_content = check_info.content.trim();
2333
2334                                    // Any of these structural separators break lists
2335                                    if check_info.heading.is_some()
2336                                        || line_content.starts_with("---")
2337                                        || line_content.starts_with("***")
2338                                        || line_content.starts_with("___")
2339                                        || (line_content.contains('|')
2340                                            && !line_content.contains("](")
2341                                            && !line_content.contains("http")
2342                                            && (line_content.matches('|').count() > 1
2343                                                || line_content.starts_with('|')
2344                                                || line_content.ends_with('|')))
2345                                        || line_content.starts_with(">")
2346                                    {
2347                                        true
2348                                    }
2349                                    // Other non-list content - check if properly indented
2350                                    else {
2351                                        let last_item_marker_width =
2352                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2353                                                lines[block_last_item_line - 1]
2354                                                    .list_item
2355                                                    .as_ref()
2356                                                    .map(|li| {
2357                                                        if li.is_ordered {
2358                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
2359                                                        } else {
2360                                                            li.marker.len()
2361                                                        }
2362                                                    })
2363                                                    .unwrap_or(3) // fallback to 3 if no list item found
2364                                            } else {
2365                                                3 // fallback
2366                                            };
2367
2368                                        let min_continuation =
2369                                            if block.is_ordered { last_item_marker_width } else { 2 };
2370                                        check_info.indent < min_continuation
2371                                    }
2372                                } else {
2373                                    false
2374                                };
2375
2376                                if is_list_breaking_content {
2377                                    // Not indented enough, so it breaks the list
2378                                    found_non_list = true;
2379                                    break;
2380                                }
2381                            }
2382                        }
2383                        found_non_list
2384                    };
2385
2386                    // A list continues if:
2387                    // 1. It's a nested item (indented more than the parent), OR
2388                    // 2. It's the same type at the same level with reasonable distance
2389                    let mut continues_list = if is_nested {
2390                        // Nested items always continue the list if they're in the same context
2391                        same_context && reasonable_distance && !has_non_list_content
2392                    } else {
2393                        // Same-level items need to match type and markers
2394                        let result = same_type
2395                            && same_context
2396                            && reasonable_distance
2397                            && marker_compatible
2398                            && !has_non_list_content;
2399
2400                        // Debug logging for lines after problematic content
2401                        if block.item_lines.last().is_some_and(|&last_line| {
2402                            last_line > 0
2403                                && last_line <= lines.len()
2404                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2405                                && lines[last_line - 1].content.contains(r"\`")
2406                        }) {
2407                            log::debug!(
2408                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2409                            );
2410                            if line_num > 0 && line_num <= lines.len() {
2411                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2412                            }
2413                        }
2414
2415                        result
2416                    };
2417
2418                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2419                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2420                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2421                        // Check if the previous line was a list item
2422                        if block.item_lines.contains(&(line_num - 1)) {
2423                            // They're consecutive list items - force them to be in the same list
2424                            continues_list = true;
2425                        }
2426                    }
2427
2428                    if continues_list {
2429                        // Extend current block
2430                        block.end_line = line_num;
2431                        block.item_lines.push(line_num);
2432
2433                        // Update max marker width
2434                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2435                            list_item.marker.len() + 1
2436                        } else {
2437                            list_item.marker.len()
2438                        });
2439
2440                        // Update marker consistency for unordered lists
2441                        if !block.is_ordered
2442                            && block.marker.is_some()
2443                            && block.marker.as_ref() != Some(&list_item.marker)
2444                        {
2445                            // Mixed markers, clear the marker field
2446                            block.marker = None;
2447                        }
2448                    } else {
2449                        // End current block and start a new one
2450
2451                        list_blocks.push(block.clone());
2452
2453                        *block = ListBlock {
2454                            start_line: line_num,
2455                            end_line: line_num,
2456                            is_ordered: list_item.is_ordered,
2457                            marker: if list_item.is_ordered {
2458                                None
2459                            } else {
2460                                Some(list_item.marker.clone())
2461                            },
2462                            blockquote_prefix: blockquote_prefix.clone(),
2463                            item_lines: vec![line_num],
2464                            nesting_level: nesting,
2465                            max_marker_width: if list_item.is_ordered {
2466                                list_item.marker.len() + 1
2467                            } else {
2468                                list_item.marker.len()
2469                            },
2470                        };
2471                    }
2472                } else {
2473                    // Start a new block
2474                    current_block = Some(ListBlock {
2475                        start_line: line_num,
2476                        end_line: line_num,
2477                        is_ordered: list_item.is_ordered,
2478                        marker: if list_item.is_ordered {
2479                            None
2480                        } else {
2481                            Some(list_item.marker.clone())
2482                        },
2483                        blockquote_prefix,
2484                        item_lines: vec![line_num],
2485                        nesting_level: nesting,
2486                        max_marker_width: list_item.marker.len(),
2487                    });
2488                }
2489
2490                last_list_item_line = line_num;
2491                current_indent_level = item_indent;
2492                last_marker_width = if list_item.is_ordered {
2493                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2494                } else {
2495                    list_item.marker.len()
2496                };
2497            } else if let Some(ref mut block) = current_block {
2498                // Not a list item - check if it continues the current block
2499
2500                // For MD032 compatibility, we use a simple approach:
2501                // - Indented lines continue the list
2502                // - Blank lines followed by indented content continue the list
2503                // - Everything else ends the list
2504
2505                // Check if the last line in the list block ended with a backslash (hard line break)
2506                // This handles cases where list items use backslash for hard line breaks
2507                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2508                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2509                } else {
2510                    false
2511                };
2512
2513                // Calculate minimum indentation for list continuation
2514                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2515                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2516                let min_continuation_indent = if block.is_ordered {
2517                    current_indent_level + last_marker_width
2518                } else {
2519                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2520                };
2521
2522                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2523                    // Indented line or backslash continuation continues the list
2524                    block.end_line = line_num;
2525                } else if line_info.is_blank {
2526                    // Blank line - check if it's internal to the list or ending it
2527                    // We only include blank lines that are followed by more list content
2528                    let mut check_idx = line_idx + 1;
2529                    let mut found_continuation = false;
2530
2531                    // Skip additional blank lines
2532                    while check_idx < lines.len() && lines[check_idx].is_blank {
2533                        check_idx += 1;
2534                    }
2535
2536                    if check_idx < lines.len() {
2537                        let next_line = &lines[check_idx];
2538                        // Check if followed by indented content (list continuation)
2539                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2540                            found_continuation = true;
2541                        }
2542                        // Check if followed by another list item at the same level
2543                        else if !next_line.in_code_block
2544                            && next_line.list_item.is_some()
2545                            && let Some(item) = &next_line.list_item
2546                        {
2547                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2548                                .find(&next_line.content)
2549                                .map_or(String::new(), |m| m.as_str().to_string());
2550                            if item.marker_column == current_indent_level
2551                                && item.is_ordered == block.is_ordered
2552                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2553                            {
2554                                // Check if there was meaningful content between the list items (unused now)
2555                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2556                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2557                                    if let Some(between_line) = lines.get(idx) {
2558                                        let trimmed = between_line.content.trim();
2559                                        // Skip empty lines
2560                                        if trimmed.is_empty() {
2561                                            return false;
2562                                        }
2563                                        // Check for meaningful content
2564                                        let line_indent =
2565                                            between_line.content.len() - between_line.content.trim_start().len();
2566
2567                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2568                                        if trimmed.starts_with("```")
2569                                            || trimmed.starts_with("~~~")
2570                                            || trimmed.starts_with("---")
2571                                            || trimmed.starts_with("***")
2572                                            || trimmed.starts_with("___")
2573                                            || trimmed.starts_with(">")
2574                                            || trimmed.contains('|') // Tables
2575                                            || between_line.heading.is_some()
2576                                        {
2577                                            return true; // These are structural separators - meaningful content that breaks lists
2578                                        }
2579
2580                                        // Only properly indented content continues the list
2581                                        line_indent >= min_continuation_indent
2582                                    } else {
2583                                        false
2584                                    }
2585                                });
2586
2587                                if block.is_ordered {
2588                                    // For ordered lists: don't continue if there are structural separators
2589                                    // Check if there are structural separators between the list items
2590                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2591                                        if let Some(between_line) = lines.get(idx) {
2592                                            let trimmed = between_line.content.trim();
2593                                            if trimmed.is_empty() {
2594                                                return false;
2595                                            }
2596                                            // Check for structural separators that break lists
2597                                            trimmed.starts_with("```")
2598                                                || trimmed.starts_with("~~~")
2599                                                || trimmed.starts_with("---")
2600                                                || trimmed.starts_with("***")
2601                                                || trimmed.starts_with("___")
2602                                                || trimmed.starts_with(">")
2603                                                || trimmed.contains('|') // Tables
2604                                                || between_line.heading.is_some()
2605                                        } else {
2606                                            false
2607                                        }
2608                                    });
2609                                    found_continuation = !has_structural_separators;
2610                                } else {
2611                                    // For unordered lists: also check for structural separators
2612                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2613                                        if let Some(between_line) = lines.get(idx) {
2614                                            let trimmed = between_line.content.trim();
2615                                            if trimmed.is_empty() {
2616                                                return false;
2617                                            }
2618                                            // Check for structural separators that break lists
2619                                            trimmed.starts_with("```")
2620                                                || trimmed.starts_with("~~~")
2621                                                || trimmed.starts_with("---")
2622                                                || trimmed.starts_with("***")
2623                                                || trimmed.starts_with("___")
2624                                                || trimmed.starts_with(">")
2625                                                || trimmed.contains('|') // Tables
2626                                                || between_line.heading.is_some()
2627                                        } else {
2628                                            false
2629                                        }
2630                                    });
2631                                    found_continuation = !has_structural_separators;
2632                                }
2633                            }
2634                        }
2635                    }
2636
2637                    if found_continuation {
2638                        // Include the blank line in the block
2639                        block.end_line = line_num;
2640                    } else {
2641                        // Blank line ends the list - don't include it
2642                        list_blocks.push(block.clone());
2643                        current_block = None;
2644                    }
2645                } else {
2646                    // Check for lazy continuation - non-indented line immediately after a list item
2647                    // But only if the line has sufficient indentation for the list type
2648                    let min_required_indent = if block.is_ordered {
2649                        current_indent_level + last_marker_width
2650                    } else {
2651                        current_indent_level + 2
2652                    };
2653
2654                    // For lazy continuation to apply, the line must either:
2655                    // 1. Have no indentation (true lazy continuation)
2656                    // 2. Have sufficient indentation for the list type
2657                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2658                    let line_content = line_info.content.trim();
2659                    let is_structural_separator = line_info.heading.is_some()
2660                        || line_content.starts_with("```")
2661                        || line_content.starts_with("~~~")
2662                        || line_content.starts_with("---")
2663                        || line_content.starts_with("***")
2664                        || line_content.starts_with("___")
2665                        || line_content.starts_with(">")
2666                        || (line_content.contains('|')
2667                            && !line_content.contains("](")
2668                            && !line_content.contains("http")
2669                            && (line_content.matches('|').count() > 1
2670                                || line_content.starts_with('|')
2671                                || line_content.ends_with('|'))); // Tables
2672
2673                    // Allow lazy continuation if we're still within the same list block
2674                    // (not just immediately after a list item)
2675                    let is_lazy_continuation = !is_structural_separator
2676                        && !line_info.is_blank
2677                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2678
2679                    if is_lazy_continuation {
2680                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2681                        // it's probably not a continuation
2682                        let content_to_check = if !blockquote_prefix.is_empty() {
2683                            // Strip blockquote prefix to check the actual content
2684                            line_info
2685                                .content
2686                                .strip_prefix(&blockquote_prefix)
2687                                .unwrap_or(&line_info.content)
2688                                .trim()
2689                        } else {
2690                            line_info.content.trim()
2691                        };
2692
2693                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2694
2695                        // If it starts with uppercase and the previous line ended with punctuation,
2696                        // it's likely a new paragraph, not a continuation
2697                        if starts_with_uppercase && last_list_item_line > 0 {
2698                            // This looks like a new paragraph
2699                            list_blocks.push(block.clone());
2700                            current_block = None;
2701                        } else {
2702                            // This is a lazy continuation line
2703                            block.end_line = line_num;
2704                        }
2705                    } else {
2706                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2707                        list_blocks.push(block.clone());
2708                        current_block = None;
2709                    }
2710                }
2711            }
2712        }
2713
2714        // Don't forget the last block
2715        if let Some(block) = current_block {
2716            list_blocks.push(block);
2717        }
2718
2719        // Merge adjacent blocks that should be one
2720        merge_adjacent_list_blocks(&mut list_blocks, lines);
2721
2722        list_blocks
2723    }
2724
2725    /// Compute character frequency for fast content analysis
2726    fn compute_char_frequency(content: &str) -> CharFrequency {
2727        let mut frequency = CharFrequency::default();
2728
2729        for ch in content.chars() {
2730            match ch {
2731                '#' => frequency.hash_count += 1,
2732                '*' => frequency.asterisk_count += 1,
2733                '_' => frequency.underscore_count += 1,
2734                '-' => frequency.hyphen_count += 1,
2735                '+' => frequency.plus_count += 1,
2736                '>' => frequency.gt_count += 1,
2737                '|' => frequency.pipe_count += 1,
2738                '[' => frequency.bracket_count += 1,
2739                '`' => frequency.backtick_count += 1,
2740                '<' => frequency.lt_count += 1,
2741                '!' => frequency.exclamation_count += 1,
2742                '\n' => frequency.newline_count += 1,
2743                _ => {}
2744            }
2745        }
2746
2747        frequency
2748    }
2749
2750    /// Parse HTML tags in the content
2751    fn parse_html_tags(
2752        content: &str,
2753        lines: &[LineInfo],
2754        code_blocks: &[(usize, usize)],
2755        flavor: MarkdownFlavor,
2756    ) -> Vec<HtmlTag> {
2757        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2758            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2759
2760        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2761
2762        for cap in HTML_TAG_REGEX.captures_iter(content) {
2763            let full_match = cap.get(0).unwrap();
2764            let match_start = full_match.start();
2765            let match_end = full_match.end();
2766
2767            // Skip if in code block
2768            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2769                continue;
2770            }
2771
2772            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2773            let tag_name_original = cap.get(2).unwrap().as_str();
2774            let tag_name = tag_name_original.to_lowercase();
2775            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2776
2777            // Skip JSX components in MDX files (tags starting with uppercase letter)
2778            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2779            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2780                continue;
2781            }
2782
2783            // Find which line this tag is on
2784            let mut line_num = 1;
2785            let mut col_start = match_start;
2786            let mut col_end = match_end;
2787            for (idx, line_info) in lines.iter().enumerate() {
2788                if match_start >= line_info.byte_offset {
2789                    line_num = idx + 1;
2790                    col_start = match_start - line_info.byte_offset;
2791                    col_end = match_end - line_info.byte_offset;
2792                } else {
2793                    break;
2794                }
2795            }
2796
2797            html_tags.push(HtmlTag {
2798                line: line_num,
2799                start_col: col_start,
2800                end_col: col_end,
2801                byte_offset: match_start,
2802                byte_end: match_end,
2803                tag_name,
2804                is_closing,
2805                is_self_closing,
2806                raw_content: full_match.as_str().to_string(),
2807            });
2808        }
2809
2810        html_tags
2811    }
2812
2813    /// Parse emphasis spans in the content
2814    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2815        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2816            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2817
2818        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2819
2820        for cap in EMPHASIS_REGEX.captures_iter(content) {
2821            let full_match = cap.get(0).unwrap();
2822            let match_start = full_match.start();
2823            let match_end = full_match.end();
2824
2825            // Skip if in code block
2826            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2827                continue;
2828            }
2829
2830            let opening_markers = cap.get(1).unwrap().as_str();
2831            let content_part = cap.get(2).unwrap().as_str();
2832            let closing_markers = cap.get(3).unwrap().as_str();
2833
2834            // Validate matching markers
2835            if opening_markers.chars().next() != closing_markers.chars().next()
2836                || opening_markers.len() != closing_markers.len()
2837            {
2838                continue;
2839            }
2840
2841            let marker = opening_markers.chars().next().unwrap();
2842            let marker_count = opening_markers.len();
2843
2844            // Find which line this emphasis is on
2845            let mut line_num = 1;
2846            let mut col_start = match_start;
2847            let mut col_end = match_end;
2848            for (idx, line_info) in lines.iter().enumerate() {
2849                if match_start >= line_info.byte_offset {
2850                    line_num = idx + 1;
2851                    col_start = match_start - line_info.byte_offset;
2852                    col_end = match_end - line_info.byte_offset;
2853                } else {
2854                    break;
2855                }
2856            }
2857
2858            emphasis_spans.push(EmphasisSpan {
2859                line: line_num,
2860                start_col: col_start,
2861                end_col: col_end,
2862                byte_offset: match_start,
2863                byte_end: match_end,
2864                marker,
2865                marker_count,
2866                content: content_part.to_string(),
2867            });
2868        }
2869
2870        emphasis_spans
2871    }
2872
2873    /// Parse table rows in the content
2874    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2875        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2876
2877        for (line_idx, line_info) in lines.iter().enumerate() {
2878            // Skip lines in code blocks or blank lines
2879            if line_info.in_code_block || line_info.is_blank {
2880                continue;
2881            }
2882
2883            let line = &line_info.content;
2884            let line_num = line_idx + 1;
2885
2886            // Check if this line contains pipes (potential table row)
2887            if !line.contains('|') {
2888                continue;
2889            }
2890
2891            // Count columns by splitting on pipes
2892            let parts: Vec<&str> = line.split('|').collect();
2893            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2894
2895            // Check if this is a separator row
2896            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2897            let mut column_alignments = Vec::new();
2898
2899            if is_separator {
2900                for part in &parts[1..parts.len() - 1] {
2901                    // Skip first and last empty parts
2902                    let trimmed = part.trim();
2903                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2904                        "center".to_string()
2905                    } else if trimmed.ends_with(':') {
2906                        "right".to_string()
2907                    } else if trimmed.starts_with(':') {
2908                        "left".to_string()
2909                    } else {
2910                        "none".to_string()
2911                    };
2912                    column_alignments.push(alignment);
2913                }
2914            }
2915
2916            table_rows.push(TableRow {
2917                line: line_num,
2918                is_separator,
2919                column_count,
2920                column_alignments,
2921            });
2922        }
2923
2924        table_rows
2925    }
2926
2927    /// Parse bare URLs and emails in the content
2928    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2929        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2930
2931        // Check for bare URLs (not in angle brackets or markdown links)
2932        for cap in BARE_URL_PATTERN.captures_iter(content) {
2933            let full_match = cap.get(0).unwrap();
2934            let match_start = full_match.start();
2935            let match_end = full_match.end();
2936
2937            // Skip if in code block
2938            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2939                continue;
2940            }
2941
2942            // Skip if already in angle brackets or markdown links
2943            let preceding_char = if match_start > 0 {
2944                content.chars().nth(match_start - 1)
2945            } else {
2946                None
2947            };
2948            let following_char = content.chars().nth(match_end);
2949
2950            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2951                continue;
2952            }
2953            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2954                continue;
2955            }
2956
2957            let url = full_match.as_str();
2958            let url_type = if url.starts_with("https://") {
2959                "https"
2960            } else if url.starts_with("http://") {
2961                "http"
2962            } else if url.starts_with("ftp://") {
2963                "ftp"
2964            } else {
2965                "other"
2966            };
2967
2968            // Find which line this URL is on
2969            let mut line_num = 1;
2970            let mut col_start = match_start;
2971            let mut col_end = match_end;
2972            for (idx, line_info) in lines.iter().enumerate() {
2973                if match_start >= line_info.byte_offset {
2974                    line_num = idx + 1;
2975                    col_start = match_start - line_info.byte_offset;
2976                    col_end = match_end - line_info.byte_offset;
2977                } else {
2978                    break;
2979                }
2980            }
2981
2982            bare_urls.push(BareUrl {
2983                line: line_num,
2984                start_col: col_start,
2985                end_col: col_end,
2986                byte_offset: match_start,
2987                byte_end: match_end,
2988                url: url.to_string(),
2989                url_type: url_type.to_string(),
2990            });
2991        }
2992
2993        // Check for bare email addresses
2994        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2995            let full_match = cap.get(0).unwrap();
2996            let match_start = full_match.start();
2997            let match_end = full_match.end();
2998
2999            // Skip if in code block
3000            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3001                continue;
3002            }
3003
3004            // Skip if already in angle brackets or markdown links
3005            let preceding_char = if match_start > 0 {
3006                content.chars().nth(match_start - 1)
3007            } else {
3008                None
3009            };
3010            let following_char = content.chars().nth(match_end);
3011
3012            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3013                continue;
3014            }
3015            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3016                continue;
3017            }
3018
3019            let email = full_match.as_str();
3020
3021            // Find which line this email is on
3022            let mut line_num = 1;
3023            let mut col_start = match_start;
3024            let mut col_end = match_end;
3025            for (idx, line_info) in lines.iter().enumerate() {
3026                if match_start >= line_info.byte_offset {
3027                    line_num = idx + 1;
3028                    col_start = match_start - line_info.byte_offset;
3029                    col_end = match_end - line_info.byte_offset;
3030                } else {
3031                    break;
3032                }
3033            }
3034
3035            bare_urls.push(BareUrl {
3036                line: line_num,
3037                start_col: col_start,
3038                end_col: col_end,
3039                byte_offset: match_start,
3040                byte_end: match_end,
3041                url: email.to_string(),
3042                url_type: "email".to_string(),
3043            });
3044        }
3045
3046        bare_urls
3047    }
3048}
3049
3050/// Merge adjacent list blocks that should be treated as one
3051fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3052    if list_blocks.len() < 2 {
3053        return;
3054    }
3055
3056    let mut merger = ListBlockMerger::new(lines);
3057    *list_blocks = merger.merge(list_blocks);
3058}
3059
3060/// Helper struct to manage the complex logic of merging list blocks
3061struct ListBlockMerger<'a> {
3062    lines: &'a [LineInfo],
3063}
3064
3065impl<'a> ListBlockMerger<'a> {
3066    fn new(lines: &'a [LineInfo]) -> Self {
3067        Self { lines }
3068    }
3069
3070    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3071        let mut merged = Vec::with_capacity(list_blocks.len());
3072        let mut current = list_blocks[0].clone();
3073
3074        for next in list_blocks.iter().skip(1) {
3075            if self.should_merge_blocks(&current, next) {
3076                current = self.merge_two_blocks(current, next);
3077            } else {
3078                merged.push(current);
3079                current = next.clone();
3080            }
3081        }
3082
3083        merged.push(current);
3084        merged
3085    }
3086
3087    /// Determine if two adjacent list blocks should be merged
3088    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3089        // Basic compatibility checks
3090        if !self.blocks_are_compatible(current, next) {
3091            return false;
3092        }
3093
3094        // Check spacing and content between blocks
3095        let spacing = self.analyze_spacing_between(current, next);
3096        match spacing {
3097            BlockSpacing::Consecutive => true,
3098            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3099            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3100                self.can_merge_with_content_between(current, next)
3101            }
3102        }
3103    }
3104
3105    /// Check if blocks have compatible structure for merging
3106    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3107        current.is_ordered == next.is_ordered
3108            && current.blockquote_prefix == next.blockquote_prefix
3109            && current.nesting_level == next.nesting_level
3110    }
3111
3112    /// Analyze the spacing between two list blocks
3113    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3114        let gap = next.start_line - current.end_line;
3115
3116        match gap {
3117            1 => BlockSpacing::Consecutive,
3118            2 => BlockSpacing::SingleBlank,
3119            _ if gap > 2 => {
3120                if self.has_only_blank_lines_between(current, next) {
3121                    BlockSpacing::MultipleBlanks
3122                } else {
3123                    BlockSpacing::ContentBetween
3124                }
3125            }
3126            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3127        }
3128    }
3129
3130    /// Check if unordered lists can be merged with a single blank line between
3131    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3132        // Check if there are structural separators between the blocks
3133        // If has_meaningful_content_between returns true, it means there are structural separators
3134        if has_meaningful_content_between(current, next, self.lines) {
3135            return false; // Structural separators prevent merging
3136        }
3137
3138        // Only merge unordered lists with same marker across single blank
3139        !current.is_ordered && current.marker == next.marker
3140    }
3141
3142    /// Check if ordered lists can be merged when there's content between them
3143    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3144        // Do not merge lists if there are structural separators between them
3145        if has_meaningful_content_between(current, next, self.lines) {
3146            return false; // Structural separators prevent merging
3147        }
3148
3149        // Only consider merging ordered lists if there's no structural content between
3150        current.is_ordered && next.is_ordered
3151    }
3152
3153    /// Check if there are only blank lines between blocks
3154    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3155        for line_num in (current.end_line + 1)..next.start_line {
3156            if let Some(line_info) = self.lines.get(line_num - 1)
3157                && !line_info.content.trim().is_empty()
3158            {
3159                return false;
3160            }
3161        }
3162        true
3163    }
3164
3165    /// Merge two compatible list blocks into one
3166    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3167        current.end_line = next.end_line;
3168        current.item_lines.extend_from_slice(&next.item_lines);
3169
3170        // Update max marker width
3171        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3172
3173        // Handle marker consistency for unordered lists
3174        if !current.is_ordered && self.markers_differ(&current, next) {
3175            current.marker = None; // Mixed markers
3176        }
3177
3178        current
3179    }
3180
3181    /// Check if two blocks have different markers
3182    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3183        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3184    }
3185}
3186
3187/// Types of spacing between list blocks
3188#[derive(Debug, PartialEq)]
3189enum BlockSpacing {
3190    Consecutive,    // No gap between blocks
3191    SingleBlank,    // One blank line between blocks
3192    MultipleBlanks, // Multiple blank lines but no content
3193    ContentBetween, // Content exists between blocks
3194}
3195
3196/// Check if there's meaningful content (not just blank lines) between two list blocks
3197fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3198    // Check lines between current.end_line and next.start_line
3199    for line_num in (current.end_line + 1)..next.start_line {
3200        if let Some(line_info) = lines.get(line_num - 1) {
3201            // Convert to 0-indexed
3202            let trimmed = line_info.content.trim();
3203
3204            // Skip empty lines
3205            if trimmed.is_empty() {
3206                continue;
3207            }
3208
3209            // Check for structural separators that should separate lists (CommonMark compliant)
3210
3211            // Headings separate lists
3212            if line_info.heading.is_some() {
3213                return true; // Has meaningful content - headings separate lists
3214            }
3215
3216            // Horizontal rules separate lists (---, ***, ___)
3217            if is_horizontal_rule(trimmed) {
3218                return true; // Has meaningful content - horizontal rules separate lists
3219            }
3220
3221            // Tables separate lists (lines containing | but not in URLs or code)
3222            // Simple heuristic: tables typically have | at start/end or multiple |
3223            if trimmed.contains('|') && trimmed.len() > 1 {
3224                // Don't treat URLs with | as tables
3225                if !trimmed.contains("](") && !trimmed.contains("http") {
3226                    // More robust check: tables usually have multiple | or | at edges
3227                    let pipe_count = trimmed.matches('|').count();
3228                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3229                        return true; // Has meaningful content - tables separate lists
3230                    }
3231                }
3232            }
3233
3234            // Blockquotes separate lists
3235            if trimmed.starts_with('>') {
3236                return true; // Has meaningful content - blockquotes separate lists
3237            }
3238
3239            // Code block fences separate lists (unless properly indented as list content)
3240            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3241                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3242
3243                // Check if this code block is properly indented as list continuation
3244                let min_continuation_indent = if current.is_ordered {
3245                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3246                } else {
3247                    current.nesting_level + 2
3248                };
3249
3250                if line_indent < min_continuation_indent {
3251                    // This is a standalone code block that separates lists
3252                    return true; // Has meaningful content - standalone code blocks separate lists
3253                }
3254            }
3255
3256            // Check if this line has proper indentation for list continuation
3257            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3258
3259            // Calculate minimum indentation needed to be list continuation
3260            let min_indent = if current.is_ordered {
3261                current.nesting_level + current.max_marker_width
3262            } else {
3263                current.nesting_level + 2
3264            };
3265
3266            // If the line is not indented enough to be list continuation, it's meaningful content
3267            if line_indent < min_indent {
3268                return true; // Has meaningful content - content not indented as list continuation
3269            }
3270
3271            // If we reach here, the line is properly indented as list continuation
3272            // Continue checking other lines
3273        }
3274    }
3275
3276    // Only blank lines or properly indented list continuation content between blocks
3277    false
3278}
3279
3280/// Check if a line is a horizontal rule (---, ***, ___)
3281fn is_horizontal_rule(trimmed: &str) -> bool {
3282    if trimmed.len() < 3 {
3283        return false;
3284    }
3285
3286    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3287    let chars: Vec<char> = trimmed.chars().collect();
3288    if let Some(&first_char) = chars.first()
3289        && (first_char == '-' || first_char == '*' || first_char == '_')
3290    {
3291        let mut count = 0;
3292        for &ch in &chars {
3293            if ch == first_char {
3294                count += 1;
3295            } else if ch != ' ' && ch != '\t' {
3296                return false; // Non-matching, non-whitespace character
3297            }
3298        }
3299        return count >= 3;
3300    }
3301    false
3302}
3303
3304/// Check if content contains patterns that cause the markdown crate to panic
3305#[cfg(test)]
3306mod tests {
3307    use super::*;
3308
3309    #[test]
3310    fn test_empty_content() {
3311        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3312        assert_eq!(ctx.content, "");
3313        assert_eq!(ctx.line_offsets, vec![0]);
3314        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3315        assert_eq!(ctx.lines.len(), 0);
3316    }
3317
3318    #[test]
3319    fn test_single_line() {
3320        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3321        assert_eq!(ctx.content, "# Hello");
3322        assert_eq!(ctx.line_offsets, vec![0]);
3323        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3324        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3325    }
3326
3327    #[test]
3328    fn test_multi_line() {
3329        let content = "# Title\n\nSecond line\nThird line";
3330        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3331        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3332        // Test offset to line/col
3333        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3334        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3335        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3336        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3337        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3338    }
3339
3340    #[test]
3341    fn test_line_info() {
3342        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3343        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3344
3345        // Test line info
3346        assert_eq!(ctx.lines.len(), 7);
3347
3348        // Line 1: "# Title"
3349        let line1 = &ctx.lines[0];
3350        assert_eq!(line1.content, "# Title");
3351        assert_eq!(line1.byte_offset, 0);
3352        assert_eq!(line1.indent, 0);
3353        assert!(!line1.is_blank);
3354        assert!(!line1.in_code_block);
3355        assert!(line1.list_item.is_none());
3356
3357        // Line 2: "    indented"
3358        let line2 = &ctx.lines[1];
3359        assert_eq!(line2.content, "    indented");
3360        assert_eq!(line2.byte_offset, 8);
3361        assert_eq!(line2.indent, 4);
3362        assert!(!line2.is_blank);
3363
3364        // Line 3: "" (blank)
3365        let line3 = &ctx.lines[2];
3366        assert_eq!(line3.content, "");
3367        assert!(line3.is_blank);
3368
3369        // Test helper methods
3370        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3371        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3372        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3373        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3374    }
3375
3376    #[test]
3377    fn test_list_item_detection() {
3378        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3379        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3380
3381        // Line 1: "- Unordered item"
3382        let line1 = &ctx.lines[0];
3383        assert!(line1.list_item.is_some());
3384        let list1 = line1.list_item.as_ref().unwrap();
3385        assert_eq!(list1.marker, "-");
3386        assert!(!list1.is_ordered);
3387        assert_eq!(list1.marker_column, 0);
3388        assert_eq!(list1.content_column, 2);
3389
3390        // Line 2: "  * Nested item"
3391        let line2 = &ctx.lines[1];
3392        assert!(line2.list_item.is_some());
3393        let list2 = line2.list_item.as_ref().unwrap();
3394        assert_eq!(list2.marker, "*");
3395        assert_eq!(list2.marker_column, 2);
3396
3397        // Line 3: "1. Ordered item"
3398        let line3 = &ctx.lines[2];
3399        assert!(line3.list_item.is_some());
3400        let list3 = line3.list_item.as_ref().unwrap();
3401        assert_eq!(list3.marker, "1.");
3402        assert!(list3.is_ordered);
3403        assert_eq!(list3.number, Some(1));
3404
3405        // Line 6: "Not a list"
3406        let line6 = &ctx.lines[5];
3407        assert!(line6.list_item.is_none());
3408    }
3409
3410    #[test]
3411    fn test_offset_to_line_col_edge_cases() {
3412        let content = "a\nb\nc";
3413        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3414        // line_offsets: [0, 2, 4]
3415        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3416        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3417        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3418        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3419        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3420        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3421    }
3422
3423    #[test]
3424    fn test_mdx_esm_blocks() {
3425        let content = r##"import {Chart} from './snowfall.js'
3426export const year = 2023
3427
3428# Last year's snowfall
3429
3430In {year}, the snowfall was above average.
3431It was followed by a warm spring which caused
3432flood conditions in many of the nearby rivers.
3433
3434<Chart color="#fcb32c" year={year} />
3435"##;
3436
3437        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3438
3439        // Check that lines 1 and 2 are marked as ESM blocks
3440        assert_eq!(ctx.lines.len(), 10);
3441        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3442        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3443        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3444        assert!(
3445            !ctx.lines[3].in_esm_block,
3446            "Line 4 (heading) should NOT be in_esm_block"
3447        );
3448        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3449        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3450    }
3451
3452    #[test]
3453    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3454        let content = r#"import {Chart} from './snowfall.js'
3455export const year = 2023
3456
3457# Last year's snowfall
3458"#;
3459
3460        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3461
3462        // ESM blocks should NOT be detected in Standard flavor
3463        assert!(
3464            !ctx.lines[0].in_esm_block,
3465            "Line 1 should NOT be in_esm_block in Standard flavor"
3466        );
3467        assert!(
3468            !ctx.lines[1].in_esm_block,
3469            "Line 2 should NOT be in_esm_block in Standard flavor"
3470        );
3471    }
3472}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs