rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::sync::LazyLock;
7
8// Comprehensive link pattern that captures both inline and reference links
9// Use (?s) flag to make . match newlines
10static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
11    Regex::new(
12        r#"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
16            |
17            \[([^\]]*)\]      # Reference ID in group 6
18        )"#
19    ).unwrap()
20});
21
22// Image pattern (similar to links but with ! prefix)
23// Use (?s) flag to make . match newlines
24static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
25    Regex::new(
26        r#"(?sx)
27        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
28        (?:
29            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
30            |
31            \[([^\]]*)\]      # Reference ID in group 6
32        )"#
33    ).unwrap()
34});
35
36// Reference definition pattern
37static REF_DEF_PATTERN: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
39
40// Pattern for bare URLs
41static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
42    Regex::new(
43        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44    ).unwrap()
45});
46
47// Pattern for email addresses
48static BARE_EMAIL_PATTERN: LazyLock<Regex> =
49    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
50
51// Pattern for blockquote prefix in parse_list_blocks
52static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
53
54/// Pre-computed information about a line
55#[derive(Debug, Clone)]
56pub struct LineInfo {
57    /// The actual line content (without newline)
58    pub content: String,
59    /// Byte offset where this line starts in the document
60    pub byte_offset: usize,
61    /// Number of leading spaces/tabs
62    pub indent: usize,
63    /// Whether the line is blank (empty or only whitespace)
64    pub is_blank: bool,
65    /// Whether this line is inside a code block
66    pub in_code_block: bool,
67    /// Whether this line is inside front matter
68    pub in_front_matter: bool,
69    /// Whether this line is inside an HTML block
70    pub in_html_block: bool,
71    /// Whether this line is inside an HTML comment
72    pub in_html_comment: bool,
73    /// List item information if this line starts a list item
74    pub list_item: Option<ListItemInfo>,
75    /// Heading information if this line is a heading
76    pub heading: Option<HeadingInfo>,
77    /// Blockquote information if this line is a blockquote
78    pub blockquote: Option<BlockquoteInfo>,
79    /// Whether this line is inside a mkdocstrings autodoc block
80    pub in_mkdocstrings: bool,
81    /// Whether this line is part of an ESM import/export block (MDX only)
82    pub in_esm_block: bool,
83}
84
85/// Information about a list item
86#[derive(Debug, Clone)]
87pub struct ListItemInfo {
88    /// The marker used (*, -, +, or number with . or ))
89    pub marker: String,
90    /// Whether it's ordered (true) or unordered (false)
91    pub is_ordered: bool,
92    /// The number for ordered lists
93    pub number: Option<usize>,
94    /// Column where the marker starts (0-based)
95    pub marker_column: usize,
96    /// Column where content after marker starts
97    pub content_column: usize,
98}
99
100/// Heading style type
101#[derive(Debug, Clone, PartialEq)]
102pub enum HeadingStyle {
103    /// ATX style heading (# Heading)
104    ATX,
105    /// Setext style heading with = underline
106    Setext1,
107    /// Setext style heading with - underline
108    Setext2,
109}
110
111/// Parsed link information
112#[derive(Debug, Clone)]
113pub struct ParsedLink {
114    /// Line number (1-indexed)
115    pub line: usize,
116    /// Start column (0-indexed) in the line
117    pub start_col: usize,
118    /// End column (0-indexed) in the line
119    pub end_col: usize,
120    /// Byte offset in document
121    pub byte_offset: usize,
122    /// End byte offset in document
123    pub byte_end: usize,
124    /// Link text
125    pub text: String,
126    /// Link URL or reference
127    pub url: String,
128    /// Whether this is a reference link [text][ref] vs inline [text](url)
129    pub is_reference: bool,
130    /// Reference ID for reference links
131    pub reference_id: Option<String>,
132}
133
134/// Information about a broken link reported by pulldown-cmark
135#[derive(Debug, Clone)]
136pub struct BrokenLinkInfo {
137    /// The reference text that couldn't be resolved
138    pub reference: String,
139    /// Byte span in the source document
140    pub span: std::ops::Range<usize>,
141}
142
143/// Parsed image information
144#[derive(Debug, Clone)]
145pub struct ParsedImage {
146    /// Line number (1-indexed)
147    pub line: usize,
148    /// Start column (0-indexed) in the line
149    pub start_col: usize,
150    /// End column (0-indexed) in the line
151    pub end_col: usize,
152    /// Byte offset in document
153    pub byte_offset: usize,
154    /// End byte offset in document
155    pub byte_end: usize,
156    /// Alt text
157    pub alt_text: String,
158    /// Image URL or reference
159    pub url: String,
160    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
161    pub is_reference: bool,
162    /// Reference ID for reference images
163    pub reference_id: Option<String>,
164}
165
166/// Reference definition [ref]: url "title"
167#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169    /// Line number (1-indexed)
170    pub line: usize,
171    /// Reference ID (normalized to lowercase)
172    pub id: String,
173    /// URL
174    pub url: String,
175    /// Optional title
176    pub title: Option<String>,
177    /// Byte offset where the reference definition starts
178    pub byte_offset: usize,
179    /// Byte offset where the reference definition ends
180    pub byte_end: usize,
181}
182
183/// Parsed code span information
184#[derive(Debug, Clone)]
185pub struct CodeSpan {
186    /// Line number (1-indexed)
187    pub line: usize,
188    /// Start column (0-indexed) in the line
189    pub start_col: usize,
190    /// End column (0-indexed) in the line
191    pub end_col: usize,
192    /// Byte offset in document
193    pub byte_offset: usize,
194    /// End byte offset in document
195    pub byte_end: usize,
196    /// Number of backticks used (1, 2, 3, etc.)
197    pub backtick_count: usize,
198    /// Content inside the code span (without backticks)
199    pub content: String,
200}
201
202/// Information about a heading
203#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205    /// Heading level (1-6 for ATX, 1-2 for Setext)
206    pub level: u8,
207    /// Style of heading
208    pub style: HeadingStyle,
209    /// The heading marker (# characters or underline)
210    pub marker: String,
211    /// Column where the marker starts (0-based)
212    pub marker_column: usize,
213    /// Column where heading text starts
214    pub content_column: usize,
215    /// The heading text (without markers and without custom ID syntax)
216    pub text: String,
217    /// Custom header ID if present (e.g., from {#custom-id} syntax)
218    pub custom_id: Option<String>,
219    /// Original heading text including custom ID syntax
220    pub raw_text: String,
221    /// Whether it has a closing sequence (for ATX)
222    pub has_closing_sequence: bool,
223    /// The closing sequence if present
224    pub closing_sequence: String,
225}
226
227/// Information about a blockquote line
228#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230    /// Nesting level (1 for >, 2 for >>, etc.)
231    pub nesting_level: usize,
232    /// The indentation before the blockquote marker
233    pub indent: String,
234    /// Column where the first > starts (0-based)
235    pub marker_column: usize,
236    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
237    pub prefix: String,
238    /// Content after the blockquote marker(s)
239    pub content: String,
240    /// Whether the line has no space after the marker
241    pub has_no_space_after_marker: bool,
242    /// Whether the line has multiple spaces after the marker
243    pub has_multiple_spaces_after_marker: bool,
244    /// Whether this is an empty blockquote line needing MD028 fix
245    pub needs_md028_fix: bool,
246}
247
248/// Information about a list block
249#[derive(Debug, Clone)]
250pub struct ListBlock {
251    /// Line number where the list starts (1-indexed)
252    pub start_line: usize,
253    /// Line number where the list ends (1-indexed)
254    pub end_line: usize,
255    /// Whether it's ordered or unordered
256    pub is_ordered: bool,
257    /// The consistent marker for unordered lists (if any)
258    pub marker: Option<String>,
259    /// Blockquote prefix for this list (empty if not in blockquote)
260    pub blockquote_prefix: String,
261    /// Lines that are list items within this block
262    pub item_lines: Vec<usize>,
263    /// Nesting level (0 for top-level lists)
264    pub nesting_level: usize,
265    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
266    pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271/// Character frequency data for fast content analysis
272#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274    /// Count of # characters (headings)
275    pub hash_count: usize,
276    /// Count of * characters (emphasis, lists, horizontal rules)
277    pub asterisk_count: usize,
278    /// Count of _ characters (emphasis, horizontal rules)
279    pub underscore_count: usize,
280    /// Count of - characters (lists, horizontal rules, setext headings)
281    pub hyphen_count: usize,
282    /// Count of + characters (lists)
283    pub plus_count: usize,
284    /// Count of > characters (blockquotes)
285    pub gt_count: usize,
286    /// Count of | characters (tables)
287    pub pipe_count: usize,
288    /// Count of [ characters (links, images)
289    pub bracket_count: usize,
290    /// Count of ` characters (code spans, code blocks)
291    pub backtick_count: usize,
292    /// Count of < characters (HTML tags, autolinks)
293    pub lt_count: usize,
294    /// Count of ! characters (images)
295    pub exclamation_count: usize,
296    /// Count of newline characters
297    pub newline_count: usize,
298}
299
300/// Pre-parsed HTML tag information
301#[derive(Debug, Clone)]
302pub struct HtmlTag {
303    /// Line number (1-indexed)
304    pub line: usize,
305    /// Start column (0-indexed) in the line
306    pub start_col: usize,
307    /// End column (0-indexed) in the line
308    pub end_col: usize,
309    /// Byte offset in document
310    pub byte_offset: usize,
311    /// End byte offset in document
312    pub byte_end: usize,
313    /// Tag name (e.g., "div", "img", "br")
314    pub tag_name: String,
315    /// Whether it's a closing tag (`</tag>`)
316    pub is_closing: bool,
317    /// Whether it's self-closing (`<tag />`)
318    pub is_self_closing: bool,
319    /// Raw tag content
320    pub raw_content: String,
321}
322
323/// Pre-parsed emphasis span information
324#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326    /// Line number (1-indexed)
327    pub line: usize,
328    /// Start column (0-indexed) in the line
329    pub start_col: usize,
330    /// End column (0-indexed) in the line
331    pub end_col: usize,
332    /// Byte offset in document
333    pub byte_offset: usize,
334    /// End byte offset in document
335    pub byte_end: usize,
336    /// Type of emphasis ('*' or '_')
337    pub marker: char,
338    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
339    pub marker_count: usize,
340    /// Content inside the emphasis
341    pub content: String,
342}
343
344/// Pre-parsed table row information
345#[derive(Debug, Clone)]
346pub struct TableRow {
347    /// Line number (1-indexed)
348    pub line: usize,
349    /// Whether this is a separator row (contains only |, -, :, and spaces)
350    pub is_separator: bool,
351    /// Number of columns (pipe-separated cells)
352    pub column_count: usize,
353    /// Alignment info from separator row
354    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
355}
356
357/// Pre-parsed bare URL information (not in links)
358#[derive(Debug, Clone)]
359pub struct BareUrl {
360    /// Line number (1-indexed)
361    pub line: usize,
362    /// Start column (0-indexed) in the line
363    pub start_col: usize,
364    /// End column (0-indexed) in the line
365    pub end_col: usize,
366    /// Byte offset in document
367    pub byte_offset: usize,
368    /// End byte offset in document
369    pub byte_end: usize,
370    /// The URL string
371    pub url: String,
372    /// Type of URL ("http", "https", "ftp", "email")
373    pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377    pub content: &'a str,
378    pub line_offsets: Vec<usize>,
379    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
380    pub lines: Vec<LineInfo>,             // Pre-computed line information
381    pub links: Vec<ParsedLink>,           // Pre-parsed links
382    pub images: Vec<ParsedImage>,         // Pre-parsed images
383    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
384    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
385    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
386    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
387    pub char_frequency: CharFrequency,    // Character frequency analysis
388    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
389    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
390    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
391    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
392    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
393    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
394    pub line_index: crate::utils::range_utils::LineIndex, // Pre-computed line index for byte position calculations
395    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
396    pub flavor: MarkdownFlavor,           // Markdown flavor being used
397}
398
399/// Detailed blockquote parse result with all components
400struct BlockquoteComponents<'a> {
401    indent: &'a str,
402    markers: &'a str,
403    spaces_after: &'a str,
404    content: &'a str,
405}
406
407/// Parse blockquote prefix with detailed components using manual parsing
408#[inline]
409fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
410    let bytes = line.as_bytes();
411    let mut pos = 0;
412
413    // Parse leading whitespace (indent)
414    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
415        pos += 1;
416    }
417    let indent_end = pos;
418
419    // Must have at least one '>' marker
420    if pos >= bytes.len() || bytes[pos] != b'>' {
421        return None;
422    }
423
424    // Parse '>' markers
425    while pos < bytes.len() && bytes[pos] == b'>' {
426        pos += 1;
427    }
428    let markers_end = pos;
429
430    // Parse spaces after markers
431    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
432        pos += 1;
433    }
434    let spaces_end = pos;
435
436    Some(BlockquoteComponents {
437        indent: &line[0..indent_end],
438        markers: &line[indent_end..markers_end],
439        spaces_after: &line[markers_end..spaces_end],
440        content: &line[spaces_end..],
441    })
442}
443
444impl<'a> LintContext<'a> {
445    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
446        use std::time::Instant;
447        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
448
449        let start = Instant::now();
450        let mut line_offsets = vec![0];
451        for (i, c) in content.char_indices() {
452            if c == '\n' {
453                line_offsets.push(i + 1);
454            }
455        }
456        if profile {
457            eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
458        }
459
460        // Detect code blocks once and cache them
461        let start = Instant::now();
462        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
463        if profile {
464            eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
465        }
466
467        // Pre-compute HTML comment ranges ONCE for all operations
468        let start = Instant::now();
469        let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
470        if profile {
471            eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
472        }
473
474        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
475        let start = Instant::now();
476        let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
477            crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
478        } else {
479            Vec::new()
480        };
481        if profile {
482            eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
483        }
484
485        // Pre-compute line information (without headings/blockquotes yet)
486        let start = Instant::now();
487        let mut lines = Self::compute_basic_line_info(
488            content,
489            &line_offsets,
490            &code_blocks,
491            flavor,
492            &html_comment_ranges,
493            &autodoc_ranges,
494        );
495        if profile {
496            eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
497        }
498
499        // Detect HTML blocks BEFORE heading detection
500        let start = Instant::now();
501        Self::detect_html_blocks(&mut lines);
502        if profile {
503            eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
504        }
505
506        // Detect ESM import/export blocks in MDX files BEFORE heading detection
507        let start = Instant::now();
508        Self::detect_esm_blocks(&mut lines, flavor);
509        if profile {
510            eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
511        }
512
513        // Now detect headings and blockquotes
514        let start = Instant::now();
515        Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
516        if profile {
517            eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
518        }
519
520        // Parse code spans early so we can exclude them from link/image parsing
521        let start = Instant::now();
522        let code_spans = Self::parse_code_spans(content, &lines);
523        if profile {
524            eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
525        }
526
527        // Parse links, images, references, and list blocks
528        let start = Instant::now();
529        let (links, broken_links) =
530            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
531        if profile {
532            eprintln!("[PROFILE] Links: {:?}", start.elapsed());
533        }
534
535        let start = Instant::now();
536        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
537        if profile {
538            eprintln!("[PROFILE] Images: {:?}", start.elapsed());
539        }
540
541        let start = Instant::now();
542        let reference_defs = Self::parse_reference_defs(content, &lines);
543        if profile {
544            eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
545        }
546
547        let start = Instant::now();
548        let list_blocks = Self::parse_list_blocks(&lines);
549        if profile {
550            eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
551        }
552
553        // Compute character frequency for fast content analysis
554        let start = Instant::now();
555        let char_frequency = Self::compute_char_frequency(content);
556        if profile {
557            eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
558        }
559
560        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
561        let start = Instant::now();
562        let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
563            content,
564            &code_blocks,
565            &code_spans,
566            &html_comment_ranges,
567        );
568        if profile {
569            eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
570        }
571
572        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
573        let start = Instant::now();
574        let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
575        if profile {
576            eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
577        }
578
579        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
580        let start = Instant::now();
581        let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
582        if profile {
583            eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
584        }
585
586        Self {
587            content,
588            line_offsets,
589            code_blocks,
590            lines,
591            links,
592            images,
593            broken_links,
594            reference_defs,
595            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
596            list_blocks,
597            char_frequency,
598            html_tags_cache: Mutex::new(None),
599            emphasis_spans_cache: Mutex::new(None),
600            table_rows_cache: Mutex::new(None),
601            bare_urls_cache: Mutex::new(None),
602            html_comment_ranges,
603            table_blocks,
604            line_index,
605            jinja_ranges,
606            flavor,
607        }
608    }
609
610    /// Get code spans - computed lazily on first access
611    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
612        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
613
614        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
615    }
616
617    /// Get HTML comment ranges - pre-computed during LintContext construction
618    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
619        &self.html_comment_ranges
620    }
621
622    /// Get HTML tags - computed lazily on first access
623    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
624        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
625
626        Arc::clone(cache.get_or_insert_with(|| {
627            Arc::new(Self::parse_html_tags(
628                self.content,
629                &self.lines,
630                &self.code_blocks,
631                self.flavor,
632            ))
633        }))
634    }
635
636    /// Get emphasis spans - computed lazily on first access
637    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
638        let mut cache = self
639            .emphasis_spans_cache
640            .lock()
641            .expect("Emphasis spans cache mutex poisoned");
642
643        Arc::clone(
644            cache.get_or_insert_with(|| {
645                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
646            }),
647        )
648    }
649
650    /// Get table rows - computed lazily on first access
651    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
652        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
653
654        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(&self.lines))))
655    }
656
657    /// Get bare URLs - computed lazily on first access
658    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
659        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
660
661        Arc::clone(
662            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
663        )
664    }
665
666    /// Map a byte offset to (line, column)
667    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
668        match self.line_offsets.binary_search(&offset) {
669            Ok(line) => (line + 1, 1),
670            Err(line) => {
671                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
672                (line, offset - line_start + 1)
673            }
674        }
675    }
676
677    /// Check if a position is within a code block or code span
678    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
679        // Check code blocks first
680        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
681            return true;
682        }
683
684        // Check inline code spans (lazy load if needed)
685        self.code_spans()
686            .iter()
687            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
688    }
689
690    /// Get line information by line number (1-indexed)
691    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
692        if line_num > 0 {
693            self.lines.get(line_num - 1)
694        } else {
695            None
696        }
697    }
698
699    /// Get byte offset for a line number (1-indexed)
700    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
701        self.line_info(line_num).map(|info| info.byte_offset)
702    }
703
704    /// Get URL for a reference link/image by its ID
705    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
706        let normalized_id = ref_id.to_lowercase();
707        self.reference_defs
708            .iter()
709            .find(|def| def.id == normalized_id)
710            .map(|def| def.url.as_str())
711    }
712
713    /// Get links on a specific line
714    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
715        self.links.iter().filter(|link| link.line == line_num).collect()
716    }
717
718    /// Get images on a specific line
719    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
720        self.images.iter().filter(|img| img.line == line_num).collect()
721    }
722
723    /// Check if a line is part of a list block
724    pub fn is_in_list_block(&self, line_num: usize) -> bool {
725        self.list_blocks
726            .iter()
727            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
728    }
729
730    /// Get the list block containing a specific line
731    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
732        self.list_blocks
733            .iter()
734            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
735    }
736
737    // Compatibility methods for DocumentStructure migration
738
739    /// Check if a line is within a code block
740    pub fn is_in_code_block(&self, line_num: usize) -> bool {
741        if line_num == 0 || line_num > self.lines.len() {
742            return false;
743        }
744        self.lines[line_num - 1].in_code_block
745    }
746
747    /// Check if a line is within front matter
748    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
749        if line_num == 0 || line_num > self.lines.len() {
750            return false;
751        }
752        self.lines[line_num - 1].in_front_matter
753    }
754
755    /// Check if a line is within an HTML block
756    pub fn is_in_html_block(&self, line_num: usize) -> bool {
757        if line_num == 0 || line_num > self.lines.len() {
758            return false;
759        }
760        self.lines[line_num - 1].in_html_block
761    }
762
763    /// Check if a line and column is within a code span
764    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
765        if line_num == 0 || line_num > self.lines.len() {
766            return false;
767        }
768
769        // Use the code spans cache to check
770        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
771        // Convert col to 0-indexed for comparison
772        let col_0indexed = if col > 0 { col - 1 } else { 0 };
773        let code_spans = self.code_spans();
774        code_spans
775            .iter()
776            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
777    }
778
779    /// Check if a byte position is within a reference definition
780    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
781    #[inline]
782    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
783        self.reference_defs
784            .iter()
785            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
786    }
787
788    /// Check if a byte position is within an HTML comment
789    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
790    /// where k is the number of HTML comments (typically very small)
791    #[inline]
792    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
793        self.html_comment_ranges
794            .iter()
795            .any(|range| byte_pos >= range.start && byte_pos < range.end)
796    }
797
798    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
799    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
800        self.jinja_ranges
801            .iter()
802            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
803    }
804
805    /// Check if content has any instances of a specific character (fast)
806    pub fn has_char(&self, ch: char) -> bool {
807        match ch {
808            '#' => self.char_frequency.hash_count > 0,
809            '*' => self.char_frequency.asterisk_count > 0,
810            '_' => self.char_frequency.underscore_count > 0,
811            '-' => self.char_frequency.hyphen_count > 0,
812            '+' => self.char_frequency.plus_count > 0,
813            '>' => self.char_frequency.gt_count > 0,
814            '|' => self.char_frequency.pipe_count > 0,
815            '[' => self.char_frequency.bracket_count > 0,
816            '`' => self.char_frequency.backtick_count > 0,
817            '<' => self.char_frequency.lt_count > 0,
818            '!' => self.char_frequency.exclamation_count > 0,
819            '\n' => self.char_frequency.newline_count > 0,
820            _ => self.content.contains(ch), // Fallback for other characters
821        }
822    }
823
824    /// Get count of a specific character (fast)
825    pub fn char_count(&self, ch: char) -> usize {
826        match ch {
827            '#' => self.char_frequency.hash_count,
828            '*' => self.char_frequency.asterisk_count,
829            '_' => self.char_frequency.underscore_count,
830            '-' => self.char_frequency.hyphen_count,
831            '+' => self.char_frequency.plus_count,
832            '>' => self.char_frequency.gt_count,
833            '|' => self.char_frequency.pipe_count,
834            '[' => self.char_frequency.bracket_count,
835            '`' => self.char_frequency.backtick_count,
836            '<' => self.char_frequency.lt_count,
837            '!' => self.char_frequency.exclamation_count,
838            '\n' => self.char_frequency.newline_count,
839            _ => self.content.matches(ch).count(), // Fallback for other characters
840        }
841    }
842
843    /// Check if content likely contains headings (fast)
844    pub fn likely_has_headings(&self) -> bool {
845        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
846    }
847
848    /// Check if content likely contains lists (fast)
849    pub fn likely_has_lists(&self) -> bool {
850        self.char_frequency.asterisk_count > 0
851            || self.char_frequency.hyphen_count > 0
852            || self.char_frequency.plus_count > 0
853    }
854
855    /// Check if content likely contains emphasis (fast)
856    pub fn likely_has_emphasis(&self) -> bool {
857        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
858    }
859
860    /// Check if content likely contains tables (fast)
861    pub fn likely_has_tables(&self) -> bool {
862        self.char_frequency.pipe_count > 2
863    }
864
865    /// Check if content likely contains blockquotes (fast)
866    pub fn likely_has_blockquotes(&self) -> bool {
867        self.char_frequency.gt_count > 0
868    }
869
870    /// Check if content likely contains code (fast)
871    pub fn likely_has_code(&self) -> bool {
872        self.char_frequency.backtick_count > 0
873    }
874
875    /// Check if content likely contains links or images (fast)
876    pub fn likely_has_links_or_images(&self) -> bool {
877        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
878    }
879
880    /// Check if content likely contains HTML (fast)
881    pub fn likely_has_html(&self) -> bool {
882        self.char_frequency.lt_count > 0
883    }
884
885    /// Get HTML tags on a specific line
886    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
887        self.html_tags()
888            .iter()
889            .filter(|tag| tag.line == line_num)
890            .cloned()
891            .collect()
892    }
893
894    /// Get emphasis spans on a specific line
895    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
896        self.emphasis_spans()
897            .iter()
898            .filter(|span| span.line == line_num)
899            .cloned()
900            .collect()
901    }
902
903    /// Get table rows on a specific line
904    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
905        self.table_rows()
906            .iter()
907            .filter(|row| row.line == line_num)
908            .cloned()
909            .collect()
910    }
911
912    /// Get bare URLs on a specific line
913    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
914        self.bare_urls()
915            .iter()
916            .filter(|url| url.line == line_num)
917            .cloned()
918            .collect()
919    }
920
921    /// Find the line index for a given byte offset using binary search.
922    /// Returns (line_index, line_number, column) where:
923    /// - line_index is the 0-based index in the lines array
924    /// - line_number is the 1-based line number
925    /// - column is the byte offset within that line
926    #[inline]
927    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
928        // Binary search to find the line containing this byte offset
929        let idx = match lines.binary_search_by(|line| {
930            if byte_offset < line.byte_offset {
931                std::cmp::Ordering::Greater
932            } else if byte_offset > line.byte_offset + line.content.len() {
933                std::cmp::Ordering::Less
934            } else {
935                std::cmp::Ordering::Equal
936            }
937        }) {
938            Ok(idx) => idx,
939            Err(idx) => idx.saturating_sub(1),
940        };
941
942        let line = &lines[idx];
943        let line_num = idx + 1;
944        let col = byte_offset.saturating_sub(line.byte_offset);
945
946        (idx, line_num, col)
947    }
948
949    /// Check if a byte offset is within a code span using binary search
950    #[inline]
951    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
952        // Since spans are sorted by byte_offset, use partition_point for binary search
953        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
954
955        // Check the span that starts at or before our offset
956        if idx > 0 {
957            let span = &code_spans[idx - 1];
958            if offset >= span.byte_offset && offset < span.byte_end {
959                return true;
960            }
961        }
962
963        false
964    }
965
966    /// Parse all links in the content
967    fn parse_links(
968        content: &str,
969        lines: &[LineInfo],
970        code_blocks: &[(usize, usize)],
971        code_spans: &[CodeSpan],
972        flavor: MarkdownFlavor,
973        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
974    ) -> (Vec<ParsedLink>, Vec<BrokenLinkInfo>) {
975        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
976        use std::collections::HashSet;
977
978        let mut links = Vec::with_capacity(content.len() / 500);
979        let mut broken_links = Vec::new();
980
981        // Track byte positions of links found by pulldown-cmark
982        let mut found_positions = HashSet::new();
983
984        // Use pulldown-cmark's streaming parser with BrokenLink callback
985        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
986        // This automatically handles:
987        // - Escaped links (won't generate events)
988        // - Links in code blocks/spans (won't generate Link events)
989        // - Images (generates Tag::Image instead)
990        // - Reference resolution (dest_url is already resolved!)
991        // - Broken references (callback is invoked)
992        let parser = Parser::new_with_broken_link_callback(
993            content,
994            pulldown_cmark::Options::empty(),
995            Some(|link: BrokenLink<'_>| {
996                broken_links.push(BrokenLinkInfo {
997                    reference: link.reference.to_string(),
998                    span: link.span.clone(),
999                });
1000                None
1001            }),
1002        )
1003        .into_offset_iter();
1004
1005        let mut link_stack: Vec<(usize, usize, String, LinkType, String)> = Vec::new();
1006        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1007
1008        for (event, range) in parser {
1009            match event {
1010                Event::Start(Tag::Link {
1011                    link_type,
1012                    dest_url,
1013                    id,
1014                    ..
1015                }) => {
1016                    // Link start - record position, URL, and reference ID
1017                    link_stack.push((range.start, range.end, dest_url.to_string(), link_type, id.to_string()));
1018                    text_chunks.clear();
1019                }
1020                Event::Text(text) if !link_stack.is_empty() => {
1021                    // Track text content with its byte range
1022                    text_chunks.push((text.to_string(), range.start, range.end));
1023                }
1024                Event::Code(code) if !link_stack.is_empty() => {
1025                    // Include inline code in link text (with backticks)
1026                    let code_text = format!("`{code}`");
1027                    text_chunks.push((code_text, range.start, range.end));
1028                }
1029                Event::End(TagEnd::Link) => {
1030                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1031                        // Skip if in HTML comment
1032                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1033                            text_chunks.clear();
1034                            continue;
1035                        }
1036
1037                        // Find line and column information
1038                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1039
1040                        // Skip if this link is on a MkDocs snippet line
1041                        if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1042                            text_chunks.clear();
1043                            continue;
1044                        }
1045
1046                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1047
1048                        let is_reference = matches!(
1049                            link_type,
1050                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1051                        );
1052
1053                        // Extract link text directly from source bytes to preserve escaping
1054                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1055                        let link_text = if start_pos < content.len() {
1056                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1057
1058                            // Find MATCHING ] by tracking bracket depth for nested brackets
1059                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1060                            let mut close_pos = None;
1061                            let mut depth = 0;
1062
1063                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1064                                // Count preceding backslashes
1065                                let mut backslash_count = 0;
1066                                let mut j = i;
1067                                while j > 0 && link_bytes[j - 1] == b'\\' {
1068                                    backslash_count += 1;
1069                                    j -= 1;
1070                                }
1071                                let is_escaped = backslash_count % 2 != 0;
1072
1073                                if !is_escaped {
1074                                    if byte == b'[' {
1075                                        depth += 1;
1076                                    } else if byte == b']' {
1077                                        if depth == 0 {
1078                                            // Found the matching closing bracket
1079                                            close_pos = Some(i);
1080                                            break;
1081                                        } else {
1082                                            depth -= 1;
1083                                        }
1084                                    }
1085                                }
1086                            }
1087
1088                            if let Some(pos) = close_pos {
1089                                std::str::from_utf8(&link_bytes[1..pos]).unwrap_or("").to_string()
1090                            } else {
1091                                String::new()
1092                            }
1093                        } else {
1094                            String::new()
1095                        };
1096
1097                        // For reference links, use the actual reference ID from pulldown-cmark
1098                        let reference_id = if is_reference && !ref_id.is_empty() {
1099                            Some(ref_id.to_lowercase())
1100                        } else if is_reference {
1101                            // For collapsed/shortcut references without explicit ID, use the link text
1102                            Some(link_text.to_lowercase())
1103                        } else {
1104                            None
1105                        };
1106
1107                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1108                        // Check for escaped image syntax: \![text](url)
1109                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1110                        let has_escaped_bang = start_pos >= 2
1111                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1112                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1113
1114                        // Check for escaped bracket: \[text](url)
1115                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1116                        let has_escaped_bracket =
1117                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1118
1119                        if has_escaped_bang || has_escaped_bracket {
1120                            text_chunks.clear();
1121                            continue; // Skip: this is escaped markdown, not a real link
1122                        }
1123
1124                        // Track this position as found
1125                        found_positions.insert(start_pos);
1126
1127                        links.push(ParsedLink {
1128                            line: line_num,
1129                            start_col: col_start,
1130                            end_col: col_end,
1131                            byte_offset: start_pos,
1132                            byte_end: range.end,
1133                            text: link_text,
1134                            url,
1135                            is_reference,
1136                            reference_id,
1137                        });
1138
1139                        text_chunks.clear();
1140                    }
1141                }
1142                _ => {}
1143            }
1144        }
1145
1146        // Also find undefined references using regex
1147        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1148        // because the reference is undefined
1149        for cap in LINK_PATTERN.captures_iter(content) {
1150            let full_match = cap.get(0).unwrap();
1151            let match_start = full_match.start();
1152            let match_end = full_match.end();
1153
1154            // Skip if this was already found by pulldown-cmark (it's a valid link)
1155            if found_positions.contains(&match_start) {
1156                continue;
1157            }
1158
1159            // Skip if escaped
1160            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1161                continue;
1162            }
1163
1164            // Skip if it's an image
1165            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1166                continue;
1167            }
1168
1169            // Skip if in code block
1170            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1171                continue;
1172            }
1173
1174            // Skip if in code span
1175            if Self::is_offset_in_code_span(code_spans, match_start) {
1176                continue;
1177            }
1178
1179            // Skip if in HTML comment
1180            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1181                continue;
1182            }
1183
1184            // Find line and column information
1185            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1186
1187            // Skip if this link is on a MkDocs snippet line
1188            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1189                continue;
1190            }
1191
1192            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1193
1194            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1195
1196            // Only process reference links (group 6)
1197            if let Some(ref_id) = cap.get(6) {
1198                let ref_id_str = ref_id.as_str();
1199                let normalized_ref = if ref_id_str.is_empty() {
1200                    text.to_lowercase() // Implicit reference
1201                } else {
1202                    ref_id_str.to_lowercase()
1203                };
1204
1205                // This is an undefined reference (pulldown-cmark didn't parse it)
1206                links.push(ParsedLink {
1207                    line: line_num,
1208                    start_col: col_start,
1209                    end_col: col_end,
1210                    byte_offset: match_start,
1211                    byte_end: match_end,
1212                    text,
1213                    url: String::new(), // Empty URL indicates undefined reference
1214                    is_reference: true,
1215                    reference_id: Some(normalized_ref),
1216                });
1217            }
1218        }
1219
1220        (links, broken_links)
1221    }
1222
1223    /// Parse all images in the content
1224    fn parse_images(
1225        content: &str,
1226        lines: &[LineInfo],
1227        code_blocks: &[(usize, usize)],
1228        code_spans: &[CodeSpan],
1229        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1230    ) -> Vec<ParsedImage> {
1231        use crate::utils::skip_context::is_in_html_comment_ranges;
1232        use std::collections::HashSet;
1233
1234        // Pre-size based on a heuristic: images are less common than links
1235        let mut images = Vec::with_capacity(content.len() / 1000);
1236        let mut found_positions = HashSet::new();
1237
1238        // Use pulldown-cmark for parsing - more accurate and faster
1239        let parser = Parser::new(content).into_offset_iter();
1240        let mut image_stack: Vec<(usize, String, LinkType, String)> = Vec::new();
1241        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1242
1243        for (event, range) in parser {
1244            match event {
1245                Event::Start(Tag::Image {
1246                    link_type,
1247                    dest_url,
1248                    id,
1249                    ..
1250                }) => {
1251                    image_stack.push((range.start, dest_url.to_string(), link_type, id.to_string()));
1252                    text_chunks.clear();
1253                }
1254                Event::Text(text) if !image_stack.is_empty() => {
1255                    text_chunks.push((text.to_string(), range.start, range.end));
1256                }
1257                Event::Code(code) if !image_stack.is_empty() => {
1258                    let code_text = format!("`{code}`");
1259                    text_chunks.push((code_text, range.start, range.end));
1260                }
1261                Event::End(TagEnd::Image) => {
1262                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1263                        // Skip if in code block
1264                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1265                            continue;
1266                        }
1267
1268                        // Skip if in code span
1269                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1270                            continue;
1271                        }
1272
1273                        // Skip if in HTML comment
1274                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1275                            continue;
1276                        }
1277
1278                        // Find line and column using binary search
1279                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1280                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1281
1282                        let is_reference = matches!(
1283                            link_type,
1284                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1285                        );
1286
1287                        // Extract alt text directly from source bytes to preserve escaping
1288                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1289                        let alt_text = if start_pos < content.len() {
1290                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1291
1292                            // Find MATCHING ] by tracking bracket depth for nested brackets
1293                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1294                            let mut close_pos = None;
1295                            let mut depth = 0;
1296
1297                            if image_bytes.len() > 2 {
1298                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1299                                    // Count preceding backslashes
1300                                    let mut backslash_count = 0;
1301                                    let mut j = i;
1302                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1303                                        backslash_count += 1;
1304                                        j -= 1;
1305                                    }
1306                                    let is_escaped = backslash_count % 2 != 0;
1307
1308                                    if !is_escaped {
1309                                        if byte == b'[' {
1310                                            depth += 1;
1311                                        } else if byte == b']' {
1312                                            if depth == 0 {
1313                                                // Found the matching closing bracket
1314                                                close_pos = Some(i);
1315                                                break;
1316                                            } else {
1317                                                depth -= 1;
1318                                            }
1319                                        }
1320                                    }
1321                                }
1322                            }
1323
1324                            if let Some(pos) = close_pos {
1325                                std::str::from_utf8(&image_bytes[2..pos]).unwrap_or("").to_string()
1326                            } else {
1327                                String::new()
1328                            }
1329                        } else {
1330                            String::new()
1331                        };
1332
1333                        let reference_id = if is_reference && !ref_id.is_empty() {
1334                            Some(ref_id.to_lowercase())
1335                        } else if is_reference {
1336                            Some(alt_text.to_lowercase()) // Collapsed/shortcut references
1337                        } else {
1338                            None
1339                        };
1340
1341                        found_positions.insert(start_pos);
1342                        images.push(ParsedImage {
1343                            line: line_num,
1344                            start_col: col_start,
1345                            end_col: col_end,
1346                            byte_offset: start_pos,
1347                            byte_end: range.end,
1348                            alt_text,
1349                            url,
1350                            is_reference,
1351                            reference_id,
1352                        });
1353                    }
1354                }
1355                _ => {}
1356            }
1357        }
1358
1359        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1360        for cap in IMAGE_PATTERN.captures_iter(content) {
1361            let full_match = cap.get(0).unwrap();
1362            let match_start = full_match.start();
1363            let match_end = full_match.end();
1364
1365            // Skip if already found by pulldown-cmark
1366            if found_positions.contains(&match_start) {
1367                continue;
1368            }
1369
1370            // Skip if the ! is escaped
1371            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1372                continue;
1373            }
1374
1375            // Skip if in code block, code span, or HTML comment
1376            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1377                || Self::is_offset_in_code_span(code_spans, match_start)
1378                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1379            {
1380                continue;
1381            }
1382
1383            // Only process reference images (undefined references not found by pulldown-cmark)
1384            if let Some(ref_id) = cap.get(6) {
1385                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1386                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1387                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1388                let ref_id_str = ref_id.as_str();
1389                let normalized_ref = if ref_id_str.is_empty() {
1390                    alt_text.to_lowercase()
1391                } else {
1392                    ref_id_str.to_lowercase()
1393                };
1394
1395                images.push(ParsedImage {
1396                    line: line_num,
1397                    start_col: col_start,
1398                    end_col: col_end,
1399                    byte_offset: match_start,
1400                    byte_end: match_end,
1401                    alt_text,
1402                    url: String::new(),
1403                    is_reference: true,
1404                    reference_id: Some(normalized_ref),
1405                });
1406            }
1407        }
1408
1409        images
1410    }
1411
1412    /// Parse reference definitions
1413    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1414        // Pre-size based on lines count as reference definitions are line-based
1415        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1416
1417        for (line_idx, line_info) in lines.iter().enumerate() {
1418            // Skip lines in code blocks
1419            if line_info.in_code_block {
1420                continue;
1421            }
1422
1423            let line = &line_info.content;
1424            let line_num = line_idx + 1;
1425
1426            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1427                let id = cap.get(1).unwrap().as_str().to_lowercase();
1428                let url = cap.get(2).unwrap().as_str().to_string();
1429                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1430
1431                // Calculate byte positions
1432                // The match starts at the beginning of the line (0) and extends to the end
1433                let match_obj = cap.get(0).unwrap();
1434                let byte_offset = line_info.byte_offset + match_obj.start();
1435                let byte_end = line_info.byte_offset + match_obj.end();
1436
1437                refs.push(ReferenceDef {
1438                    line: line_num,
1439                    id,
1440                    url,
1441                    title,
1442                    byte_offset,
1443                    byte_end,
1444                });
1445            }
1446        }
1447
1448        refs
1449    }
1450
1451    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1452    /// Matches: ^(\s*>\s*)(.*)
1453    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1454    #[inline]
1455    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1456        let trimmed_start = line.trim_start();
1457        if !trimmed_start.starts_with('>') {
1458            return None;
1459        }
1460
1461        let leading_ws_len = line.len() - trimmed_start.len();
1462        let after_gt = &trimmed_start[1..];
1463        let content = after_gt.trim_start();
1464        let ws_after_gt_len = after_gt.len() - content.len();
1465        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1466
1467        Some((&line[..prefix_len], content))
1468    }
1469
1470    /// Fast unordered list parser - replaces regex for 5-10x speedup
1471    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1472    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1473    #[inline]
1474    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1475        let bytes = line.as_bytes();
1476        let mut i = 0;
1477
1478        // Skip leading whitespace
1479        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1480            i += 1;
1481        }
1482
1483        // Check for marker
1484        if i >= bytes.len() {
1485            return None;
1486        }
1487        let marker = bytes[i] as char;
1488        if marker != '-' && marker != '*' && marker != '+' {
1489            return None;
1490        }
1491        let marker_pos = i;
1492        i += 1;
1493
1494        // Collect spacing after marker (space or tab only)
1495        let spacing_start = i;
1496        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1497            i += 1;
1498        }
1499
1500        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1501    }
1502
1503    /// Fast ordered list parser - replaces regex for 5-10x speedup
1504    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1505    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1506    #[inline]
1507    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1508        let bytes = line.as_bytes();
1509        let mut i = 0;
1510
1511        // Skip leading whitespace
1512        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1513            i += 1;
1514        }
1515
1516        // Collect digits
1517        let number_start = i;
1518        while i < bytes.len() && bytes[i].is_ascii_digit() {
1519            i += 1;
1520        }
1521        if i == number_start {
1522            return None; // No digits found
1523        }
1524
1525        // Check for delimiter
1526        if i >= bytes.len() {
1527            return None;
1528        }
1529        let delimiter = bytes[i] as char;
1530        if delimiter != '.' && delimiter != ')' {
1531            return None;
1532        }
1533        let delimiter_pos = i;
1534        i += 1;
1535
1536        // Collect spacing after delimiter (space or tab only)
1537        let spacing_start = i;
1538        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1539            i += 1;
1540        }
1541
1542        Some((
1543            &line[..number_start],
1544            &line[number_start..delimiter_pos],
1545            delimiter,
1546            &line[spacing_start..i],
1547            &line[i..],
1548        ))
1549    }
1550
1551    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1552    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1553    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1554        let num_lines = line_offsets.len();
1555        let mut in_code_block = vec![false; num_lines];
1556
1557        // For each code block, mark all lines within it
1558        for &(start, end) in code_blocks {
1559            // Ensure we're at valid UTF-8 boundaries
1560            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1561                let mut boundary = start;
1562                while boundary > 0 && !content.is_char_boundary(boundary) {
1563                    boundary -= 1;
1564                }
1565                boundary
1566            } else {
1567                start
1568            };
1569
1570            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1571                let mut boundary = end;
1572                while boundary < content.len() && !content.is_char_boundary(boundary) {
1573                    boundary += 1;
1574                }
1575                boundary
1576            } else {
1577                end.min(content.len())
1578            };
1579
1580            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1581            // That function now has proper list context awareness (see code_block_utils.rs)
1582            // and correctly distinguishes between:
1583            // - Fenced code blocks (``` or ~~~)
1584            // - Indented code blocks at document level (4 spaces + blank line before)
1585            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1586            //
1587            // We no longer need to re-validate here. The original validation logic
1588            // was causing false positives by marking list continuation paragraphs as
1589            // code blocks when they have 4 spaces of indentation.
1590
1591            // Use binary search to find the first and last line indices
1592            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1593            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1594            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1595            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1596
1597            // Mark all lines in the range at once
1598            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1599                *flag = true;
1600            }
1601        }
1602
1603        in_code_block
1604    }
1605
1606    /// Pre-compute basic line information (without headings/blockquotes)
1607    fn compute_basic_line_info(
1608        content: &str,
1609        line_offsets: &[usize],
1610        code_blocks: &[(usize, usize)],
1611        flavor: MarkdownFlavor,
1612        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1613        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1614    ) -> Vec<LineInfo> {
1615        let content_lines: Vec<&str> = content.lines().collect();
1616        let mut lines = Vec::with_capacity(content_lines.len());
1617
1618        // Pre-compute which lines are in code blocks
1619        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1620
1621        // Detect front matter boundaries FIRST, before any other parsing
1622        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1623        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1624
1625        for (i, line) in content_lines.iter().enumerate() {
1626            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1627            let indent = line.len() - line.trim_start().len();
1628
1629            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1630            let blockquote_parse = Self::parse_blockquote_prefix(line);
1631
1632            // For blank detection, consider blockquote context
1633            let is_blank = if let Some((_, content)) = blockquote_parse {
1634                // In blockquote context, check if content after prefix is blank
1635                content.trim().is_empty()
1636            } else {
1637                line.trim().is_empty()
1638            };
1639
1640            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1641            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1642
1643            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1644            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1645                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1646            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1647            let in_html_comment =
1648                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1649            let list_item = if !(in_code_block
1650                || is_blank
1651                || in_mkdocstrings
1652                || in_html_comment
1653                || (front_matter_end > 0 && i < front_matter_end))
1654            {
1655                // Strip blockquote prefix if present for list detection (reuse cached result)
1656                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1657                    (content, prefix.len())
1658                } else {
1659                    (&**line, 0)
1660                };
1661
1662                if let Some((leading_spaces, marker, spacing, _content)) =
1663                    Self::parse_unordered_list(line_for_list_check)
1664                {
1665                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1666                    let content_column = marker_column + 1 + spacing.len();
1667
1668                    // According to CommonMark spec, unordered list items MUST have at least one space
1669                    // after the marker (-, *, or +). Without a space, it's not a list item.
1670                    // This also naturally handles cases like:
1671                    // - *emphasis* (not a list)
1672                    // - **bold** (not a list)
1673                    // - --- (horizontal rule, not a list)
1674                    if spacing.is_empty() {
1675                        None
1676                    } else {
1677                        Some(ListItemInfo {
1678                            marker: marker.to_string(),
1679                            is_ordered: false,
1680                            number: None,
1681                            marker_column,
1682                            content_column,
1683                        })
1684                    }
1685                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1686                    Self::parse_ordered_list(line_for_list_check)
1687                {
1688                    let marker = format!("{number_str}{delimiter}");
1689                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1690                    let content_column = marker_column + marker.len() + spacing.len();
1691
1692                    // According to CommonMark spec, ordered list items MUST have at least one space
1693                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1694                    if spacing.is_empty() {
1695                        None
1696                    } else {
1697                        Some(ListItemInfo {
1698                            marker,
1699                            is_ordered: true,
1700                            number: number_str.parse().ok(),
1701                            marker_column,
1702                            content_column,
1703                        })
1704                    }
1705                } else {
1706                    None
1707                }
1708            } else {
1709                None
1710            };
1711
1712            lines.push(LineInfo {
1713                content: line.to_string(),
1714                byte_offset,
1715                indent,
1716                is_blank,
1717                in_code_block,
1718                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1719                in_html_block: false, // Will be populated after line creation
1720                in_html_comment,
1721                list_item,
1722                heading: None,    // Will be populated in second pass for Setext headings
1723                blockquote: None, // Will be populated after line creation
1724                in_mkdocstrings,
1725                in_esm_block: false, // Will be populated after line creation for MDX files
1726            });
1727        }
1728
1729        lines
1730    }
1731
1732    /// Detect headings and blockquotes (called after HTML block detection)
1733    fn detect_headings_and_blockquotes(
1734        content: &str,
1735        lines: &mut [LineInfo],
1736        flavor: MarkdownFlavor,
1737        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1738    ) {
1739        // Regex for heading detection
1740        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1741            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1742        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1743            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1744
1745        let content_lines: Vec<&str> = content.lines().collect();
1746
1747        // Detect front matter boundaries to skip those lines
1748        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1749
1750        // Detect headings (including Setext which needs look-ahead) and blockquotes
1751        for i in 0..lines.len() {
1752            if lines[i].in_code_block {
1753                continue;
1754            }
1755
1756            // Skip lines in front matter
1757            if front_matter_end > 0 && i < front_matter_end {
1758                continue;
1759            }
1760
1761            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1762            if lines[i].in_html_block {
1763                continue;
1764            }
1765
1766            let line = content_lines[i];
1767
1768            // Check for blockquotes (even on blank lines within blockquotes)
1769            if let Some(bq) = parse_blockquote_detailed(line) {
1770                let nesting_level = bq.markers.len(); // Each '>' is one level
1771                let marker_column = bq.indent.len();
1772
1773                // Build the prefix (indentation + markers + space)
1774                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1775
1776                // Check for various blockquote issues
1777                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1778                // Consider tabs as multiple spaces, or actual multiple spaces
1779                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1780
1781                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1782                // MD028 flags empty blockquote lines that don't have a single space after the marker
1783                // Lines like "> " or ">> " are already correct and don't need fixing
1784                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1785
1786                lines[i].blockquote = Some(BlockquoteInfo {
1787                    nesting_level,
1788                    indent: bq.indent.to_string(),
1789                    marker_column,
1790                    prefix,
1791                    content: bq.content.to_string(),
1792                    has_no_space_after_marker: has_no_space,
1793                    has_multiple_spaces_after_marker: has_multiple_spaces,
1794                    needs_md028_fix,
1795                });
1796            }
1797
1798            // Skip heading detection for blank lines
1799            if lines[i].is_blank {
1800                continue;
1801            }
1802
1803            // Check for ATX headings (but skip MkDocs snippet lines)
1804            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1805            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1806                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1807                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1808            } else {
1809                false
1810            };
1811
1812            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1813                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1814                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1815                    continue;
1816                }
1817                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1818                let hashes = caps.get(2).map_or("", |m| m.as_str());
1819                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1820                let rest = caps.get(4).map_or("", |m| m.as_str());
1821
1822                let level = hashes.len() as u8;
1823                let marker_column = leading_spaces.len();
1824
1825                // Check for closing sequence, but handle custom IDs that might come after
1826                let (text, has_closing, closing_seq) = {
1827                    // First check if there's a custom ID at the end
1828                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1829                        // Check if this looks like a valid custom ID (ends with })
1830                        if rest[id_start..].trim_end().ends_with('}') {
1831                            // Split off the custom ID
1832                            (&rest[..id_start], &rest[id_start..])
1833                        } else {
1834                            (rest, "")
1835                        }
1836                    } else {
1837                        (rest, "")
1838                    };
1839
1840                    // Now look for closing hashes in the part before the custom ID
1841                    let trimmed_rest = rest_without_id.trim_end();
1842                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1843                        // Look for the start of the hash sequence
1844                        let mut start_of_hashes = last_hash_pos;
1845                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1846                            start_of_hashes -= 1;
1847                        }
1848
1849                        // Check if there's at least one space before the closing hashes
1850                        let has_space_before = start_of_hashes == 0
1851                            || trimmed_rest
1852                                .chars()
1853                                .nth(start_of_hashes - 1)
1854                                .is_some_and(|c| c.is_whitespace());
1855
1856                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1857                        let potential_closing = &trimmed_rest[start_of_hashes..];
1858                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1859
1860                        if is_all_hashes && has_space_before {
1861                            // This is a closing sequence
1862                            let closing_hashes = potential_closing.to_string();
1863                            // The text is everything before the closing hashes
1864                            // Don't include the custom ID here - it will be extracted later
1865                            let text_part = if !custom_id_part.is_empty() {
1866                                // If we have a custom ID, append it back to get the full rest
1867                                // This allows the extract_header_id function to handle it properly
1868                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1869                            } else {
1870                                rest_without_id[..start_of_hashes].trim_end().to_string()
1871                            };
1872                            (text_part, true, closing_hashes)
1873                        } else {
1874                            // Not a valid closing sequence, return the full content
1875                            (rest.to_string(), false, String::new())
1876                        }
1877                    } else {
1878                        // No hashes found, return the full content
1879                        (rest.to_string(), false, String::new())
1880                    }
1881                };
1882
1883                let content_column = marker_column + hashes.len() + spaces_after.len();
1884
1885                // Extract custom header ID if present
1886                let raw_text = text.trim().to_string();
1887                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1888
1889                // If no custom ID was found on the header line, check the next line for standalone attr-list
1890                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1891                    let next_line = content_lines[i + 1];
1892                    if !lines[i + 1].in_code_block
1893                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1894                        && let Some(next_line_id) =
1895                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1896                    {
1897                        custom_id = Some(next_line_id);
1898                    }
1899                }
1900
1901                lines[i].heading = Some(HeadingInfo {
1902                    level,
1903                    style: HeadingStyle::ATX,
1904                    marker: hashes.to_string(),
1905                    marker_column,
1906                    content_column,
1907                    text: clean_text,
1908                    custom_id,
1909                    raw_text,
1910                    has_closing_sequence: has_closing,
1911                    closing_sequence: closing_seq,
1912                });
1913            }
1914            // Check for Setext headings (need to look at next line)
1915            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1916                let next_line = content_lines[i + 1];
1917                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1918                    // Skip if next line is front matter delimiter
1919                    if front_matter_end > 0 && i < front_matter_end {
1920                        continue;
1921                    }
1922
1923                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1924                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1925                    {
1926                        continue;
1927                    }
1928
1929                    let underline = next_line.trim();
1930
1931                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1932                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1933                    if underline == "---" {
1934                        continue;
1935                    }
1936
1937                    // Skip if the current line looks like YAML key-value syntax
1938                    let current_line_trimmed = line.trim();
1939                    if current_line_trimmed.contains(':')
1940                        && !current_line_trimmed.starts_with('#')
1941                        && !current_line_trimmed.contains('[')
1942                        && !current_line_trimmed.contains("](")
1943                    {
1944                        // This looks like "key: value" which suggests YAML, not a heading
1945                        continue;
1946                    }
1947
1948                    let level = if underline.starts_with('=') { 1 } else { 2 };
1949                    let style = if level == 1 {
1950                        HeadingStyle::Setext1
1951                    } else {
1952                        HeadingStyle::Setext2
1953                    };
1954
1955                    // Extract custom header ID if present
1956                    let raw_text = line.trim().to_string();
1957                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1958
1959                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1960                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1961                        let attr_line = content_lines[i + 2];
1962                        if !lines[i + 2].in_code_block
1963                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1964                            && let Some(attr_line_id) =
1965                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1966                        {
1967                            custom_id = Some(attr_line_id);
1968                        }
1969                    }
1970
1971                    lines[i].heading = Some(HeadingInfo {
1972                        level,
1973                        style,
1974                        marker: underline.to_string(),
1975                        marker_column: next_line.len() - next_line.trim_start().len(),
1976                        content_column: lines[i].indent,
1977                        text: clean_text,
1978                        custom_id,
1979                        raw_text,
1980                        has_closing_sequence: false,
1981                        closing_sequence: String::new(),
1982                    });
1983                }
1984            }
1985        }
1986    }
1987
1988    /// Detect HTML blocks in the content
1989    fn detect_html_blocks(lines: &mut [LineInfo]) {
1990        // HTML block elements that trigger block context
1991        const BLOCK_ELEMENTS: &[&str] = &[
1992            "address",
1993            "article",
1994            "aside",
1995            "blockquote",
1996            "details",
1997            "dialog",
1998            "dd",
1999            "div",
2000            "dl",
2001            "dt",
2002            "fieldset",
2003            "figcaption",
2004            "figure",
2005            "footer",
2006            "form",
2007            "h1",
2008            "h2",
2009            "h3",
2010            "h4",
2011            "h5",
2012            "h6",
2013            "header",
2014            "hr",
2015            "li",
2016            "main",
2017            "nav",
2018            "ol",
2019            "p",
2020            "pre",
2021            "script",
2022            "section",
2023            "style",
2024            "table",
2025            "tbody",
2026            "td",
2027            "tfoot",
2028            "th",
2029            "thead",
2030            "tr",
2031            "ul",
2032        ];
2033
2034        let mut i = 0;
2035        while i < lines.len() {
2036            // Skip if already in code block or front matter
2037            if lines[i].in_code_block || lines[i].in_front_matter {
2038                i += 1;
2039                continue;
2040            }
2041
2042            let trimmed = lines[i].content.trim_start();
2043
2044            // Check if line starts with an HTML tag
2045            if trimmed.starts_with('<') && trimmed.len() > 1 {
2046                // Extract tag name safely
2047                let after_bracket = &trimmed[1..];
2048                let is_closing = after_bracket.starts_with('/');
2049                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2050
2051                // Extract tag name (stop at space, >, /, or end of string)
2052                let tag_name = tag_start
2053                    .chars()
2054                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2055                    .collect::<String>()
2056                    .to_lowercase();
2057
2058                // Check if it's a block element
2059                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2060                    // Mark this line as in HTML block
2061                    lines[i].in_html_block = true;
2062
2063                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2064                    // This avoids complex nesting logic that might cause infinite loops
2065                    if !is_closing {
2066                        let closing_tag = format!("</{tag_name}>");
2067                        // style and script tags can contain blank lines (CSS/JS formatting)
2068                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2069                        let mut j = i + 1;
2070                        while j < lines.len() && j < i + 100 {
2071                            // Limit search to 100 lines
2072                            // Stop at blank lines (except for style/script tags)
2073                            if !allow_blank_lines && lines[j].is_blank {
2074                                break;
2075                            }
2076
2077                            lines[j].in_html_block = true;
2078
2079                            // Check if this line contains the closing tag
2080                            if lines[j].content.contains(&closing_tag) {
2081                                break;
2082                            }
2083                            j += 1;
2084                        }
2085                    }
2086                }
2087            }
2088
2089            i += 1;
2090        }
2091    }
2092
2093    /// Detect ESM import/export blocks in MDX files
2094    /// ESM blocks consist of contiguous import/export statements at the top of the file
2095    fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2096        // Only process MDX files
2097        if !flavor.supports_esm_blocks() {
2098            return;
2099        }
2100
2101        for line in lines.iter_mut() {
2102            // Skip blank lines and comments at the start
2103            if line.is_blank || line.in_html_comment {
2104                continue;
2105            }
2106
2107            // Check if line starts with import or export
2108            let trimmed = line.content.trim_start();
2109            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2110                line.in_esm_block = true;
2111            } else {
2112                // Once we hit a non-ESM line, we're done with the ESM block
2113                break;
2114            }
2115        }
2116    }
2117
2118    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2119    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2120        let mut code_spans = Vec::new();
2121
2122        // Quick check - if no backticks, no code spans
2123        if !content.contains('`') {
2124            return code_spans;
2125        }
2126
2127        // Use pulldown-cmark's streaming parser with byte offsets
2128        let parser = Parser::new(content).into_offset_iter();
2129
2130        for (event, range) in parser {
2131            if let Event::Code(_) = event {
2132                let start_pos = range.start;
2133                let end_pos = range.end;
2134
2135                // The range includes the backticks, extract the actual content
2136                let full_span = &content[start_pos..end_pos];
2137                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2138
2139                // Extract content between backticks, preserving spaces
2140                let content_start = start_pos + backtick_count;
2141                let content_end = end_pos - backtick_count;
2142                let span_content = if content_start < content_end {
2143                    content[content_start..content_end].to_string()
2144                } else {
2145                    String::new()
2146                };
2147
2148                // Use binary search to find line number - O(log n) instead of O(n)
2149                // Find the rightmost line whose byte_offset <= start_pos
2150                let line_idx = lines
2151                    .partition_point(|line| line.byte_offset <= start_pos)
2152                    .saturating_sub(1);
2153                let line_num = line_idx + 1;
2154                let col_start = start_pos - lines[line_idx].byte_offset;
2155
2156                // Find end column using binary search
2157                let end_line_idx = lines
2158                    .partition_point(|line| line.byte_offset <= end_pos)
2159                    .saturating_sub(1);
2160                let col_end = end_pos - lines[end_line_idx].byte_offset;
2161
2162                code_spans.push(CodeSpan {
2163                    line: line_num,
2164                    start_col: col_start,
2165                    end_col: col_end,
2166                    byte_offset: start_pos,
2167                    byte_end: end_pos,
2168                    backtick_count,
2169                    content: span_content,
2170                });
2171            }
2172        }
2173
2174        // Sort by position to ensure consistent ordering
2175        code_spans.sort_by_key(|span| span.byte_offset);
2176
2177        code_spans
2178    }
2179
2180    /// Parse all list blocks in the content (legacy line-by-line approach)
2181    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
2182        // Pre-size based on lines that could be list items
2183        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2184        let mut current_block: Option<ListBlock> = None;
2185        let mut last_list_item_line = 0;
2186        let mut current_indent_level = 0;
2187        let mut last_marker_width = 0;
2188
2189        for (line_idx, line_info) in lines.iter().enumerate() {
2190            let line_num = line_idx + 1;
2191
2192            // Enhanced code block handling using Design #3's context analysis
2193            if line_info.in_code_block {
2194                if let Some(ref mut block) = current_block {
2195                    // Calculate minimum indentation for list continuation
2196                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
2197
2198                    // Analyze code block context using the three-tier classification
2199                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2200
2201                    match context {
2202                        CodeBlockContext::Indented => {
2203                            // Code block is properly indented - continues the list
2204                            block.end_line = line_num;
2205                            continue;
2206                        }
2207                        CodeBlockContext::Standalone => {
2208                            // Code block separates lists - end current block
2209                            let completed_block = current_block.take().unwrap();
2210                            list_blocks.push(completed_block);
2211                            continue;
2212                        }
2213                        CodeBlockContext::Adjacent => {
2214                            // Edge case - use conservative behavior (continue list)
2215                            block.end_line = line_num;
2216                            continue;
2217                        }
2218                    }
2219                } else {
2220                    // No current list block - skip code block lines
2221                    continue;
2222                }
2223            }
2224
2225            // Extract blockquote prefix if any
2226            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2227                caps.get(0).unwrap().as_str().to_string()
2228            } else {
2229                String::new()
2230            };
2231
2232            // Check if this line is a list item
2233            if let Some(list_item) = &line_info.list_item {
2234                // Calculate nesting level based on indentation
2235                let item_indent = list_item.marker_column;
2236                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2237
2238                if let Some(ref mut block) = current_block {
2239                    // Check if this continues the current block
2240                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2241                    // or a continuation at the same or lower level
2242                    let is_nested = nesting > block.nesting_level;
2243                    let same_type =
2244                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2245                    let same_context = block.blockquote_prefix == blockquote_prefix;
2246                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2247
2248                    // For unordered lists, also check marker consistency
2249                    let marker_compatible =
2250                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2251
2252                    // Check if there's non-list content between the last item and this one
2253                    let has_non_list_content = {
2254                        let mut found_non_list = false;
2255                        // Use the last item from the current block, not the global last_list_item_line
2256                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2257
2258                        // Debug: Special check for problematic line
2259                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2260                            let last_line = &lines[block_last_item_line - 1];
2261                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2262                                log::debug!(
2263                                    "After problematic line {}: checking lines {} to {} for non-list content",
2264                                    block_last_item_line,
2265                                    block_last_item_line + 1,
2266                                    line_num
2267                                );
2268                                // If they're consecutive list items, there's no content between
2269                                if line_num == block_last_item_line + 1 {
2270                                    log::debug!("Lines are consecutive, no content between");
2271                                }
2272                            }
2273                        }
2274
2275                        for check_line in (block_last_item_line + 1)..line_num {
2276                            let check_idx = check_line - 1;
2277                            if check_idx < lines.len() {
2278                                let check_info = &lines[check_idx];
2279                                // Check for content that breaks the list
2280                                let is_list_breaking_content = if check_info.in_code_block {
2281                                    // Use enhanced code block classification for list separation
2282                                    let last_item_marker_width =
2283                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2284                                            lines[block_last_item_line - 1]
2285                                                .list_item
2286                                                .as_ref()
2287                                                .map(|li| {
2288                                                    if li.is_ordered {
2289                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
2290                                                    } else {
2291                                                        li.marker.len()
2292                                                    }
2293                                                })
2294                                                .unwrap_or(3) // fallback to 3 if no list item found
2295                                        } else {
2296                                            3 // fallback
2297                                        };
2298
2299                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2300
2301                                    // Analyze code block context using our enhanced classification
2302                                    let context = CodeBlockUtils::analyze_code_block_context(
2303                                        lines,
2304                                        check_line - 1,
2305                                        min_continuation,
2306                                    );
2307
2308                                    // Standalone code blocks break lists, indented ones continue them
2309                                    matches!(context, CodeBlockContext::Standalone)
2310                                } else if !check_info.is_blank && check_info.list_item.is_none() {
2311                                    // Check for structural separators that should break lists (from issue #42)
2312                                    let line_content = check_info.content.trim();
2313
2314                                    // Any of these structural separators break lists
2315                                    if check_info.heading.is_some()
2316                                        || line_content.starts_with("---")
2317                                        || line_content.starts_with("***")
2318                                        || line_content.starts_with("___")
2319                                        || (line_content.contains('|')
2320                                            && !line_content.contains("](")
2321                                            && !line_content.contains("http")
2322                                            && (line_content.matches('|').count() > 1
2323                                                || line_content.starts_with('|')
2324                                                || line_content.ends_with('|')))
2325                                        || line_content.starts_with(">")
2326                                    {
2327                                        true
2328                                    }
2329                                    // Other non-list content - check if properly indented
2330                                    else {
2331                                        let last_item_marker_width =
2332                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2333                                                lines[block_last_item_line - 1]
2334                                                    .list_item
2335                                                    .as_ref()
2336                                                    .map(|li| {
2337                                                        if li.is_ordered {
2338                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
2339                                                        } else {
2340                                                            li.marker.len()
2341                                                        }
2342                                                    })
2343                                                    .unwrap_or(3) // fallback to 3 if no list item found
2344                                            } else {
2345                                                3 // fallback
2346                                            };
2347
2348                                        let min_continuation =
2349                                            if block.is_ordered { last_item_marker_width } else { 2 };
2350                                        check_info.indent < min_continuation
2351                                    }
2352                                } else {
2353                                    false
2354                                };
2355
2356                                if is_list_breaking_content {
2357                                    // Not indented enough, so it breaks the list
2358                                    found_non_list = true;
2359                                    break;
2360                                }
2361                            }
2362                        }
2363                        found_non_list
2364                    };
2365
2366                    // A list continues if:
2367                    // 1. It's a nested item (indented more than the parent), OR
2368                    // 2. It's the same type at the same level with reasonable distance
2369                    let mut continues_list = if is_nested {
2370                        // Nested items always continue the list if they're in the same context
2371                        same_context && reasonable_distance && !has_non_list_content
2372                    } else {
2373                        // Same-level items need to match type and markers
2374                        let result = same_type
2375                            && same_context
2376                            && reasonable_distance
2377                            && marker_compatible
2378                            && !has_non_list_content;
2379
2380                        // Debug logging for lines after problematic content
2381                        if block.item_lines.last().is_some_and(|&last_line| {
2382                            last_line > 0
2383                                && last_line <= lines.len()
2384                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2385                                && lines[last_line - 1].content.contains(r"\`")
2386                        }) {
2387                            log::debug!(
2388                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2389                            );
2390                            if line_num > 0 && line_num <= lines.len() {
2391                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2392                            }
2393                        }
2394
2395                        result
2396                    };
2397
2398                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2399                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2400                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2401                        // Check if the previous line was a list item
2402                        if block.item_lines.contains(&(line_num - 1)) {
2403                            // They're consecutive list items - force them to be in the same list
2404                            continues_list = true;
2405                        }
2406                    }
2407
2408                    if continues_list {
2409                        // Extend current block
2410                        block.end_line = line_num;
2411                        block.item_lines.push(line_num);
2412
2413                        // Update max marker width
2414                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2415                            list_item.marker.len() + 1
2416                        } else {
2417                            list_item.marker.len()
2418                        });
2419
2420                        // Update marker consistency for unordered lists
2421                        if !block.is_ordered
2422                            && block.marker.is_some()
2423                            && block.marker.as_ref() != Some(&list_item.marker)
2424                        {
2425                            // Mixed markers, clear the marker field
2426                            block.marker = None;
2427                        }
2428                    } else {
2429                        // End current block and start a new one
2430
2431                        list_blocks.push(block.clone());
2432
2433                        *block = ListBlock {
2434                            start_line: line_num,
2435                            end_line: line_num,
2436                            is_ordered: list_item.is_ordered,
2437                            marker: if list_item.is_ordered {
2438                                None
2439                            } else {
2440                                Some(list_item.marker.clone())
2441                            },
2442                            blockquote_prefix: blockquote_prefix.clone(),
2443                            item_lines: vec![line_num],
2444                            nesting_level: nesting,
2445                            max_marker_width: if list_item.is_ordered {
2446                                list_item.marker.len() + 1
2447                            } else {
2448                                list_item.marker.len()
2449                            },
2450                        };
2451                    }
2452                } else {
2453                    // Start a new block
2454                    current_block = Some(ListBlock {
2455                        start_line: line_num,
2456                        end_line: line_num,
2457                        is_ordered: list_item.is_ordered,
2458                        marker: if list_item.is_ordered {
2459                            None
2460                        } else {
2461                            Some(list_item.marker.clone())
2462                        },
2463                        blockquote_prefix,
2464                        item_lines: vec![line_num],
2465                        nesting_level: nesting,
2466                        max_marker_width: list_item.marker.len(),
2467                    });
2468                }
2469
2470                last_list_item_line = line_num;
2471                current_indent_level = item_indent;
2472                last_marker_width = if list_item.is_ordered {
2473                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2474                } else {
2475                    list_item.marker.len()
2476                };
2477            } else if let Some(ref mut block) = current_block {
2478                // Not a list item - check if it continues the current block
2479
2480                // For MD032 compatibility, we use a simple approach:
2481                // - Indented lines continue the list
2482                // - Blank lines followed by indented content continue the list
2483                // - Everything else ends the list
2484
2485                // Check if the last line in the list block ended with a backslash (hard line break)
2486                // This handles cases where list items use backslash for hard line breaks
2487                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2488                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2489                } else {
2490                    false
2491                };
2492
2493                // Calculate minimum indentation for list continuation
2494                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2495                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2496                let min_continuation_indent = if block.is_ordered {
2497                    current_indent_level + last_marker_width
2498                } else {
2499                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2500                };
2501
2502                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2503                    // Indented line or backslash continuation continues the list
2504                    block.end_line = line_num;
2505                } else if line_info.is_blank {
2506                    // Blank line - check if it's internal to the list or ending it
2507                    // We only include blank lines that are followed by more list content
2508                    let mut check_idx = line_idx + 1;
2509                    let mut found_continuation = false;
2510
2511                    // Skip additional blank lines
2512                    while check_idx < lines.len() && lines[check_idx].is_blank {
2513                        check_idx += 1;
2514                    }
2515
2516                    if check_idx < lines.len() {
2517                        let next_line = &lines[check_idx];
2518                        // Check if followed by indented content (list continuation)
2519                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2520                            found_continuation = true;
2521                        }
2522                        // Check if followed by another list item at the same level
2523                        else if !next_line.in_code_block
2524                            && next_line.list_item.is_some()
2525                            && let Some(item) = &next_line.list_item
2526                        {
2527                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2528                                .find(&next_line.content)
2529                                .map_or(String::new(), |m| m.as_str().to_string());
2530                            if item.marker_column == current_indent_level
2531                                && item.is_ordered == block.is_ordered
2532                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2533                            {
2534                                // Check if there was meaningful content between the list items (unused now)
2535                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2536                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2537                                    if let Some(between_line) = lines.get(idx) {
2538                                        let trimmed = between_line.content.trim();
2539                                        // Skip empty lines
2540                                        if trimmed.is_empty() {
2541                                            return false;
2542                                        }
2543                                        // Check for meaningful content
2544                                        let line_indent =
2545                                            between_line.content.len() - between_line.content.trim_start().len();
2546
2547                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2548                                        if trimmed.starts_with("```")
2549                                            || trimmed.starts_with("~~~")
2550                                            || trimmed.starts_with("---")
2551                                            || trimmed.starts_with("***")
2552                                            || trimmed.starts_with("___")
2553                                            || trimmed.starts_with(">")
2554                                            || trimmed.contains('|') // Tables
2555                                            || between_line.heading.is_some()
2556                                        {
2557                                            return true; // These are structural separators - meaningful content that breaks lists
2558                                        }
2559
2560                                        // Only properly indented content continues the list
2561                                        line_indent >= min_continuation_indent
2562                                    } else {
2563                                        false
2564                                    }
2565                                });
2566
2567                                if block.is_ordered {
2568                                    // For ordered lists: don't continue if there are structural separators
2569                                    // Check if there are structural separators between the list items
2570                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2571                                        if let Some(between_line) = lines.get(idx) {
2572                                            let trimmed = between_line.content.trim();
2573                                            if trimmed.is_empty() {
2574                                                return false;
2575                                            }
2576                                            // Check for structural separators that break lists
2577                                            trimmed.starts_with("```")
2578                                                || trimmed.starts_with("~~~")
2579                                                || trimmed.starts_with("---")
2580                                                || trimmed.starts_with("***")
2581                                                || trimmed.starts_with("___")
2582                                                || trimmed.starts_with(">")
2583                                                || trimmed.contains('|') // Tables
2584                                                || between_line.heading.is_some()
2585                                        } else {
2586                                            false
2587                                        }
2588                                    });
2589                                    found_continuation = !has_structural_separators;
2590                                } else {
2591                                    // For unordered lists: also check for structural separators
2592                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2593                                        if let Some(between_line) = lines.get(idx) {
2594                                            let trimmed = between_line.content.trim();
2595                                            if trimmed.is_empty() {
2596                                                return false;
2597                                            }
2598                                            // Check for structural separators that break lists
2599                                            trimmed.starts_with("```")
2600                                                || trimmed.starts_with("~~~")
2601                                                || trimmed.starts_with("---")
2602                                                || trimmed.starts_with("***")
2603                                                || trimmed.starts_with("___")
2604                                                || trimmed.starts_with(">")
2605                                                || trimmed.contains('|') // Tables
2606                                                || between_line.heading.is_some()
2607                                        } else {
2608                                            false
2609                                        }
2610                                    });
2611                                    found_continuation = !has_structural_separators;
2612                                }
2613                            }
2614                        }
2615                    }
2616
2617                    if found_continuation {
2618                        // Include the blank line in the block
2619                        block.end_line = line_num;
2620                    } else {
2621                        // Blank line ends the list - don't include it
2622                        list_blocks.push(block.clone());
2623                        current_block = None;
2624                    }
2625                } else {
2626                    // Check for lazy continuation - non-indented line immediately after a list item
2627                    // But only if the line has sufficient indentation for the list type
2628                    let min_required_indent = if block.is_ordered {
2629                        current_indent_level + last_marker_width
2630                    } else {
2631                        current_indent_level + 2
2632                    };
2633
2634                    // For lazy continuation to apply, the line must either:
2635                    // 1. Have no indentation (true lazy continuation)
2636                    // 2. Have sufficient indentation for the list type
2637                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2638                    let line_content = line_info.content.trim();
2639                    let is_structural_separator = line_info.heading.is_some()
2640                        || line_content.starts_with("```")
2641                        || line_content.starts_with("~~~")
2642                        || line_content.starts_with("---")
2643                        || line_content.starts_with("***")
2644                        || line_content.starts_with("___")
2645                        || line_content.starts_with(">")
2646                        || (line_content.contains('|')
2647                            && !line_content.contains("](")
2648                            && !line_content.contains("http")
2649                            && (line_content.matches('|').count() > 1
2650                                || line_content.starts_with('|')
2651                                || line_content.ends_with('|'))); // Tables
2652
2653                    // Allow lazy continuation if we're still within the same list block
2654                    // (not just immediately after a list item)
2655                    let is_lazy_continuation = !is_structural_separator
2656                        && !line_info.is_blank
2657                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2658
2659                    if is_lazy_continuation {
2660                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2661                        // it's probably not a continuation
2662                        let content_to_check = if !blockquote_prefix.is_empty() {
2663                            // Strip blockquote prefix to check the actual content
2664                            line_info
2665                                .content
2666                                .strip_prefix(&blockquote_prefix)
2667                                .unwrap_or(&line_info.content)
2668                                .trim()
2669                        } else {
2670                            line_info.content.trim()
2671                        };
2672
2673                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2674
2675                        // If it starts with uppercase and the previous line ended with punctuation,
2676                        // it's likely a new paragraph, not a continuation
2677                        if starts_with_uppercase && last_list_item_line > 0 {
2678                            // This looks like a new paragraph
2679                            list_blocks.push(block.clone());
2680                            current_block = None;
2681                        } else {
2682                            // This is a lazy continuation line
2683                            block.end_line = line_num;
2684                        }
2685                    } else {
2686                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2687                        list_blocks.push(block.clone());
2688                        current_block = None;
2689                    }
2690                }
2691            }
2692        }
2693
2694        // Don't forget the last block
2695        if let Some(block) = current_block {
2696            list_blocks.push(block);
2697        }
2698
2699        // Merge adjacent blocks that should be one
2700        merge_adjacent_list_blocks(&mut list_blocks, lines);
2701
2702        list_blocks
2703    }
2704
2705    /// Compute character frequency for fast content analysis
2706    fn compute_char_frequency(content: &str) -> CharFrequency {
2707        let mut frequency = CharFrequency::default();
2708
2709        for ch in content.chars() {
2710            match ch {
2711                '#' => frequency.hash_count += 1,
2712                '*' => frequency.asterisk_count += 1,
2713                '_' => frequency.underscore_count += 1,
2714                '-' => frequency.hyphen_count += 1,
2715                '+' => frequency.plus_count += 1,
2716                '>' => frequency.gt_count += 1,
2717                '|' => frequency.pipe_count += 1,
2718                '[' => frequency.bracket_count += 1,
2719                '`' => frequency.backtick_count += 1,
2720                '<' => frequency.lt_count += 1,
2721                '!' => frequency.exclamation_count += 1,
2722                '\n' => frequency.newline_count += 1,
2723                _ => {}
2724            }
2725        }
2726
2727        frequency
2728    }
2729
2730    /// Parse HTML tags in the content
2731    fn parse_html_tags(
2732        content: &str,
2733        lines: &[LineInfo],
2734        code_blocks: &[(usize, usize)],
2735        flavor: MarkdownFlavor,
2736    ) -> Vec<HtmlTag> {
2737        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2738            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2739
2740        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2741
2742        for cap in HTML_TAG_REGEX.captures_iter(content) {
2743            let full_match = cap.get(0).unwrap();
2744            let match_start = full_match.start();
2745            let match_end = full_match.end();
2746
2747            // Skip if in code block
2748            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2749                continue;
2750            }
2751
2752            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2753            let tag_name_original = cap.get(2).unwrap().as_str();
2754            let tag_name = tag_name_original.to_lowercase();
2755            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2756
2757            // Skip JSX components in MDX files (tags starting with uppercase letter)
2758            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2759            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2760                continue;
2761            }
2762
2763            // Find which line this tag is on
2764            let mut line_num = 1;
2765            let mut col_start = match_start;
2766            let mut col_end = match_end;
2767            for (idx, line_info) in lines.iter().enumerate() {
2768                if match_start >= line_info.byte_offset {
2769                    line_num = idx + 1;
2770                    col_start = match_start - line_info.byte_offset;
2771                    col_end = match_end - line_info.byte_offset;
2772                } else {
2773                    break;
2774                }
2775            }
2776
2777            html_tags.push(HtmlTag {
2778                line: line_num,
2779                start_col: col_start,
2780                end_col: col_end,
2781                byte_offset: match_start,
2782                byte_end: match_end,
2783                tag_name,
2784                is_closing,
2785                is_self_closing,
2786                raw_content: full_match.as_str().to_string(),
2787            });
2788        }
2789
2790        html_tags
2791    }
2792
2793    /// Parse emphasis spans in the content
2794    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2795        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2796            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2797
2798        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2799
2800        for cap in EMPHASIS_REGEX.captures_iter(content) {
2801            let full_match = cap.get(0).unwrap();
2802            let match_start = full_match.start();
2803            let match_end = full_match.end();
2804
2805            // Skip if in code block
2806            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2807                continue;
2808            }
2809
2810            let opening_markers = cap.get(1).unwrap().as_str();
2811            let content_part = cap.get(2).unwrap().as_str();
2812            let closing_markers = cap.get(3).unwrap().as_str();
2813
2814            // Validate matching markers
2815            if opening_markers.chars().next() != closing_markers.chars().next()
2816                || opening_markers.len() != closing_markers.len()
2817            {
2818                continue;
2819            }
2820
2821            let marker = opening_markers.chars().next().unwrap();
2822            let marker_count = opening_markers.len();
2823
2824            // Find which line this emphasis is on
2825            let mut line_num = 1;
2826            let mut col_start = match_start;
2827            let mut col_end = match_end;
2828            for (idx, line_info) in lines.iter().enumerate() {
2829                if match_start >= line_info.byte_offset {
2830                    line_num = idx + 1;
2831                    col_start = match_start - line_info.byte_offset;
2832                    col_end = match_end - line_info.byte_offset;
2833                } else {
2834                    break;
2835                }
2836            }
2837
2838            emphasis_spans.push(EmphasisSpan {
2839                line: line_num,
2840                start_col: col_start,
2841                end_col: col_end,
2842                byte_offset: match_start,
2843                byte_end: match_end,
2844                marker,
2845                marker_count,
2846                content: content_part.to_string(),
2847            });
2848        }
2849
2850        emphasis_spans
2851    }
2852
2853    /// Parse table rows in the content
2854    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2855        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2856
2857        for (line_idx, line_info) in lines.iter().enumerate() {
2858            // Skip lines in code blocks or blank lines
2859            if line_info.in_code_block || line_info.is_blank {
2860                continue;
2861            }
2862
2863            let line = &line_info.content;
2864            let line_num = line_idx + 1;
2865
2866            // Check if this line contains pipes (potential table row)
2867            if !line.contains('|') {
2868                continue;
2869            }
2870
2871            // Count columns by splitting on pipes
2872            let parts: Vec<&str> = line.split('|').collect();
2873            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2874
2875            // Check if this is a separator row
2876            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2877            let mut column_alignments = Vec::new();
2878
2879            if is_separator {
2880                for part in &parts[1..parts.len() - 1] {
2881                    // Skip first and last empty parts
2882                    let trimmed = part.trim();
2883                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2884                        "center".to_string()
2885                    } else if trimmed.ends_with(':') {
2886                        "right".to_string()
2887                    } else if trimmed.starts_with(':') {
2888                        "left".to_string()
2889                    } else {
2890                        "none".to_string()
2891                    };
2892                    column_alignments.push(alignment);
2893                }
2894            }
2895
2896            table_rows.push(TableRow {
2897                line: line_num,
2898                is_separator,
2899                column_count,
2900                column_alignments,
2901            });
2902        }
2903
2904        table_rows
2905    }
2906
2907    /// Parse bare URLs and emails in the content
2908    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2909        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2910
2911        // Check for bare URLs (not in angle brackets or markdown links)
2912        for cap in BARE_URL_PATTERN.captures_iter(content) {
2913            let full_match = cap.get(0).unwrap();
2914            let match_start = full_match.start();
2915            let match_end = full_match.end();
2916
2917            // Skip if in code block
2918            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2919                continue;
2920            }
2921
2922            // Skip if already in angle brackets or markdown links
2923            let preceding_char = if match_start > 0 {
2924                content.chars().nth(match_start - 1)
2925            } else {
2926                None
2927            };
2928            let following_char = content.chars().nth(match_end);
2929
2930            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2931                continue;
2932            }
2933            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2934                continue;
2935            }
2936
2937            let url = full_match.as_str();
2938            let url_type = if url.starts_with("https://") {
2939                "https"
2940            } else if url.starts_with("http://") {
2941                "http"
2942            } else if url.starts_with("ftp://") {
2943                "ftp"
2944            } else {
2945                "other"
2946            };
2947
2948            // Find which line this URL is on
2949            let mut line_num = 1;
2950            let mut col_start = match_start;
2951            let mut col_end = match_end;
2952            for (idx, line_info) in lines.iter().enumerate() {
2953                if match_start >= line_info.byte_offset {
2954                    line_num = idx + 1;
2955                    col_start = match_start - line_info.byte_offset;
2956                    col_end = match_end - line_info.byte_offset;
2957                } else {
2958                    break;
2959                }
2960            }
2961
2962            bare_urls.push(BareUrl {
2963                line: line_num,
2964                start_col: col_start,
2965                end_col: col_end,
2966                byte_offset: match_start,
2967                byte_end: match_end,
2968                url: url.to_string(),
2969                url_type: url_type.to_string(),
2970            });
2971        }
2972
2973        // Check for bare email addresses
2974        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2975            let full_match = cap.get(0).unwrap();
2976            let match_start = full_match.start();
2977            let match_end = full_match.end();
2978
2979            // Skip if in code block
2980            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2981                continue;
2982            }
2983
2984            // Skip if already in angle brackets or markdown links
2985            let preceding_char = if match_start > 0 {
2986                content.chars().nth(match_start - 1)
2987            } else {
2988                None
2989            };
2990            let following_char = content.chars().nth(match_end);
2991
2992            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2993                continue;
2994            }
2995            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2996                continue;
2997            }
2998
2999            let email = full_match.as_str();
3000
3001            // Find which line this email is on
3002            let mut line_num = 1;
3003            let mut col_start = match_start;
3004            let mut col_end = match_end;
3005            for (idx, line_info) in lines.iter().enumerate() {
3006                if match_start >= line_info.byte_offset {
3007                    line_num = idx + 1;
3008                    col_start = match_start - line_info.byte_offset;
3009                    col_end = match_end - line_info.byte_offset;
3010                } else {
3011                    break;
3012                }
3013            }
3014
3015            bare_urls.push(BareUrl {
3016                line: line_num,
3017                start_col: col_start,
3018                end_col: col_end,
3019                byte_offset: match_start,
3020                byte_end: match_end,
3021                url: email.to_string(),
3022                url_type: "email".to_string(),
3023            });
3024        }
3025
3026        bare_urls
3027    }
3028}
3029
3030/// Merge adjacent list blocks that should be treated as one
3031fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3032    if list_blocks.len() < 2 {
3033        return;
3034    }
3035
3036    let mut merger = ListBlockMerger::new(lines);
3037    *list_blocks = merger.merge(list_blocks);
3038}
3039
3040/// Helper struct to manage the complex logic of merging list blocks
3041struct ListBlockMerger<'a> {
3042    lines: &'a [LineInfo],
3043}
3044
3045impl<'a> ListBlockMerger<'a> {
3046    fn new(lines: &'a [LineInfo]) -> Self {
3047        Self { lines }
3048    }
3049
3050    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3051        let mut merged = Vec::with_capacity(list_blocks.len());
3052        let mut current = list_blocks[0].clone();
3053
3054        for next in list_blocks.iter().skip(1) {
3055            if self.should_merge_blocks(&current, next) {
3056                current = self.merge_two_blocks(current, next);
3057            } else {
3058                merged.push(current);
3059                current = next.clone();
3060            }
3061        }
3062
3063        merged.push(current);
3064        merged
3065    }
3066
3067    /// Determine if two adjacent list blocks should be merged
3068    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3069        // Basic compatibility checks
3070        if !self.blocks_are_compatible(current, next) {
3071            return false;
3072        }
3073
3074        // Check spacing and content between blocks
3075        let spacing = self.analyze_spacing_between(current, next);
3076        match spacing {
3077            BlockSpacing::Consecutive => true,
3078            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3079            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3080                self.can_merge_with_content_between(current, next)
3081            }
3082        }
3083    }
3084
3085    /// Check if blocks have compatible structure for merging
3086    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3087        current.is_ordered == next.is_ordered
3088            && current.blockquote_prefix == next.blockquote_prefix
3089            && current.nesting_level == next.nesting_level
3090    }
3091
3092    /// Analyze the spacing between two list blocks
3093    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3094        let gap = next.start_line - current.end_line;
3095
3096        match gap {
3097            1 => BlockSpacing::Consecutive,
3098            2 => BlockSpacing::SingleBlank,
3099            _ if gap > 2 => {
3100                if self.has_only_blank_lines_between(current, next) {
3101                    BlockSpacing::MultipleBlanks
3102                } else {
3103                    BlockSpacing::ContentBetween
3104                }
3105            }
3106            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3107        }
3108    }
3109
3110    /// Check if unordered lists can be merged with a single blank line between
3111    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3112        // Check if there are structural separators between the blocks
3113        // If has_meaningful_content_between returns true, it means there are structural separators
3114        if has_meaningful_content_between(current, next, self.lines) {
3115            return false; // Structural separators prevent merging
3116        }
3117
3118        // Only merge unordered lists with same marker across single blank
3119        !current.is_ordered && current.marker == next.marker
3120    }
3121
3122    /// Check if ordered lists can be merged when there's content between them
3123    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3124        // Do not merge lists if there are structural separators between them
3125        if has_meaningful_content_between(current, next, self.lines) {
3126            return false; // Structural separators prevent merging
3127        }
3128
3129        // Only consider merging ordered lists if there's no structural content between
3130        current.is_ordered && next.is_ordered
3131    }
3132
3133    /// Check if there are only blank lines between blocks
3134    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3135        for line_num in (current.end_line + 1)..next.start_line {
3136            if let Some(line_info) = self.lines.get(line_num - 1)
3137                && !line_info.content.trim().is_empty()
3138            {
3139                return false;
3140            }
3141        }
3142        true
3143    }
3144
3145    /// Merge two compatible list blocks into one
3146    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3147        current.end_line = next.end_line;
3148        current.item_lines.extend_from_slice(&next.item_lines);
3149
3150        // Update max marker width
3151        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3152
3153        // Handle marker consistency for unordered lists
3154        if !current.is_ordered && self.markers_differ(&current, next) {
3155            current.marker = None; // Mixed markers
3156        }
3157
3158        current
3159    }
3160
3161    /// Check if two blocks have different markers
3162    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3163        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3164    }
3165}
3166
3167/// Types of spacing between list blocks
3168#[derive(Debug, PartialEq)]
3169enum BlockSpacing {
3170    Consecutive,    // No gap between blocks
3171    SingleBlank,    // One blank line between blocks
3172    MultipleBlanks, // Multiple blank lines but no content
3173    ContentBetween, // Content exists between blocks
3174}
3175
3176/// Check if there's meaningful content (not just blank lines) between two list blocks
3177fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3178    // Check lines between current.end_line and next.start_line
3179    for line_num in (current.end_line + 1)..next.start_line {
3180        if let Some(line_info) = lines.get(line_num - 1) {
3181            // Convert to 0-indexed
3182            let trimmed = line_info.content.trim();
3183
3184            // Skip empty lines
3185            if trimmed.is_empty() {
3186                continue;
3187            }
3188
3189            // Check for structural separators that should separate lists (CommonMark compliant)
3190
3191            // Headings separate lists
3192            if line_info.heading.is_some() {
3193                return true; // Has meaningful content - headings separate lists
3194            }
3195
3196            // Horizontal rules separate lists (---, ***, ___)
3197            if is_horizontal_rule(trimmed) {
3198                return true; // Has meaningful content - horizontal rules separate lists
3199            }
3200
3201            // Tables separate lists (lines containing | but not in URLs or code)
3202            // Simple heuristic: tables typically have | at start/end or multiple |
3203            if trimmed.contains('|') && trimmed.len() > 1 {
3204                // Don't treat URLs with | as tables
3205                if !trimmed.contains("](") && !trimmed.contains("http") {
3206                    // More robust check: tables usually have multiple | or | at edges
3207                    let pipe_count = trimmed.matches('|').count();
3208                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3209                        return true; // Has meaningful content - tables separate lists
3210                    }
3211                }
3212            }
3213
3214            // Blockquotes separate lists
3215            if trimmed.starts_with('>') {
3216                return true; // Has meaningful content - blockquotes separate lists
3217            }
3218
3219            // Code block fences separate lists (unless properly indented as list content)
3220            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3221                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3222
3223                // Check if this code block is properly indented as list continuation
3224                let min_continuation_indent = if current.is_ordered {
3225                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3226                } else {
3227                    current.nesting_level + 2
3228                };
3229
3230                if line_indent < min_continuation_indent {
3231                    // This is a standalone code block that separates lists
3232                    return true; // Has meaningful content - standalone code blocks separate lists
3233                }
3234            }
3235
3236            // Check if this line has proper indentation for list continuation
3237            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3238
3239            // Calculate minimum indentation needed to be list continuation
3240            let min_indent = if current.is_ordered {
3241                current.nesting_level + current.max_marker_width
3242            } else {
3243                current.nesting_level + 2
3244            };
3245
3246            // If the line is not indented enough to be list continuation, it's meaningful content
3247            if line_indent < min_indent {
3248                return true; // Has meaningful content - content not indented as list continuation
3249            }
3250
3251            // If we reach here, the line is properly indented as list continuation
3252            // Continue checking other lines
3253        }
3254    }
3255
3256    // Only blank lines or properly indented list continuation content between blocks
3257    false
3258}
3259
3260/// Check if a line is a horizontal rule (---, ***, ___)
3261fn is_horizontal_rule(trimmed: &str) -> bool {
3262    if trimmed.len() < 3 {
3263        return false;
3264    }
3265
3266    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3267    let chars: Vec<char> = trimmed.chars().collect();
3268    if let Some(&first_char) = chars.first()
3269        && (first_char == '-' || first_char == '*' || first_char == '_')
3270    {
3271        let mut count = 0;
3272        for &ch in &chars {
3273            if ch == first_char {
3274                count += 1;
3275            } else if ch != ' ' && ch != '\t' {
3276                return false; // Non-matching, non-whitespace character
3277            }
3278        }
3279        return count >= 3;
3280    }
3281    false
3282}
3283
3284/// Check if content contains patterns that cause the markdown crate to panic
3285#[cfg(test)]
3286mod tests {
3287    use super::*;
3288
3289    #[test]
3290    fn test_empty_content() {
3291        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3292        assert_eq!(ctx.content, "");
3293        assert_eq!(ctx.line_offsets, vec![0]);
3294        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3295        assert_eq!(ctx.lines.len(), 0);
3296    }
3297
3298    #[test]
3299    fn test_single_line() {
3300        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3301        assert_eq!(ctx.content, "# Hello");
3302        assert_eq!(ctx.line_offsets, vec![0]);
3303        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3304        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3305    }
3306
3307    #[test]
3308    fn test_multi_line() {
3309        let content = "# Title\n\nSecond line\nThird line";
3310        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3311        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3312        // Test offset to line/col
3313        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3314        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3315        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3316        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3317        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3318    }
3319
3320    #[test]
3321    fn test_line_info() {
3322        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3323        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3324
3325        // Test line info
3326        assert_eq!(ctx.lines.len(), 7);
3327
3328        // Line 1: "# Title"
3329        let line1 = &ctx.lines[0];
3330        assert_eq!(line1.content, "# Title");
3331        assert_eq!(line1.byte_offset, 0);
3332        assert_eq!(line1.indent, 0);
3333        assert!(!line1.is_blank);
3334        assert!(!line1.in_code_block);
3335        assert!(line1.list_item.is_none());
3336
3337        // Line 2: "    indented"
3338        let line2 = &ctx.lines[1];
3339        assert_eq!(line2.content, "    indented");
3340        assert_eq!(line2.byte_offset, 8);
3341        assert_eq!(line2.indent, 4);
3342        assert!(!line2.is_blank);
3343
3344        // Line 3: "" (blank)
3345        let line3 = &ctx.lines[2];
3346        assert_eq!(line3.content, "");
3347        assert!(line3.is_blank);
3348
3349        // Test helper methods
3350        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3351        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3352        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3353        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3354    }
3355
3356    #[test]
3357    fn test_list_item_detection() {
3358        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3359        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3360
3361        // Line 1: "- Unordered item"
3362        let line1 = &ctx.lines[0];
3363        assert!(line1.list_item.is_some());
3364        let list1 = line1.list_item.as_ref().unwrap();
3365        assert_eq!(list1.marker, "-");
3366        assert!(!list1.is_ordered);
3367        assert_eq!(list1.marker_column, 0);
3368        assert_eq!(list1.content_column, 2);
3369
3370        // Line 2: "  * Nested item"
3371        let line2 = &ctx.lines[1];
3372        assert!(line2.list_item.is_some());
3373        let list2 = line2.list_item.as_ref().unwrap();
3374        assert_eq!(list2.marker, "*");
3375        assert_eq!(list2.marker_column, 2);
3376
3377        // Line 3: "1. Ordered item"
3378        let line3 = &ctx.lines[2];
3379        assert!(line3.list_item.is_some());
3380        let list3 = line3.list_item.as_ref().unwrap();
3381        assert_eq!(list3.marker, "1.");
3382        assert!(list3.is_ordered);
3383        assert_eq!(list3.number, Some(1));
3384
3385        // Line 6: "Not a list"
3386        let line6 = &ctx.lines[5];
3387        assert!(line6.list_item.is_none());
3388    }
3389
3390    #[test]
3391    fn test_offset_to_line_col_edge_cases() {
3392        let content = "a\nb\nc";
3393        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3394        // line_offsets: [0, 2, 4]
3395        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3396        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3397        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3398        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3399        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3400        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3401    }
3402
3403    #[test]
3404    fn test_mdx_esm_blocks() {
3405        let content = r##"import {Chart} from './snowfall.js'
3406export const year = 2023
3407
3408# Last year's snowfall
3409
3410In {year}, the snowfall was above average.
3411It was followed by a warm spring which caused
3412flood conditions in many of the nearby rivers.
3413
3414<Chart color="#fcb32c" year={year} />
3415"##;
3416
3417        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3418
3419        // Check that lines 1 and 2 are marked as ESM blocks
3420        assert_eq!(ctx.lines.len(), 10);
3421        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3422        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3423        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3424        assert!(
3425            !ctx.lines[3].in_esm_block,
3426            "Line 4 (heading) should NOT be in_esm_block"
3427        );
3428        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3429        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3430    }
3431
3432    #[test]
3433    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3434        let content = r#"import {Chart} from './snowfall.js'
3435export const year = 2023
3436
3437# Last year's snowfall
3438"#;
3439
3440        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3441
3442        // ESM blocks should NOT be detected in Standard flavor
3443        assert!(
3444            !ctx.lines[0].in_esm_block,
3445            "Line 1 should NOT be in_esm_block in Standard flavor"
3446        );
3447        assert!(
3448            !ctx.lines[1].in_esm_block,
3449            "Line 2 should NOT be in_esm_block in Standard flavor"
3450        );
3451    }
3452}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs