rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9/// Macro for profiling sections - only active in non-WASM builds
10#[cfg(not(target_arch = "wasm32"))]
11macro_rules! profile_section {
12    ($name:expr, $profile:expr, $code:expr) => {{
13        let start = std::time::Instant::now();
14        let result = $code;
15        if $profile {
16            eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
17        }
18        result
19    }};
20}
21
22#[cfg(target_arch = "wasm32")]
23macro_rules! profile_section {
24    ($name:expr, $profile:expr, $code:expr) => {{ $code }};
25}
26
27// Comprehensive link pattern that captures both inline and reference links
28// Use (?s) flag to make . match newlines
29static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30    Regex::new(
31        r#"(?sx)
32        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
33        (?:
34            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
35            |
36            \[([^\]]*)\]      # Reference ID in group 6
37        )"#
38    ).unwrap()
39});
40
41// Image pattern (similar to links but with ! prefix)
42// Use (?s) flag to make . match newlines
43static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(
45        r#"(?sx)
46        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
47        (?:
48            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
49            |
50            \[([^\]]*)\]      # Reference ID in group 6
51        )"#
52    ).unwrap()
53});
54
55// Reference definition pattern
56static REF_DEF_PATTERN: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
58
59// Pattern for bare URLs
60static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
61    Regex::new(
62        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
63    ).unwrap()
64});
65
66// Pattern for email addresses
67static BARE_EMAIL_PATTERN: LazyLock<Regex> =
68    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
69
70// Pattern for blockquote prefix in parse_list_blocks
71static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
72
73/// Pre-computed information about a line
74#[derive(Debug, Clone)]
75pub struct LineInfo {
76    /// Byte offset where this line starts in the document
77    pub byte_offset: usize,
78    /// Length of the line in bytes (without newline)
79    pub byte_len: usize,
80    /// Number of leading spaces/tabs
81    pub indent: usize,
82    /// Whether the line is blank (empty or only whitespace)
83    pub is_blank: bool,
84    /// Whether this line is inside a code block
85    pub in_code_block: bool,
86    /// Whether this line is inside front matter
87    pub in_front_matter: bool,
88    /// Whether this line is inside an HTML block
89    pub in_html_block: bool,
90    /// Whether this line is inside an HTML comment
91    pub in_html_comment: bool,
92    /// List item information if this line starts a list item
93    pub list_item: Option<ListItemInfo>,
94    /// Heading information if this line is a heading
95    pub heading: Option<HeadingInfo>,
96    /// Blockquote information if this line is a blockquote
97    pub blockquote: Option<BlockquoteInfo>,
98    /// Whether this line is inside a mkdocstrings autodoc block
99    pub in_mkdocstrings: bool,
100    /// Whether this line is part of an ESM import/export block (MDX only)
101    pub in_esm_block: bool,
102}
103
104impl LineInfo {
105    /// Get the line content as a string slice from the source document
106    pub fn content<'a>(&self, source: &'a str) -> &'a str {
107        &source[self.byte_offset..self.byte_offset + self.byte_len]
108    }
109}
110
111/// Information about a list item
112#[derive(Debug, Clone)]
113pub struct ListItemInfo {
114    /// The marker used (*, -, +, or number with . or ))
115    pub marker: String,
116    /// Whether it's ordered (true) or unordered (false)
117    pub is_ordered: bool,
118    /// The number for ordered lists
119    pub number: Option<usize>,
120    /// Column where the marker starts (0-based)
121    pub marker_column: usize,
122    /// Column where content after marker starts
123    pub content_column: usize,
124}
125
126/// Heading style type
127#[derive(Debug, Clone, PartialEq)]
128pub enum HeadingStyle {
129    /// ATX style heading (# Heading)
130    ATX,
131    /// Setext style heading with = underline
132    Setext1,
133    /// Setext style heading with - underline
134    Setext2,
135}
136
137/// Parsed link information
138#[derive(Debug, Clone)]
139pub struct ParsedLink<'a> {
140    /// Line number (1-indexed)
141    pub line: usize,
142    /// Start column (0-indexed) in the line
143    pub start_col: usize,
144    /// End column (0-indexed) in the line
145    pub end_col: usize,
146    /// Byte offset in document
147    pub byte_offset: usize,
148    /// End byte offset in document
149    pub byte_end: usize,
150    /// Link text
151    pub text: Cow<'a, str>,
152    /// Link URL or reference
153    pub url: Cow<'a, str>,
154    /// Whether this is a reference link [text][ref] vs inline [text](url)
155    pub is_reference: bool,
156    /// Reference ID for reference links
157    pub reference_id: Option<Cow<'a, str>>,
158    /// Link type from pulldown-cmark
159    pub link_type: LinkType,
160}
161
162/// Information about a broken link reported by pulldown-cmark
163#[derive(Debug, Clone)]
164pub struct BrokenLinkInfo {
165    /// The reference text that couldn't be resolved
166    pub reference: String,
167    /// Byte span in the source document
168    pub span: std::ops::Range<usize>,
169}
170
171/// Parsed image information
172#[derive(Debug, Clone)]
173pub struct ParsedImage<'a> {
174    /// Line number (1-indexed)
175    pub line: usize,
176    /// Start column (0-indexed) in the line
177    pub start_col: usize,
178    /// End column (0-indexed) in the line
179    pub end_col: usize,
180    /// Byte offset in document
181    pub byte_offset: usize,
182    /// End byte offset in document
183    pub byte_end: usize,
184    /// Alt text
185    pub alt_text: Cow<'a, str>,
186    /// Image URL or reference
187    pub url: Cow<'a, str>,
188    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
189    pub is_reference: bool,
190    /// Reference ID for reference images
191    pub reference_id: Option<Cow<'a, str>>,
192    /// Link type from pulldown-cmark
193    pub link_type: LinkType,
194}
195
196/// Reference definition [ref]: url "title"
197#[derive(Debug, Clone)]
198pub struct ReferenceDef {
199    /// Line number (1-indexed)
200    pub line: usize,
201    /// Reference ID (normalized to lowercase)
202    pub id: String,
203    /// URL
204    pub url: String,
205    /// Optional title
206    pub title: Option<String>,
207    /// Byte offset where the reference definition starts
208    pub byte_offset: usize,
209    /// Byte offset where the reference definition ends
210    pub byte_end: usize,
211}
212
213/// Parsed code span information
214#[derive(Debug, Clone)]
215pub struct CodeSpan {
216    /// Line number (1-indexed)
217    pub line: usize,
218    /// Start column (0-indexed) in the line
219    pub start_col: usize,
220    /// End column (0-indexed) in the line
221    pub end_col: usize,
222    /// Byte offset in document
223    pub byte_offset: usize,
224    /// End byte offset in document
225    pub byte_end: usize,
226    /// Number of backticks used (1, 2, 3, etc.)
227    pub backtick_count: usize,
228    /// Content inside the code span (without backticks)
229    pub content: String,
230}
231
232/// Information about a heading
233#[derive(Debug, Clone)]
234pub struct HeadingInfo {
235    /// Heading level (1-6 for ATX, 1-2 for Setext)
236    pub level: u8,
237    /// Style of heading
238    pub style: HeadingStyle,
239    /// The heading marker (# characters or underline)
240    pub marker: String,
241    /// Column where the marker starts (0-based)
242    pub marker_column: usize,
243    /// Column where heading text starts
244    pub content_column: usize,
245    /// The heading text (without markers and without custom ID syntax)
246    pub text: String,
247    /// Custom header ID if present (e.g., from {#custom-id} syntax)
248    pub custom_id: Option<String>,
249    /// Original heading text including custom ID syntax
250    pub raw_text: String,
251    /// Whether it has a closing sequence (for ATX)
252    pub has_closing_sequence: bool,
253    /// The closing sequence if present
254    pub closing_sequence: String,
255}
256
257/// Information about a blockquote line
258#[derive(Debug, Clone)]
259pub struct BlockquoteInfo {
260    /// Nesting level (1 for >, 2 for >>, etc.)
261    pub nesting_level: usize,
262    /// The indentation before the blockquote marker
263    pub indent: String,
264    /// Column where the first > starts (0-based)
265    pub marker_column: usize,
266    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
267    pub prefix: String,
268    /// Content after the blockquote marker(s)
269    pub content: String,
270    /// Whether the line has no space after the marker
271    pub has_no_space_after_marker: bool,
272    /// Whether the line has multiple spaces after the marker
273    pub has_multiple_spaces_after_marker: bool,
274    /// Whether this is an empty blockquote line needing MD028 fix
275    pub needs_md028_fix: bool,
276}
277
278/// Information about a list block
279#[derive(Debug, Clone)]
280pub struct ListBlock {
281    /// Line number where the list starts (1-indexed)
282    pub start_line: usize,
283    /// Line number where the list ends (1-indexed)
284    pub end_line: usize,
285    /// Whether it's ordered or unordered
286    pub is_ordered: bool,
287    /// The consistent marker for unordered lists (if any)
288    pub marker: Option<String>,
289    /// Blockquote prefix for this list (empty if not in blockquote)
290    pub blockquote_prefix: String,
291    /// Lines that are list items within this block
292    pub item_lines: Vec<usize>,
293    /// Nesting level (0 for top-level lists)
294    pub nesting_level: usize,
295    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
296    pub max_marker_width: usize,
297}
298
299use std::sync::{Arc, Mutex};
300
301/// Character frequency data for fast content analysis
302#[derive(Debug, Clone, Default)]
303pub struct CharFrequency {
304    /// Count of # characters (headings)
305    pub hash_count: usize,
306    /// Count of * characters (emphasis, lists, horizontal rules)
307    pub asterisk_count: usize,
308    /// Count of _ characters (emphasis, horizontal rules)
309    pub underscore_count: usize,
310    /// Count of - characters (lists, horizontal rules, setext headings)
311    pub hyphen_count: usize,
312    /// Count of + characters (lists)
313    pub plus_count: usize,
314    /// Count of > characters (blockquotes)
315    pub gt_count: usize,
316    /// Count of | characters (tables)
317    pub pipe_count: usize,
318    /// Count of [ characters (links, images)
319    pub bracket_count: usize,
320    /// Count of ` characters (code spans, code blocks)
321    pub backtick_count: usize,
322    /// Count of < characters (HTML tags, autolinks)
323    pub lt_count: usize,
324    /// Count of ! characters (images)
325    pub exclamation_count: usize,
326    /// Count of newline characters
327    pub newline_count: usize,
328}
329
330/// Pre-parsed HTML tag information
331#[derive(Debug, Clone)]
332pub struct HtmlTag {
333    /// Line number (1-indexed)
334    pub line: usize,
335    /// Start column (0-indexed) in the line
336    pub start_col: usize,
337    /// End column (0-indexed) in the line
338    pub end_col: usize,
339    /// Byte offset in document
340    pub byte_offset: usize,
341    /// End byte offset in document
342    pub byte_end: usize,
343    /// Tag name (e.g., "div", "img", "br")
344    pub tag_name: String,
345    /// Whether it's a closing tag (`</tag>`)
346    pub is_closing: bool,
347    /// Whether it's self-closing (`<tag />`)
348    pub is_self_closing: bool,
349    /// Raw tag content
350    pub raw_content: String,
351}
352
353/// Pre-parsed emphasis span information
354#[derive(Debug, Clone)]
355pub struct EmphasisSpan {
356    /// Line number (1-indexed)
357    pub line: usize,
358    /// Start column (0-indexed) in the line
359    pub start_col: usize,
360    /// End column (0-indexed) in the line
361    pub end_col: usize,
362    /// Byte offset in document
363    pub byte_offset: usize,
364    /// End byte offset in document
365    pub byte_end: usize,
366    /// Type of emphasis ('*' or '_')
367    pub marker: char,
368    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
369    pub marker_count: usize,
370    /// Content inside the emphasis
371    pub content: String,
372}
373
374/// Pre-parsed table row information
375#[derive(Debug, Clone)]
376pub struct TableRow {
377    /// Line number (1-indexed)
378    pub line: usize,
379    /// Whether this is a separator row (contains only |, -, :, and spaces)
380    pub is_separator: bool,
381    /// Number of columns (pipe-separated cells)
382    pub column_count: usize,
383    /// Alignment info from separator row
384    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
385}
386
387/// Pre-parsed bare URL information (not in links)
388#[derive(Debug, Clone)]
389pub struct BareUrl {
390    /// Line number (1-indexed)
391    pub line: usize,
392    /// Start column (0-indexed) in the line
393    pub start_col: usize,
394    /// End column (0-indexed) in the line
395    pub end_col: usize,
396    /// Byte offset in document
397    pub byte_offset: usize,
398    /// End byte offset in document
399    pub byte_end: usize,
400    /// The URL string
401    pub url: String,
402    /// Type of URL ("http", "https", "ftp", "email")
403    pub url_type: String,
404}
405
406pub struct LintContext<'a> {
407    pub content: &'a str,
408    pub line_offsets: Vec<usize>,
409    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
410    pub lines: Vec<LineInfo>,             // Pre-computed line information
411    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
412    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
413    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
414    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
415    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
416    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
417    pub char_frequency: CharFrequency,    // Character frequency analysis
418    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
419    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
420    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
421    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
422    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
423    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
424    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
425    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
426    pub flavor: MarkdownFlavor,           // Markdown flavor being used
427}
428
429/// Detailed blockquote parse result with all components
430struct BlockquoteComponents<'a> {
431    indent: &'a str,
432    markers: &'a str,
433    spaces_after: &'a str,
434    content: &'a str,
435}
436
437/// Parse blockquote prefix with detailed components using manual parsing
438#[inline]
439fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
440    let bytes = line.as_bytes();
441    let mut pos = 0;
442
443    // Parse leading whitespace (indent)
444    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
445        pos += 1;
446    }
447    let indent_end = pos;
448
449    // Must have at least one '>' marker
450    if pos >= bytes.len() || bytes[pos] != b'>' {
451        return None;
452    }
453
454    // Parse '>' markers
455    while pos < bytes.len() && bytes[pos] == b'>' {
456        pos += 1;
457    }
458    let markers_end = pos;
459
460    // Parse spaces after markers
461    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
462        pos += 1;
463    }
464    let spaces_end = pos;
465
466    Some(BlockquoteComponents {
467        indent: &line[0..indent_end],
468        markers: &line[indent_end..markers_end],
469        spaces_after: &line[markers_end..spaces_end],
470        content: &line[spaces_end..],
471    })
472}
473
474impl<'a> LintContext<'a> {
475    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
476        #[cfg(not(target_arch = "wasm32"))]
477        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
478        #[cfg(target_arch = "wasm32")]
479        let profile = false;
480
481        let line_offsets = profile_section!("Line offsets", profile, {
482            let mut offsets = vec![0];
483            for (i, c) in content.char_indices() {
484                if c == '\n' {
485                    offsets.push(i + 1);
486                }
487            }
488            offsets
489        });
490
491        // Detect code blocks once and cache them
492        let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
493
494        // Pre-compute HTML comment ranges ONCE for all operations
495        let html_comment_ranges = profile_section!(
496            "HTML comment ranges",
497            profile,
498            crate::utils::skip_context::compute_html_comment_ranges(content)
499        );
500
501        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
502        let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
503            if flavor == MarkdownFlavor::MkDocs {
504                crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
505            } else {
506                Vec::new()
507            }
508        });
509
510        // Pre-compute line information (without headings/blockquotes yet)
511        let mut lines = profile_section!(
512            "Basic line info",
513            profile,
514            Self::compute_basic_line_info(
515                content,
516                &line_offsets,
517                &code_blocks,
518                flavor,
519                &html_comment_ranges,
520                &autodoc_ranges,
521            )
522        );
523
524        // Detect HTML blocks BEFORE heading detection
525        profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
526
527        // Detect ESM import/export blocks in MDX files BEFORE heading detection
528        profile_section!(
529            "ESM blocks",
530            profile,
531            Self::detect_esm_blocks(content, &mut lines, flavor)
532        );
533
534        // Now detect headings and blockquotes
535        profile_section!(
536            "Headings & blockquotes",
537            profile,
538            Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges)
539        );
540
541        // Parse code spans early so we can exclude them from link/image parsing
542        let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
543
544        // Parse links, images, references, and list blocks
545        let (links, broken_links) = profile_section!(
546            "Links",
547            profile,
548            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
549        );
550
551        let images = profile_section!(
552            "Images",
553            profile,
554            Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
555        );
556
557        let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
558
559        let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
560
561        // Compute character frequency for fast content analysis
562        let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
563
564        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
565        let table_blocks = profile_section!(
566            "Table blocks",
567            profile,
568            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
569                content,
570                &code_blocks,
571                &code_spans,
572                &html_comment_ranges,
573            )
574        );
575
576        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
577        let line_index = profile_section!(
578            "Line index",
579            profile,
580            crate::utils::range_utils::LineIndex::new(content)
581        );
582
583        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
584        let jinja_ranges = profile_section!(
585            "Jinja ranges",
586            profile,
587            crate::utils::jinja_utils::find_jinja_ranges(content)
588        );
589
590        Self {
591            content,
592            line_offsets,
593            code_blocks,
594            lines,
595            links,
596            images,
597            broken_links,
598            reference_defs,
599            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
600            list_blocks,
601            char_frequency,
602            html_tags_cache: Mutex::new(None),
603            emphasis_spans_cache: Mutex::new(None),
604            table_rows_cache: Mutex::new(None),
605            bare_urls_cache: Mutex::new(None),
606            html_comment_ranges,
607            table_blocks,
608            line_index,
609            jinja_ranges,
610            flavor,
611        }
612    }
613
614    /// Get code spans - computed lazily on first access
615    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
616        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
617
618        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
619    }
620
621    /// Get HTML comment ranges - pre-computed during LintContext construction
622    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
623        &self.html_comment_ranges
624    }
625
626    /// Get HTML tags - computed lazily on first access
627    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
628        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
629
630        Arc::clone(cache.get_or_insert_with(|| {
631            Arc::new(Self::parse_html_tags(
632                self.content,
633                &self.lines,
634                &self.code_blocks,
635                self.flavor,
636            ))
637        }))
638    }
639
640    /// Get emphasis spans - computed lazily on first access
641    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
642        let mut cache = self
643            .emphasis_spans_cache
644            .lock()
645            .expect("Emphasis spans cache mutex poisoned");
646
647        Arc::clone(
648            cache.get_or_insert_with(|| {
649                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
650            }),
651        )
652    }
653
654    /// Get table rows - computed lazily on first access
655    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
656        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
657
658        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
659    }
660
661    /// Get bare URLs - computed lazily on first access
662    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
663        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
664
665        Arc::clone(
666            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
667        )
668    }
669
670    /// Map a byte offset to (line, column)
671    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
672        match self.line_offsets.binary_search(&offset) {
673            Ok(line) => (line + 1, 1),
674            Err(line) => {
675                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
676                (line, offset - line_start + 1)
677            }
678        }
679    }
680
681    /// Check if a position is within a code block or code span
682    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
683        // Check code blocks first
684        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
685            return true;
686        }
687
688        // Check inline code spans (lazy load if needed)
689        self.code_spans()
690            .iter()
691            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
692    }
693
694    /// Get line information by line number (1-indexed)
695    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
696        if line_num > 0 {
697            self.lines.get(line_num - 1)
698        } else {
699            None
700        }
701    }
702
703    /// Get byte offset for a line number (1-indexed)
704    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
705        self.line_info(line_num).map(|info| info.byte_offset)
706    }
707
708    /// Get URL for a reference link/image by its ID
709    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
710        let normalized_id = ref_id.to_lowercase();
711        self.reference_defs
712            .iter()
713            .find(|def| def.id == normalized_id)
714            .map(|def| def.url.as_str())
715    }
716
717    /// Check if a line is part of a list block
718    pub fn is_in_list_block(&self, line_num: usize) -> bool {
719        self.list_blocks
720            .iter()
721            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
722    }
723
724    /// Get the list block containing a specific line
725    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
726        self.list_blocks
727            .iter()
728            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
729    }
730
731    // Compatibility methods for DocumentStructure migration
732
733    /// Check if a line is within a code block
734    pub fn is_in_code_block(&self, line_num: usize) -> bool {
735        if line_num == 0 || line_num > self.lines.len() {
736            return false;
737        }
738        self.lines[line_num - 1].in_code_block
739    }
740
741    /// Check if a line is within front matter
742    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
743        if line_num == 0 || line_num > self.lines.len() {
744            return false;
745        }
746        self.lines[line_num - 1].in_front_matter
747    }
748
749    /// Check if a line is within an HTML block
750    pub fn is_in_html_block(&self, line_num: usize) -> bool {
751        if line_num == 0 || line_num > self.lines.len() {
752            return false;
753        }
754        self.lines[line_num - 1].in_html_block
755    }
756
757    /// Check if a line and column is within a code span
758    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
759        if line_num == 0 || line_num > self.lines.len() {
760            return false;
761        }
762
763        // Use the code spans cache to check
764        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
765        // Convert col to 0-indexed for comparison
766        let col_0indexed = if col > 0 { col - 1 } else { 0 };
767        let code_spans = self.code_spans();
768        code_spans
769            .iter()
770            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
771    }
772
773    /// Check if a byte position is within a reference definition
774    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
775    #[inline]
776    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
777        self.reference_defs
778            .iter()
779            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
780    }
781
782    /// Check if a byte position is within an HTML comment
783    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
784    /// where k is the number of HTML comments (typically very small)
785    #[inline]
786    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
787        self.html_comment_ranges
788            .iter()
789            .any(|range| byte_pos >= range.start && byte_pos < range.end)
790    }
791
792    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
793    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
794        self.jinja_ranges
795            .iter()
796            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
797    }
798
799    /// Check if content has any instances of a specific character (fast)
800    pub fn has_char(&self, ch: char) -> bool {
801        match ch {
802            '#' => self.char_frequency.hash_count > 0,
803            '*' => self.char_frequency.asterisk_count > 0,
804            '_' => self.char_frequency.underscore_count > 0,
805            '-' => self.char_frequency.hyphen_count > 0,
806            '+' => self.char_frequency.plus_count > 0,
807            '>' => self.char_frequency.gt_count > 0,
808            '|' => self.char_frequency.pipe_count > 0,
809            '[' => self.char_frequency.bracket_count > 0,
810            '`' => self.char_frequency.backtick_count > 0,
811            '<' => self.char_frequency.lt_count > 0,
812            '!' => self.char_frequency.exclamation_count > 0,
813            '\n' => self.char_frequency.newline_count > 0,
814            _ => self.content.contains(ch), // Fallback for other characters
815        }
816    }
817
818    /// Get count of a specific character (fast)
819    pub fn char_count(&self, ch: char) -> usize {
820        match ch {
821            '#' => self.char_frequency.hash_count,
822            '*' => self.char_frequency.asterisk_count,
823            '_' => self.char_frequency.underscore_count,
824            '-' => self.char_frequency.hyphen_count,
825            '+' => self.char_frequency.plus_count,
826            '>' => self.char_frequency.gt_count,
827            '|' => self.char_frequency.pipe_count,
828            '[' => self.char_frequency.bracket_count,
829            '`' => self.char_frequency.backtick_count,
830            '<' => self.char_frequency.lt_count,
831            '!' => self.char_frequency.exclamation_count,
832            '\n' => self.char_frequency.newline_count,
833            _ => self.content.matches(ch).count(), // Fallback for other characters
834        }
835    }
836
837    /// Check if content likely contains headings (fast)
838    pub fn likely_has_headings(&self) -> bool {
839        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
840    }
841
842    /// Check if content likely contains lists (fast)
843    pub fn likely_has_lists(&self) -> bool {
844        self.char_frequency.asterisk_count > 0
845            || self.char_frequency.hyphen_count > 0
846            || self.char_frequency.plus_count > 0
847    }
848
849    /// Check if content likely contains emphasis (fast)
850    pub fn likely_has_emphasis(&self) -> bool {
851        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
852    }
853
854    /// Check if content likely contains tables (fast)
855    pub fn likely_has_tables(&self) -> bool {
856        self.char_frequency.pipe_count > 2
857    }
858
859    /// Check if content likely contains blockquotes (fast)
860    pub fn likely_has_blockquotes(&self) -> bool {
861        self.char_frequency.gt_count > 0
862    }
863
864    /// Check if content likely contains code (fast)
865    pub fn likely_has_code(&self) -> bool {
866        self.char_frequency.backtick_count > 0
867    }
868
869    /// Check if content likely contains links or images (fast)
870    pub fn likely_has_links_or_images(&self) -> bool {
871        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
872    }
873
874    /// Check if content likely contains HTML (fast)
875    pub fn likely_has_html(&self) -> bool {
876        self.char_frequency.lt_count > 0
877    }
878
879    /// Get HTML tags on a specific line
880    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
881        self.html_tags()
882            .iter()
883            .filter(|tag| tag.line == line_num)
884            .cloned()
885            .collect()
886    }
887
888    /// Get emphasis spans on a specific line
889    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
890        self.emphasis_spans()
891            .iter()
892            .filter(|span| span.line == line_num)
893            .cloned()
894            .collect()
895    }
896
897    /// Get table rows on a specific line
898    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
899        self.table_rows()
900            .iter()
901            .filter(|row| row.line == line_num)
902            .cloned()
903            .collect()
904    }
905
906    /// Get bare URLs on a specific line
907    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
908        self.bare_urls()
909            .iter()
910            .filter(|url| url.line == line_num)
911            .cloned()
912            .collect()
913    }
914
915    /// Find the line index for a given byte offset using binary search.
916    /// Returns (line_index, line_number, column) where:
917    /// - line_index is the 0-based index in the lines array
918    /// - line_number is the 1-based line number
919    /// - column is the byte offset within that line
920    #[inline]
921    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
922        // Binary search to find the line containing this byte offset
923        let idx = match lines.binary_search_by(|line| {
924            if byte_offset < line.byte_offset {
925                std::cmp::Ordering::Greater
926            } else if byte_offset > line.byte_offset + line.byte_len {
927                std::cmp::Ordering::Less
928            } else {
929                std::cmp::Ordering::Equal
930            }
931        }) {
932            Ok(idx) => idx,
933            Err(idx) => idx.saturating_sub(1),
934        };
935
936        let line = &lines[idx];
937        let line_num = idx + 1;
938        let col = byte_offset.saturating_sub(line.byte_offset);
939
940        (idx, line_num, col)
941    }
942
943    /// Check if a byte offset is within a code span using binary search
944    #[inline]
945    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
946        // Since spans are sorted by byte_offset, use partition_point for binary search
947        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
948
949        // Check the span that starts at or before our offset
950        if idx > 0 {
951            let span = &code_spans[idx - 1];
952            if offset >= span.byte_offset && offset < span.byte_end {
953                return true;
954            }
955        }
956
957        false
958    }
959
960    /// Parse all links in the content
961    fn parse_links(
962        content: &'a str,
963        lines: &[LineInfo],
964        code_blocks: &[(usize, usize)],
965        code_spans: &[CodeSpan],
966        flavor: MarkdownFlavor,
967        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
968    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>) {
969        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
970        use std::collections::HashSet;
971
972        let mut links = Vec::with_capacity(content.len() / 500);
973        let mut broken_links = Vec::new();
974
975        // Track byte positions of links found by pulldown-cmark
976        let mut found_positions = HashSet::new();
977
978        // Use pulldown-cmark's streaming parser with BrokenLink callback
979        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
980        // This automatically handles:
981        // - Escaped links (won't generate events)
982        // - Links in code blocks/spans (won't generate Link events)
983        // - Images (generates Tag::Image instead)
984        // - Reference resolution (dest_url is already resolved!)
985        // - Broken references (callback is invoked)
986        // - Wiki-links (enabled via ENABLE_WIKILINKS)
987        let mut options = Options::empty();
988        options.insert(Options::ENABLE_WIKILINKS);
989
990        let parser = Parser::new_with_broken_link_callback(
991            content,
992            options,
993            Some(|link: BrokenLink<'_>| {
994                broken_links.push(BrokenLinkInfo {
995                    reference: link.reference.to_string(),
996                    span: link.span.clone(),
997                });
998                None
999            }),
1000        )
1001        .into_offset_iter();
1002
1003        let mut link_stack: Vec<(
1004            usize,
1005            usize,
1006            pulldown_cmark::CowStr<'a>,
1007            LinkType,
1008            pulldown_cmark::CowStr<'a>,
1009        )> = Vec::new();
1010        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1011
1012        for (event, range) in parser {
1013            match event {
1014                Event::Start(Tag::Link {
1015                    link_type,
1016                    dest_url,
1017                    id,
1018                    ..
1019                }) => {
1020                    // Link start - record position, URL, and reference ID
1021                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1022                    text_chunks.clear();
1023                }
1024                Event::Text(text) if !link_stack.is_empty() => {
1025                    // Track text content with its byte range
1026                    text_chunks.push((text.to_string(), range.start, range.end));
1027                }
1028                Event::Code(code) if !link_stack.is_empty() => {
1029                    // Include inline code in link text (with backticks)
1030                    let code_text = format!("`{code}`");
1031                    text_chunks.push((code_text, range.start, range.end));
1032                }
1033                Event::End(TagEnd::Link) => {
1034                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1035                        // Skip if in HTML comment
1036                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1037                            text_chunks.clear();
1038                            continue;
1039                        }
1040
1041                        // Find line and column information
1042                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1043
1044                        // Skip if this link is on a MkDocs snippet line
1045                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1046                            text_chunks.clear();
1047                            continue;
1048                        }
1049
1050                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1051
1052                        let is_reference = matches!(
1053                            link_type,
1054                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1055                        );
1056
1057                        // Extract link text directly from source bytes to preserve escaping
1058                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1059                        let link_text = if start_pos < content.len() {
1060                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1061
1062                            // Find MATCHING ] by tracking bracket depth for nested brackets
1063                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1064                            // Brackets inside code spans (between backticks) should be ignored
1065                            let mut close_pos = None;
1066                            let mut depth = 0;
1067                            let mut in_code_span = false;
1068
1069                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1070                                // Count preceding backslashes
1071                                let mut backslash_count = 0;
1072                                let mut j = i;
1073                                while j > 0 && link_bytes[j - 1] == b'\\' {
1074                                    backslash_count += 1;
1075                                    j -= 1;
1076                                }
1077                                let is_escaped = backslash_count % 2 != 0;
1078
1079                                // Track code spans - backticks toggle in/out of code
1080                                if byte == b'`' && !is_escaped {
1081                                    in_code_span = !in_code_span;
1082                                }
1083
1084                                // Only count brackets when NOT in a code span
1085                                if !is_escaped && !in_code_span {
1086                                    if byte == b'[' {
1087                                        depth += 1;
1088                                    } else if byte == b']' {
1089                                        if depth == 0 {
1090                                            // Found the matching closing bracket
1091                                            close_pos = Some(i);
1092                                            break;
1093                                        } else {
1094                                            depth -= 1;
1095                                        }
1096                                    }
1097                                }
1098                            }
1099
1100                            if let Some(pos) = close_pos {
1101                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1102                            } else {
1103                                Cow::Borrowed("")
1104                            }
1105                        } else {
1106                            Cow::Borrowed("")
1107                        };
1108
1109                        // For reference links, use the actual reference ID from pulldown-cmark
1110                        let reference_id = if is_reference && !ref_id.is_empty() {
1111                            Some(Cow::Owned(ref_id.to_lowercase()))
1112                        } else if is_reference {
1113                            // For collapsed/shortcut references without explicit ID, use the link text
1114                            Some(Cow::Owned(link_text.to_lowercase()))
1115                        } else {
1116                            None
1117                        };
1118
1119                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1120                        // Check for escaped image syntax: \![text](url)
1121                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1122                        let has_escaped_bang = start_pos >= 2
1123                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1124                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1125
1126                        // Check for escaped bracket: \[text](url)
1127                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1128                        let has_escaped_bracket =
1129                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1130
1131                        if has_escaped_bang || has_escaped_bracket {
1132                            text_chunks.clear();
1133                            continue; // Skip: this is escaped markdown, not a real link
1134                        }
1135
1136                        // Track this position as found
1137                        found_positions.insert(start_pos);
1138
1139                        links.push(ParsedLink {
1140                            line: line_num,
1141                            start_col: col_start,
1142                            end_col: col_end,
1143                            byte_offset: start_pos,
1144                            byte_end: range.end,
1145                            text: link_text,
1146                            url: Cow::Owned(url.to_string()),
1147                            is_reference,
1148                            reference_id,
1149                            link_type,
1150                        });
1151
1152                        text_chunks.clear();
1153                    }
1154                }
1155                _ => {}
1156            }
1157        }
1158
1159        // Also find undefined references using regex
1160        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1161        // because the reference is undefined
1162        for cap in LINK_PATTERN.captures_iter(content) {
1163            let full_match = cap.get(0).unwrap();
1164            let match_start = full_match.start();
1165            let match_end = full_match.end();
1166
1167            // Skip if this was already found by pulldown-cmark (it's a valid link)
1168            if found_positions.contains(&match_start) {
1169                continue;
1170            }
1171
1172            // Skip if escaped
1173            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1174                continue;
1175            }
1176
1177            // Skip if it's an image
1178            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1179                continue;
1180            }
1181
1182            // Skip if in code block
1183            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1184                continue;
1185            }
1186
1187            // Skip if in code span
1188            if Self::is_offset_in_code_span(code_spans, match_start) {
1189                continue;
1190            }
1191
1192            // Skip if in HTML comment
1193            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1194                continue;
1195            }
1196
1197            // Find line and column information
1198            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1199
1200            // Skip if this link is on a MkDocs snippet line
1201            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1202                continue;
1203            }
1204
1205            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1206
1207            let text = cap.get(1).map_or("", |m| m.as_str());
1208
1209            // Only process reference links (group 6)
1210            if let Some(ref_id) = cap.get(6) {
1211                let ref_id_str = ref_id.as_str();
1212                let normalized_ref = if ref_id_str.is_empty() {
1213                    Cow::Owned(text.to_lowercase()) // Implicit reference
1214                } else {
1215                    Cow::Owned(ref_id_str.to_lowercase())
1216                };
1217
1218                // This is an undefined reference (pulldown-cmark didn't parse it)
1219                links.push(ParsedLink {
1220                    line: line_num,
1221                    start_col: col_start,
1222                    end_col: col_end,
1223                    byte_offset: match_start,
1224                    byte_end: match_end,
1225                    text: Cow::Borrowed(text),
1226                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1227                    is_reference: true,
1228                    reference_id: Some(normalized_ref),
1229                    link_type: LinkType::Reference, // Undefined references are reference-style
1230                });
1231            }
1232        }
1233
1234        (links, broken_links)
1235    }
1236
1237    /// Parse all images in the content
1238    fn parse_images(
1239        content: &'a str,
1240        lines: &[LineInfo],
1241        code_blocks: &[(usize, usize)],
1242        code_spans: &[CodeSpan],
1243        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1244    ) -> Vec<ParsedImage<'a>> {
1245        use crate::utils::skip_context::is_in_html_comment_ranges;
1246        use std::collections::HashSet;
1247
1248        // Pre-size based on a heuristic: images are less common than links
1249        let mut images = Vec::with_capacity(content.len() / 1000);
1250        let mut found_positions = HashSet::new();
1251
1252        // Use pulldown-cmark for parsing - more accurate and faster
1253        let parser = Parser::new(content).into_offset_iter();
1254        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1255            Vec::new();
1256        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1257
1258        for (event, range) in parser {
1259            match event {
1260                Event::Start(Tag::Image {
1261                    link_type,
1262                    dest_url,
1263                    id,
1264                    ..
1265                }) => {
1266                    image_stack.push((range.start, dest_url, link_type, id));
1267                    text_chunks.clear();
1268                }
1269                Event::Text(text) if !image_stack.is_empty() => {
1270                    text_chunks.push((text.to_string(), range.start, range.end));
1271                }
1272                Event::Code(code) if !image_stack.is_empty() => {
1273                    let code_text = format!("`{code}`");
1274                    text_chunks.push((code_text, range.start, range.end));
1275                }
1276                Event::End(TagEnd::Image) => {
1277                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1278                        // Skip if in code block
1279                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1280                            continue;
1281                        }
1282
1283                        // Skip if in code span
1284                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1285                            continue;
1286                        }
1287
1288                        // Skip if in HTML comment
1289                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1290                            continue;
1291                        }
1292
1293                        // Find line and column using binary search
1294                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1295                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1296
1297                        let is_reference = matches!(
1298                            link_type,
1299                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1300                        );
1301
1302                        // Extract alt text directly from source bytes to preserve escaping
1303                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1304                        let alt_text = if start_pos < content.len() {
1305                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1306
1307                            // Find MATCHING ] by tracking bracket depth for nested brackets
1308                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1309                            let mut close_pos = None;
1310                            let mut depth = 0;
1311
1312                            if image_bytes.len() > 2 {
1313                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1314                                    // Count preceding backslashes
1315                                    let mut backslash_count = 0;
1316                                    let mut j = i;
1317                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1318                                        backslash_count += 1;
1319                                        j -= 1;
1320                                    }
1321                                    let is_escaped = backslash_count % 2 != 0;
1322
1323                                    if !is_escaped {
1324                                        if byte == b'[' {
1325                                            depth += 1;
1326                                        } else if byte == b']' {
1327                                            if depth == 0 {
1328                                                // Found the matching closing bracket
1329                                                close_pos = Some(i);
1330                                                break;
1331                                            } else {
1332                                                depth -= 1;
1333                                            }
1334                                        }
1335                                    }
1336                                }
1337                            }
1338
1339                            if let Some(pos) = close_pos {
1340                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1341                            } else {
1342                                Cow::Borrowed("")
1343                            }
1344                        } else {
1345                            Cow::Borrowed("")
1346                        };
1347
1348                        let reference_id = if is_reference && !ref_id.is_empty() {
1349                            Some(Cow::Owned(ref_id.to_lowercase()))
1350                        } else if is_reference {
1351                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1352                        } else {
1353                            None
1354                        };
1355
1356                        found_positions.insert(start_pos);
1357                        images.push(ParsedImage {
1358                            line: line_num,
1359                            start_col: col_start,
1360                            end_col: col_end,
1361                            byte_offset: start_pos,
1362                            byte_end: range.end,
1363                            alt_text,
1364                            url: Cow::Owned(url.to_string()),
1365                            is_reference,
1366                            reference_id,
1367                            link_type,
1368                        });
1369                    }
1370                }
1371                _ => {}
1372            }
1373        }
1374
1375        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1376        for cap in IMAGE_PATTERN.captures_iter(content) {
1377            let full_match = cap.get(0).unwrap();
1378            let match_start = full_match.start();
1379            let match_end = full_match.end();
1380
1381            // Skip if already found by pulldown-cmark
1382            if found_positions.contains(&match_start) {
1383                continue;
1384            }
1385
1386            // Skip if the ! is escaped
1387            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1388                continue;
1389            }
1390
1391            // Skip if in code block, code span, or HTML comment
1392            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1393                || Self::is_offset_in_code_span(code_spans, match_start)
1394                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1395            {
1396                continue;
1397            }
1398
1399            // Only process reference images (undefined references not found by pulldown-cmark)
1400            if let Some(ref_id) = cap.get(6) {
1401                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1402                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1403                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1404                let ref_id_str = ref_id.as_str();
1405                let normalized_ref = if ref_id_str.is_empty() {
1406                    Cow::Owned(alt_text.to_lowercase())
1407                } else {
1408                    Cow::Owned(ref_id_str.to_lowercase())
1409                };
1410
1411                images.push(ParsedImage {
1412                    line: line_num,
1413                    start_col: col_start,
1414                    end_col: col_end,
1415                    byte_offset: match_start,
1416                    byte_end: match_end,
1417                    alt_text: Cow::Borrowed(alt_text),
1418                    url: Cow::Borrowed(""),
1419                    is_reference: true,
1420                    reference_id: Some(normalized_ref),
1421                    link_type: LinkType::Reference, // Undefined references are reference-style
1422                });
1423            }
1424        }
1425
1426        images
1427    }
1428
1429    /// Parse reference definitions
1430    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1431        // Pre-size based on lines count as reference definitions are line-based
1432        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1433
1434        for (line_idx, line_info) in lines.iter().enumerate() {
1435            // Skip lines in code blocks
1436            if line_info.in_code_block {
1437                continue;
1438            }
1439
1440            let line = line_info.content(content);
1441            let line_num = line_idx + 1;
1442
1443            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1444                let id = cap.get(1).unwrap().as_str().to_lowercase();
1445                let url = cap.get(2).unwrap().as_str().to_string();
1446                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1447
1448                // Calculate byte positions
1449                // The match starts at the beginning of the line (0) and extends to the end
1450                let match_obj = cap.get(0).unwrap();
1451                let byte_offset = line_info.byte_offset + match_obj.start();
1452                let byte_end = line_info.byte_offset + match_obj.end();
1453
1454                refs.push(ReferenceDef {
1455                    line: line_num,
1456                    id,
1457                    url,
1458                    title,
1459                    byte_offset,
1460                    byte_end,
1461                });
1462            }
1463        }
1464
1465        refs
1466    }
1467
1468    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1469    /// Matches: ^(\s*>\s*)(.*)
1470    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1471    #[inline]
1472    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1473        let trimmed_start = line.trim_start();
1474        if !trimmed_start.starts_with('>') {
1475            return None;
1476        }
1477
1478        let leading_ws_len = line.len() - trimmed_start.len();
1479        let after_gt = &trimmed_start[1..];
1480        let content = after_gt.trim_start();
1481        let ws_after_gt_len = after_gt.len() - content.len();
1482        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1483
1484        Some((&line[..prefix_len], content))
1485    }
1486
1487    /// Fast unordered list parser - replaces regex for 5-10x speedup
1488    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1489    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1490    #[inline]
1491    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1492        let bytes = line.as_bytes();
1493        let mut i = 0;
1494
1495        // Skip leading whitespace
1496        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1497            i += 1;
1498        }
1499
1500        // Check for marker
1501        if i >= bytes.len() {
1502            return None;
1503        }
1504        let marker = bytes[i] as char;
1505        if marker != '-' && marker != '*' && marker != '+' {
1506            return None;
1507        }
1508        let marker_pos = i;
1509        i += 1;
1510
1511        // Collect spacing after marker (space or tab only)
1512        let spacing_start = i;
1513        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1514            i += 1;
1515        }
1516
1517        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1518    }
1519
1520    /// Fast ordered list parser - replaces regex for 5-10x speedup
1521    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1522    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1523    #[inline]
1524    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1525        let bytes = line.as_bytes();
1526        let mut i = 0;
1527
1528        // Skip leading whitespace
1529        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1530            i += 1;
1531        }
1532
1533        // Collect digits
1534        let number_start = i;
1535        while i < bytes.len() && bytes[i].is_ascii_digit() {
1536            i += 1;
1537        }
1538        if i == number_start {
1539            return None; // No digits found
1540        }
1541
1542        // Check for delimiter
1543        if i >= bytes.len() {
1544            return None;
1545        }
1546        let delimiter = bytes[i] as char;
1547        if delimiter != '.' && delimiter != ')' {
1548            return None;
1549        }
1550        let delimiter_pos = i;
1551        i += 1;
1552
1553        // Collect spacing after delimiter (space or tab only)
1554        let spacing_start = i;
1555        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1556            i += 1;
1557        }
1558
1559        Some((
1560            &line[..number_start],
1561            &line[number_start..delimiter_pos],
1562            delimiter,
1563            &line[spacing_start..i],
1564            &line[i..],
1565        ))
1566    }
1567
1568    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1569    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1570    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1571        let num_lines = line_offsets.len();
1572        let mut in_code_block = vec![false; num_lines];
1573
1574        // For each code block, mark all lines within it
1575        for &(start, end) in code_blocks {
1576            // Ensure we're at valid UTF-8 boundaries
1577            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1578                let mut boundary = start;
1579                while boundary > 0 && !content.is_char_boundary(boundary) {
1580                    boundary -= 1;
1581                }
1582                boundary
1583            } else {
1584                start
1585            };
1586
1587            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1588                let mut boundary = end;
1589                while boundary < content.len() && !content.is_char_boundary(boundary) {
1590                    boundary += 1;
1591                }
1592                boundary
1593            } else {
1594                end.min(content.len())
1595            };
1596
1597            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1598            // That function now has proper list context awareness (see code_block_utils.rs)
1599            // and correctly distinguishes between:
1600            // - Fenced code blocks (``` or ~~~)
1601            // - Indented code blocks at document level (4 spaces + blank line before)
1602            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1603            //
1604            // We no longer need to re-validate here. The original validation logic
1605            // was causing false positives by marking list continuation paragraphs as
1606            // code blocks when they have 4 spaces of indentation.
1607
1608            // Use binary search to find the first and last line indices
1609            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1610            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1611            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1612            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1613
1614            // Mark all lines in the range at once
1615            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1616                *flag = true;
1617            }
1618        }
1619
1620        in_code_block
1621    }
1622
1623    /// Pre-compute basic line information (without headings/blockquotes)
1624    fn compute_basic_line_info(
1625        content: &str,
1626        line_offsets: &[usize],
1627        code_blocks: &[(usize, usize)],
1628        flavor: MarkdownFlavor,
1629        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1630        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1631    ) -> Vec<LineInfo> {
1632        let content_lines: Vec<&str> = content.lines().collect();
1633        let mut lines = Vec::with_capacity(content_lines.len());
1634
1635        // Pre-compute which lines are in code blocks
1636        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1637
1638        // Detect front matter boundaries FIRST, before any other parsing
1639        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1640        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1641
1642        for (i, line) in content_lines.iter().enumerate() {
1643            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1644            let indent = line.len() - line.trim_start().len();
1645
1646            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1647            let blockquote_parse = Self::parse_blockquote_prefix(line);
1648
1649            // For blank detection, consider blockquote context
1650            let is_blank = if let Some((_, content)) = blockquote_parse {
1651                // In blockquote context, check if content after prefix is blank
1652                content.trim().is_empty()
1653            } else {
1654                line.trim().is_empty()
1655            };
1656
1657            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1658            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1659
1660            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1661            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1662                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1663            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1664            let in_html_comment =
1665                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1666            let list_item = if !(in_code_block
1667                || is_blank
1668                || in_mkdocstrings
1669                || in_html_comment
1670                || (front_matter_end > 0 && i < front_matter_end))
1671            {
1672                // Strip blockquote prefix if present for list detection (reuse cached result)
1673                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1674                    (content, prefix.len())
1675                } else {
1676                    (&**line, 0)
1677                };
1678
1679                if let Some((leading_spaces, marker, spacing, _content)) =
1680                    Self::parse_unordered_list(line_for_list_check)
1681                {
1682                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1683                    let content_column = marker_column + 1 + spacing.len();
1684
1685                    // According to CommonMark spec, unordered list items MUST have at least one space
1686                    // after the marker (-, *, or +). Without a space, it's not a list item.
1687                    // This also naturally handles cases like:
1688                    // - *emphasis* (not a list)
1689                    // - **bold** (not a list)
1690                    // - --- (horizontal rule, not a list)
1691                    if spacing.is_empty() {
1692                        None
1693                    } else {
1694                        Some(ListItemInfo {
1695                            marker: marker.to_string(),
1696                            is_ordered: false,
1697                            number: None,
1698                            marker_column,
1699                            content_column,
1700                        })
1701                    }
1702                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1703                    Self::parse_ordered_list(line_for_list_check)
1704                {
1705                    let marker = format!("{number_str}{delimiter}");
1706                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1707                    let content_column = marker_column + marker.len() + spacing.len();
1708
1709                    // According to CommonMark spec, ordered list items MUST have at least one space
1710                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1711                    if spacing.is_empty() {
1712                        None
1713                    } else {
1714                        Some(ListItemInfo {
1715                            marker,
1716                            is_ordered: true,
1717                            number: number_str.parse().ok(),
1718                            marker_column,
1719                            content_column,
1720                        })
1721                    }
1722                } else {
1723                    None
1724                }
1725            } else {
1726                None
1727            };
1728
1729            lines.push(LineInfo {
1730                byte_offset,
1731                byte_len: line.len(),
1732                indent,
1733                is_blank,
1734                in_code_block,
1735                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1736                in_html_block: false, // Will be populated after line creation
1737                in_html_comment,
1738                list_item,
1739                heading: None,    // Will be populated in second pass for Setext headings
1740                blockquote: None, // Will be populated after line creation
1741                in_mkdocstrings,
1742                in_esm_block: false, // Will be populated after line creation for MDX files
1743            });
1744        }
1745
1746        lines
1747    }
1748
1749    /// Detect headings and blockquotes (called after HTML block detection)
1750    fn detect_headings_and_blockquotes(
1751        content: &str,
1752        lines: &mut [LineInfo],
1753        flavor: MarkdownFlavor,
1754        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1755    ) {
1756        // Regex for heading detection
1757        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1758            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1759        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1760            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1761
1762        let content_lines: Vec<&str> = content.lines().collect();
1763
1764        // Detect front matter boundaries to skip those lines
1765        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1766
1767        // Detect headings (including Setext which needs look-ahead) and blockquotes
1768        for i in 0..lines.len() {
1769            if lines[i].in_code_block {
1770                continue;
1771            }
1772
1773            // Skip lines in front matter
1774            if front_matter_end > 0 && i < front_matter_end {
1775                continue;
1776            }
1777
1778            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1779            if lines[i].in_html_block {
1780                continue;
1781            }
1782
1783            let line = content_lines[i];
1784
1785            // Check for blockquotes (even on blank lines within blockquotes)
1786            if let Some(bq) = parse_blockquote_detailed(line) {
1787                let nesting_level = bq.markers.len(); // Each '>' is one level
1788                let marker_column = bq.indent.len();
1789
1790                // Build the prefix (indentation + markers + space)
1791                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1792
1793                // Check for various blockquote issues
1794                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1795                // Consider tabs as multiple spaces, or actual multiple spaces
1796                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1797
1798                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1799                // MD028 flags empty blockquote lines that don't have a single space after the marker
1800                // Lines like "> " or ">> " are already correct and don't need fixing
1801                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1802
1803                lines[i].blockquote = Some(BlockquoteInfo {
1804                    nesting_level,
1805                    indent: bq.indent.to_string(),
1806                    marker_column,
1807                    prefix,
1808                    content: bq.content.to_string(),
1809                    has_no_space_after_marker: has_no_space,
1810                    has_multiple_spaces_after_marker: has_multiple_spaces,
1811                    needs_md028_fix,
1812                });
1813            }
1814
1815            // Skip heading detection for blank lines
1816            if lines[i].is_blank {
1817                continue;
1818            }
1819
1820            // Check for ATX headings (but skip MkDocs snippet lines)
1821            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1822            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1823                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1824                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1825            } else {
1826                false
1827            };
1828
1829            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1830                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1831                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1832                    continue;
1833                }
1834                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1835                let hashes = caps.get(2).map_or("", |m| m.as_str());
1836                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1837                let rest = caps.get(4).map_or("", |m| m.as_str());
1838
1839                let level = hashes.len() as u8;
1840                let marker_column = leading_spaces.len();
1841
1842                // Check for closing sequence, but handle custom IDs that might come after
1843                let (text, has_closing, closing_seq) = {
1844                    // First check if there's a custom ID at the end
1845                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1846                        // Check if this looks like a valid custom ID (ends with })
1847                        if rest[id_start..].trim_end().ends_with('}') {
1848                            // Split off the custom ID
1849                            (&rest[..id_start], &rest[id_start..])
1850                        } else {
1851                            (rest, "")
1852                        }
1853                    } else {
1854                        (rest, "")
1855                    };
1856
1857                    // Now look for closing hashes in the part before the custom ID
1858                    let trimmed_rest = rest_without_id.trim_end();
1859                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1860                        // Look for the start of the hash sequence
1861                        let mut start_of_hashes = last_hash_pos;
1862                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1863                            start_of_hashes -= 1;
1864                        }
1865
1866                        // Check if there's at least one space before the closing hashes
1867                        let has_space_before = start_of_hashes == 0
1868                            || trimmed_rest
1869                                .chars()
1870                                .nth(start_of_hashes - 1)
1871                                .is_some_and(|c| c.is_whitespace());
1872
1873                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1874                        let potential_closing = &trimmed_rest[start_of_hashes..];
1875                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1876
1877                        if is_all_hashes && has_space_before {
1878                            // This is a closing sequence
1879                            let closing_hashes = potential_closing.to_string();
1880                            // The text is everything before the closing hashes
1881                            // Don't include the custom ID here - it will be extracted later
1882                            let text_part = if !custom_id_part.is_empty() {
1883                                // If we have a custom ID, append it back to get the full rest
1884                                // This allows the extract_header_id function to handle it properly
1885                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1886                            } else {
1887                                rest_without_id[..start_of_hashes].trim_end().to_string()
1888                            };
1889                            (text_part, true, closing_hashes)
1890                        } else {
1891                            // Not a valid closing sequence, return the full content
1892                            (rest.to_string(), false, String::new())
1893                        }
1894                    } else {
1895                        // No hashes found, return the full content
1896                        (rest.to_string(), false, String::new())
1897                    }
1898                };
1899
1900                let content_column = marker_column + hashes.len() + spaces_after.len();
1901
1902                // Extract custom header ID if present
1903                let raw_text = text.trim().to_string();
1904                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1905
1906                // If no custom ID was found on the header line, check the next line for standalone attr-list
1907                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1908                    let next_line = content_lines[i + 1];
1909                    if !lines[i + 1].in_code_block
1910                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1911                        && let Some(next_line_id) =
1912                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1913                    {
1914                        custom_id = Some(next_line_id);
1915                    }
1916                }
1917
1918                lines[i].heading = Some(HeadingInfo {
1919                    level,
1920                    style: HeadingStyle::ATX,
1921                    marker: hashes.to_string(),
1922                    marker_column,
1923                    content_column,
1924                    text: clean_text,
1925                    custom_id,
1926                    raw_text,
1927                    has_closing_sequence: has_closing,
1928                    closing_sequence: closing_seq,
1929                });
1930            }
1931            // Check for Setext headings (need to look at next line)
1932            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1933                let next_line = content_lines[i + 1];
1934                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1935                    // Skip if next line is front matter delimiter
1936                    if front_matter_end > 0 && i < front_matter_end {
1937                        continue;
1938                    }
1939
1940                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1941                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1942                    {
1943                        continue;
1944                    }
1945
1946                    let underline = next_line.trim();
1947
1948                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1949                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1950                    if underline == "---" {
1951                        continue;
1952                    }
1953
1954                    // Skip if the current line looks like YAML key-value syntax
1955                    let current_line_trimmed = line.trim();
1956                    if current_line_trimmed.contains(':')
1957                        && !current_line_trimmed.starts_with('#')
1958                        && !current_line_trimmed.contains('[')
1959                        && !current_line_trimmed.contains("](")
1960                    {
1961                        // This looks like "key: value" which suggests YAML, not a heading
1962                        continue;
1963                    }
1964
1965                    let level = if underline.starts_with('=') { 1 } else { 2 };
1966                    let style = if level == 1 {
1967                        HeadingStyle::Setext1
1968                    } else {
1969                        HeadingStyle::Setext2
1970                    };
1971
1972                    // Extract custom header ID if present
1973                    let raw_text = line.trim().to_string();
1974                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1975
1976                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1977                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1978                        let attr_line = content_lines[i + 2];
1979                        if !lines[i + 2].in_code_block
1980                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1981                            && let Some(attr_line_id) =
1982                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1983                        {
1984                            custom_id = Some(attr_line_id);
1985                        }
1986                    }
1987
1988                    lines[i].heading = Some(HeadingInfo {
1989                        level,
1990                        style,
1991                        marker: underline.to_string(),
1992                        marker_column: next_line.len() - next_line.trim_start().len(),
1993                        content_column: lines[i].indent,
1994                        text: clean_text,
1995                        custom_id,
1996                        raw_text,
1997                        has_closing_sequence: false,
1998                        closing_sequence: String::new(),
1999                    });
2000                }
2001            }
2002        }
2003    }
2004
2005    /// Detect HTML blocks in the content
2006    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2007        // HTML block elements that trigger block context
2008        const BLOCK_ELEMENTS: &[&str] = &[
2009            "address",
2010            "article",
2011            "aside",
2012            "blockquote",
2013            "details",
2014            "dialog",
2015            "dd",
2016            "div",
2017            "dl",
2018            "dt",
2019            "fieldset",
2020            "figcaption",
2021            "figure",
2022            "footer",
2023            "form",
2024            "h1",
2025            "h2",
2026            "h3",
2027            "h4",
2028            "h5",
2029            "h6",
2030            "header",
2031            "hr",
2032            "li",
2033            "main",
2034            "nav",
2035            "ol",
2036            "p",
2037            "picture",
2038            "pre",
2039            "script",
2040            "section",
2041            "style",
2042            "table",
2043            "tbody",
2044            "td",
2045            "textarea",
2046            "tfoot",
2047            "th",
2048            "thead",
2049            "tr",
2050            "ul",
2051        ];
2052
2053        let mut i = 0;
2054        while i < lines.len() {
2055            // Skip if already in code block or front matter
2056            if lines[i].in_code_block || lines[i].in_front_matter {
2057                i += 1;
2058                continue;
2059            }
2060
2061            let trimmed = lines[i].content(content).trim_start();
2062
2063            // Check if line starts with an HTML tag
2064            if trimmed.starts_with('<') && trimmed.len() > 1 {
2065                // Extract tag name safely
2066                let after_bracket = &trimmed[1..];
2067                let is_closing = after_bracket.starts_with('/');
2068                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2069
2070                // Extract tag name (stop at space, >, /, or end of string)
2071                let tag_name = tag_start
2072                    .chars()
2073                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2074                    .collect::<String>()
2075                    .to_lowercase();
2076
2077                // Check if it's a block element
2078                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2079                    // Mark this line as in HTML block
2080                    lines[i].in_html_block = true;
2081
2082                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2083                    // This avoids complex nesting logic that might cause infinite loops
2084                    if !is_closing {
2085                        let closing_tag = format!("</{tag_name}>");
2086                        // style and script tags can contain blank lines (CSS/JS formatting)
2087                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2088                        let mut j = i + 1;
2089                        while j < lines.len() && j < i + 100 {
2090                            // Limit search to 100 lines
2091                            // Stop at blank lines (except for style/script tags)
2092                            if !allow_blank_lines && lines[j].is_blank {
2093                                break;
2094                            }
2095
2096                            lines[j].in_html_block = true;
2097
2098                            // Check if this line contains the closing tag
2099                            if lines[j].content(content).contains(&closing_tag) {
2100                                break;
2101                            }
2102                            j += 1;
2103                        }
2104                    }
2105                }
2106            }
2107
2108            i += 1;
2109        }
2110    }
2111
2112    /// Detect ESM import/export blocks in MDX files
2113    /// ESM blocks consist of contiguous import/export statements at the top of the file
2114    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2115        // Only process MDX files
2116        if !flavor.supports_esm_blocks() {
2117            return;
2118        }
2119
2120        for line in lines.iter_mut() {
2121            // Skip blank lines and comments at the start
2122            if line.is_blank || line.in_html_comment {
2123                continue;
2124            }
2125
2126            // Check if line starts with import or export
2127            let trimmed = line.content(content).trim_start();
2128            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2129                line.in_esm_block = true;
2130            } else {
2131                // Once we hit a non-ESM line, we're done with the ESM block
2132                break;
2133            }
2134        }
2135    }
2136
2137    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2138    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2139        let mut code_spans = Vec::new();
2140
2141        // Quick check - if no backticks, no code spans
2142        if !content.contains('`') {
2143            return code_spans;
2144        }
2145
2146        // Use pulldown-cmark's streaming parser with byte offsets
2147        let parser = Parser::new(content).into_offset_iter();
2148
2149        for (event, range) in parser {
2150            if let Event::Code(_) = event {
2151                let start_pos = range.start;
2152                let end_pos = range.end;
2153
2154                // The range includes the backticks, extract the actual content
2155                let full_span = &content[start_pos..end_pos];
2156                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2157
2158                // Extract content between backticks, preserving spaces
2159                let content_start = start_pos + backtick_count;
2160                let content_end = end_pos - backtick_count;
2161                let span_content = if content_start < content_end {
2162                    content[content_start..content_end].to_string()
2163                } else {
2164                    String::new()
2165                };
2166
2167                // Use binary search to find line number - O(log n) instead of O(n)
2168                // Find the rightmost line whose byte_offset <= start_pos
2169                let line_idx = lines
2170                    .partition_point(|line| line.byte_offset <= start_pos)
2171                    .saturating_sub(1);
2172                let line_num = line_idx + 1;
2173                let col_start = start_pos - lines[line_idx].byte_offset;
2174
2175                // Find end column using binary search
2176                let end_line_idx = lines
2177                    .partition_point(|line| line.byte_offset <= end_pos)
2178                    .saturating_sub(1);
2179                let col_end = end_pos - lines[end_line_idx].byte_offset;
2180
2181                code_spans.push(CodeSpan {
2182                    line: line_num,
2183                    start_col: col_start,
2184                    end_col: col_end,
2185                    byte_offset: start_pos,
2186                    byte_end: end_pos,
2187                    backtick_count,
2188                    content: span_content,
2189                });
2190            }
2191        }
2192
2193        // Sort by position to ensure consistent ordering
2194        code_spans.sort_by_key(|span| span.byte_offset);
2195
2196        code_spans
2197    }
2198
2199    /// Parse all list blocks in the content (legacy line-by-line approach)
2200    ///
2201    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2202    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2203    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2204    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2205    ///   treated as list continuation (based on the list marker width)
2206    ///
2207    /// When a new list item is encountered, we check if list-breaking content was seen
2208    /// since the last item. If so, we start a new list block.
2209    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2210        // Minimum indentation for unordered list continuation per CommonMark spec
2211        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2212
2213        /// Initialize or reset the forward-scanning tracking state.
2214        /// This helper eliminates code duplication across three initialization sites.
2215        #[inline]
2216        fn reset_tracking_state(
2217            list_item: &ListItemInfo,
2218            has_list_breaking_content: &mut bool,
2219            min_continuation: &mut usize,
2220        ) {
2221            *has_list_breaking_content = false;
2222            let marker_width = if list_item.is_ordered {
2223                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2224            } else {
2225                list_item.marker.len()
2226            };
2227            *min_continuation = if list_item.is_ordered {
2228                marker_width
2229            } else {
2230                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2231            };
2232        }
2233
2234        // Pre-size based on lines that could be list items
2235        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2236        let mut current_block: Option<ListBlock> = None;
2237        let mut last_list_item_line = 0;
2238        let mut current_indent_level = 0;
2239        let mut last_marker_width = 0;
2240
2241        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2242        let mut has_list_breaking_content_since_last_item = false;
2243        let mut min_continuation_for_tracking = 0;
2244
2245        for (line_idx, line_info) in lines.iter().enumerate() {
2246            let line_num = line_idx + 1;
2247
2248            // Enhanced code block handling using Design #3's context analysis
2249            if line_info.in_code_block {
2250                if let Some(ref mut block) = current_block {
2251                    // Calculate minimum indentation for list continuation
2252                    let min_continuation_indent =
2253                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2254
2255                    // Analyze code block context using the three-tier classification
2256                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2257
2258                    match context {
2259                        CodeBlockContext::Indented => {
2260                            // Code block is properly indented - continues the list
2261                            block.end_line = line_num;
2262                            continue;
2263                        }
2264                        CodeBlockContext::Standalone => {
2265                            // Code block separates lists - end current block
2266                            let completed_block = current_block.take().unwrap();
2267                            list_blocks.push(completed_block);
2268                            continue;
2269                        }
2270                        CodeBlockContext::Adjacent => {
2271                            // Edge case - use conservative behavior (continue list)
2272                            block.end_line = line_num;
2273                            continue;
2274                        }
2275                    }
2276                } else {
2277                    // No current list block - skip code block lines
2278                    continue;
2279                }
2280            }
2281
2282            // Extract blockquote prefix if any
2283            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2284                caps.get(0).unwrap().as_str().to_string()
2285            } else {
2286                String::new()
2287            };
2288
2289            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2290            if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2291                let line_content = line_info.content(content).trim();
2292
2293                // Check for structural separators that break lists
2294                let breaks_list = line_info.heading.is_some()
2295                    || line_content.starts_with("---")
2296                    || line_content.starts_with("***")
2297                    || line_content.starts_with("___")
2298                    || (line_content.contains('|')
2299                        && !line_content.contains("](")
2300                        && !line_content.contains("http")
2301                        && (line_content.matches('|').count() > 1
2302                            || line_content.starts_with('|')
2303                            || line_content.ends_with('|')))
2304                    || line_content.starts_with(">")
2305                    || (line_info.indent < min_continuation_for_tracking);
2306
2307                if breaks_list {
2308                    has_list_breaking_content_since_last_item = true;
2309                }
2310            }
2311
2312            // Check if this line is a list item
2313            if let Some(list_item) = &line_info.list_item {
2314                // Calculate nesting level based on indentation
2315                let item_indent = list_item.marker_column;
2316                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2317
2318                if let Some(ref mut block) = current_block {
2319                    // Check if this continues the current block
2320                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2321                    // or a continuation at the same or lower level
2322                    let is_nested = nesting > block.nesting_level;
2323                    let same_type =
2324                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2325                    let same_context = block.blockquote_prefix == blockquote_prefix;
2326                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2327
2328                    // For unordered lists, also check marker consistency
2329                    let marker_compatible =
2330                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2331
2332                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2333                    // This eliminates the quadratic bottleneck from issue #148
2334                    let has_non_list_content = has_list_breaking_content_since_last_item;
2335
2336                    // A list continues if:
2337                    // 1. It's a nested item (indented more than the parent), OR
2338                    // 2. It's the same type at the same level with reasonable distance
2339                    let mut continues_list = if is_nested {
2340                        // Nested items always continue the list if they're in the same context
2341                        same_context && reasonable_distance && !has_non_list_content
2342                    } else {
2343                        // Same-level items need to match type and markers
2344                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2345                    };
2346
2347                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2348                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2349                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2350                        // Check if the previous line was a list item
2351                        if block.item_lines.contains(&(line_num - 1)) {
2352                            // They're consecutive list items - force them to be in the same list
2353                            continues_list = true;
2354                        }
2355                    }
2356
2357                    if continues_list {
2358                        // Extend current block
2359                        block.end_line = line_num;
2360                        block.item_lines.push(line_num);
2361
2362                        // Update max marker width
2363                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2364                            list_item.marker.len() + 1
2365                        } else {
2366                            list_item.marker.len()
2367                        });
2368
2369                        // Update marker consistency for unordered lists
2370                        if !block.is_ordered
2371                            && block.marker.is_some()
2372                            && block.marker.as_ref() != Some(&list_item.marker)
2373                        {
2374                            // Mixed markers, clear the marker field
2375                            block.marker = None;
2376                        }
2377
2378                        // Reset tracked state for issue #148 optimization
2379                        reset_tracking_state(
2380                            list_item,
2381                            &mut has_list_breaking_content_since_last_item,
2382                            &mut min_continuation_for_tracking,
2383                        );
2384                    } else {
2385                        // End current block and start a new one
2386
2387                        list_blocks.push(block.clone());
2388
2389                        *block = ListBlock {
2390                            start_line: line_num,
2391                            end_line: line_num,
2392                            is_ordered: list_item.is_ordered,
2393                            marker: if list_item.is_ordered {
2394                                None
2395                            } else {
2396                                Some(list_item.marker.clone())
2397                            },
2398                            blockquote_prefix: blockquote_prefix.clone(),
2399                            item_lines: vec![line_num],
2400                            nesting_level: nesting,
2401                            max_marker_width: if list_item.is_ordered {
2402                                list_item.marker.len() + 1
2403                            } else {
2404                                list_item.marker.len()
2405                            },
2406                        };
2407
2408                        // Initialize tracked state for new block (issue #148 optimization)
2409                        reset_tracking_state(
2410                            list_item,
2411                            &mut has_list_breaking_content_since_last_item,
2412                            &mut min_continuation_for_tracking,
2413                        );
2414                    }
2415                } else {
2416                    // Start a new block
2417                    current_block = Some(ListBlock {
2418                        start_line: line_num,
2419                        end_line: line_num,
2420                        is_ordered: list_item.is_ordered,
2421                        marker: if list_item.is_ordered {
2422                            None
2423                        } else {
2424                            Some(list_item.marker.clone())
2425                        },
2426                        blockquote_prefix,
2427                        item_lines: vec![line_num],
2428                        nesting_level: nesting,
2429                        max_marker_width: list_item.marker.len(),
2430                    });
2431
2432                    // Initialize tracked state for new block (issue #148 optimization)
2433                    reset_tracking_state(
2434                        list_item,
2435                        &mut has_list_breaking_content_since_last_item,
2436                        &mut min_continuation_for_tracking,
2437                    );
2438                }
2439
2440                last_list_item_line = line_num;
2441                current_indent_level = item_indent;
2442                last_marker_width = if list_item.is_ordered {
2443                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2444                } else {
2445                    list_item.marker.len()
2446                };
2447            } else if let Some(ref mut block) = current_block {
2448                // Not a list item - check if it continues the current block
2449
2450                // For MD032 compatibility, we use a simple approach:
2451                // - Indented lines continue the list
2452                // - Blank lines followed by indented content continue the list
2453                // - Everything else ends the list
2454
2455                // Check if the last line in the list block ended with a backslash (hard line break)
2456                // This handles cases where list items use backslash for hard line breaks
2457                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2458                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2459                } else {
2460                    false
2461                };
2462
2463                // Calculate minimum indentation for list continuation
2464                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2465                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2466                let min_continuation_indent = if block.is_ordered {
2467                    current_indent_level + last_marker_width
2468                } else {
2469                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2470                };
2471
2472                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2473                    // Indented line or backslash continuation continues the list
2474                    block.end_line = line_num;
2475                } else if line_info.is_blank {
2476                    // Blank line - check if it's internal to the list or ending it
2477                    // We only include blank lines that are followed by more list content
2478                    let mut check_idx = line_idx + 1;
2479                    let mut found_continuation = false;
2480
2481                    // Skip additional blank lines
2482                    while check_idx < lines.len() && lines[check_idx].is_blank {
2483                        check_idx += 1;
2484                    }
2485
2486                    if check_idx < lines.len() {
2487                        let next_line = &lines[check_idx];
2488                        // Check if followed by indented content (list continuation)
2489                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2490                            found_continuation = true;
2491                        }
2492                        // Check if followed by another list item at the same level
2493                        else if !next_line.in_code_block
2494                            && next_line.list_item.is_some()
2495                            && let Some(item) = &next_line.list_item
2496                        {
2497                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2498                                .find(next_line.content(content))
2499                                .map_or(String::new(), |m| m.as_str().to_string());
2500                            if item.marker_column == current_indent_level
2501                                && item.is_ordered == block.is_ordered
2502                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2503                            {
2504                                // Check if there was meaningful content between the list items (unused now)
2505                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2506                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2507                                    if let Some(between_line) = lines.get(idx) {
2508                                        let between_content = between_line.content(content);
2509                                        let trimmed = between_content.trim();
2510                                        // Skip empty lines
2511                                        if trimmed.is_empty() {
2512                                            return false;
2513                                        }
2514                                        // Check for meaningful content
2515                                        let line_indent = between_content.len() - between_content.trim_start().len();
2516
2517                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2518                                        if trimmed.starts_with("```")
2519                                            || trimmed.starts_with("~~~")
2520                                            || trimmed.starts_with("---")
2521                                            || trimmed.starts_with("***")
2522                                            || trimmed.starts_with("___")
2523                                            || trimmed.starts_with(">")
2524                                            || trimmed.contains('|') // Tables
2525                                            || between_line.heading.is_some()
2526                                        {
2527                                            return true; // These are structural separators - meaningful content that breaks lists
2528                                        }
2529
2530                                        // Only properly indented content continues the list
2531                                        line_indent >= min_continuation_indent
2532                                    } else {
2533                                        false
2534                                    }
2535                                });
2536
2537                                if block.is_ordered {
2538                                    // For ordered lists: don't continue if there are structural separators
2539                                    // Check if there are structural separators between the list items
2540                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2541                                        if let Some(between_line) = lines.get(idx) {
2542                                            let trimmed = between_line.content(content).trim();
2543                                            if trimmed.is_empty() {
2544                                                return false;
2545                                            }
2546                                            // Check for structural separators that break lists
2547                                            trimmed.starts_with("```")
2548                                                || trimmed.starts_with("~~~")
2549                                                || trimmed.starts_with("---")
2550                                                || trimmed.starts_with("***")
2551                                                || trimmed.starts_with("___")
2552                                                || trimmed.starts_with(">")
2553                                                || trimmed.contains('|') // Tables
2554                                                || between_line.heading.is_some()
2555                                        } else {
2556                                            false
2557                                        }
2558                                    });
2559                                    found_continuation = !has_structural_separators;
2560                                } else {
2561                                    // For unordered lists: also check for structural separators
2562                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2563                                        if let Some(between_line) = lines.get(idx) {
2564                                            let trimmed = between_line.content(content).trim();
2565                                            if trimmed.is_empty() {
2566                                                return false;
2567                                            }
2568                                            // Check for structural separators that break lists
2569                                            trimmed.starts_with("```")
2570                                                || trimmed.starts_with("~~~")
2571                                                || trimmed.starts_with("---")
2572                                                || trimmed.starts_with("***")
2573                                                || trimmed.starts_with("___")
2574                                                || trimmed.starts_with(">")
2575                                                || trimmed.contains('|') // Tables
2576                                                || between_line.heading.is_some()
2577                                        } else {
2578                                            false
2579                                        }
2580                                    });
2581                                    found_continuation = !has_structural_separators;
2582                                }
2583                            }
2584                        }
2585                    }
2586
2587                    if found_continuation {
2588                        // Include the blank line in the block
2589                        block.end_line = line_num;
2590                    } else {
2591                        // Blank line ends the list - don't include it
2592                        list_blocks.push(block.clone());
2593                        current_block = None;
2594                    }
2595                } else {
2596                    // Check for lazy continuation - non-indented line immediately after a list item
2597                    // But only if the line has sufficient indentation for the list type
2598                    let min_required_indent = if block.is_ordered {
2599                        current_indent_level + last_marker_width
2600                    } else {
2601                        current_indent_level + 2
2602                    };
2603
2604                    // For lazy continuation to apply, the line must either:
2605                    // 1. Have no indentation (true lazy continuation)
2606                    // 2. Have sufficient indentation for the list type
2607                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2608                    let line_content = line_info.content(content).trim();
2609                    let is_structural_separator = line_info.heading.is_some()
2610                        || line_content.starts_with("```")
2611                        || line_content.starts_with("~~~")
2612                        || line_content.starts_with("---")
2613                        || line_content.starts_with("***")
2614                        || line_content.starts_with("___")
2615                        || line_content.starts_with(">")
2616                        || (line_content.contains('|')
2617                            && !line_content.contains("](")
2618                            && !line_content.contains("http")
2619                            && (line_content.matches('|').count() > 1
2620                                || line_content.starts_with('|')
2621                                || line_content.ends_with('|'))); // Tables
2622
2623                    // Allow lazy continuation if we're still within the same list block
2624                    // (not just immediately after a list item)
2625                    let is_lazy_continuation = !is_structural_separator
2626                        && !line_info.is_blank
2627                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2628
2629                    if is_lazy_continuation {
2630                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2631                        // it's probably not a continuation
2632                        let content_to_check = if !blockquote_prefix.is_empty() {
2633                            // Strip blockquote prefix to check the actual content
2634                            line_info
2635                                .content(content)
2636                                .strip_prefix(&blockquote_prefix)
2637                                .unwrap_or(line_info.content(content))
2638                                .trim()
2639                        } else {
2640                            line_info.content(content).trim()
2641                        };
2642
2643                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2644
2645                        // If it starts with uppercase and the previous line ended with punctuation,
2646                        // it's likely a new paragraph, not a continuation
2647                        if starts_with_uppercase && last_list_item_line > 0 {
2648                            // This looks like a new paragraph
2649                            list_blocks.push(block.clone());
2650                            current_block = None;
2651                        } else {
2652                            // This is a lazy continuation line
2653                            block.end_line = line_num;
2654                        }
2655                    } else {
2656                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2657                        list_blocks.push(block.clone());
2658                        current_block = None;
2659                    }
2660                }
2661            }
2662        }
2663
2664        // Don't forget the last block
2665        if let Some(block) = current_block {
2666            list_blocks.push(block);
2667        }
2668
2669        // Merge adjacent blocks that should be one
2670        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2671
2672        list_blocks
2673    }
2674
2675    /// Compute character frequency for fast content analysis
2676    fn compute_char_frequency(content: &str) -> CharFrequency {
2677        let mut frequency = CharFrequency::default();
2678
2679        for ch in content.chars() {
2680            match ch {
2681                '#' => frequency.hash_count += 1,
2682                '*' => frequency.asterisk_count += 1,
2683                '_' => frequency.underscore_count += 1,
2684                '-' => frequency.hyphen_count += 1,
2685                '+' => frequency.plus_count += 1,
2686                '>' => frequency.gt_count += 1,
2687                '|' => frequency.pipe_count += 1,
2688                '[' => frequency.bracket_count += 1,
2689                '`' => frequency.backtick_count += 1,
2690                '<' => frequency.lt_count += 1,
2691                '!' => frequency.exclamation_count += 1,
2692                '\n' => frequency.newline_count += 1,
2693                _ => {}
2694            }
2695        }
2696
2697        frequency
2698    }
2699
2700    /// Parse HTML tags in the content
2701    fn parse_html_tags(
2702        content: &str,
2703        lines: &[LineInfo],
2704        code_blocks: &[(usize, usize)],
2705        flavor: MarkdownFlavor,
2706    ) -> Vec<HtmlTag> {
2707        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2708            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2709
2710        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2711
2712        for cap in HTML_TAG_REGEX.captures_iter(content) {
2713            let full_match = cap.get(0).unwrap();
2714            let match_start = full_match.start();
2715            let match_end = full_match.end();
2716
2717            // Skip if in code block
2718            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2719                continue;
2720            }
2721
2722            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2723            let tag_name_original = cap.get(2).unwrap().as_str();
2724            let tag_name = tag_name_original.to_lowercase();
2725            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2726
2727            // Skip JSX components in MDX files (tags starting with uppercase letter)
2728            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2729            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2730                continue;
2731            }
2732
2733            // Find which line this tag is on
2734            let mut line_num = 1;
2735            let mut col_start = match_start;
2736            let mut col_end = match_end;
2737            for (idx, line_info) in lines.iter().enumerate() {
2738                if match_start >= line_info.byte_offset {
2739                    line_num = idx + 1;
2740                    col_start = match_start - line_info.byte_offset;
2741                    col_end = match_end - line_info.byte_offset;
2742                } else {
2743                    break;
2744                }
2745            }
2746
2747            html_tags.push(HtmlTag {
2748                line: line_num,
2749                start_col: col_start,
2750                end_col: col_end,
2751                byte_offset: match_start,
2752                byte_end: match_end,
2753                tag_name,
2754                is_closing,
2755                is_self_closing,
2756                raw_content: full_match.as_str().to_string(),
2757            });
2758        }
2759
2760        html_tags
2761    }
2762
2763    /// Parse emphasis spans in the content
2764    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2765        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2766            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2767
2768        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2769
2770        for cap in EMPHASIS_REGEX.captures_iter(content) {
2771            let full_match = cap.get(0).unwrap();
2772            let match_start = full_match.start();
2773            let match_end = full_match.end();
2774
2775            // Skip if in code block
2776            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2777                continue;
2778            }
2779
2780            let opening_markers = cap.get(1).unwrap().as_str();
2781            let content_part = cap.get(2).unwrap().as_str();
2782            let closing_markers = cap.get(3).unwrap().as_str();
2783
2784            // Validate matching markers
2785            if opening_markers.chars().next() != closing_markers.chars().next()
2786                || opening_markers.len() != closing_markers.len()
2787            {
2788                continue;
2789            }
2790
2791            let marker = opening_markers.chars().next().unwrap();
2792            let marker_count = opening_markers.len();
2793
2794            // Find which line this emphasis is on
2795            let mut line_num = 1;
2796            let mut col_start = match_start;
2797            let mut col_end = match_end;
2798            for (idx, line_info) in lines.iter().enumerate() {
2799                if match_start >= line_info.byte_offset {
2800                    line_num = idx + 1;
2801                    col_start = match_start - line_info.byte_offset;
2802                    col_end = match_end - line_info.byte_offset;
2803                } else {
2804                    break;
2805                }
2806            }
2807
2808            emphasis_spans.push(EmphasisSpan {
2809                line: line_num,
2810                start_col: col_start,
2811                end_col: col_end,
2812                byte_offset: match_start,
2813                byte_end: match_end,
2814                marker,
2815                marker_count,
2816                content: content_part.to_string(),
2817            });
2818        }
2819
2820        emphasis_spans
2821    }
2822
2823    /// Parse table rows in the content
2824    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2825        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2826
2827        for (line_idx, line_info) in lines.iter().enumerate() {
2828            // Skip lines in code blocks or blank lines
2829            if line_info.in_code_block || line_info.is_blank {
2830                continue;
2831            }
2832
2833            let line = line_info.content(content);
2834            let line_num = line_idx + 1;
2835
2836            // Check if this line contains pipes (potential table row)
2837            if !line.contains('|') {
2838                continue;
2839            }
2840
2841            // Count columns by splitting on pipes
2842            let parts: Vec<&str> = line.split('|').collect();
2843            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2844
2845            // Check if this is a separator row
2846            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2847            let mut column_alignments = Vec::new();
2848
2849            if is_separator {
2850                for part in &parts[1..parts.len() - 1] {
2851                    // Skip first and last empty parts
2852                    let trimmed = part.trim();
2853                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2854                        "center".to_string()
2855                    } else if trimmed.ends_with(':') {
2856                        "right".to_string()
2857                    } else if trimmed.starts_with(':') {
2858                        "left".to_string()
2859                    } else {
2860                        "none".to_string()
2861                    };
2862                    column_alignments.push(alignment);
2863                }
2864            }
2865
2866            table_rows.push(TableRow {
2867                line: line_num,
2868                is_separator,
2869                column_count,
2870                column_alignments,
2871            });
2872        }
2873
2874        table_rows
2875    }
2876
2877    /// Parse bare URLs and emails in the content
2878    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2879        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2880
2881        // Check for bare URLs (not in angle brackets or markdown links)
2882        for cap in BARE_URL_PATTERN.captures_iter(content) {
2883            let full_match = cap.get(0).unwrap();
2884            let match_start = full_match.start();
2885            let match_end = full_match.end();
2886
2887            // Skip if in code block
2888            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2889                continue;
2890            }
2891
2892            // Skip if already in angle brackets or markdown links
2893            let preceding_char = if match_start > 0 {
2894                content.chars().nth(match_start - 1)
2895            } else {
2896                None
2897            };
2898            let following_char = content.chars().nth(match_end);
2899
2900            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2901                continue;
2902            }
2903            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2904                continue;
2905            }
2906
2907            let url = full_match.as_str();
2908            let url_type = if url.starts_with("https://") {
2909                "https"
2910            } else if url.starts_with("http://") {
2911                "http"
2912            } else if url.starts_with("ftp://") {
2913                "ftp"
2914            } else {
2915                "other"
2916            };
2917
2918            // Find which line this URL is on
2919            let mut line_num = 1;
2920            let mut col_start = match_start;
2921            let mut col_end = match_end;
2922            for (idx, line_info) in lines.iter().enumerate() {
2923                if match_start >= line_info.byte_offset {
2924                    line_num = idx + 1;
2925                    col_start = match_start - line_info.byte_offset;
2926                    col_end = match_end - line_info.byte_offset;
2927                } else {
2928                    break;
2929                }
2930            }
2931
2932            bare_urls.push(BareUrl {
2933                line: line_num,
2934                start_col: col_start,
2935                end_col: col_end,
2936                byte_offset: match_start,
2937                byte_end: match_end,
2938                url: url.to_string(),
2939                url_type: url_type.to_string(),
2940            });
2941        }
2942
2943        // Check for bare email addresses
2944        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2945            let full_match = cap.get(0).unwrap();
2946            let match_start = full_match.start();
2947            let match_end = full_match.end();
2948
2949            // Skip if in code block
2950            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2951                continue;
2952            }
2953
2954            // Skip if already in angle brackets or markdown links
2955            let preceding_char = if match_start > 0 {
2956                content.chars().nth(match_start - 1)
2957            } else {
2958                None
2959            };
2960            let following_char = content.chars().nth(match_end);
2961
2962            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2963                continue;
2964            }
2965            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2966                continue;
2967            }
2968
2969            let email = full_match.as_str();
2970
2971            // Find which line this email is on
2972            let mut line_num = 1;
2973            let mut col_start = match_start;
2974            let mut col_end = match_end;
2975            for (idx, line_info) in lines.iter().enumerate() {
2976                if match_start >= line_info.byte_offset {
2977                    line_num = idx + 1;
2978                    col_start = match_start - line_info.byte_offset;
2979                    col_end = match_end - line_info.byte_offset;
2980                } else {
2981                    break;
2982                }
2983            }
2984
2985            bare_urls.push(BareUrl {
2986                line: line_num,
2987                start_col: col_start,
2988                end_col: col_end,
2989                byte_offset: match_start,
2990                byte_end: match_end,
2991                url: email.to_string(),
2992                url_type: "email".to_string(),
2993            });
2994        }
2995
2996        bare_urls
2997    }
2998}
2999
3000/// Merge adjacent list blocks that should be treated as one
3001fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3002    if list_blocks.len() < 2 {
3003        return;
3004    }
3005
3006    let mut merger = ListBlockMerger::new(content, lines);
3007    *list_blocks = merger.merge(list_blocks);
3008}
3009
3010/// Helper struct to manage the complex logic of merging list blocks
3011struct ListBlockMerger<'a> {
3012    content: &'a str,
3013    lines: &'a [LineInfo],
3014}
3015
3016impl<'a> ListBlockMerger<'a> {
3017    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3018        Self { content, lines }
3019    }
3020
3021    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3022        let mut merged = Vec::with_capacity(list_blocks.len());
3023        let mut current = list_blocks[0].clone();
3024
3025        for next in list_blocks.iter().skip(1) {
3026            if self.should_merge_blocks(&current, next) {
3027                current = self.merge_two_blocks(current, next);
3028            } else {
3029                merged.push(current);
3030                current = next.clone();
3031            }
3032        }
3033
3034        merged.push(current);
3035        merged
3036    }
3037
3038    /// Determine if two adjacent list blocks should be merged
3039    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3040        // Basic compatibility checks
3041        if !self.blocks_are_compatible(current, next) {
3042            return false;
3043        }
3044
3045        // Check spacing and content between blocks
3046        let spacing = self.analyze_spacing_between(current, next);
3047        match spacing {
3048            BlockSpacing::Consecutive => true,
3049            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3050            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3051                self.can_merge_with_content_between(current, next)
3052            }
3053        }
3054    }
3055
3056    /// Check if blocks have compatible structure for merging
3057    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3058        current.is_ordered == next.is_ordered
3059            && current.blockquote_prefix == next.blockquote_prefix
3060            && current.nesting_level == next.nesting_level
3061    }
3062
3063    /// Analyze the spacing between two list blocks
3064    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3065        let gap = next.start_line - current.end_line;
3066
3067        match gap {
3068            1 => BlockSpacing::Consecutive,
3069            2 => BlockSpacing::SingleBlank,
3070            _ if gap > 2 => {
3071                if self.has_only_blank_lines_between(current, next) {
3072                    BlockSpacing::MultipleBlanks
3073                } else {
3074                    BlockSpacing::ContentBetween
3075                }
3076            }
3077            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3078        }
3079    }
3080
3081    /// Check if unordered lists can be merged with a single blank line between
3082    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3083        // Check if there are structural separators between the blocks
3084        // If has_meaningful_content_between returns true, it means there are structural separators
3085        if has_meaningful_content_between(self.content, current, next, self.lines) {
3086            return false; // Structural separators prevent merging
3087        }
3088
3089        // Only merge unordered lists with same marker across single blank
3090        !current.is_ordered && current.marker == next.marker
3091    }
3092
3093    /// Check if ordered lists can be merged when there's content between them
3094    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3095        // Do not merge lists if there are structural separators between them
3096        if has_meaningful_content_between(self.content, current, next, self.lines) {
3097            return false; // Structural separators prevent merging
3098        }
3099
3100        // Only consider merging ordered lists if there's no structural content between
3101        current.is_ordered && next.is_ordered
3102    }
3103
3104    /// Check if there are only blank lines between blocks
3105    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3106        for line_num in (current.end_line + 1)..next.start_line {
3107            if let Some(line_info) = self.lines.get(line_num - 1)
3108                && !line_info.content(self.content).trim().is_empty()
3109            {
3110                return false;
3111            }
3112        }
3113        true
3114    }
3115
3116    /// Merge two compatible list blocks into one
3117    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3118        current.end_line = next.end_line;
3119        current.item_lines.extend_from_slice(&next.item_lines);
3120
3121        // Update max marker width
3122        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3123
3124        // Handle marker consistency for unordered lists
3125        if !current.is_ordered && self.markers_differ(&current, next) {
3126            current.marker = None; // Mixed markers
3127        }
3128
3129        current
3130    }
3131
3132    /// Check if two blocks have different markers
3133    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3134        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3135    }
3136}
3137
3138/// Types of spacing between list blocks
3139#[derive(Debug, PartialEq)]
3140enum BlockSpacing {
3141    Consecutive,    // No gap between blocks
3142    SingleBlank,    // One blank line between blocks
3143    MultipleBlanks, // Multiple blank lines but no content
3144    ContentBetween, // Content exists between blocks
3145}
3146
3147/// Check if there's meaningful content (not just blank lines) between two list blocks
3148fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3149    // Check lines between current.end_line and next.start_line
3150    for line_num in (current.end_line + 1)..next.start_line {
3151        if let Some(line_info) = lines.get(line_num - 1) {
3152            // Convert to 0-indexed
3153            let trimmed = line_info.content(content).trim();
3154
3155            // Skip empty lines
3156            if trimmed.is_empty() {
3157                continue;
3158            }
3159
3160            // Check for structural separators that should separate lists (CommonMark compliant)
3161
3162            // Headings separate lists
3163            if line_info.heading.is_some() {
3164                return true; // Has meaningful content - headings separate lists
3165            }
3166
3167            // Horizontal rules separate lists (---, ***, ___)
3168            if is_horizontal_rule(trimmed) {
3169                return true; // Has meaningful content - horizontal rules separate lists
3170            }
3171
3172            // Tables separate lists (lines containing | but not in URLs or code)
3173            // Simple heuristic: tables typically have | at start/end or multiple |
3174            if trimmed.contains('|') && trimmed.len() > 1 {
3175                // Don't treat URLs with | as tables
3176                if !trimmed.contains("](") && !trimmed.contains("http") {
3177                    // More robust check: tables usually have multiple | or | at edges
3178                    let pipe_count = trimmed.matches('|').count();
3179                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3180                        return true; // Has meaningful content - tables separate lists
3181                    }
3182                }
3183            }
3184
3185            // Blockquotes separate lists
3186            if trimmed.starts_with('>') {
3187                return true; // Has meaningful content - blockquotes separate lists
3188            }
3189
3190            // Code block fences separate lists (unless properly indented as list content)
3191            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3192                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3193
3194                // Check if this code block is properly indented as list continuation
3195                let min_continuation_indent = if current.is_ordered {
3196                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3197                } else {
3198                    current.nesting_level + 2
3199                };
3200
3201                if line_indent < min_continuation_indent {
3202                    // This is a standalone code block that separates lists
3203                    return true; // Has meaningful content - standalone code blocks separate lists
3204                }
3205            }
3206
3207            // Check if this line has proper indentation for list continuation
3208            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3209
3210            // Calculate minimum indentation needed to be list continuation
3211            let min_indent = if current.is_ordered {
3212                current.nesting_level + current.max_marker_width
3213            } else {
3214                current.nesting_level + 2
3215            };
3216
3217            // If the line is not indented enough to be list continuation, it's meaningful content
3218            if line_indent < min_indent {
3219                return true; // Has meaningful content - content not indented as list continuation
3220            }
3221
3222            // If we reach here, the line is properly indented as list continuation
3223            // Continue checking other lines
3224        }
3225    }
3226
3227    // Only blank lines or properly indented list continuation content between blocks
3228    false
3229}
3230
3231/// Check if a line is a horizontal rule (---, ***, ___)
3232fn is_horizontal_rule(trimmed: &str) -> bool {
3233    if trimmed.len() < 3 {
3234        return false;
3235    }
3236
3237    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3238    let chars: Vec<char> = trimmed.chars().collect();
3239    if let Some(&first_char) = chars.first()
3240        && (first_char == '-' || first_char == '*' || first_char == '_')
3241    {
3242        let mut count = 0;
3243        for &ch in &chars {
3244            if ch == first_char {
3245                count += 1;
3246            } else if ch != ' ' && ch != '\t' {
3247                return false; // Non-matching, non-whitespace character
3248            }
3249        }
3250        return count >= 3;
3251    }
3252    false
3253}
3254
3255/// Check if content contains patterns that cause the markdown crate to panic
3256#[cfg(test)]
3257mod tests {
3258    use super::*;
3259
3260    #[test]
3261    fn test_empty_content() {
3262        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3263        assert_eq!(ctx.content, "");
3264        assert_eq!(ctx.line_offsets, vec![0]);
3265        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3266        assert_eq!(ctx.lines.len(), 0);
3267    }
3268
3269    #[test]
3270    fn test_single_line() {
3271        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3272        assert_eq!(ctx.content, "# Hello");
3273        assert_eq!(ctx.line_offsets, vec![0]);
3274        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3275        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3276    }
3277
3278    #[test]
3279    fn test_multi_line() {
3280        let content = "# Title\n\nSecond line\nThird line";
3281        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3282        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3283        // Test offset to line/col
3284        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3285        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3286        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3287        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3288        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3289    }
3290
3291    #[test]
3292    fn test_line_info() {
3293        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3294        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3295
3296        // Test line info
3297        assert_eq!(ctx.lines.len(), 7);
3298
3299        // Line 1: "# Title"
3300        let line1 = &ctx.lines[0];
3301        assert_eq!(line1.content(ctx.content), "# Title");
3302        assert_eq!(line1.byte_offset, 0);
3303        assert_eq!(line1.indent, 0);
3304        assert!(!line1.is_blank);
3305        assert!(!line1.in_code_block);
3306        assert!(line1.list_item.is_none());
3307
3308        // Line 2: "    indented"
3309        let line2 = &ctx.lines[1];
3310        assert_eq!(line2.content(ctx.content), "    indented");
3311        assert_eq!(line2.byte_offset, 8);
3312        assert_eq!(line2.indent, 4);
3313        assert!(!line2.is_blank);
3314
3315        // Line 3: "" (blank)
3316        let line3 = &ctx.lines[2];
3317        assert_eq!(line3.content(ctx.content), "");
3318        assert!(line3.is_blank);
3319
3320        // Test helper methods
3321        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3322        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3323        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3324        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3325    }
3326
3327    #[test]
3328    fn test_list_item_detection() {
3329        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3330        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3331
3332        // Line 1: "- Unordered item"
3333        let line1 = &ctx.lines[0];
3334        assert!(line1.list_item.is_some());
3335        let list1 = line1.list_item.as_ref().unwrap();
3336        assert_eq!(list1.marker, "-");
3337        assert!(!list1.is_ordered);
3338        assert_eq!(list1.marker_column, 0);
3339        assert_eq!(list1.content_column, 2);
3340
3341        // Line 2: "  * Nested item"
3342        let line2 = &ctx.lines[1];
3343        assert!(line2.list_item.is_some());
3344        let list2 = line2.list_item.as_ref().unwrap();
3345        assert_eq!(list2.marker, "*");
3346        assert_eq!(list2.marker_column, 2);
3347
3348        // Line 3: "1. Ordered item"
3349        let line3 = &ctx.lines[2];
3350        assert!(line3.list_item.is_some());
3351        let list3 = line3.list_item.as_ref().unwrap();
3352        assert_eq!(list3.marker, "1.");
3353        assert!(list3.is_ordered);
3354        assert_eq!(list3.number, Some(1));
3355
3356        // Line 6: "Not a list"
3357        let line6 = &ctx.lines[5];
3358        assert!(line6.list_item.is_none());
3359    }
3360
3361    #[test]
3362    fn test_offset_to_line_col_edge_cases() {
3363        let content = "a\nb\nc";
3364        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3365        // line_offsets: [0, 2, 4]
3366        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3367        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3368        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3369        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3370        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3371        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3372    }
3373
3374    #[test]
3375    fn test_mdx_esm_blocks() {
3376        let content = r##"import {Chart} from './snowfall.js'
3377export const year = 2023
3378
3379# Last year's snowfall
3380
3381In {year}, the snowfall was above average.
3382It was followed by a warm spring which caused
3383flood conditions in many of the nearby rivers.
3384
3385<Chart color="#fcb32c" year={year} />
3386"##;
3387
3388        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3389
3390        // Check that lines 1 and 2 are marked as ESM blocks
3391        assert_eq!(ctx.lines.len(), 10);
3392        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3393        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3394        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3395        assert!(
3396            !ctx.lines[3].in_esm_block,
3397            "Line 4 (heading) should NOT be in_esm_block"
3398        );
3399        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3400        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3401    }
3402
3403    #[test]
3404    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3405        let content = r#"import {Chart} from './snowfall.js'
3406export const year = 2023
3407
3408# Last year's snowfall
3409"#;
3410
3411        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3412
3413        // ESM blocks should NOT be detected in Standard flavor
3414        assert!(
3415            !ctx.lines[0].in_esm_block,
3416            "Line 1 should NOT be in_esm_block in Standard flavor"
3417        );
3418        assert!(
3419            !ctx.lines[1].in_esm_block,
3420            "Line 2 should NOT be in_esm_block in Standard flavor"
3421        );
3422    }
3423}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs