rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::borrow::Cow;
7use std::sync::LazyLock;
8
9// Comprehensive link pattern that captures both inline and reference links
10// Use (?s) flag to make . match newlines
11static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
12    Regex::new(
13        r#"(?sx)
14        \[((?:[^\[\]\\]|\\.)*)\]          # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
15        (?:
16            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
17            |
18            \[([^\]]*)\]      # Reference ID in group 6
19        )"#
20    ).unwrap()
21});
22
23// Image pattern (similar to links but with ! prefix)
24// Use (?s) flag to make . match newlines
25static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
26    Regex::new(
27        r#"(?sx)
28        !\[((?:[^\[\]\\]|\\.)*)\]         # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
29        (?:
30            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
31            |
32            \[([^\]]*)\]      # Reference ID in group 6
33        )"#
34    ).unwrap()
35});
36
37// Reference definition pattern
38static REF_DEF_PATTERN: LazyLock<Regex> =
39    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
40
41// Pattern for bare URLs
42static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
43    Regex::new(
44        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
45    ).unwrap()
46});
47
48// Pattern for email addresses
49static BARE_EMAIL_PATTERN: LazyLock<Regex> =
50    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
51
52// Pattern for blockquote prefix in parse_list_blocks
53static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
54
55/// Pre-computed information about a line
56#[derive(Debug, Clone)]
57pub struct LineInfo {
58    /// Byte offset where this line starts in the document
59    pub byte_offset: usize,
60    /// Length of the line in bytes (without newline)
61    pub byte_len: usize,
62    /// Number of leading spaces/tabs
63    pub indent: usize,
64    /// Whether the line is blank (empty or only whitespace)
65    pub is_blank: bool,
66    /// Whether this line is inside a code block
67    pub in_code_block: bool,
68    /// Whether this line is inside front matter
69    pub in_front_matter: bool,
70    /// Whether this line is inside an HTML block
71    pub in_html_block: bool,
72    /// Whether this line is inside an HTML comment
73    pub in_html_comment: bool,
74    /// List item information if this line starts a list item
75    pub list_item: Option<ListItemInfo>,
76    /// Heading information if this line is a heading
77    pub heading: Option<HeadingInfo>,
78    /// Blockquote information if this line is a blockquote
79    pub blockquote: Option<BlockquoteInfo>,
80    /// Whether this line is inside a mkdocstrings autodoc block
81    pub in_mkdocstrings: bool,
82    /// Whether this line is part of an ESM import/export block (MDX only)
83    pub in_esm_block: bool,
84}
85
86impl LineInfo {
87    /// Get the line content as a string slice from the source document
88    pub fn content<'a>(&self, source: &'a str) -> &'a str {
89        &source[self.byte_offset..self.byte_offset + self.byte_len]
90    }
91}
92
93/// Information about a list item
94#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96    /// The marker used (*, -, +, or number with . or ))
97    pub marker: String,
98    /// Whether it's ordered (true) or unordered (false)
99    pub is_ordered: bool,
100    /// The number for ordered lists
101    pub number: Option<usize>,
102    /// Column where the marker starts (0-based)
103    pub marker_column: usize,
104    /// Column where content after marker starts
105    pub content_column: usize,
106}
107
108/// Heading style type
109#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111    /// ATX style heading (# Heading)
112    ATX,
113    /// Setext style heading with = underline
114    Setext1,
115    /// Setext style heading with - underline
116    Setext2,
117}
118
119/// Parsed link information
120#[derive(Debug, Clone)]
121pub struct ParsedLink<'a> {
122    /// Line number (1-indexed)
123    pub line: usize,
124    /// Start column (0-indexed) in the line
125    pub start_col: usize,
126    /// End column (0-indexed) in the line
127    pub end_col: usize,
128    /// Byte offset in document
129    pub byte_offset: usize,
130    /// End byte offset in document
131    pub byte_end: usize,
132    /// Link text
133    pub text: Cow<'a, str>,
134    /// Link URL or reference
135    pub url: Cow<'a, str>,
136    /// Whether this is a reference link [text][ref] vs inline [text](url)
137    pub is_reference: bool,
138    /// Reference ID for reference links
139    pub reference_id: Option<Cow<'a, str>>,
140    /// Link type from pulldown-cmark
141    pub link_type: LinkType,
142}
143
144/// Information about a broken link reported by pulldown-cmark
145#[derive(Debug, Clone)]
146pub struct BrokenLinkInfo {
147    /// The reference text that couldn't be resolved
148    pub reference: String,
149    /// Byte span in the source document
150    pub span: std::ops::Range<usize>,
151}
152
153/// Parsed image information
154#[derive(Debug, Clone)]
155pub struct ParsedImage<'a> {
156    /// Line number (1-indexed)
157    pub line: usize,
158    /// Start column (0-indexed) in the line
159    pub start_col: usize,
160    /// End column (0-indexed) in the line
161    pub end_col: usize,
162    /// Byte offset in document
163    pub byte_offset: usize,
164    /// End byte offset in document
165    pub byte_end: usize,
166    /// Alt text
167    pub alt_text: Cow<'a, str>,
168    /// Image URL or reference
169    pub url: Cow<'a, str>,
170    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
171    pub is_reference: bool,
172    /// Reference ID for reference images
173    pub reference_id: Option<Cow<'a, str>>,
174    /// Link type from pulldown-cmark
175    pub link_type: LinkType,
176}
177
178/// Reference definition [ref]: url "title"
179#[derive(Debug, Clone)]
180pub struct ReferenceDef {
181    /// Line number (1-indexed)
182    pub line: usize,
183    /// Reference ID (normalized to lowercase)
184    pub id: String,
185    /// URL
186    pub url: String,
187    /// Optional title
188    pub title: Option<String>,
189    /// Byte offset where the reference definition starts
190    pub byte_offset: usize,
191    /// Byte offset where the reference definition ends
192    pub byte_end: usize,
193}
194
195/// Parsed code span information
196#[derive(Debug, Clone)]
197pub struct CodeSpan {
198    /// Line number (1-indexed)
199    pub line: usize,
200    /// Start column (0-indexed) in the line
201    pub start_col: usize,
202    /// End column (0-indexed) in the line
203    pub end_col: usize,
204    /// Byte offset in document
205    pub byte_offset: usize,
206    /// End byte offset in document
207    pub byte_end: usize,
208    /// Number of backticks used (1, 2, 3, etc.)
209    pub backtick_count: usize,
210    /// Content inside the code span (without backticks)
211    pub content: String,
212}
213
214/// Information about a heading
215#[derive(Debug, Clone)]
216pub struct HeadingInfo {
217    /// Heading level (1-6 for ATX, 1-2 for Setext)
218    pub level: u8,
219    /// Style of heading
220    pub style: HeadingStyle,
221    /// The heading marker (# characters or underline)
222    pub marker: String,
223    /// Column where the marker starts (0-based)
224    pub marker_column: usize,
225    /// Column where heading text starts
226    pub content_column: usize,
227    /// The heading text (without markers and without custom ID syntax)
228    pub text: String,
229    /// Custom header ID if present (e.g., from {#custom-id} syntax)
230    pub custom_id: Option<String>,
231    /// Original heading text including custom ID syntax
232    pub raw_text: String,
233    /// Whether it has a closing sequence (for ATX)
234    pub has_closing_sequence: bool,
235    /// The closing sequence if present
236    pub closing_sequence: String,
237}
238
239/// Information about a blockquote line
240#[derive(Debug, Clone)]
241pub struct BlockquoteInfo {
242    /// Nesting level (1 for >, 2 for >>, etc.)
243    pub nesting_level: usize,
244    /// The indentation before the blockquote marker
245    pub indent: String,
246    /// Column where the first > starts (0-based)
247    pub marker_column: usize,
248    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
249    pub prefix: String,
250    /// Content after the blockquote marker(s)
251    pub content: String,
252    /// Whether the line has no space after the marker
253    pub has_no_space_after_marker: bool,
254    /// Whether the line has multiple spaces after the marker
255    pub has_multiple_spaces_after_marker: bool,
256    /// Whether this is an empty blockquote line needing MD028 fix
257    pub needs_md028_fix: bool,
258}
259
260/// Information about a list block
261#[derive(Debug, Clone)]
262pub struct ListBlock {
263    /// Line number where the list starts (1-indexed)
264    pub start_line: usize,
265    /// Line number where the list ends (1-indexed)
266    pub end_line: usize,
267    /// Whether it's ordered or unordered
268    pub is_ordered: bool,
269    /// The consistent marker for unordered lists (if any)
270    pub marker: Option<String>,
271    /// Blockquote prefix for this list (empty if not in blockquote)
272    pub blockquote_prefix: String,
273    /// Lines that are list items within this block
274    pub item_lines: Vec<usize>,
275    /// Nesting level (0 for top-level lists)
276    pub nesting_level: usize,
277    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
278    pub max_marker_width: usize,
279}
280
281use std::sync::{Arc, Mutex};
282
283/// Character frequency data for fast content analysis
284#[derive(Debug, Clone, Default)]
285pub struct CharFrequency {
286    /// Count of # characters (headings)
287    pub hash_count: usize,
288    /// Count of * characters (emphasis, lists, horizontal rules)
289    pub asterisk_count: usize,
290    /// Count of _ characters (emphasis, horizontal rules)
291    pub underscore_count: usize,
292    /// Count of - characters (lists, horizontal rules, setext headings)
293    pub hyphen_count: usize,
294    /// Count of + characters (lists)
295    pub plus_count: usize,
296    /// Count of > characters (blockquotes)
297    pub gt_count: usize,
298    /// Count of | characters (tables)
299    pub pipe_count: usize,
300    /// Count of [ characters (links, images)
301    pub bracket_count: usize,
302    /// Count of ` characters (code spans, code blocks)
303    pub backtick_count: usize,
304    /// Count of < characters (HTML tags, autolinks)
305    pub lt_count: usize,
306    /// Count of ! characters (images)
307    pub exclamation_count: usize,
308    /// Count of newline characters
309    pub newline_count: usize,
310}
311
312/// Pre-parsed HTML tag information
313#[derive(Debug, Clone)]
314pub struct HtmlTag {
315    /// Line number (1-indexed)
316    pub line: usize,
317    /// Start column (0-indexed) in the line
318    pub start_col: usize,
319    /// End column (0-indexed) in the line
320    pub end_col: usize,
321    /// Byte offset in document
322    pub byte_offset: usize,
323    /// End byte offset in document
324    pub byte_end: usize,
325    /// Tag name (e.g., "div", "img", "br")
326    pub tag_name: String,
327    /// Whether it's a closing tag (`</tag>`)
328    pub is_closing: bool,
329    /// Whether it's self-closing (`<tag />`)
330    pub is_self_closing: bool,
331    /// Raw tag content
332    pub raw_content: String,
333}
334
335/// Pre-parsed emphasis span information
336#[derive(Debug, Clone)]
337pub struct EmphasisSpan {
338    /// Line number (1-indexed)
339    pub line: usize,
340    /// Start column (0-indexed) in the line
341    pub start_col: usize,
342    /// End column (0-indexed) in the line
343    pub end_col: usize,
344    /// Byte offset in document
345    pub byte_offset: usize,
346    /// End byte offset in document
347    pub byte_end: usize,
348    /// Type of emphasis ('*' or '_')
349    pub marker: char,
350    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
351    pub marker_count: usize,
352    /// Content inside the emphasis
353    pub content: String,
354}
355
356/// Pre-parsed table row information
357#[derive(Debug, Clone)]
358pub struct TableRow {
359    /// Line number (1-indexed)
360    pub line: usize,
361    /// Whether this is a separator row (contains only |, -, :, and spaces)
362    pub is_separator: bool,
363    /// Number of columns (pipe-separated cells)
364    pub column_count: usize,
365    /// Alignment info from separator row
366    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
367}
368
369/// Pre-parsed bare URL information (not in links)
370#[derive(Debug, Clone)]
371pub struct BareUrl {
372    /// Line number (1-indexed)
373    pub line: usize,
374    /// Start column (0-indexed) in the line
375    pub start_col: usize,
376    /// End column (0-indexed) in the line
377    pub end_col: usize,
378    /// Byte offset in document
379    pub byte_offset: usize,
380    /// End byte offset in document
381    pub byte_end: usize,
382    /// The URL string
383    pub url: String,
384    /// Type of URL ("http", "https", "ftp", "email")
385    pub url_type: String,
386}
387
388pub struct LintContext<'a> {
389    pub content: &'a str,
390    pub line_offsets: Vec<usize>,
391    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
392    pub lines: Vec<LineInfo>,             // Pre-computed line information
393    pub links: Vec<ParsedLink<'a>>,       // Pre-parsed links
394    pub images: Vec<ParsedImage<'a>>,     // Pre-parsed images
395    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
396    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
397    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
398    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
399    pub char_frequency: CharFrequency,    // Character frequency analysis
400    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
401    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
402    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
403    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
404    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
405    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
406    pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
407    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
408    pub flavor: MarkdownFlavor,           // Markdown flavor being used
409}
410
411/// Detailed blockquote parse result with all components
412struct BlockquoteComponents<'a> {
413    indent: &'a str,
414    markers: &'a str,
415    spaces_after: &'a str,
416    content: &'a str,
417}
418
419/// Parse blockquote prefix with detailed components using manual parsing
420#[inline]
421fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
422    let bytes = line.as_bytes();
423    let mut pos = 0;
424
425    // Parse leading whitespace (indent)
426    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
427        pos += 1;
428    }
429    let indent_end = pos;
430
431    // Must have at least one '>' marker
432    if pos >= bytes.len() || bytes[pos] != b'>' {
433        return None;
434    }
435
436    // Parse '>' markers
437    while pos < bytes.len() && bytes[pos] == b'>' {
438        pos += 1;
439    }
440    let markers_end = pos;
441
442    // Parse spaces after markers
443    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
444        pos += 1;
445    }
446    let spaces_end = pos;
447
448    Some(BlockquoteComponents {
449        indent: &line[0..indent_end],
450        markers: &line[indent_end..markers_end],
451        spaces_after: &line[markers_end..spaces_end],
452        content: &line[spaces_end..],
453    })
454}
455
456impl<'a> LintContext<'a> {
457    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
458        use std::time::Instant;
459        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
460
461        let start = Instant::now();
462        let mut line_offsets = vec![0];
463        for (i, c) in content.char_indices() {
464            if c == '\n' {
465                line_offsets.push(i + 1);
466            }
467        }
468        if profile {
469            eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
470        }
471
472        // Detect code blocks once and cache them
473        let start = Instant::now();
474        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
475        if profile {
476            eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
477        }
478
479        // Pre-compute HTML comment ranges ONCE for all operations
480        let start = Instant::now();
481        let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
482        if profile {
483            eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
484        }
485
486        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
487        let start = Instant::now();
488        let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
489            crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
490        } else {
491            Vec::new()
492        };
493        if profile {
494            eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
495        }
496
497        // Pre-compute line information (without headings/blockquotes yet)
498        let start = Instant::now();
499        let mut lines = Self::compute_basic_line_info(
500            content,
501            &line_offsets,
502            &code_blocks,
503            flavor,
504            &html_comment_ranges,
505            &autodoc_ranges,
506        );
507        if profile {
508            eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
509        }
510
511        // Detect HTML blocks BEFORE heading detection
512        let start = Instant::now();
513        Self::detect_html_blocks(content, &mut lines);
514        if profile {
515            eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
516        }
517
518        // Detect ESM import/export blocks in MDX files BEFORE heading detection
519        let start = Instant::now();
520        Self::detect_esm_blocks(content, &mut lines, flavor);
521        if profile {
522            eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
523        }
524
525        // Now detect headings and blockquotes
526        let start = Instant::now();
527        Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
528        if profile {
529            eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
530        }
531
532        // Parse code spans early so we can exclude them from link/image parsing
533        let start = Instant::now();
534        let code_spans = Self::parse_code_spans(content, &lines);
535        if profile {
536            eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
537        }
538
539        // Parse links, images, references, and list blocks
540        let start = Instant::now();
541        let (links, broken_links) =
542            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
543        if profile {
544            eprintln!("[PROFILE] Links: {:?}", start.elapsed());
545        }
546
547        let start = Instant::now();
548        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
549        if profile {
550            eprintln!("[PROFILE] Images: {:?}", start.elapsed());
551        }
552
553        let start = Instant::now();
554        let reference_defs = Self::parse_reference_defs(content, &lines);
555        if profile {
556            eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
557        }
558
559        let start = Instant::now();
560        let list_blocks = Self::parse_list_blocks(content, &lines);
561        if profile {
562            eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
563        }
564
565        // Compute character frequency for fast content analysis
566        let start = Instant::now();
567        let char_frequency = Self::compute_char_frequency(content);
568        if profile {
569            eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
570        }
571
572        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
573        let start = Instant::now();
574        let table_blocks = crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
575            content,
576            &code_blocks,
577            &code_spans,
578            &html_comment_ranges,
579        );
580        if profile {
581            eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
582        }
583
584        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
585        let start = Instant::now();
586        let line_index = crate::utils::range_utils::LineIndex::new(content);
587        if profile {
588            eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
589        }
590
591        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
592        let start = Instant::now();
593        let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
594        if profile {
595            eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
596        }
597
598        Self {
599            content,
600            line_offsets,
601            code_blocks,
602            lines,
603            links,
604            images,
605            broken_links,
606            reference_defs,
607            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
608            list_blocks,
609            char_frequency,
610            html_tags_cache: Mutex::new(None),
611            emphasis_spans_cache: Mutex::new(None),
612            table_rows_cache: Mutex::new(None),
613            bare_urls_cache: Mutex::new(None),
614            html_comment_ranges,
615            table_blocks,
616            line_index,
617            jinja_ranges,
618            flavor,
619        }
620    }
621
622    /// Get code spans - computed lazily on first access
623    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
624        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
625
626        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
627    }
628
629    /// Get HTML comment ranges - pre-computed during LintContext construction
630    pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
631        &self.html_comment_ranges
632    }
633
634    /// Get HTML tags - computed lazily on first access
635    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
636        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
637
638        Arc::clone(cache.get_or_insert_with(|| {
639            Arc::new(Self::parse_html_tags(
640                self.content,
641                &self.lines,
642                &self.code_blocks,
643                self.flavor,
644            ))
645        }))
646    }
647
648    /// Get emphasis spans - computed lazily on first access
649    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
650        let mut cache = self
651            .emphasis_spans_cache
652            .lock()
653            .expect("Emphasis spans cache mutex poisoned");
654
655        Arc::clone(
656            cache.get_or_insert_with(|| {
657                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
658            }),
659        )
660    }
661
662    /// Get table rows - computed lazily on first access
663    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
664        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
665
666        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))))
667    }
668
669    /// Get bare URLs - computed lazily on first access
670    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
671        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
672
673        Arc::clone(
674            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
675        )
676    }
677
678    /// Map a byte offset to (line, column)
679    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
680        match self.line_offsets.binary_search(&offset) {
681            Ok(line) => (line + 1, 1),
682            Err(line) => {
683                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
684                (line, offset - line_start + 1)
685            }
686        }
687    }
688
689    /// Check if a position is within a code block or code span
690    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
691        // Check code blocks first
692        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
693            return true;
694        }
695
696        // Check inline code spans (lazy load if needed)
697        self.code_spans()
698            .iter()
699            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
700    }
701
702    /// Get line information by line number (1-indexed)
703    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
704        if line_num > 0 {
705            self.lines.get(line_num - 1)
706        } else {
707            None
708        }
709    }
710
711    /// Get byte offset for a line number (1-indexed)
712    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
713        self.line_info(line_num).map(|info| info.byte_offset)
714    }
715
716    /// Get URL for a reference link/image by its ID
717    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
718        let normalized_id = ref_id.to_lowercase();
719        self.reference_defs
720            .iter()
721            .find(|def| def.id == normalized_id)
722            .map(|def| def.url.as_str())
723    }
724
725    /// Check if a line is part of a list block
726    pub fn is_in_list_block(&self, line_num: usize) -> bool {
727        self.list_blocks
728            .iter()
729            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
730    }
731
732    /// Get the list block containing a specific line
733    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
734        self.list_blocks
735            .iter()
736            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
737    }
738
739    // Compatibility methods for DocumentStructure migration
740
741    /// Check if a line is within a code block
742    pub fn is_in_code_block(&self, line_num: usize) -> bool {
743        if line_num == 0 || line_num > self.lines.len() {
744            return false;
745        }
746        self.lines[line_num - 1].in_code_block
747    }
748
749    /// Check if a line is within front matter
750    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
751        if line_num == 0 || line_num > self.lines.len() {
752            return false;
753        }
754        self.lines[line_num - 1].in_front_matter
755    }
756
757    /// Check if a line is within an HTML block
758    pub fn is_in_html_block(&self, line_num: usize) -> bool {
759        if line_num == 0 || line_num > self.lines.len() {
760            return false;
761        }
762        self.lines[line_num - 1].in_html_block
763    }
764
765    /// Check if a line and column is within a code span
766    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
767        if line_num == 0 || line_num > self.lines.len() {
768            return false;
769        }
770
771        // Use the code spans cache to check
772        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
773        // Convert col to 0-indexed for comparison
774        let col_0indexed = if col > 0 { col - 1 } else { 0 };
775        let code_spans = self.code_spans();
776        code_spans
777            .iter()
778            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
779    }
780
781    /// Check if a byte position is within a reference definition
782    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
783    #[inline]
784    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
785        self.reference_defs
786            .iter()
787            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
788    }
789
790    /// Check if a byte position is within an HTML comment
791    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
792    /// where k is the number of HTML comments (typically very small)
793    #[inline]
794    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
795        self.html_comment_ranges
796            .iter()
797            .any(|range| byte_pos >= range.start && byte_pos < range.end)
798    }
799
800    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
801    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
802        self.jinja_ranges
803            .iter()
804            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
805    }
806
807    /// Check if content has any instances of a specific character (fast)
808    pub fn has_char(&self, ch: char) -> bool {
809        match ch {
810            '#' => self.char_frequency.hash_count > 0,
811            '*' => self.char_frequency.asterisk_count > 0,
812            '_' => self.char_frequency.underscore_count > 0,
813            '-' => self.char_frequency.hyphen_count > 0,
814            '+' => self.char_frequency.plus_count > 0,
815            '>' => self.char_frequency.gt_count > 0,
816            '|' => self.char_frequency.pipe_count > 0,
817            '[' => self.char_frequency.bracket_count > 0,
818            '`' => self.char_frequency.backtick_count > 0,
819            '<' => self.char_frequency.lt_count > 0,
820            '!' => self.char_frequency.exclamation_count > 0,
821            '\n' => self.char_frequency.newline_count > 0,
822            _ => self.content.contains(ch), // Fallback for other characters
823        }
824    }
825
826    /// Get count of a specific character (fast)
827    pub fn char_count(&self, ch: char) -> usize {
828        match ch {
829            '#' => self.char_frequency.hash_count,
830            '*' => self.char_frequency.asterisk_count,
831            '_' => self.char_frequency.underscore_count,
832            '-' => self.char_frequency.hyphen_count,
833            '+' => self.char_frequency.plus_count,
834            '>' => self.char_frequency.gt_count,
835            '|' => self.char_frequency.pipe_count,
836            '[' => self.char_frequency.bracket_count,
837            '`' => self.char_frequency.backtick_count,
838            '<' => self.char_frequency.lt_count,
839            '!' => self.char_frequency.exclamation_count,
840            '\n' => self.char_frequency.newline_count,
841            _ => self.content.matches(ch).count(), // Fallback for other characters
842        }
843    }
844
845    /// Check if content likely contains headings (fast)
846    pub fn likely_has_headings(&self) -> bool {
847        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
848    }
849
850    /// Check if content likely contains lists (fast)
851    pub fn likely_has_lists(&self) -> bool {
852        self.char_frequency.asterisk_count > 0
853            || self.char_frequency.hyphen_count > 0
854            || self.char_frequency.plus_count > 0
855    }
856
857    /// Check if content likely contains emphasis (fast)
858    pub fn likely_has_emphasis(&self) -> bool {
859        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
860    }
861
862    /// Check if content likely contains tables (fast)
863    pub fn likely_has_tables(&self) -> bool {
864        self.char_frequency.pipe_count > 2
865    }
866
867    /// Check if content likely contains blockquotes (fast)
868    pub fn likely_has_blockquotes(&self) -> bool {
869        self.char_frequency.gt_count > 0
870    }
871
872    /// Check if content likely contains code (fast)
873    pub fn likely_has_code(&self) -> bool {
874        self.char_frequency.backtick_count > 0
875    }
876
877    /// Check if content likely contains links or images (fast)
878    pub fn likely_has_links_or_images(&self) -> bool {
879        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
880    }
881
882    /// Check if content likely contains HTML (fast)
883    pub fn likely_has_html(&self) -> bool {
884        self.char_frequency.lt_count > 0
885    }
886
887    /// Get HTML tags on a specific line
888    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
889        self.html_tags()
890            .iter()
891            .filter(|tag| tag.line == line_num)
892            .cloned()
893            .collect()
894    }
895
896    /// Get emphasis spans on a specific line
897    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
898        self.emphasis_spans()
899            .iter()
900            .filter(|span| span.line == line_num)
901            .cloned()
902            .collect()
903    }
904
905    /// Get table rows on a specific line
906    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
907        self.table_rows()
908            .iter()
909            .filter(|row| row.line == line_num)
910            .cloned()
911            .collect()
912    }
913
914    /// Get bare URLs on a specific line
915    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
916        self.bare_urls()
917            .iter()
918            .filter(|url| url.line == line_num)
919            .cloned()
920            .collect()
921    }
922
923    /// Find the line index for a given byte offset using binary search.
924    /// Returns (line_index, line_number, column) where:
925    /// - line_index is the 0-based index in the lines array
926    /// - line_number is the 1-based line number
927    /// - column is the byte offset within that line
928    #[inline]
929    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
930        // Binary search to find the line containing this byte offset
931        let idx = match lines.binary_search_by(|line| {
932            if byte_offset < line.byte_offset {
933                std::cmp::Ordering::Greater
934            } else if byte_offset > line.byte_offset + line.byte_len {
935                std::cmp::Ordering::Less
936            } else {
937                std::cmp::Ordering::Equal
938            }
939        }) {
940            Ok(idx) => idx,
941            Err(idx) => idx.saturating_sub(1),
942        };
943
944        let line = &lines[idx];
945        let line_num = idx + 1;
946        let col = byte_offset.saturating_sub(line.byte_offset);
947
948        (idx, line_num, col)
949    }
950
951    /// Check if a byte offset is within a code span using binary search
952    #[inline]
953    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
954        // Since spans are sorted by byte_offset, use partition_point for binary search
955        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
956
957        // Check the span that starts at or before our offset
958        if idx > 0 {
959            let span = &code_spans[idx - 1];
960            if offset >= span.byte_offset && offset < span.byte_end {
961                return true;
962            }
963        }
964
965        false
966    }
967
968    /// Parse all links in the content
969    fn parse_links(
970        content: &'a str,
971        lines: &[LineInfo],
972        code_blocks: &[(usize, usize)],
973        code_spans: &[CodeSpan],
974        flavor: MarkdownFlavor,
975        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
976    ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>) {
977        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
978        use std::collections::HashSet;
979
980        let mut links = Vec::with_capacity(content.len() / 500);
981        let mut broken_links = Vec::new();
982
983        // Track byte positions of links found by pulldown-cmark
984        let mut found_positions = HashSet::new();
985
986        // Use pulldown-cmark's streaming parser with BrokenLink callback
987        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
988        // This automatically handles:
989        // - Escaped links (won't generate events)
990        // - Links in code blocks/spans (won't generate Link events)
991        // - Images (generates Tag::Image instead)
992        // - Reference resolution (dest_url is already resolved!)
993        // - Broken references (callback is invoked)
994        // - Wiki-links (enabled via ENABLE_WIKILINKS)
995        let mut options = Options::empty();
996        options.insert(Options::ENABLE_WIKILINKS);
997
998        let parser = Parser::new_with_broken_link_callback(
999            content,
1000            options,
1001            Some(|link: BrokenLink<'_>| {
1002                broken_links.push(BrokenLinkInfo {
1003                    reference: link.reference.to_string(),
1004                    span: link.span.clone(),
1005                });
1006                None
1007            }),
1008        )
1009        .into_offset_iter();
1010
1011        let mut link_stack: Vec<(
1012            usize,
1013            usize,
1014            pulldown_cmark::CowStr<'a>,
1015            LinkType,
1016            pulldown_cmark::CowStr<'a>,
1017        )> = Vec::new();
1018        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1019
1020        for (event, range) in parser {
1021            match event {
1022                Event::Start(Tag::Link {
1023                    link_type,
1024                    dest_url,
1025                    id,
1026                    ..
1027                }) => {
1028                    // Link start - record position, URL, and reference ID
1029                    link_stack.push((range.start, range.end, dest_url, link_type, id));
1030                    text_chunks.clear();
1031                }
1032                Event::Text(text) if !link_stack.is_empty() => {
1033                    // Track text content with its byte range
1034                    text_chunks.push((text.to_string(), range.start, range.end));
1035                }
1036                Event::Code(code) if !link_stack.is_empty() => {
1037                    // Include inline code in link text (with backticks)
1038                    let code_text = format!("`{code}`");
1039                    text_chunks.push((code_text, range.start, range.end));
1040                }
1041                Event::End(TagEnd::Link) => {
1042                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1043                        // Skip if in HTML comment
1044                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1045                            text_chunks.clear();
1046                            continue;
1047                        }
1048
1049                        // Find line and column information
1050                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1051
1052                        // Skip if this link is on a MkDocs snippet line
1053                        if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1054                            text_chunks.clear();
1055                            continue;
1056                        }
1057
1058                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1059
1060                        let is_reference = matches!(
1061                            link_type,
1062                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1063                        );
1064
1065                        // Extract link text directly from source bytes to preserve escaping
1066                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1067                        let link_text = if start_pos < content.len() {
1068                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1069
1070                            // Find MATCHING ] by tracking bracket depth for nested brackets
1071                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1072                            // Brackets inside code spans (between backticks) should be ignored
1073                            let mut close_pos = None;
1074                            let mut depth = 0;
1075                            let mut in_code_span = false;
1076
1077                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1078                                // Count preceding backslashes
1079                                let mut backslash_count = 0;
1080                                let mut j = i;
1081                                while j > 0 && link_bytes[j - 1] == b'\\' {
1082                                    backslash_count += 1;
1083                                    j -= 1;
1084                                }
1085                                let is_escaped = backslash_count % 2 != 0;
1086
1087                                // Track code spans - backticks toggle in/out of code
1088                                if byte == b'`' && !is_escaped {
1089                                    in_code_span = !in_code_span;
1090                                }
1091
1092                                // Only count brackets when NOT in a code span
1093                                if !is_escaped && !in_code_span {
1094                                    if byte == b'[' {
1095                                        depth += 1;
1096                                    } else if byte == b']' {
1097                                        if depth == 0 {
1098                                            // Found the matching closing bracket
1099                                            close_pos = Some(i);
1100                                            break;
1101                                        } else {
1102                                            depth -= 1;
1103                                        }
1104                                    }
1105                                }
1106                            }
1107
1108                            if let Some(pos) = close_pos {
1109                                Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1110                            } else {
1111                                Cow::Borrowed("")
1112                            }
1113                        } else {
1114                            Cow::Borrowed("")
1115                        };
1116
1117                        // For reference links, use the actual reference ID from pulldown-cmark
1118                        let reference_id = if is_reference && !ref_id.is_empty() {
1119                            Some(Cow::Owned(ref_id.to_lowercase()))
1120                        } else if is_reference {
1121                            // For collapsed/shortcut references without explicit ID, use the link text
1122                            Some(Cow::Owned(link_text.to_lowercase()))
1123                        } else {
1124                            None
1125                        };
1126
1127                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1128                        // Check for escaped image syntax: \![text](url)
1129                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1130                        let has_escaped_bang = start_pos >= 2
1131                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1132                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1133
1134                        // Check for escaped bracket: \[text](url)
1135                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1136                        let has_escaped_bracket =
1137                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1138
1139                        if has_escaped_bang || has_escaped_bracket {
1140                            text_chunks.clear();
1141                            continue; // Skip: this is escaped markdown, not a real link
1142                        }
1143
1144                        // Track this position as found
1145                        found_positions.insert(start_pos);
1146
1147                        links.push(ParsedLink {
1148                            line: line_num,
1149                            start_col: col_start,
1150                            end_col: col_end,
1151                            byte_offset: start_pos,
1152                            byte_end: range.end,
1153                            text: link_text,
1154                            url: Cow::Owned(url.to_string()),
1155                            is_reference,
1156                            reference_id,
1157                            link_type,
1158                        });
1159
1160                        text_chunks.clear();
1161                    }
1162                }
1163                _ => {}
1164            }
1165        }
1166
1167        // Also find undefined references using regex
1168        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1169        // because the reference is undefined
1170        for cap in LINK_PATTERN.captures_iter(content) {
1171            let full_match = cap.get(0).unwrap();
1172            let match_start = full_match.start();
1173            let match_end = full_match.end();
1174
1175            // Skip if this was already found by pulldown-cmark (it's a valid link)
1176            if found_positions.contains(&match_start) {
1177                continue;
1178            }
1179
1180            // Skip if escaped
1181            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1182                continue;
1183            }
1184
1185            // Skip if it's an image
1186            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1187                continue;
1188            }
1189
1190            // Skip if in code block
1191            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1192                continue;
1193            }
1194
1195            // Skip if in code span
1196            if Self::is_offset_in_code_span(code_spans, match_start) {
1197                continue;
1198            }
1199
1200            // Skip if in HTML comment
1201            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1202                continue;
1203            }
1204
1205            // Find line and column information
1206            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1207
1208            // Skip if this link is on a MkDocs snippet line
1209            if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1210                continue;
1211            }
1212
1213            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1214
1215            let text = cap.get(1).map_or("", |m| m.as_str());
1216
1217            // Only process reference links (group 6)
1218            if let Some(ref_id) = cap.get(6) {
1219                let ref_id_str = ref_id.as_str();
1220                let normalized_ref = if ref_id_str.is_empty() {
1221                    Cow::Owned(text.to_lowercase()) // Implicit reference
1222                } else {
1223                    Cow::Owned(ref_id_str.to_lowercase())
1224                };
1225
1226                // This is an undefined reference (pulldown-cmark didn't parse it)
1227                links.push(ParsedLink {
1228                    line: line_num,
1229                    start_col: col_start,
1230                    end_col: col_end,
1231                    byte_offset: match_start,
1232                    byte_end: match_end,
1233                    text: Cow::Borrowed(text),
1234                    url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1235                    is_reference: true,
1236                    reference_id: Some(normalized_ref),
1237                    link_type: LinkType::Reference, // Undefined references are reference-style
1238                });
1239            }
1240        }
1241
1242        (links, broken_links)
1243    }
1244
1245    /// Parse all images in the content
1246    fn parse_images(
1247        content: &'a str,
1248        lines: &[LineInfo],
1249        code_blocks: &[(usize, usize)],
1250        code_spans: &[CodeSpan],
1251        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1252    ) -> Vec<ParsedImage<'a>> {
1253        use crate::utils::skip_context::is_in_html_comment_ranges;
1254        use std::collections::HashSet;
1255
1256        // Pre-size based on a heuristic: images are less common than links
1257        let mut images = Vec::with_capacity(content.len() / 1000);
1258        let mut found_positions = HashSet::new();
1259
1260        // Use pulldown-cmark for parsing - more accurate and faster
1261        let parser = Parser::new(content).into_offset_iter();
1262        let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1263            Vec::new();
1264        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1265
1266        for (event, range) in parser {
1267            match event {
1268                Event::Start(Tag::Image {
1269                    link_type,
1270                    dest_url,
1271                    id,
1272                    ..
1273                }) => {
1274                    image_stack.push((range.start, dest_url, link_type, id));
1275                    text_chunks.clear();
1276                }
1277                Event::Text(text) if !image_stack.is_empty() => {
1278                    text_chunks.push((text.to_string(), range.start, range.end));
1279                }
1280                Event::Code(code) if !image_stack.is_empty() => {
1281                    let code_text = format!("`{code}`");
1282                    text_chunks.push((code_text, range.start, range.end));
1283                }
1284                Event::End(TagEnd::Image) => {
1285                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1286                        // Skip if in code block
1287                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1288                            continue;
1289                        }
1290
1291                        // Skip if in code span
1292                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1293                            continue;
1294                        }
1295
1296                        // Skip if in HTML comment
1297                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1298                            continue;
1299                        }
1300
1301                        // Find line and column using binary search
1302                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1304
1305                        let is_reference = matches!(
1306                            link_type,
1307                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1308                        );
1309
1310                        // Extract alt text directly from source bytes to preserve escaping
1311                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1312                        let alt_text = if start_pos < content.len() {
1313                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1314
1315                            // Find MATCHING ] by tracking bracket depth for nested brackets
1316                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1317                            let mut close_pos = None;
1318                            let mut depth = 0;
1319
1320                            if image_bytes.len() > 2 {
1321                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1322                                    // Count preceding backslashes
1323                                    let mut backslash_count = 0;
1324                                    let mut j = i;
1325                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1326                                        backslash_count += 1;
1327                                        j -= 1;
1328                                    }
1329                                    let is_escaped = backslash_count % 2 != 0;
1330
1331                                    if !is_escaped {
1332                                        if byte == b'[' {
1333                                            depth += 1;
1334                                        } else if byte == b']' {
1335                                            if depth == 0 {
1336                                                // Found the matching closing bracket
1337                                                close_pos = Some(i);
1338                                                break;
1339                                            } else {
1340                                                depth -= 1;
1341                                            }
1342                                        }
1343                                    }
1344                                }
1345                            }
1346
1347                            if let Some(pos) = close_pos {
1348                                Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1349                            } else {
1350                                Cow::Borrowed("")
1351                            }
1352                        } else {
1353                            Cow::Borrowed("")
1354                        };
1355
1356                        let reference_id = if is_reference && !ref_id.is_empty() {
1357                            Some(Cow::Owned(ref_id.to_lowercase()))
1358                        } else if is_reference {
1359                            Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1360                        } else {
1361                            None
1362                        };
1363
1364                        found_positions.insert(start_pos);
1365                        images.push(ParsedImage {
1366                            line: line_num,
1367                            start_col: col_start,
1368                            end_col: col_end,
1369                            byte_offset: start_pos,
1370                            byte_end: range.end,
1371                            alt_text,
1372                            url: Cow::Owned(url.to_string()),
1373                            is_reference,
1374                            reference_id,
1375                            link_type,
1376                        });
1377                    }
1378                }
1379                _ => {}
1380            }
1381        }
1382
1383        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1384        for cap in IMAGE_PATTERN.captures_iter(content) {
1385            let full_match = cap.get(0).unwrap();
1386            let match_start = full_match.start();
1387            let match_end = full_match.end();
1388
1389            // Skip if already found by pulldown-cmark
1390            if found_positions.contains(&match_start) {
1391                continue;
1392            }
1393
1394            // Skip if the ! is escaped
1395            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1396                continue;
1397            }
1398
1399            // Skip if in code block, code span, or HTML comment
1400            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1401                || Self::is_offset_in_code_span(code_spans, match_start)
1402                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1403            {
1404                continue;
1405            }
1406
1407            // Only process reference images (undefined references not found by pulldown-cmark)
1408            if let Some(ref_id) = cap.get(6) {
1409                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1410                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1411                let alt_text = cap.get(1).map_or("", |m| m.as_str());
1412                let ref_id_str = ref_id.as_str();
1413                let normalized_ref = if ref_id_str.is_empty() {
1414                    Cow::Owned(alt_text.to_lowercase())
1415                } else {
1416                    Cow::Owned(ref_id_str.to_lowercase())
1417                };
1418
1419                images.push(ParsedImage {
1420                    line: line_num,
1421                    start_col: col_start,
1422                    end_col: col_end,
1423                    byte_offset: match_start,
1424                    byte_end: match_end,
1425                    alt_text: Cow::Borrowed(alt_text),
1426                    url: Cow::Borrowed(""),
1427                    is_reference: true,
1428                    reference_id: Some(normalized_ref),
1429                    link_type: LinkType::Reference, // Undefined references are reference-style
1430                });
1431            }
1432        }
1433
1434        images
1435    }
1436
1437    /// Parse reference definitions
1438    fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1439        // Pre-size based on lines count as reference definitions are line-based
1440        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1441
1442        for (line_idx, line_info) in lines.iter().enumerate() {
1443            // Skip lines in code blocks
1444            if line_info.in_code_block {
1445                continue;
1446            }
1447
1448            let line = line_info.content(content);
1449            let line_num = line_idx + 1;
1450
1451            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1452                let id = cap.get(1).unwrap().as_str().to_lowercase();
1453                let url = cap.get(2).unwrap().as_str().to_string();
1454                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1455
1456                // Calculate byte positions
1457                // The match starts at the beginning of the line (0) and extends to the end
1458                let match_obj = cap.get(0).unwrap();
1459                let byte_offset = line_info.byte_offset + match_obj.start();
1460                let byte_end = line_info.byte_offset + match_obj.end();
1461
1462                refs.push(ReferenceDef {
1463                    line: line_num,
1464                    id,
1465                    url,
1466                    title,
1467                    byte_offset,
1468                    byte_end,
1469                });
1470            }
1471        }
1472
1473        refs
1474    }
1475
1476    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1477    /// Matches: ^(\s*>\s*)(.*)
1478    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1479    #[inline]
1480    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1481        let trimmed_start = line.trim_start();
1482        if !trimmed_start.starts_with('>') {
1483            return None;
1484        }
1485
1486        let leading_ws_len = line.len() - trimmed_start.len();
1487        let after_gt = &trimmed_start[1..];
1488        let content = after_gt.trim_start();
1489        let ws_after_gt_len = after_gt.len() - content.len();
1490        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1491
1492        Some((&line[..prefix_len], content))
1493    }
1494
1495    /// Fast unordered list parser - replaces regex for 5-10x speedup
1496    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1497    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1498    #[inline]
1499    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1500        let bytes = line.as_bytes();
1501        let mut i = 0;
1502
1503        // Skip leading whitespace
1504        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1505            i += 1;
1506        }
1507
1508        // Check for marker
1509        if i >= bytes.len() {
1510            return None;
1511        }
1512        let marker = bytes[i] as char;
1513        if marker != '-' && marker != '*' && marker != '+' {
1514            return None;
1515        }
1516        let marker_pos = i;
1517        i += 1;
1518
1519        // Collect spacing after marker (space or tab only)
1520        let spacing_start = i;
1521        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1522            i += 1;
1523        }
1524
1525        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1526    }
1527
1528    /// Fast ordered list parser - replaces regex for 5-10x speedup
1529    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1530    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1531    #[inline]
1532    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1533        let bytes = line.as_bytes();
1534        let mut i = 0;
1535
1536        // Skip leading whitespace
1537        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1538            i += 1;
1539        }
1540
1541        // Collect digits
1542        let number_start = i;
1543        while i < bytes.len() && bytes[i].is_ascii_digit() {
1544            i += 1;
1545        }
1546        if i == number_start {
1547            return None; // No digits found
1548        }
1549
1550        // Check for delimiter
1551        if i >= bytes.len() {
1552            return None;
1553        }
1554        let delimiter = bytes[i] as char;
1555        if delimiter != '.' && delimiter != ')' {
1556            return None;
1557        }
1558        let delimiter_pos = i;
1559        i += 1;
1560
1561        // Collect spacing after delimiter (space or tab only)
1562        let spacing_start = i;
1563        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1564            i += 1;
1565        }
1566
1567        Some((
1568            &line[..number_start],
1569            &line[number_start..delimiter_pos],
1570            delimiter,
1571            &line[spacing_start..i],
1572            &line[i..],
1573        ))
1574    }
1575
1576    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1577    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1578    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1579        let num_lines = line_offsets.len();
1580        let mut in_code_block = vec![false; num_lines];
1581
1582        // For each code block, mark all lines within it
1583        for &(start, end) in code_blocks {
1584            // Ensure we're at valid UTF-8 boundaries
1585            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1586                let mut boundary = start;
1587                while boundary > 0 && !content.is_char_boundary(boundary) {
1588                    boundary -= 1;
1589                }
1590                boundary
1591            } else {
1592                start
1593            };
1594
1595            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1596                let mut boundary = end;
1597                while boundary < content.len() && !content.is_char_boundary(boundary) {
1598                    boundary += 1;
1599                }
1600                boundary
1601            } else {
1602                end.min(content.len())
1603            };
1604
1605            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1606            // That function now has proper list context awareness (see code_block_utils.rs)
1607            // and correctly distinguishes between:
1608            // - Fenced code blocks (``` or ~~~)
1609            // - Indented code blocks at document level (4 spaces + blank line before)
1610            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1611            //
1612            // We no longer need to re-validate here. The original validation logic
1613            // was causing false positives by marking list continuation paragraphs as
1614            // code blocks when they have 4 spaces of indentation.
1615
1616            // Use binary search to find the first and last line indices
1617            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1618            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1619            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1620            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1621
1622            // Mark all lines in the range at once
1623            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1624                *flag = true;
1625            }
1626        }
1627
1628        in_code_block
1629    }
1630
1631    /// Pre-compute basic line information (without headings/blockquotes)
1632    fn compute_basic_line_info(
1633        content: &str,
1634        line_offsets: &[usize],
1635        code_blocks: &[(usize, usize)],
1636        flavor: MarkdownFlavor,
1637        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1638        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1639    ) -> Vec<LineInfo> {
1640        let content_lines: Vec<&str> = content.lines().collect();
1641        let mut lines = Vec::with_capacity(content_lines.len());
1642
1643        // Pre-compute which lines are in code blocks
1644        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1645
1646        // Detect front matter boundaries FIRST, before any other parsing
1647        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1648        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1649
1650        for (i, line) in content_lines.iter().enumerate() {
1651            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1652            let indent = line.len() - line.trim_start().len();
1653
1654            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1655            let blockquote_parse = Self::parse_blockquote_prefix(line);
1656
1657            // For blank detection, consider blockquote context
1658            let is_blank = if let Some((_, content)) = blockquote_parse {
1659                // In blockquote context, check if content after prefix is blank
1660                content.trim().is_empty()
1661            } else {
1662                line.trim().is_empty()
1663            };
1664
1665            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1666            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1667
1668            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1669            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1670                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1671            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1672            let in_html_comment =
1673                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1674            let list_item = if !(in_code_block
1675                || is_blank
1676                || in_mkdocstrings
1677                || in_html_comment
1678                || (front_matter_end > 0 && i < front_matter_end))
1679            {
1680                // Strip blockquote prefix if present for list detection (reuse cached result)
1681                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1682                    (content, prefix.len())
1683                } else {
1684                    (&**line, 0)
1685                };
1686
1687                if let Some((leading_spaces, marker, spacing, _content)) =
1688                    Self::parse_unordered_list(line_for_list_check)
1689                {
1690                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1691                    let content_column = marker_column + 1 + spacing.len();
1692
1693                    // According to CommonMark spec, unordered list items MUST have at least one space
1694                    // after the marker (-, *, or +). Without a space, it's not a list item.
1695                    // This also naturally handles cases like:
1696                    // - *emphasis* (not a list)
1697                    // - **bold** (not a list)
1698                    // - --- (horizontal rule, not a list)
1699                    if spacing.is_empty() {
1700                        None
1701                    } else {
1702                        Some(ListItemInfo {
1703                            marker: marker.to_string(),
1704                            is_ordered: false,
1705                            number: None,
1706                            marker_column,
1707                            content_column,
1708                        })
1709                    }
1710                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1711                    Self::parse_ordered_list(line_for_list_check)
1712                {
1713                    let marker = format!("{number_str}{delimiter}");
1714                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1715                    let content_column = marker_column + marker.len() + spacing.len();
1716
1717                    // According to CommonMark spec, ordered list items MUST have at least one space
1718                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1719                    if spacing.is_empty() {
1720                        None
1721                    } else {
1722                        Some(ListItemInfo {
1723                            marker,
1724                            is_ordered: true,
1725                            number: number_str.parse().ok(),
1726                            marker_column,
1727                            content_column,
1728                        })
1729                    }
1730                } else {
1731                    None
1732                }
1733            } else {
1734                None
1735            };
1736
1737            lines.push(LineInfo {
1738                byte_offset,
1739                byte_len: line.len(),
1740                indent,
1741                is_blank,
1742                in_code_block,
1743                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1744                in_html_block: false, // Will be populated after line creation
1745                in_html_comment,
1746                list_item,
1747                heading: None,    // Will be populated in second pass for Setext headings
1748                blockquote: None, // Will be populated after line creation
1749                in_mkdocstrings,
1750                in_esm_block: false, // Will be populated after line creation for MDX files
1751            });
1752        }
1753
1754        lines
1755    }
1756
1757    /// Detect headings and blockquotes (called after HTML block detection)
1758    fn detect_headings_and_blockquotes(
1759        content: &str,
1760        lines: &mut [LineInfo],
1761        flavor: MarkdownFlavor,
1762        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1763    ) {
1764        // Regex for heading detection
1765        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1766            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1767        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1768            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1769
1770        let content_lines: Vec<&str> = content.lines().collect();
1771
1772        // Detect front matter boundaries to skip those lines
1773        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1774
1775        // Detect headings (including Setext which needs look-ahead) and blockquotes
1776        for i in 0..lines.len() {
1777            if lines[i].in_code_block {
1778                continue;
1779            }
1780
1781            // Skip lines in front matter
1782            if front_matter_end > 0 && i < front_matter_end {
1783                continue;
1784            }
1785
1786            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1787            if lines[i].in_html_block {
1788                continue;
1789            }
1790
1791            let line = content_lines[i];
1792
1793            // Check for blockquotes (even on blank lines within blockquotes)
1794            if let Some(bq) = parse_blockquote_detailed(line) {
1795                let nesting_level = bq.markers.len(); // Each '>' is one level
1796                let marker_column = bq.indent.len();
1797
1798                // Build the prefix (indentation + markers + space)
1799                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1800
1801                // Check for various blockquote issues
1802                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1803                // Consider tabs as multiple spaces, or actual multiple spaces
1804                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1805
1806                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1807                // MD028 flags empty blockquote lines that don't have a single space after the marker
1808                // Lines like "> " or ">> " are already correct and don't need fixing
1809                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1810
1811                lines[i].blockquote = Some(BlockquoteInfo {
1812                    nesting_level,
1813                    indent: bq.indent.to_string(),
1814                    marker_column,
1815                    prefix,
1816                    content: bq.content.to_string(),
1817                    has_no_space_after_marker: has_no_space,
1818                    has_multiple_spaces_after_marker: has_multiple_spaces,
1819                    needs_md028_fix,
1820                });
1821            }
1822
1823            // Skip heading detection for blank lines
1824            if lines[i].is_blank {
1825                continue;
1826            }
1827
1828            // Check for ATX headings (but skip MkDocs snippet lines)
1829            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1830            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1831                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1832                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1833            } else {
1834                false
1835            };
1836
1837            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1838                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1839                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1840                    continue;
1841                }
1842                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1843                let hashes = caps.get(2).map_or("", |m| m.as_str());
1844                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1845                let rest = caps.get(4).map_or("", |m| m.as_str());
1846
1847                let level = hashes.len() as u8;
1848                let marker_column = leading_spaces.len();
1849
1850                // Check for closing sequence, but handle custom IDs that might come after
1851                let (text, has_closing, closing_seq) = {
1852                    // First check if there's a custom ID at the end
1853                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1854                        // Check if this looks like a valid custom ID (ends with })
1855                        if rest[id_start..].trim_end().ends_with('}') {
1856                            // Split off the custom ID
1857                            (&rest[..id_start], &rest[id_start..])
1858                        } else {
1859                            (rest, "")
1860                        }
1861                    } else {
1862                        (rest, "")
1863                    };
1864
1865                    // Now look for closing hashes in the part before the custom ID
1866                    let trimmed_rest = rest_without_id.trim_end();
1867                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1868                        // Look for the start of the hash sequence
1869                        let mut start_of_hashes = last_hash_pos;
1870                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1871                            start_of_hashes -= 1;
1872                        }
1873
1874                        // Check if there's at least one space before the closing hashes
1875                        let has_space_before = start_of_hashes == 0
1876                            || trimmed_rest
1877                                .chars()
1878                                .nth(start_of_hashes - 1)
1879                                .is_some_and(|c| c.is_whitespace());
1880
1881                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1882                        let potential_closing = &trimmed_rest[start_of_hashes..];
1883                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1884
1885                        if is_all_hashes && has_space_before {
1886                            // This is a closing sequence
1887                            let closing_hashes = potential_closing.to_string();
1888                            // The text is everything before the closing hashes
1889                            // Don't include the custom ID here - it will be extracted later
1890                            let text_part = if !custom_id_part.is_empty() {
1891                                // If we have a custom ID, append it back to get the full rest
1892                                // This allows the extract_header_id function to handle it properly
1893                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1894                            } else {
1895                                rest_without_id[..start_of_hashes].trim_end().to_string()
1896                            };
1897                            (text_part, true, closing_hashes)
1898                        } else {
1899                            // Not a valid closing sequence, return the full content
1900                            (rest.to_string(), false, String::new())
1901                        }
1902                    } else {
1903                        // No hashes found, return the full content
1904                        (rest.to_string(), false, String::new())
1905                    }
1906                };
1907
1908                let content_column = marker_column + hashes.len() + spaces_after.len();
1909
1910                // Extract custom header ID if present
1911                let raw_text = text.trim().to_string();
1912                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1913
1914                // If no custom ID was found on the header line, check the next line for standalone attr-list
1915                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1916                    let next_line = content_lines[i + 1];
1917                    if !lines[i + 1].in_code_block
1918                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1919                        && let Some(next_line_id) =
1920                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1921                    {
1922                        custom_id = Some(next_line_id);
1923                    }
1924                }
1925
1926                lines[i].heading = Some(HeadingInfo {
1927                    level,
1928                    style: HeadingStyle::ATX,
1929                    marker: hashes.to_string(),
1930                    marker_column,
1931                    content_column,
1932                    text: clean_text,
1933                    custom_id,
1934                    raw_text,
1935                    has_closing_sequence: has_closing,
1936                    closing_sequence: closing_seq,
1937                });
1938            }
1939            // Check for Setext headings (need to look at next line)
1940            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1941                let next_line = content_lines[i + 1];
1942                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1943                    // Skip if next line is front matter delimiter
1944                    if front_matter_end > 0 && i < front_matter_end {
1945                        continue;
1946                    }
1947
1948                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1949                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1950                    {
1951                        continue;
1952                    }
1953
1954                    let underline = next_line.trim();
1955
1956                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1957                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1958                    if underline == "---" {
1959                        continue;
1960                    }
1961
1962                    // Skip if the current line looks like YAML key-value syntax
1963                    let current_line_trimmed = line.trim();
1964                    if current_line_trimmed.contains(':')
1965                        && !current_line_trimmed.starts_with('#')
1966                        && !current_line_trimmed.contains('[')
1967                        && !current_line_trimmed.contains("](")
1968                    {
1969                        // This looks like "key: value" which suggests YAML, not a heading
1970                        continue;
1971                    }
1972
1973                    let level = if underline.starts_with('=') { 1 } else { 2 };
1974                    let style = if level == 1 {
1975                        HeadingStyle::Setext1
1976                    } else {
1977                        HeadingStyle::Setext2
1978                    };
1979
1980                    // Extract custom header ID if present
1981                    let raw_text = line.trim().to_string();
1982                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1983
1984                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1985                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1986                        let attr_line = content_lines[i + 2];
1987                        if !lines[i + 2].in_code_block
1988                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1989                            && let Some(attr_line_id) =
1990                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1991                        {
1992                            custom_id = Some(attr_line_id);
1993                        }
1994                    }
1995
1996                    lines[i].heading = Some(HeadingInfo {
1997                        level,
1998                        style,
1999                        marker: underline.to_string(),
2000                        marker_column: next_line.len() - next_line.trim_start().len(),
2001                        content_column: lines[i].indent,
2002                        text: clean_text,
2003                        custom_id,
2004                        raw_text,
2005                        has_closing_sequence: false,
2006                        closing_sequence: String::new(),
2007                    });
2008                }
2009            }
2010        }
2011    }
2012
2013    /// Detect HTML blocks in the content
2014    fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2015        // HTML block elements that trigger block context
2016        const BLOCK_ELEMENTS: &[&str] = &[
2017            "address",
2018            "article",
2019            "aside",
2020            "blockquote",
2021            "details",
2022            "dialog",
2023            "dd",
2024            "div",
2025            "dl",
2026            "dt",
2027            "fieldset",
2028            "figcaption",
2029            "figure",
2030            "footer",
2031            "form",
2032            "h1",
2033            "h2",
2034            "h3",
2035            "h4",
2036            "h5",
2037            "h6",
2038            "header",
2039            "hr",
2040            "li",
2041            "main",
2042            "nav",
2043            "ol",
2044            "p",
2045            "picture",
2046            "pre",
2047            "script",
2048            "section",
2049            "style",
2050            "table",
2051            "tbody",
2052            "td",
2053            "textarea",
2054            "tfoot",
2055            "th",
2056            "thead",
2057            "tr",
2058            "ul",
2059        ];
2060
2061        let mut i = 0;
2062        while i < lines.len() {
2063            // Skip if already in code block or front matter
2064            if lines[i].in_code_block || lines[i].in_front_matter {
2065                i += 1;
2066                continue;
2067            }
2068
2069            let trimmed = lines[i].content(content).trim_start();
2070
2071            // Check if line starts with an HTML tag
2072            if trimmed.starts_with('<') && trimmed.len() > 1 {
2073                // Extract tag name safely
2074                let after_bracket = &trimmed[1..];
2075                let is_closing = after_bracket.starts_with('/');
2076                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2077
2078                // Extract tag name (stop at space, >, /, or end of string)
2079                let tag_name = tag_start
2080                    .chars()
2081                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2082                    .collect::<String>()
2083                    .to_lowercase();
2084
2085                // Check if it's a block element
2086                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2087                    // Mark this line as in HTML block
2088                    lines[i].in_html_block = true;
2089
2090                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2091                    // This avoids complex nesting logic that might cause infinite loops
2092                    if !is_closing {
2093                        let closing_tag = format!("</{tag_name}>");
2094                        // style and script tags can contain blank lines (CSS/JS formatting)
2095                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2096                        let mut j = i + 1;
2097                        while j < lines.len() && j < i + 100 {
2098                            // Limit search to 100 lines
2099                            // Stop at blank lines (except for style/script tags)
2100                            if !allow_blank_lines && lines[j].is_blank {
2101                                break;
2102                            }
2103
2104                            lines[j].in_html_block = true;
2105
2106                            // Check if this line contains the closing tag
2107                            if lines[j].content(content).contains(&closing_tag) {
2108                                break;
2109                            }
2110                            j += 1;
2111                        }
2112                    }
2113                }
2114            }
2115
2116            i += 1;
2117        }
2118    }
2119
2120    /// Detect ESM import/export blocks in MDX files
2121    /// ESM blocks consist of contiguous import/export statements at the top of the file
2122    fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2123        // Only process MDX files
2124        if !flavor.supports_esm_blocks() {
2125            return;
2126        }
2127
2128        for line in lines.iter_mut() {
2129            // Skip blank lines and comments at the start
2130            if line.is_blank || line.in_html_comment {
2131                continue;
2132            }
2133
2134            // Check if line starts with import or export
2135            let trimmed = line.content(content).trim_start();
2136            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2137                line.in_esm_block = true;
2138            } else {
2139                // Once we hit a non-ESM line, we're done with the ESM block
2140                break;
2141            }
2142        }
2143    }
2144
2145    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2146    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2147        let mut code_spans = Vec::new();
2148
2149        // Quick check - if no backticks, no code spans
2150        if !content.contains('`') {
2151            return code_spans;
2152        }
2153
2154        // Use pulldown-cmark's streaming parser with byte offsets
2155        let parser = Parser::new(content).into_offset_iter();
2156
2157        for (event, range) in parser {
2158            if let Event::Code(_) = event {
2159                let start_pos = range.start;
2160                let end_pos = range.end;
2161
2162                // The range includes the backticks, extract the actual content
2163                let full_span = &content[start_pos..end_pos];
2164                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2165
2166                // Extract content between backticks, preserving spaces
2167                let content_start = start_pos + backtick_count;
2168                let content_end = end_pos - backtick_count;
2169                let span_content = if content_start < content_end {
2170                    content[content_start..content_end].to_string()
2171                } else {
2172                    String::new()
2173                };
2174
2175                // Use binary search to find line number - O(log n) instead of O(n)
2176                // Find the rightmost line whose byte_offset <= start_pos
2177                let line_idx = lines
2178                    .partition_point(|line| line.byte_offset <= start_pos)
2179                    .saturating_sub(1);
2180                let line_num = line_idx + 1;
2181                let col_start = start_pos - lines[line_idx].byte_offset;
2182
2183                // Find end column using binary search
2184                let end_line_idx = lines
2185                    .partition_point(|line| line.byte_offset <= end_pos)
2186                    .saturating_sub(1);
2187                let col_end = end_pos - lines[end_line_idx].byte_offset;
2188
2189                code_spans.push(CodeSpan {
2190                    line: line_num,
2191                    start_col: col_start,
2192                    end_col: col_end,
2193                    byte_offset: start_pos,
2194                    byte_end: end_pos,
2195                    backtick_count,
2196                    content: span_content,
2197                });
2198            }
2199        }
2200
2201        // Sort by position to ensure consistent ordering
2202        code_spans.sort_by_key(|span| span.byte_offset);
2203
2204        code_spans
2205    }
2206
2207    /// Parse all list blocks in the content (legacy line-by-line approach)
2208    ///
2209    /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2210    /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2211    ///   terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2212    /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2213    ///   treated as list continuation (based on the list marker width)
2214    ///
2215    /// When a new list item is encountered, we check if list-breaking content was seen
2216    /// since the last item. If so, we start a new list block.
2217    fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2218        // Minimum indentation for unordered list continuation per CommonMark spec
2219        const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2220
2221        /// Initialize or reset the forward-scanning tracking state.
2222        /// This helper eliminates code duplication across three initialization sites.
2223        #[inline]
2224        fn reset_tracking_state(
2225            list_item: &ListItemInfo,
2226            has_list_breaking_content: &mut bool,
2227            min_continuation: &mut usize,
2228        ) {
2229            *has_list_breaking_content = false;
2230            let marker_width = if list_item.is_ordered {
2231                list_item.marker.len() + 1 // Ordered markers need space after period/paren
2232            } else {
2233                list_item.marker.len()
2234            };
2235            *min_continuation = if list_item.is_ordered {
2236                marker_width
2237            } else {
2238                UNORDERED_LIST_MIN_CONTINUATION_INDENT
2239            };
2240        }
2241
2242        // Pre-size based on lines that could be list items
2243        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2244        let mut current_block: Option<ListBlock> = None;
2245        let mut last_list_item_line = 0;
2246        let mut current_indent_level = 0;
2247        let mut last_marker_width = 0;
2248
2249        // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2250        let mut has_list_breaking_content_since_last_item = false;
2251        let mut min_continuation_for_tracking = 0;
2252
2253        for (line_idx, line_info) in lines.iter().enumerate() {
2254            let line_num = line_idx + 1;
2255
2256            // Enhanced code block handling using Design #3's context analysis
2257            if line_info.in_code_block {
2258                if let Some(ref mut block) = current_block {
2259                    // Calculate minimum indentation for list continuation
2260                    let min_continuation_indent =
2261                        CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2262
2263                    // Analyze code block context using the three-tier classification
2264                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2265
2266                    match context {
2267                        CodeBlockContext::Indented => {
2268                            // Code block is properly indented - continues the list
2269                            block.end_line = line_num;
2270                            continue;
2271                        }
2272                        CodeBlockContext::Standalone => {
2273                            // Code block separates lists - end current block
2274                            let completed_block = current_block.take().unwrap();
2275                            list_blocks.push(completed_block);
2276                            continue;
2277                        }
2278                        CodeBlockContext::Adjacent => {
2279                            // Edge case - use conservative behavior (continue list)
2280                            block.end_line = line_num;
2281                            continue;
2282                        }
2283                    }
2284                } else {
2285                    // No current list block - skip code block lines
2286                    continue;
2287                }
2288            }
2289
2290            // Extract blockquote prefix if any
2291            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2292                caps.get(0).unwrap().as_str().to_string()
2293            } else {
2294                String::new()
2295            };
2296
2297            // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2298            if current_block.is_some() && line_info.list_item.is_none() && !line_info.is_blank {
2299                let line_content = line_info.content(content).trim();
2300
2301                // Check for structural separators that break lists
2302                let breaks_list = line_info.heading.is_some()
2303                    || line_content.starts_with("---")
2304                    || line_content.starts_with("***")
2305                    || line_content.starts_with("___")
2306                    || (line_content.contains('|')
2307                        && !line_content.contains("](")
2308                        && !line_content.contains("http")
2309                        && (line_content.matches('|').count() > 1
2310                            || line_content.starts_with('|')
2311                            || line_content.ends_with('|')))
2312                    || line_content.starts_with(">")
2313                    || (line_info.indent < min_continuation_for_tracking);
2314
2315                if breaks_list {
2316                    has_list_breaking_content_since_last_item = true;
2317                }
2318            }
2319
2320            // Check if this line is a list item
2321            if let Some(list_item) = &line_info.list_item {
2322                // Calculate nesting level based on indentation
2323                let item_indent = list_item.marker_column;
2324                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2325
2326                if let Some(ref mut block) = current_block {
2327                    // Check if this continues the current block
2328                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2329                    // or a continuation at the same or lower level
2330                    let is_nested = nesting > block.nesting_level;
2331                    let same_type =
2332                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2333                    let same_context = block.blockquote_prefix == blockquote_prefix;
2334                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2335
2336                    // For unordered lists, also check marker consistency
2337                    let marker_compatible =
2338                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2339
2340                    // O(1) check: Use the tracked variable instead of O(n) nested loop
2341                    // This eliminates the quadratic bottleneck from issue #148
2342                    let has_non_list_content = has_list_breaking_content_since_last_item;
2343
2344                    // A list continues if:
2345                    // 1. It's a nested item (indented more than the parent), OR
2346                    // 2. It's the same type at the same level with reasonable distance
2347                    let mut continues_list = if is_nested {
2348                        // Nested items always continue the list if they're in the same context
2349                        same_context && reasonable_distance && !has_non_list_content
2350                    } else {
2351                        // Same-level items need to match type and markers
2352                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
2353                    };
2354
2355                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2356                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2357                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2358                        // Check if the previous line was a list item
2359                        if block.item_lines.contains(&(line_num - 1)) {
2360                            // They're consecutive list items - force them to be in the same list
2361                            continues_list = true;
2362                        }
2363                    }
2364
2365                    if continues_list {
2366                        // Extend current block
2367                        block.end_line = line_num;
2368                        block.item_lines.push(line_num);
2369
2370                        // Update max marker width
2371                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2372                            list_item.marker.len() + 1
2373                        } else {
2374                            list_item.marker.len()
2375                        });
2376
2377                        // Update marker consistency for unordered lists
2378                        if !block.is_ordered
2379                            && block.marker.is_some()
2380                            && block.marker.as_ref() != Some(&list_item.marker)
2381                        {
2382                            // Mixed markers, clear the marker field
2383                            block.marker = None;
2384                        }
2385
2386                        // Reset tracked state for issue #148 optimization
2387                        reset_tracking_state(
2388                            list_item,
2389                            &mut has_list_breaking_content_since_last_item,
2390                            &mut min_continuation_for_tracking,
2391                        );
2392                    } else {
2393                        // End current block and start a new one
2394
2395                        list_blocks.push(block.clone());
2396
2397                        *block = ListBlock {
2398                            start_line: line_num,
2399                            end_line: line_num,
2400                            is_ordered: list_item.is_ordered,
2401                            marker: if list_item.is_ordered {
2402                                None
2403                            } else {
2404                                Some(list_item.marker.clone())
2405                            },
2406                            blockquote_prefix: blockquote_prefix.clone(),
2407                            item_lines: vec![line_num],
2408                            nesting_level: nesting,
2409                            max_marker_width: if list_item.is_ordered {
2410                                list_item.marker.len() + 1
2411                            } else {
2412                                list_item.marker.len()
2413                            },
2414                        };
2415
2416                        // Initialize tracked state for new block (issue #148 optimization)
2417                        reset_tracking_state(
2418                            list_item,
2419                            &mut has_list_breaking_content_since_last_item,
2420                            &mut min_continuation_for_tracking,
2421                        );
2422                    }
2423                } else {
2424                    // Start a new block
2425                    current_block = Some(ListBlock {
2426                        start_line: line_num,
2427                        end_line: line_num,
2428                        is_ordered: list_item.is_ordered,
2429                        marker: if list_item.is_ordered {
2430                            None
2431                        } else {
2432                            Some(list_item.marker.clone())
2433                        },
2434                        blockquote_prefix,
2435                        item_lines: vec![line_num],
2436                        nesting_level: nesting,
2437                        max_marker_width: list_item.marker.len(),
2438                    });
2439
2440                    // Initialize tracked state for new block (issue #148 optimization)
2441                    reset_tracking_state(
2442                        list_item,
2443                        &mut has_list_breaking_content_since_last_item,
2444                        &mut min_continuation_for_tracking,
2445                    );
2446                }
2447
2448                last_list_item_line = line_num;
2449                current_indent_level = item_indent;
2450                last_marker_width = if list_item.is_ordered {
2451                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2452                } else {
2453                    list_item.marker.len()
2454                };
2455            } else if let Some(ref mut block) = current_block {
2456                // Not a list item - check if it continues the current block
2457
2458                // For MD032 compatibility, we use a simple approach:
2459                // - Indented lines continue the list
2460                // - Blank lines followed by indented content continue the list
2461                // - Everything else ends the list
2462
2463                // Check if the last line in the list block ended with a backslash (hard line break)
2464                // This handles cases where list items use backslash for hard line breaks
2465                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2466                    lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
2467                } else {
2468                    false
2469                };
2470
2471                // Calculate minimum indentation for list continuation
2472                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2473                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2474                let min_continuation_indent = if block.is_ordered {
2475                    current_indent_level + last_marker_width
2476                } else {
2477                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2478                };
2479
2480                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2481                    // Indented line or backslash continuation continues the list
2482                    block.end_line = line_num;
2483                } else if line_info.is_blank {
2484                    // Blank line - check if it's internal to the list or ending it
2485                    // We only include blank lines that are followed by more list content
2486                    let mut check_idx = line_idx + 1;
2487                    let mut found_continuation = false;
2488
2489                    // Skip additional blank lines
2490                    while check_idx < lines.len() && lines[check_idx].is_blank {
2491                        check_idx += 1;
2492                    }
2493
2494                    if check_idx < lines.len() {
2495                        let next_line = &lines[check_idx];
2496                        // Check if followed by indented content (list continuation)
2497                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2498                            found_continuation = true;
2499                        }
2500                        // Check if followed by another list item at the same level
2501                        else if !next_line.in_code_block
2502                            && next_line.list_item.is_some()
2503                            && let Some(item) = &next_line.list_item
2504                        {
2505                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2506                                .find(next_line.content(content))
2507                                .map_or(String::new(), |m| m.as_str().to_string());
2508                            if item.marker_column == current_indent_level
2509                                && item.is_ordered == block.is_ordered
2510                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2511                            {
2512                                // Check if there was meaningful content between the list items (unused now)
2513                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2514                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2515                                    if let Some(between_line) = lines.get(idx) {
2516                                        let between_content = between_line.content(content);
2517                                        let trimmed = between_content.trim();
2518                                        // Skip empty lines
2519                                        if trimmed.is_empty() {
2520                                            return false;
2521                                        }
2522                                        // Check for meaningful content
2523                                        let line_indent = between_content.len() - between_content.trim_start().len();
2524
2525                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2526                                        if trimmed.starts_with("```")
2527                                            || trimmed.starts_with("~~~")
2528                                            || trimmed.starts_with("---")
2529                                            || trimmed.starts_with("***")
2530                                            || trimmed.starts_with("___")
2531                                            || trimmed.starts_with(">")
2532                                            || trimmed.contains('|') // Tables
2533                                            || between_line.heading.is_some()
2534                                        {
2535                                            return true; // These are structural separators - meaningful content that breaks lists
2536                                        }
2537
2538                                        // Only properly indented content continues the list
2539                                        line_indent >= min_continuation_indent
2540                                    } else {
2541                                        false
2542                                    }
2543                                });
2544
2545                                if block.is_ordered {
2546                                    // For ordered lists: don't continue if there are structural separators
2547                                    // Check if there are structural separators between the list items
2548                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2549                                        if let Some(between_line) = lines.get(idx) {
2550                                            let trimmed = between_line.content(content).trim();
2551                                            if trimmed.is_empty() {
2552                                                return false;
2553                                            }
2554                                            // Check for structural separators that break lists
2555                                            trimmed.starts_with("```")
2556                                                || trimmed.starts_with("~~~")
2557                                                || trimmed.starts_with("---")
2558                                                || trimmed.starts_with("***")
2559                                                || trimmed.starts_with("___")
2560                                                || trimmed.starts_with(">")
2561                                                || trimmed.contains('|') // Tables
2562                                                || between_line.heading.is_some()
2563                                        } else {
2564                                            false
2565                                        }
2566                                    });
2567                                    found_continuation = !has_structural_separators;
2568                                } else {
2569                                    // For unordered lists: also check for structural separators
2570                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2571                                        if let Some(between_line) = lines.get(idx) {
2572                                            let trimmed = between_line.content(content).trim();
2573                                            if trimmed.is_empty() {
2574                                                return false;
2575                                            }
2576                                            // Check for structural separators that break lists
2577                                            trimmed.starts_with("```")
2578                                                || trimmed.starts_with("~~~")
2579                                                || trimmed.starts_with("---")
2580                                                || trimmed.starts_with("***")
2581                                                || trimmed.starts_with("___")
2582                                                || trimmed.starts_with(">")
2583                                                || trimmed.contains('|') // Tables
2584                                                || between_line.heading.is_some()
2585                                        } else {
2586                                            false
2587                                        }
2588                                    });
2589                                    found_continuation = !has_structural_separators;
2590                                }
2591                            }
2592                        }
2593                    }
2594
2595                    if found_continuation {
2596                        // Include the blank line in the block
2597                        block.end_line = line_num;
2598                    } else {
2599                        // Blank line ends the list - don't include it
2600                        list_blocks.push(block.clone());
2601                        current_block = None;
2602                    }
2603                } else {
2604                    // Check for lazy continuation - non-indented line immediately after a list item
2605                    // But only if the line has sufficient indentation for the list type
2606                    let min_required_indent = if block.is_ordered {
2607                        current_indent_level + last_marker_width
2608                    } else {
2609                        current_indent_level + 2
2610                    };
2611
2612                    // For lazy continuation to apply, the line must either:
2613                    // 1. Have no indentation (true lazy continuation)
2614                    // 2. Have sufficient indentation for the list type
2615                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2616                    let line_content = line_info.content(content).trim();
2617                    let is_structural_separator = line_info.heading.is_some()
2618                        || line_content.starts_with("```")
2619                        || line_content.starts_with("~~~")
2620                        || line_content.starts_with("---")
2621                        || line_content.starts_with("***")
2622                        || line_content.starts_with("___")
2623                        || line_content.starts_with(">")
2624                        || (line_content.contains('|')
2625                            && !line_content.contains("](")
2626                            && !line_content.contains("http")
2627                            && (line_content.matches('|').count() > 1
2628                                || line_content.starts_with('|')
2629                                || line_content.ends_with('|'))); // Tables
2630
2631                    // Allow lazy continuation if we're still within the same list block
2632                    // (not just immediately after a list item)
2633                    let is_lazy_continuation = !is_structural_separator
2634                        && !line_info.is_blank
2635                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2636
2637                    if is_lazy_continuation {
2638                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2639                        // it's probably not a continuation
2640                        let content_to_check = if !blockquote_prefix.is_empty() {
2641                            // Strip blockquote prefix to check the actual content
2642                            line_info
2643                                .content(content)
2644                                .strip_prefix(&blockquote_prefix)
2645                                .unwrap_or(line_info.content(content))
2646                                .trim()
2647                        } else {
2648                            line_info.content(content).trim()
2649                        };
2650
2651                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2652
2653                        // If it starts with uppercase and the previous line ended with punctuation,
2654                        // it's likely a new paragraph, not a continuation
2655                        if starts_with_uppercase && last_list_item_line > 0 {
2656                            // This looks like a new paragraph
2657                            list_blocks.push(block.clone());
2658                            current_block = None;
2659                        } else {
2660                            // This is a lazy continuation line
2661                            block.end_line = line_num;
2662                        }
2663                    } else {
2664                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2665                        list_blocks.push(block.clone());
2666                        current_block = None;
2667                    }
2668                }
2669            }
2670        }
2671
2672        // Don't forget the last block
2673        if let Some(block) = current_block {
2674            list_blocks.push(block);
2675        }
2676
2677        // Merge adjacent blocks that should be one
2678        merge_adjacent_list_blocks(content, &mut list_blocks, lines);
2679
2680        list_blocks
2681    }
2682
2683    /// Compute character frequency for fast content analysis
2684    fn compute_char_frequency(content: &str) -> CharFrequency {
2685        let mut frequency = CharFrequency::default();
2686
2687        for ch in content.chars() {
2688            match ch {
2689                '#' => frequency.hash_count += 1,
2690                '*' => frequency.asterisk_count += 1,
2691                '_' => frequency.underscore_count += 1,
2692                '-' => frequency.hyphen_count += 1,
2693                '+' => frequency.plus_count += 1,
2694                '>' => frequency.gt_count += 1,
2695                '|' => frequency.pipe_count += 1,
2696                '[' => frequency.bracket_count += 1,
2697                '`' => frequency.backtick_count += 1,
2698                '<' => frequency.lt_count += 1,
2699                '!' => frequency.exclamation_count += 1,
2700                '\n' => frequency.newline_count += 1,
2701                _ => {}
2702            }
2703        }
2704
2705        frequency
2706    }
2707
2708    /// Parse HTML tags in the content
2709    fn parse_html_tags(
2710        content: &str,
2711        lines: &[LineInfo],
2712        code_blocks: &[(usize, usize)],
2713        flavor: MarkdownFlavor,
2714    ) -> Vec<HtmlTag> {
2715        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2716            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2717
2718        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2719
2720        for cap in HTML_TAG_REGEX.captures_iter(content) {
2721            let full_match = cap.get(0).unwrap();
2722            let match_start = full_match.start();
2723            let match_end = full_match.end();
2724
2725            // Skip if in code block
2726            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2727                continue;
2728            }
2729
2730            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2731            let tag_name_original = cap.get(2).unwrap().as_str();
2732            let tag_name = tag_name_original.to_lowercase();
2733            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2734
2735            // Skip JSX components in MDX files (tags starting with uppercase letter)
2736            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2737            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2738                continue;
2739            }
2740
2741            // Find which line this tag is on
2742            let mut line_num = 1;
2743            let mut col_start = match_start;
2744            let mut col_end = match_end;
2745            for (idx, line_info) in lines.iter().enumerate() {
2746                if match_start >= line_info.byte_offset {
2747                    line_num = idx + 1;
2748                    col_start = match_start - line_info.byte_offset;
2749                    col_end = match_end - line_info.byte_offset;
2750                } else {
2751                    break;
2752                }
2753            }
2754
2755            html_tags.push(HtmlTag {
2756                line: line_num,
2757                start_col: col_start,
2758                end_col: col_end,
2759                byte_offset: match_start,
2760                byte_end: match_end,
2761                tag_name,
2762                is_closing,
2763                is_self_closing,
2764                raw_content: full_match.as_str().to_string(),
2765            });
2766        }
2767
2768        html_tags
2769    }
2770
2771    /// Parse emphasis spans in the content
2772    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2773        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2774            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2775
2776        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2777
2778        for cap in EMPHASIS_REGEX.captures_iter(content) {
2779            let full_match = cap.get(0).unwrap();
2780            let match_start = full_match.start();
2781            let match_end = full_match.end();
2782
2783            // Skip if in code block
2784            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2785                continue;
2786            }
2787
2788            let opening_markers = cap.get(1).unwrap().as_str();
2789            let content_part = cap.get(2).unwrap().as_str();
2790            let closing_markers = cap.get(3).unwrap().as_str();
2791
2792            // Validate matching markers
2793            if opening_markers.chars().next() != closing_markers.chars().next()
2794                || opening_markers.len() != closing_markers.len()
2795            {
2796                continue;
2797            }
2798
2799            let marker = opening_markers.chars().next().unwrap();
2800            let marker_count = opening_markers.len();
2801
2802            // Find which line this emphasis is on
2803            let mut line_num = 1;
2804            let mut col_start = match_start;
2805            let mut col_end = match_end;
2806            for (idx, line_info) in lines.iter().enumerate() {
2807                if match_start >= line_info.byte_offset {
2808                    line_num = idx + 1;
2809                    col_start = match_start - line_info.byte_offset;
2810                    col_end = match_end - line_info.byte_offset;
2811                } else {
2812                    break;
2813                }
2814            }
2815
2816            emphasis_spans.push(EmphasisSpan {
2817                line: line_num,
2818                start_col: col_start,
2819                end_col: col_end,
2820                byte_offset: match_start,
2821                byte_end: match_end,
2822                marker,
2823                marker_count,
2824                content: content_part.to_string(),
2825            });
2826        }
2827
2828        emphasis_spans
2829    }
2830
2831    /// Parse table rows in the content
2832    fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
2833        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2834
2835        for (line_idx, line_info) in lines.iter().enumerate() {
2836            // Skip lines in code blocks or blank lines
2837            if line_info.in_code_block || line_info.is_blank {
2838                continue;
2839            }
2840
2841            let line = line_info.content(content);
2842            let line_num = line_idx + 1;
2843
2844            // Check if this line contains pipes (potential table row)
2845            if !line.contains('|') {
2846                continue;
2847            }
2848
2849            // Count columns by splitting on pipes
2850            let parts: Vec<&str> = line.split('|').collect();
2851            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2852
2853            // Check if this is a separator row
2854            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2855            let mut column_alignments = Vec::new();
2856
2857            if is_separator {
2858                for part in &parts[1..parts.len() - 1] {
2859                    // Skip first and last empty parts
2860                    let trimmed = part.trim();
2861                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2862                        "center".to_string()
2863                    } else if trimmed.ends_with(':') {
2864                        "right".to_string()
2865                    } else if trimmed.starts_with(':') {
2866                        "left".to_string()
2867                    } else {
2868                        "none".to_string()
2869                    };
2870                    column_alignments.push(alignment);
2871                }
2872            }
2873
2874            table_rows.push(TableRow {
2875                line: line_num,
2876                is_separator,
2877                column_count,
2878                column_alignments,
2879            });
2880        }
2881
2882        table_rows
2883    }
2884
2885    /// Parse bare URLs and emails in the content
2886    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2887        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2888
2889        // Check for bare URLs (not in angle brackets or markdown links)
2890        for cap in BARE_URL_PATTERN.captures_iter(content) {
2891            let full_match = cap.get(0).unwrap();
2892            let match_start = full_match.start();
2893            let match_end = full_match.end();
2894
2895            // Skip if in code block
2896            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2897                continue;
2898            }
2899
2900            // Skip if already in angle brackets or markdown links
2901            let preceding_char = if match_start > 0 {
2902                content.chars().nth(match_start - 1)
2903            } else {
2904                None
2905            };
2906            let following_char = content.chars().nth(match_end);
2907
2908            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2909                continue;
2910            }
2911            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2912                continue;
2913            }
2914
2915            let url = full_match.as_str();
2916            let url_type = if url.starts_with("https://") {
2917                "https"
2918            } else if url.starts_with("http://") {
2919                "http"
2920            } else if url.starts_with("ftp://") {
2921                "ftp"
2922            } else {
2923                "other"
2924            };
2925
2926            // Find which line this URL is on
2927            let mut line_num = 1;
2928            let mut col_start = match_start;
2929            let mut col_end = match_end;
2930            for (idx, line_info) in lines.iter().enumerate() {
2931                if match_start >= line_info.byte_offset {
2932                    line_num = idx + 1;
2933                    col_start = match_start - line_info.byte_offset;
2934                    col_end = match_end - line_info.byte_offset;
2935                } else {
2936                    break;
2937                }
2938            }
2939
2940            bare_urls.push(BareUrl {
2941                line: line_num,
2942                start_col: col_start,
2943                end_col: col_end,
2944                byte_offset: match_start,
2945                byte_end: match_end,
2946                url: url.to_string(),
2947                url_type: url_type.to_string(),
2948            });
2949        }
2950
2951        // Check for bare email addresses
2952        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2953            let full_match = cap.get(0).unwrap();
2954            let match_start = full_match.start();
2955            let match_end = full_match.end();
2956
2957            // Skip if in code block
2958            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2959                continue;
2960            }
2961
2962            // Skip if already in angle brackets or markdown links
2963            let preceding_char = if match_start > 0 {
2964                content.chars().nth(match_start - 1)
2965            } else {
2966                None
2967            };
2968            let following_char = content.chars().nth(match_end);
2969
2970            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2971                continue;
2972            }
2973            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2974                continue;
2975            }
2976
2977            let email = full_match.as_str();
2978
2979            // Find which line this email is on
2980            let mut line_num = 1;
2981            let mut col_start = match_start;
2982            let mut col_end = match_end;
2983            for (idx, line_info) in lines.iter().enumerate() {
2984                if match_start >= line_info.byte_offset {
2985                    line_num = idx + 1;
2986                    col_start = match_start - line_info.byte_offset;
2987                    col_end = match_end - line_info.byte_offset;
2988                } else {
2989                    break;
2990                }
2991            }
2992
2993            bare_urls.push(BareUrl {
2994                line: line_num,
2995                start_col: col_start,
2996                end_col: col_end,
2997                byte_offset: match_start,
2998                byte_end: match_end,
2999                url: email.to_string(),
3000                url_type: "email".to_string(),
3001            });
3002        }
3003
3004        bare_urls
3005    }
3006}
3007
3008/// Merge adjacent list blocks that should be treated as one
3009fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3010    if list_blocks.len() < 2 {
3011        return;
3012    }
3013
3014    let mut merger = ListBlockMerger::new(content, lines);
3015    *list_blocks = merger.merge(list_blocks);
3016}
3017
3018/// Helper struct to manage the complex logic of merging list blocks
3019struct ListBlockMerger<'a> {
3020    content: &'a str,
3021    lines: &'a [LineInfo],
3022}
3023
3024impl<'a> ListBlockMerger<'a> {
3025    fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3026        Self { content, lines }
3027    }
3028
3029    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3030        let mut merged = Vec::with_capacity(list_blocks.len());
3031        let mut current = list_blocks[0].clone();
3032
3033        for next in list_blocks.iter().skip(1) {
3034            if self.should_merge_blocks(&current, next) {
3035                current = self.merge_two_blocks(current, next);
3036            } else {
3037                merged.push(current);
3038                current = next.clone();
3039            }
3040        }
3041
3042        merged.push(current);
3043        merged
3044    }
3045
3046    /// Determine if two adjacent list blocks should be merged
3047    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3048        // Basic compatibility checks
3049        if !self.blocks_are_compatible(current, next) {
3050            return false;
3051        }
3052
3053        // Check spacing and content between blocks
3054        let spacing = self.analyze_spacing_between(current, next);
3055        match spacing {
3056            BlockSpacing::Consecutive => true,
3057            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3058            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3059                self.can_merge_with_content_between(current, next)
3060            }
3061        }
3062    }
3063
3064    /// Check if blocks have compatible structure for merging
3065    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3066        current.is_ordered == next.is_ordered
3067            && current.blockquote_prefix == next.blockquote_prefix
3068            && current.nesting_level == next.nesting_level
3069    }
3070
3071    /// Analyze the spacing between two list blocks
3072    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3073        let gap = next.start_line - current.end_line;
3074
3075        match gap {
3076            1 => BlockSpacing::Consecutive,
3077            2 => BlockSpacing::SingleBlank,
3078            _ if gap > 2 => {
3079                if self.has_only_blank_lines_between(current, next) {
3080                    BlockSpacing::MultipleBlanks
3081                } else {
3082                    BlockSpacing::ContentBetween
3083                }
3084            }
3085            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3086        }
3087    }
3088
3089    /// Check if unordered lists can be merged with a single blank line between
3090    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3091        // Check if there are structural separators between the blocks
3092        // If has_meaningful_content_between returns true, it means there are structural separators
3093        if has_meaningful_content_between(self.content, current, next, self.lines) {
3094            return false; // Structural separators prevent merging
3095        }
3096
3097        // Only merge unordered lists with same marker across single blank
3098        !current.is_ordered && current.marker == next.marker
3099    }
3100
3101    /// Check if ordered lists can be merged when there's content between them
3102    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3103        // Do not merge lists if there are structural separators between them
3104        if has_meaningful_content_between(self.content, current, next, self.lines) {
3105            return false; // Structural separators prevent merging
3106        }
3107
3108        // Only consider merging ordered lists if there's no structural content between
3109        current.is_ordered && next.is_ordered
3110    }
3111
3112    /// Check if there are only blank lines between blocks
3113    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3114        for line_num in (current.end_line + 1)..next.start_line {
3115            if let Some(line_info) = self.lines.get(line_num - 1)
3116                && !line_info.content(self.content).trim().is_empty()
3117            {
3118                return false;
3119            }
3120        }
3121        true
3122    }
3123
3124    /// Merge two compatible list blocks into one
3125    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3126        current.end_line = next.end_line;
3127        current.item_lines.extend_from_slice(&next.item_lines);
3128
3129        // Update max marker width
3130        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3131
3132        // Handle marker consistency for unordered lists
3133        if !current.is_ordered && self.markers_differ(&current, next) {
3134            current.marker = None; // Mixed markers
3135        }
3136
3137        current
3138    }
3139
3140    /// Check if two blocks have different markers
3141    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3142        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3143    }
3144}
3145
3146/// Types of spacing between list blocks
3147#[derive(Debug, PartialEq)]
3148enum BlockSpacing {
3149    Consecutive,    // No gap between blocks
3150    SingleBlank,    // One blank line between blocks
3151    MultipleBlanks, // Multiple blank lines but no content
3152    ContentBetween, // Content exists between blocks
3153}
3154
3155/// Check if there's meaningful content (not just blank lines) between two list blocks
3156fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3157    // Check lines between current.end_line and next.start_line
3158    for line_num in (current.end_line + 1)..next.start_line {
3159        if let Some(line_info) = lines.get(line_num - 1) {
3160            // Convert to 0-indexed
3161            let trimmed = line_info.content(content).trim();
3162
3163            // Skip empty lines
3164            if trimmed.is_empty() {
3165                continue;
3166            }
3167
3168            // Check for structural separators that should separate lists (CommonMark compliant)
3169
3170            // Headings separate lists
3171            if line_info.heading.is_some() {
3172                return true; // Has meaningful content - headings separate lists
3173            }
3174
3175            // Horizontal rules separate lists (---, ***, ___)
3176            if is_horizontal_rule(trimmed) {
3177                return true; // Has meaningful content - horizontal rules separate lists
3178            }
3179
3180            // Tables separate lists (lines containing | but not in URLs or code)
3181            // Simple heuristic: tables typically have | at start/end or multiple |
3182            if trimmed.contains('|') && trimmed.len() > 1 {
3183                // Don't treat URLs with | as tables
3184                if !trimmed.contains("](") && !trimmed.contains("http") {
3185                    // More robust check: tables usually have multiple | or | at edges
3186                    let pipe_count = trimmed.matches('|').count();
3187                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3188                        return true; // Has meaningful content - tables separate lists
3189                    }
3190                }
3191            }
3192
3193            // Blockquotes separate lists
3194            if trimmed.starts_with('>') {
3195                return true; // Has meaningful content - blockquotes separate lists
3196            }
3197
3198            // Code block fences separate lists (unless properly indented as list content)
3199            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3200                let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3201
3202                // Check if this code block is properly indented as list continuation
3203                let min_continuation_indent = if current.is_ordered {
3204                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3205                } else {
3206                    current.nesting_level + 2
3207                };
3208
3209                if line_indent < min_continuation_indent {
3210                    // This is a standalone code block that separates lists
3211                    return true; // Has meaningful content - standalone code blocks separate lists
3212                }
3213            }
3214
3215            // Check if this line has proper indentation for list continuation
3216            let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
3217
3218            // Calculate minimum indentation needed to be list continuation
3219            let min_indent = if current.is_ordered {
3220                current.nesting_level + current.max_marker_width
3221            } else {
3222                current.nesting_level + 2
3223            };
3224
3225            // If the line is not indented enough to be list continuation, it's meaningful content
3226            if line_indent < min_indent {
3227                return true; // Has meaningful content - content not indented as list continuation
3228            }
3229
3230            // If we reach here, the line is properly indented as list continuation
3231            // Continue checking other lines
3232        }
3233    }
3234
3235    // Only blank lines or properly indented list continuation content between blocks
3236    false
3237}
3238
3239/// Check if a line is a horizontal rule (---, ***, ___)
3240fn is_horizontal_rule(trimmed: &str) -> bool {
3241    if trimmed.len() < 3 {
3242        return false;
3243    }
3244
3245    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3246    let chars: Vec<char> = trimmed.chars().collect();
3247    if let Some(&first_char) = chars.first()
3248        && (first_char == '-' || first_char == '*' || first_char == '_')
3249    {
3250        let mut count = 0;
3251        for &ch in &chars {
3252            if ch == first_char {
3253                count += 1;
3254            } else if ch != ' ' && ch != '\t' {
3255                return false; // Non-matching, non-whitespace character
3256            }
3257        }
3258        return count >= 3;
3259    }
3260    false
3261}
3262
3263/// Check if content contains patterns that cause the markdown crate to panic
3264#[cfg(test)]
3265mod tests {
3266    use super::*;
3267
3268    #[test]
3269    fn test_empty_content() {
3270        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3271        assert_eq!(ctx.content, "");
3272        assert_eq!(ctx.line_offsets, vec![0]);
3273        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3274        assert_eq!(ctx.lines.len(), 0);
3275    }
3276
3277    #[test]
3278    fn test_single_line() {
3279        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3280        assert_eq!(ctx.content, "# Hello");
3281        assert_eq!(ctx.line_offsets, vec![0]);
3282        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3283        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3284    }
3285
3286    #[test]
3287    fn test_multi_line() {
3288        let content = "# Title\n\nSecond line\nThird line";
3289        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3290        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3291        // Test offset to line/col
3292        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3293        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3294        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3295        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3296        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3297    }
3298
3299    #[test]
3300    fn test_line_info() {
3301        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3302        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3303
3304        // Test line info
3305        assert_eq!(ctx.lines.len(), 7);
3306
3307        // Line 1: "# Title"
3308        let line1 = &ctx.lines[0];
3309        assert_eq!(line1.content(ctx.content), "# Title");
3310        assert_eq!(line1.byte_offset, 0);
3311        assert_eq!(line1.indent, 0);
3312        assert!(!line1.is_blank);
3313        assert!(!line1.in_code_block);
3314        assert!(line1.list_item.is_none());
3315
3316        // Line 2: "    indented"
3317        let line2 = &ctx.lines[1];
3318        assert_eq!(line2.content(ctx.content), "    indented");
3319        assert_eq!(line2.byte_offset, 8);
3320        assert_eq!(line2.indent, 4);
3321        assert!(!line2.is_blank);
3322
3323        // Line 3: "" (blank)
3324        let line3 = &ctx.lines[2];
3325        assert_eq!(line3.content(ctx.content), "");
3326        assert!(line3.is_blank);
3327
3328        // Test helper methods
3329        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3330        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3331        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3332        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3333    }
3334
3335    #[test]
3336    fn test_list_item_detection() {
3337        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3338        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3339
3340        // Line 1: "- Unordered item"
3341        let line1 = &ctx.lines[0];
3342        assert!(line1.list_item.is_some());
3343        let list1 = line1.list_item.as_ref().unwrap();
3344        assert_eq!(list1.marker, "-");
3345        assert!(!list1.is_ordered);
3346        assert_eq!(list1.marker_column, 0);
3347        assert_eq!(list1.content_column, 2);
3348
3349        // Line 2: "  * Nested item"
3350        let line2 = &ctx.lines[1];
3351        assert!(line2.list_item.is_some());
3352        let list2 = line2.list_item.as_ref().unwrap();
3353        assert_eq!(list2.marker, "*");
3354        assert_eq!(list2.marker_column, 2);
3355
3356        // Line 3: "1. Ordered item"
3357        let line3 = &ctx.lines[2];
3358        assert!(line3.list_item.is_some());
3359        let list3 = line3.list_item.as_ref().unwrap();
3360        assert_eq!(list3.marker, "1.");
3361        assert!(list3.is_ordered);
3362        assert_eq!(list3.number, Some(1));
3363
3364        // Line 6: "Not a list"
3365        let line6 = &ctx.lines[5];
3366        assert!(line6.list_item.is_none());
3367    }
3368
3369    #[test]
3370    fn test_offset_to_line_col_edge_cases() {
3371        let content = "a\nb\nc";
3372        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3373        // line_offsets: [0, 2, 4]
3374        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3375        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3376        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3377        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3378        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3379        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3380    }
3381
3382    #[test]
3383    fn test_mdx_esm_blocks() {
3384        let content = r##"import {Chart} from './snowfall.js'
3385export const year = 2023
3386
3387# Last year's snowfall
3388
3389In {year}, the snowfall was above average.
3390It was followed by a warm spring which caused
3391flood conditions in many of the nearby rivers.
3392
3393<Chart color="#fcb32c" year={year} />
3394"##;
3395
3396        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3397
3398        // Check that lines 1 and 2 are marked as ESM blocks
3399        assert_eq!(ctx.lines.len(), 10);
3400        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3401        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3402        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3403        assert!(
3404            !ctx.lines[3].in_esm_block,
3405            "Line 4 (heading) should NOT be in_esm_block"
3406        );
3407        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3408        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3409    }
3410
3411    #[test]
3412    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3413        let content = r#"import {Chart} from './snowfall.js'
3414export const year = 2023
3415
3416# Last year's snowfall
3417"#;
3418
3419        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3420
3421        // ESM blocks should NOT be detected in Standard flavor
3422        assert!(
3423            !ctx.lines[0].in_esm_block,
3424            "Line 1 should NOT be in_esm_block in Standard flavor"
3425        );
3426        assert!(
3427            !ctx.lines[1].in_esm_block,
3428            "Line 2 should NOT be in_esm_block in Standard flavor"
3429        );
3430    }
3431}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs