rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
5use regex::Regex;
6use std::sync::LazyLock;
7
8// Comprehensive link pattern that captures both inline and reference links
9// Use (?s) flag to make . match newlines
10static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
11    Regex::new(
12        r#"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
16            |
17            \[([^\]]*)\]      # Reference ID in group 6
18        )"#
19    ).unwrap()
20});
21
22// Image pattern (similar to links but with ! prefix)
23// Use (?s) flag to make . match newlines
24static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
25    Regex::new(
26        r#"(?sx)
27        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
28        (?:
29            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
30            |
31            \[([^\]]*)\]      # Reference ID in group 6
32        )"#
33    ).unwrap()
34});
35
36// Reference definition pattern
37static REF_DEF_PATTERN: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
39
40// Pattern for bare URLs
41static BARE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
42    Regex::new(
43        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44    ).unwrap()
45});
46
47// Pattern for email addresses
48static BARE_EMAIL_PATTERN: LazyLock<Regex> =
49    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
50
51// Pattern for blockquote prefix in parse_list_blocks
52static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
53
54/// Pre-computed information about a line
55#[derive(Debug, Clone)]
56pub struct LineInfo {
57    /// The actual line content (without newline)
58    pub content: String,
59    /// Byte offset where this line starts in the document
60    pub byte_offset: usize,
61    /// Number of leading spaces/tabs
62    pub indent: usize,
63    /// Whether the line is blank (empty or only whitespace)
64    pub is_blank: bool,
65    /// Whether this line is inside a code block
66    pub in_code_block: bool,
67    /// Whether this line is inside front matter
68    pub in_front_matter: bool,
69    /// Whether this line is inside an HTML block
70    pub in_html_block: bool,
71    /// Whether this line is inside an HTML comment
72    pub in_html_comment: bool,
73    /// List item information if this line starts a list item
74    pub list_item: Option<ListItemInfo>,
75    /// Heading information if this line is a heading
76    pub heading: Option<HeadingInfo>,
77    /// Blockquote information if this line is a blockquote
78    pub blockquote: Option<BlockquoteInfo>,
79    /// Whether this line is inside a mkdocstrings autodoc block
80    pub in_mkdocstrings: bool,
81    /// Whether this line is part of an ESM import/export block (MDX only)
82    pub in_esm_block: bool,
83}
84
85/// Information about a list item
86#[derive(Debug, Clone)]
87pub struct ListItemInfo {
88    /// The marker used (*, -, +, or number with . or ))
89    pub marker: String,
90    /// Whether it's ordered (true) or unordered (false)
91    pub is_ordered: bool,
92    /// The number for ordered lists
93    pub number: Option<usize>,
94    /// Column where the marker starts (0-based)
95    pub marker_column: usize,
96    /// Column where content after marker starts
97    pub content_column: usize,
98}
99
100/// Heading style type
101#[derive(Debug, Clone, PartialEq)]
102pub enum HeadingStyle {
103    /// ATX style heading (# Heading)
104    ATX,
105    /// Setext style heading with = underline
106    Setext1,
107    /// Setext style heading with - underline
108    Setext2,
109}
110
111/// Parsed link information
112#[derive(Debug, Clone)]
113pub struct ParsedLink {
114    /// Line number (1-indexed)
115    pub line: usize,
116    /// Start column (0-indexed) in the line
117    pub start_col: usize,
118    /// End column (0-indexed) in the line
119    pub end_col: usize,
120    /// Byte offset in document
121    pub byte_offset: usize,
122    /// End byte offset in document
123    pub byte_end: usize,
124    /// Link text
125    pub text: String,
126    /// Link URL or reference
127    pub url: String,
128    /// Whether this is a reference link [text][ref] vs inline [text](url)
129    pub is_reference: bool,
130    /// Reference ID for reference links
131    pub reference_id: Option<String>,
132}
133
134/// Information about a broken link reported by pulldown-cmark
135#[derive(Debug, Clone)]
136pub struct BrokenLinkInfo {
137    /// The reference text that couldn't be resolved
138    pub reference: String,
139    /// Byte span in the source document
140    pub span: std::ops::Range<usize>,
141}
142
143/// Parsed image information
144#[derive(Debug, Clone)]
145pub struct ParsedImage {
146    /// Line number (1-indexed)
147    pub line: usize,
148    /// Start column (0-indexed) in the line
149    pub start_col: usize,
150    /// End column (0-indexed) in the line
151    pub end_col: usize,
152    /// Byte offset in document
153    pub byte_offset: usize,
154    /// End byte offset in document
155    pub byte_end: usize,
156    /// Alt text
157    pub alt_text: String,
158    /// Image URL or reference
159    pub url: String,
160    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
161    pub is_reference: bool,
162    /// Reference ID for reference images
163    pub reference_id: Option<String>,
164}
165
166/// Reference definition [ref]: url "title"
167#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169    /// Line number (1-indexed)
170    pub line: usize,
171    /// Reference ID (normalized to lowercase)
172    pub id: String,
173    /// URL
174    pub url: String,
175    /// Optional title
176    pub title: Option<String>,
177    /// Byte offset where the reference definition starts
178    pub byte_offset: usize,
179    /// Byte offset where the reference definition ends
180    pub byte_end: usize,
181}
182
183/// Parsed code span information
184#[derive(Debug, Clone)]
185pub struct CodeSpan {
186    /// Line number (1-indexed)
187    pub line: usize,
188    /// Start column (0-indexed) in the line
189    pub start_col: usize,
190    /// End column (0-indexed) in the line
191    pub end_col: usize,
192    /// Byte offset in document
193    pub byte_offset: usize,
194    /// End byte offset in document
195    pub byte_end: usize,
196    /// Number of backticks used (1, 2, 3, etc.)
197    pub backtick_count: usize,
198    /// Content inside the code span (without backticks)
199    pub content: String,
200}
201
202/// Information about a heading
203#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205    /// Heading level (1-6 for ATX, 1-2 for Setext)
206    pub level: u8,
207    /// Style of heading
208    pub style: HeadingStyle,
209    /// The heading marker (# characters or underline)
210    pub marker: String,
211    /// Column where the marker starts (0-based)
212    pub marker_column: usize,
213    /// Column where heading text starts
214    pub content_column: usize,
215    /// The heading text (without markers and without custom ID syntax)
216    pub text: String,
217    /// Custom header ID if present (e.g., from {#custom-id} syntax)
218    pub custom_id: Option<String>,
219    /// Original heading text including custom ID syntax
220    pub raw_text: String,
221    /// Whether it has a closing sequence (for ATX)
222    pub has_closing_sequence: bool,
223    /// The closing sequence if present
224    pub closing_sequence: String,
225}
226
227/// Information about a blockquote line
228#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230    /// Nesting level (1 for >, 2 for >>, etc.)
231    pub nesting_level: usize,
232    /// The indentation before the blockquote marker
233    pub indent: String,
234    /// Column where the first > starts (0-based)
235    pub marker_column: usize,
236    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
237    pub prefix: String,
238    /// Content after the blockquote marker(s)
239    pub content: String,
240    /// Whether the line has no space after the marker
241    pub has_no_space_after_marker: bool,
242    /// Whether the line has multiple spaces after the marker
243    pub has_multiple_spaces_after_marker: bool,
244    /// Whether this is an empty blockquote line needing MD028 fix
245    pub needs_md028_fix: bool,
246}
247
248/// Information about a list block
249#[derive(Debug, Clone)]
250pub struct ListBlock {
251    /// Line number where the list starts (1-indexed)
252    pub start_line: usize,
253    /// Line number where the list ends (1-indexed)
254    pub end_line: usize,
255    /// Whether it's ordered or unordered
256    pub is_ordered: bool,
257    /// The consistent marker for unordered lists (if any)
258    pub marker: Option<String>,
259    /// Blockquote prefix for this list (empty if not in blockquote)
260    pub blockquote_prefix: String,
261    /// Lines that are list items within this block
262    pub item_lines: Vec<usize>,
263    /// Nesting level (0 for top-level lists)
264    pub nesting_level: usize,
265    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
266    pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271/// Character frequency data for fast content analysis
272#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274    /// Count of # characters (headings)
275    pub hash_count: usize,
276    /// Count of * characters (emphasis, lists, horizontal rules)
277    pub asterisk_count: usize,
278    /// Count of _ characters (emphasis, horizontal rules)
279    pub underscore_count: usize,
280    /// Count of - characters (lists, horizontal rules, setext headings)
281    pub hyphen_count: usize,
282    /// Count of + characters (lists)
283    pub plus_count: usize,
284    /// Count of > characters (blockquotes)
285    pub gt_count: usize,
286    /// Count of | characters (tables)
287    pub pipe_count: usize,
288    /// Count of [ characters (links, images)
289    pub bracket_count: usize,
290    /// Count of ` characters (code spans, code blocks)
291    pub backtick_count: usize,
292    /// Count of < characters (HTML tags, autolinks)
293    pub lt_count: usize,
294    /// Count of ! characters (images)
295    pub exclamation_count: usize,
296    /// Count of newline characters
297    pub newline_count: usize,
298}
299
300/// Pre-parsed HTML tag information
301#[derive(Debug, Clone)]
302pub struct HtmlTag {
303    /// Line number (1-indexed)
304    pub line: usize,
305    /// Start column (0-indexed) in the line
306    pub start_col: usize,
307    /// End column (0-indexed) in the line
308    pub end_col: usize,
309    /// Byte offset in document
310    pub byte_offset: usize,
311    /// End byte offset in document
312    pub byte_end: usize,
313    /// Tag name (e.g., "div", "img", "br")
314    pub tag_name: String,
315    /// Whether it's a closing tag (`</tag>`)
316    pub is_closing: bool,
317    /// Whether it's self-closing (`<tag />`)
318    pub is_self_closing: bool,
319    /// Raw tag content
320    pub raw_content: String,
321}
322
323/// Pre-parsed emphasis span information
324#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326    /// Line number (1-indexed)
327    pub line: usize,
328    /// Start column (0-indexed) in the line
329    pub start_col: usize,
330    /// End column (0-indexed) in the line
331    pub end_col: usize,
332    /// Byte offset in document
333    pub byte_offset: usize,
334    /// End byte offset in document
335    pub byte_end: usize,
336    /// Type of emphasis ('*' or '_')
337    pub marker: char,
338    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
339    pub marker_count: usize,
340    /// Content inside the emphasis
341    pub content: String,
342}
343
344/// Pre-parsed table row information
345#[derive(Debug, Clone)]
346pub struct TableRow {
347    /// Line number (1-indexed)
348    pub line: usize,
349    /// Whether this is a separator row (contains only |, -, :, and spaces)
350    pub is_separator: bool,
351    /// Number of columns (pipe-separated cells)
352    pub column_count: usize,
353    /// Alignment info from separator row
354    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
355}
356
357/// Pre-parsed bare URL information (not in links)
358#[derive(Debug, Clone)]
359pub struct BareUrl {
360    /// Line number (1-indexed)
361    pub line: usize,
362    /// Start column (0-indexed) in the line
363    pub start_col: usize,
364    /// End column (0-indexed) in the line
365    pub end_col: usize,
366    /// Byte offset in document
367    pub byte_offset: usize,
368    /// End byte offset in document
369    pub byte_end: usize,
370    /// The URL string
371    pub url: String,
372    /// Type of URL ("http", "https", "ftp", "email")
373    pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377    pub content: &'a str,
378    pub line_offsets: Vec<usize>,
379    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
380    pub lines: Vec<LineInfo>,             // Pre-computed line information
381    pub links: Vec<ParsedLink>,           // Pre-parsed links
382    pub images: Vec<ParsedImage>,         // Pre-parsed images
383    pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
384    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
385    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
386    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
387    pub char_frequency: CharFrequency,    // Character frequency analysis
388    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
389    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
390    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
391    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
392    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
393    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
394    pub line_index: crate::utils::range_utils::LineIndex, // Pre-computed line index for byte position calculations
395    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
396    pub flavor: MarkdownFlavor,           // Markdown flavor being used
397}
398
399/// Detailed blockquote parse result with all components
400struct BlockquoteComponents<'a> {
401    indent: &'a str,
402    markers: &'a str,
403    spaces_after: &'a str,
404    content: &'a str,
405}
406
407/// Parse blockquote prefix with detailed components using manual parsing
408#[inline]
409fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
410    let bytes = line.as_bytes();
411    let mut pos = 0;
412
413    // Parse leading whitespace (indent)
414    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
415        pos += 1;
416    }
417    let indent_end = pos;
418
419    // Must have at least one '>' marker
420    if pos >= bytes.len() || bytes[pos] != b'>' {
421        return None;
422    }
423
424    // Parse '>' markers
425    while pos < bytes.len() && bytes[pos] == b'>' {
426        pos += 1;
427    }
428    let markers_end = pos;
429
430    // Parse spaces after markers
431    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
432        pos += 1;
433    }
434    let spaces_end = pos;
435
436    Some(BlockquoteComponents {
437        indent: &line[0..indent_end],
438        markers: &line[indent_end..markers_end],
439        spaces_after: &line[markers_end..spaces_end],
440        content: &line[spaces_end..],
441    })
442}
443
444impl<'a> LintContext<'a> {
445    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
446        use std::time::Instant;
447        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
448
449        let start = Instant::now();
450        let mut line_offsets = vec![0];
451        for (i, c) in content.char_indices() {
452            if c == '\n' {
453                line_offsets.push(i + 1);
454            }
455        }
456        if profile {
457            eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
458        }
459
460        // Detect code blocks once and cache them
461        let start = Instant::now();
462        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
463        if profile {
464            eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
465        }
466
467        // Pre-compute HTML comment ranges ONCE for all operations
468        let start = Instant::now();
469        let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
470        if profile {
471            eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
472        }
473
474        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
475        let start = Instant::now();
476        let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
477            crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
478        } else {
479            Vec::new()
480        };
481        if profile {
482            eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
483        }
484
485        // Pre-compute line information (without headings/blockquotes yet)
486        let start = Instant::now();
487        let mut lines = Self::compute_basic_line_info(
488            content,
489            &line_offsets,
490            &code_blocks,
491            flavor,
492            &html_comment_ranges,
493            &autodoc_ranges,
494        );
495        if profile {
496            eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
497        }
498
499        // Detect HTML blocks BEFORE heading detection
500        let start = Instant::now();
501        Self::detect_html_blocks(&mut lines);
502        if profile {
503            eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
504        }
505
506        // Detect ESM import/export blocks in MDX files BEFORE heading detection
507        let start = Instant::now();
508        Self::detect_esm_blocks(&mut lines, flavor);
509        if profile {
510            eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
511        }
512
513        // Now detect headings and blockquotes
514        let start = Instant::now();
515        Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
516        if profile {
517            eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
518        }
519
520        // Parse code spans early so we can exclude them from link/image parsing
521        let start = Instant::now();
522        let code_spans = Self::parse_code_spans(content, &lines);
523        if profile {
524            eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
525        }
526
527        // Parse links, images, references, and list blocks
528        let start = Instant::now();
529        let (links, broken_links) =
530            Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
531        if profile {
532            eprintln!("[PROFILE] Links: {:?}", start.elapsed());
533        }
534
535        let start = Instant::now();
536        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
537        if profile {
538            eprintln!("[PROFILE] Images: {:?}", start.elapsed());
539        }
540
541        let start = Instant::now();
542        let reference_defs = Self::parse_reference_defs(content, &lines);
543        if profile {
544            eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
545        }
546
547        let start = Instant::now();
548        let list_blocks = Self::parse_list_blocks(&lines);
549        if profile {
550            eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
551        }
552
553        // Compute character frequency for fast content analysis
554        let start = Instant::now();
555        let char_frequency = Self::compute_char_frequency(content);
556        if profile {
557            eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
558        }
559
560        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058)
561        let start = Instant::now();
562        let table_blocks =
563            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(content, &code_blocks, &code_spans);
564        if profile {
565            eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
566        }
567
568        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
569        let start = Instant::now();
570        let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
571        if profile {
572            eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
573        }
574
575        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
576        let start = Instant::now();
577        let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
578        if profile {
579            eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
580        }
581
582        Self {
583            content,
584            line_offsets,
585            code_blocks,
586            lines,
587            links,
588            images,
589            broken_links,
590            reference_defs,
591            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
592            list_blocks,
593            char_frequency,
594            html_tags_cache: Mutex::new(None),
595            emphasis_spans_cache: Mutex::new(None),
596            table_rows_cache: Mutex::new(None),
597            bare_urls_cache: Mutex::new(None),
598            html_comment_ranges,
599            table_blocks,
600            line_index,
601            jinja_ranges,
602            flavor,
603        }
604    }
605
606    /// Get code spans - computed lazily on first access
607    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
608        let mut cache = self.code_spans_cache.lock().expect("Code spans cache mutex poisoned");
609
610        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))))
611    }
612
613    /// Get HTML tags - computed lazily on first access
614    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
615        let mut cache = self.html_tags_cache.lock().expect("HTML tags cache mutex poisoned");
616
617        Arc::clone(cache.get_or_insert_with(|| {
618            Arc::new(Self::parse_html_tags(
619                self.content,
620                &self.lines,
621                &self.code_blocks,
622                self.flavor,
623            ))
624        }))
625    }
626
627    /// Get emphasis spans - computed lazily on first access
628    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
629        let mut cache = self
630            .emphasis_spans_cache
631            .lock()
632            .expect("Emphasis spans cache mutex poisoned");
633
634        Arc::clone(
635            cache.get_or_insert_with(|| {
636                Arc::new(Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks))
637            }),
638        )
639    }
640
641    /// Get table rows - computed lazily on first access
642    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
643        let mut cache = self.table_rows_cache.lock().expect("Table rows cache mutex poisoned");
644
645        Arc::clone(cache.get_or_insert_with(|| Arc::new(Self::parse_table_rows(&self.lines))))
646    }
647
648    /// Get bare URLs - computed lazily on first access
649    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
650        let mut cache = self.bare_urls_cache.lock().expect("Bare URLs cache mutex poisoned");
651
652        Arc::clone(
653            cache.get_or_insert_with(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
654        )
655    }
656
657    /// Map a byte offset to (line, column)
658    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
659        match self.line_offsets.binary_search(&offset) {
660            Ok(line) => (line + 1, 1),
661            Err(line) => {
662                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
663                (line, offset - line_start + 1)
664            }
665        }
666    }
667
668    /// Check if a position is within a code block or code span
669    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
670        // Check code blocks first
671        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
672            return true;
673        }
674
675        // Check inline code spans (lazy load if needed)
676        self.code_spans()
677            .iter()
678            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
679    }
680
681    /// Get line information by line number (1-indexed)
682    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
683        if line_num > 0 {
684            self.lines.get(line_num - 1)
685        } else {
686            None
687        }
688    }
689
690    /// Get byte offset for a line number (1-indexed)
691    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
692        self.line_info(line_num).map(|info| info.byte_offset)
693    }
694
695    /// Get URL for a reference link/image by its ID
696    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
697        let normalized_id = ref_id.to_lowercase();
698        self.reference_defs
699            .iter()
700            .find(|def| def.id == normalized_id)
701            .map(|def| def.url.as_str())
702    }
703
704    /// Get links on a specific line
705    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
706        self.links.iter().filter(|link| link.line == line_num).collect()
707    }
708
709    /// Get images on a specific line
710    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
711        self.images.iter().filter(|img| img.line == line_num).collect()
712    }
713
714    /// Check if a line is part of a list block
715    pub fn is_in_list_block(&self, line_num: usize) -> bool {
716        self.list_blocks
717            .iter()
718            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
719    }
720
721    /// Get the list block containing a specific line
722    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
723        self.list_blocks
724            .iter()
725            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
726    }
727
728    // Compatibility methods for DocumentStructure migration
729
730    /// Check if a line is within a code block
731    pub fn is_in_code_block(&self, line_num: usize) -> bool {
732        if line_num == 0 || line_num > self.lines.len() {
733            return false;
734        }
735        self.lines[line_num - 1].in_code_block
736    }
737
738    /// Check if a line is within front matter
739    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
740        if line_num == 0 || line_num > self.lines.len() {
741            return false;
742        }
743        self.lines[line_num - 1].in_front_matter
744    }
745
746    /// Check if a line is within an HTML block
747    pub fn is_in_html_block(&self, line_num: usize) -> bool {
748        if line_num == 0 || line_num > self.lines.len() {
749            return false;
750        }
751        self.lines[line_num - 1].in_html_block
752    }
753
754    /// Check if a line and column is within a code span
755    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
756        if line_num == 0 || line_num > self.lines.len() {
757            return false;
758        }
759
760        // Use the code spans cache to check
761        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
762        // Convert col to 0-indexed for comparison
763        let col_0indexed = if col > 0 { col - 1 } else { 0 };
764        let code_spans = self.code_spans();
765        code_spans
766            .iter()
767            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
768    }
769
770    /// Check if a byte position is within a reference definition
771    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
772    #[inline]
773    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
774        self.reference_defs
775            .iter()
776            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
777    }
778
779    /// Check if a byte position is within an HTML comment
780    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
781    /// where k is the number of HTML comments (typically very small)
782    #[inline]
783    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
784        self.html_comment_ranges
785            .iter()
786            .any(|range| byte_pos >= range.start && byte_pos < range.end)
787    }
788
789    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
790    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
791        self.jinja_ranges
792            .iter()
793            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
794    }
795
796    /// Check if content has any instances of a specific character (fast)
797    pub fn has_char(&self, ch: char) -> bool {
798        match ch {
799            '#' => self.char_frequency.hash_count > 0,
800            '*' => self.char_frequency.asterisk_count > 0,
801            '_' => self.char_frequency.underscore_count > 0,
802            '-' => self.char_frequency.hyphen_count > 0,
803            '+' => self.char_frequency.plus_count > 0,
804            '>' => self.char_frequency.gt_count > 0,
805            '|' => self.char_frequency.pipe_count > 0,
806            '[' => self.char_frequency.bracket_count > 0,
807            '`' => self.char_frequency.backtick_count > 0,
808            '<' => self.char_frequency.lt_count > 0,
809            '!' => self.char_frequency.exclamation_count > 0,
810            '\n' => self.char_frequency.newline_count > 0,
811            _ => self.content.contains(ch), // Fallback for other characters
812        }
813    }
814
815    /// Get count of a specific character (fast)
816    pub fn char_count(&self, ch: char) -> usize {
817        match ch {
818            '#' => self.char_frequency.hash_count,
819            '*' => self.char_frequency.asterisk_count,
820            '_' => self.char_frequency.underscore_count,
821            '-' => self.char_frequency.hyphen_count,
822            '+' => self.char_frequency.plus_count,
823            '>' => self.char_frequency.gt_count,
824            '|' => self.char_frequency.pipe_count,
825            '[' => self.char_frequency.bracket_count,
826            '`' => self.char_frequency.backtick_count,
827            '<' => self.char_frequency.lt_count,
828            '!' => self.char_frequency.exclamation_count,
829            '\n' => self.char_frequency.newline_count,
830            _ => self.content.matches(ch).count(), // Fallback for other characters
831        }
832    }
833
834    /// Check if content likely contains headings (fast)
835    pub fn likely_has_headings(&self) -> bool {
836        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
837    }
838
839    /// Check if content likely contains lists (fast)
840    pub fn likely_has_lists(&self) -> bool {
841        self.char_frequency.asterisk_count > 0
842            || self.char_frequency.hyphen_count > 0
843            || self.char_frequency.plus_count > 0
844    }
845
846    /// Check if content likely contains emphasis (fast)
847    pub fn likely_has_emphasis(&self) -> bool {
848        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
849    }
850
851    /// Check if content likely contains tables (fast)
852    pub fn likely_has_tables(&self) -> bool {
853        self.char_frequency.pipe_count > 2
854    }
855
856    /// Check if content likely contains blockquotes (fast)
857    pub fn likely_has_blockquotes(&self) -> bool {
858        self.char_frequency.gt_count > 0
859    }
860
861    /// Check if content likely contains code (fast)
862    pub fn likely_has_code(&self) -> bool {
863        self.char_frequency.backtick_count > 0
864    }
865
866    /// Check if content likely contains links or images (fast)
867    pub fn likely_has_links_or_images(&self) -> bool {
868        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
869    }
870
871    /// Check if content likely contains HTML (fast)
872    pub fn likely_has_html(&self) -> bool {
873        self.char_frequency.lt_count > 0
874    }
875
876    /// Get HTML tags on a specific line
877    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
878        self.html_tags()
879            .iter()
880            .filter(|tag| tag.line == line_num)
881            .cloned()
882            .collect()
883    }
884
885    /// Get emphasis spans on a specific line
886    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
887        self.emphasis_spans()
888            .iter()
889            .filter(|span| span.line == line_num)
890            .cloned()
891            .collect()
892    }
893
894    /// Get table rows on a specific line
895    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
896        self.table_rows()
897            .iter()
898            .filter(|row| row.line == line_num)
899            .cloned()
900            .collect()
901    }
902
903    /// Get bare URLs on a specific line
904    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
905        self.bare_urls()
906            .iter()
907            .filter(|url| url.line == line_num)
908            .cloned()
909            .collect()
910    }
911
912    /// Find the line index for a given byte offset using binary search.
913    /// Returns (line_index, line_number, column) where:
914    /// - line_index is the 0-based index in the lines array
915    /// - line_number is the 1-based line number
916    /// - column is the byte offset within that line
917    #[inline]
918    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
919        // Binary search to find the line containing this byte offset
920        let idx = match lines.binary_search_by(|line| {
921            if byte_offset < line.byte_offset {
922                std::cmp::Ordering::Greater
923            } else if byte_offset > line.byte_offset + line.content.len() {
924                std::cmp::Ordering::Less
925            } else {
926                std::cmp::Ordering::Equal
927            }
928        }) {
929            Ok(idx) => idx,
930            Err(idx) => idx.saturating_sub(1),
931        };
932
933        let line = &lines[idx];
934        let line_num = idx + 1;
935        let col = byte_offset.saturating_sub(line.byte_offset);
936
937        (idx, line_num, col)
938    }
939
940    /// Check if a byte offset is within a code span using binary search
941    #[inline]
942    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
943        // Since spans are sorted by byte_offset, use partition_point for binary search
944        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
945
946        // Check the span that starts at or before our offset
947        if idx > 0 {
948            let span = &code_spans[idx - 1];
949            if offset >= span.byte_offset && offset < span.byte_end {
950                return true;
951            }
952        }
953
954        false
955    }
956
957    /// Parse all links in the content
958    fn parse_links(
959        content: &str,
960        lines: &[LineInfo],
961        code_blocks: &[(usize, usize)],
962        code_spans: &[CodeSpan],
963        flavor: MarkdownFlavor,
964        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
965    ) -> (Vec<ParsedLink>, Vec<BrokenLinkInfo>) {
966        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
967        use std::collections::HashSet;
968
969        let mut links = Vec::with_capacity(content.len() / 500);
970        let mut broken_links = Vec::new();
971
972        // Track byte positions of links found by pulldown-cmark
973        let mut found_positions = HashSet::new();
974
975        // Use pulldown-cmark's streaming parser with BrokenLink callback
976        // The callback captures undefined references: [text][undefined], [shortcut], [text][]
977        // This automatically handles:
978        // - Escaped links (won't generate events)
979        // - Links in code blocks/spans (won't generate Link events)
980        // - Images (generates Tag::Image instead)
981        // - Reference resolution (dest_url is already resolved!)
982        // - Broken references (callback is invoked)
983        let parser = Parser::new_with_broken_link_callback(
984            content,
985            pulldown_cmark::Options::empty(),
986            Some(|link: BrokenLink<'_>| {
987                broken_links.push(BrokenLinkInfo {
988                    reference: link.reference.to_string(),
989                    span: link.span.clone(),
990                });
991                None
992            }),
993        )
994        .into_offset_iter();
995
996        let mut link_stack: Vec<(usize, usize, String, LinkType, String)> = Vec::new();
997        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
998
999        for (event, range) in parser {
1000            match event {
1001                Event::Start(Tag::Link {
1002                    link_type,
1003                    dest_url,
1004                    id,
1005                    ..
1006                }) => {
1007                    // Link start - record position, URL, and reference ID
1008                    link_stack.push((range.start, range.end, dest_url.to_string(), link_type, id.to_string()));
1009                    text_chunks.clear();
1010                }
1011                Event::Text(text) if !link_stack.is_empty() => {
1012                    // Track text content with its byte range
1013                    text_chunks.push((text.to_string(), range.start, range.end));
1014                }
1015                Event::Code(code) if !link_stack.is_empty() => {
1016                    // Include inline code in link text (with backticks)
1017                    let code_text = format!("`{code}`");
1018                    text_chunks.push((code_text, range.start, range.end));
1019                }
1020                Event::End(TagEnd::Link) => {
1021                    if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1022                        // Skip if in HTML comment
1023                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1024                            text_chunks.clear();
1025                            continue;
1026                        }
1027
1028                        // Find line and column information
1029                        let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1030
1031                        // Skip if this link is on a MkDocs snippet line
1032                        if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1033                            text_chunks.clear();
1034                            continue;
1035                        }
1036
1037                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1038
1039                        let is_reference = matches!(
1040                            link_type,
1041                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1042                        );
1043
1044                        // Extract link text directly from source bytes to preserve escaping
1045                        // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1046                        let link_text = if start_pos < content.len() {
1047                            let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1048
1049                            // Find MATCHING ] by tracking bracket depth for nested brackets
1050                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1051                            let mut close_pos = None;
1052                            let mut depth = 0;
1053
1054                            for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1055                                // Count preceding backslashes
1056                                let mut backslash_count = 0;
1057                                let mut j = i;
1058                                while j > 0 && link_bytes[j - 1] == b'\\' {
1059                                    backslash_count += 1;
1060                                    j -= 1;
1061                                }
1062                                let is_escaped = backslash_count % 2 != 0;
1063
1064                                if !is_escaped {
1065                                    if byte == b'[' {
1066                                        depth += 1;
1067                                    } else if byte == b']' {
1068                                        if depth == 0 {
1069                                            // Found the matching closing bracket
1070                                            close_pos = Some(i);
1071                                            break;
1072                                        } else {
1073                                            depth -= 1;
1074                                        }
1075                                    }
1076                                }
1077                            }
1078
1079                            if let Some(pos) = close_pos {
1080                                std::str::from_utf8(&link_bytes[1..pos]).unwrap_or("").to_string()
1081                            } else {
1082                                String::new()
1083                            }
1084                        } else {
1085                            String::new()
1086                        };
1087
1088                        // For reference links, use the actual reference ID from pulldown-cmark
1089                        let reference_id = if is_reference && !ref_id.is_empty() {
1090                            Some(ref_id.to_lowercase())
1091                        } else if is_reference {
1092                            // For collapsed/shortcut references without explicit ID, use the link text
1093                            Some(link_text.to_lowercase())
1094                        } else {
1095                            None
1096                        };
1097
1098                        // WORKAROUND: pulldown-cmark bug with escaped brackets
1099                        // Check for escaped image syntax: \![text](url)
1100                        // The byte_offset points to the '[', so we check 2 bytes back for '\!'
1101                        let has_escaped_bang = start_pos >= 2
1102                            && content.as_bytes().get(start_pos - 2) == Some(&b'\\')
1103                            && content.as_bytes().get(start_pos - 1) == Some(&b'!');
1104
1105                        // Check for escaped bracket: \[text](url)
1106                        // The byte_offset points to the '[', so we check 1 byte back for '\'
1107                        let has_escaped_bracket =
1108                            start_pos >= 1 && content.as_bytes().get(start_pos - 1) == Some(&b'\\');
1109
1110                        if has_escaped_bang || has_escaped_bracket {
1111                            text_chunks.clear();
1112                            continue; // Skip: this is escaped markdown, not a real link
1113                        }
1114
1115                        // Track this position as found
1116                        found_positions.insert(start_pos);
1117
1118                        links.push(ParsedLink {
1119                            line: line_num,
1120                            start_col: col_start,
1121                            end_col: col_end,
1122                            byte_offset: start_pos,
1123                            byte_end: range.end,
1124                            text: link_text,
1125                            url,
1126                            is_reference,
1127                            reference_id,
1128                        });
1129
1130                        text_chunks.clear();
1131                    }
1132                }
1133                _ => {}
1134            }
1135        }
1136
1137        // Also find undefined references using regex
1138        // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1139        // because the reference is undefined
1140        for cap in LINK_PATTERN.captures_iter(content) {
1141            let full_match = cap.get(0).unwrap();
1142            let match_start = full_match.start();
1143            let match_end = full_match.end();
1144
1145            // Skip if this was already found by pulldown-cmark (it's a valid link)
1146            if found_positions.contains(&match_start) {
1147                continue;
1148            }
1149
1150            // Skip if escaped
1151            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1152                continue;
1153            }
1154
1155            // Skip if it's an image
1156            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1157                continue;
1158            }
1159
1160            // Skip if in code block
1161            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1162                continue;
1163            }
1164
1165            // Skip if in code span
1166            if Self::is_offset_in_code_span(code_spans, match_start) {
1167                continue;
1168            }
1169
1170            // Skip if in HTML comment
1171            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1172                continue;
1173            }
1174
1175            // Find line and column information
1176            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1177
1178            // Skip if this link is on a MkDocs snippet line
1179            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1180                continue;
1181            }
1182
1183            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1184
1185            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1186
1187            // Only process reference links (group 6)
1188            if let Some(ref_id) = cap.get(6) {
1189                let ref_id_str = ref_id.as_str();
1190                let normalized_ref = if ref_id_str.is_empty() {
1191                    text.to_lowercase() // Implicit reference
1192                } else {
1193                    ref_id_str.to_lowercase()
1194                };
1195
1196                // This is an undefined reference (pulldown-cmark didn't parse it)
1197                links.push(ParsedLink {
1198                    line: line_num,
1199                    start_col: col_start,
1200                    end_col: col_end,
1201                    byte_offset: match_start,
1202                    byte_end: match_end,
1203                    text,
1204                    url: String::new(), // Empty URL indicates undefined reference
1205                    is_reference: true,
1206                    reference_id: Some(normalized_ref),
1207                });
1208            }
1209        }
1210
1211        (links, broken_links)
1212    }
1213
1214    /// Parse all images in the content
1215    fn parse_images(
1216        content: &str,
1217        lines: &[LineInfo],
1218        code_blocks: &[(usize, usize)],
1219        code_spans: &[CodeSpan],
1220        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1221    ) -> Vec<ParsedImage> {
1222        use crate::utils::skip_context::is_in_html_comment_ranges;
1223        use std::collections::HashSet;
1224
1225        // Pre-size based on a heuristic: images are less common than links
1226        let mut images = Vec::with_capacity(content.len() / 1000);
1227        let mut found_positions = HashSet::new();
1228
1229        // Use pulldown-cmark for parsing - more accurate and faster
1230        let parser = Parser::new(content).into_offset_iter();
1231        let mut image_stack: Vec<(usize, String, LinkType, String)> = Vec::new();
1232        let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1233
1234        for (event, range) in parser {
1235            match event {
1236                Event::Start(Tag::Image {
1237                    link_type,
1238                    dest_url,
1239                    id,
1240                    ..
1241                }) => {
1242                    image_stack.push((range.start, dest_url.to_string(), link_type, id.to_string()));
1243                    text_chunks.clear();
1244                }
1245                Event::Text(text) if !image_stack.is_empty() => {
1246                    text_chunks.push((text.to_string(), range.start, range.end));
1247                }
1248                Event::Code(code) if !image_stack.is_empty() => {
1249                    let code_text = format!("`{code}`");
1250                    text_chunks.push((code_text, range.start, range.end));
1251                }
1252                Event::End(TagEnd::Image) => {
1253                    if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1254                        // Skip if in code block
1255                        if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1256                            continue;
1257                        }
1258
1259                        // Skip if in code span
1260                        if Self::is_offset_in_code_span(code_spans, start_pos) {
1261                            continue;
1262                        }
1263
1264                        // Skip if in HTML comment
1265                        if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1266                            continue;
1267                        }
1268
1269                        // Find line and column using binary search
1270                        let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1271                        let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1272
1273                        let is_reference = matches!(
1274                            link_type,
1275                            LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1276                        );
1277
1278                        // Extract alt text directly from source bytes to preserve escaping
1279                        // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1280                        let alt_text = if start_pos < content.len() {
1281                            let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1282
1283                            // Find MATCHING ] by tracking bracket depth for nested brackets
1284                            // An unescaped bracket is one NOT preceded by an odd number of backslashes
1285                            let mut close_pos = None;
1286                            let mut depth = 0;
1287
1288                            if image_bytes.len() > 2 {
1289                                for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1290                                    // Count preceding backslashes
1291                                    let mut backslash_count = 0;
1292                                    let mut j = i;
1293                                    while j > 0 && image_bytes[j - 1] == b'\\' {
1294                                        backslash_count += 1;
1295                                        j -= 1;
1296                                    }
1297                                    let is_escaped = backslash_count % 2 != 0;
1298
1299                                    if !is_escaped {
1300                                        if byte == b'[' {
1301                                            depth += 1;
1302                                        } else if byte == b']' {
1303                                            if depth == 0 {
1304                                                // Found the matching closing bracket
1305                                                close_pos = Some(i);
1306                                                break;
1307                                            } else {
1308                                                depth -= 1;
1309                                            }
1310                                        }
1311                                    }
1312                                }
1313                            }
1314
1315                            if let Some(pos) = close_pos {
1316                                std::str::from_utf8(&image_bytes[2..pos]).unwrap_or("").to_string()
1317                            } else {
1318                                String::new()
1319                            }
1320                        } else {
1321                            String::new()
1322                        };
1323
1324                        let reference_id = if is_reference && !ref_id.is_empty() {
1325                            Some(ref_id.to_lowercase())
1326                        } else if is_reference {
1327                            Some(alt_text.to_lowercase()) // Collapsed/shortcut references
1328                        } else {
1329                            None
1330                        };
1331
1332                        found_positions.insert(start_pos);
1333                        images.push(ParsedImage {
1334                            line: line_num,
1335                            start_col: col_start,
1336                            end_col: col_end,
1337                            byte_offset: start_pos,
1338                            byte_end: range.end,
1339                            alt_text,
1340                            url,
1341                            is_reference,
1342                            reference_id,
1343                        });
1344                    }
1345                }
1346                _ => {}
1347            }
1348        }
1349
1350        // Regex fallback for undefined references that pulldown-cmark treats as plain text
1351        for cap in IMAGE_PATTERN.captures_iter(content) {
1352            let full_match = cap.get(0).unwrap();
1353            let match_start = full_match.start();
1354            let match_end = full_match.end();
1355
1356            // Skip if already found by pulldown-cmark
1357            if found_positions.contains(&match_start) {
1358                continue;
1359            }
1360
1361            // Skip if the ! is escaped
1362            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1363                continue;
1364            }
1365
1366            // Skip if in code block, code span, or HTML comment
1367            if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1368                || Self::is_offset_in_code_span(code_spans, match_start)
1369                || is_in_html_comment_ranges(html_comment_ranges, match_start)
1370            {
1371                continue;
1372            }
1373
1374            // Only process reference images (undefined references not found by pulldown-cmark)
1375            if let Some(ref_id) = cap.get(6) {
1376                let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1377                let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1378                let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1379                let ref_id_str = ref_id.as_str();
1380                let normalized_ref = if ref_id_str.is_empty() {
1381                    alt_text.to_lowercase()
1382                } else {
1383                    ref_id_str.to_lowercase()
1384                };
1385
1386                images.push(ParsedImage {
1387                    line: line_num,
1388                    start_col: col_start,
1389                    end_col: col_end,
1390                    byte_offset: match_start,
1391                    byte_end: match_end,
1392                    alt_text,
1393                    url: String::new(),
1394                    is_reference: true,
1395                    reference_id: Some(normalized_ref),
1396                });
1397            }
1398        }
1399
1400        images
1401    }
1402
1403    /// Parse reference definitions
1404    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1405        // Pre-size based on lines count as reference definitions are line-based
1406        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1407
1408        for (line_idx, line_info) in lines.iter().enumerate() {
1409            // Skip lines in code blocks
1410            if line_info.in_code_block {
1411                continue;
1412            }
1413
1414            let line = &line_info.content;
1415            let line_num = line_idx + 1;
1416
1417            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1418                let id = cap.get(1).unwrap().as_str().to_lowercase();
1419                let url = cap.get(2).unwrap().as_str().to_string();
1420                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1421
1422                // Calculate byte positions
1423                // The match starts at the beginning of the line (0) and extends to the end
1424                let match_obj = cap.get(0).unwrap();
1425                let byte_offset = line_info.byte_offset + match_obj.start();
1426                let byte_end = line_info.byte_offset + match_obj.end();
1427
1428                refs.push(ReferenceDef {
1429                    line: line_num,
1430                    id,
1431                    url,
1432                    title,
1433                    byte_offset,
1434                    byte_end,
1435                });
1436            }
1437        }
1438
1439        refs
1440    }
1441
1442    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1443    /// Matches: ^(\s*>\s*)(.*)
1444    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1445    #[inline]
1446    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1447        let trimmed_start = line.trim_start();
1448        if !trimmed_start.starts_with('>') {
1449            return None;
1450        }
1451
1452        let leading_ws_len = line.len() - trimmed_start.len();
1453        let after_gt = &trimmed_start[1..];
1454        let content = after_gt.trim_start();
1455        let ws_after_gt_len = after_gt.len() - content.len();
1456        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1457
1458        Some((&line[..prefix_len], content))
1459    }
1460
1461    /// Fast unordered list parser - replaces regex for 5-10x speedup
1462    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1463    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1464    #[inline]
1465    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1466        let bytes = line.as_bytes();
1467        let mut i = 0;
1468
1469        // Skip leading whitespace
1470        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1471            i += 1;
1472        }
1473
1474        // Check for marker
1475        if i >= bytes.len() {
1476            return None;
1477        }
1478        let marker = bytes[i] as char;
1479        if marker != '-' && marker != '*' && marker != '+' {
1480            return None;
1481        }
1482        let marker_pos = i;
1483        i += 1;
1484
1485        // Collect spacing after marker (space or tab only)
1486        let spacing_start = i;
1487        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1488            i += 1;
1489        }
1490
1491        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1492    }
1493
1494    /// Fast ordered list parser - replaces regex for 5-10x speedup
1495    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1496    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1497    #[inline]
1498    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1499        let bytes = line.as_bytes();
1500        let mut i = 0;
1501
1502        // Skip leading whitespace
1503        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1504            i += 1;
1505        }
1506
1507        // Collect digits
1508        let number_start = i;
1509        while i < bytes.len() && bytes[i].is_ascii_digit() {
1510            i += 1;
1511        }
1512        if i == number_start {
1513            return None; // No digits found
1514        }
1515
1516        // Check for delimiter
1517        if i >= bytes.len() {
1518            return None;
1519        }
1520        let delimiter = bytes[i] as char;
1521        if delimiter != '.' && delimiter != ')' {
1522            return None;
1523        }
1524        let delimiter_pos = i;
1525        i += 1;
1526
1527        // Collect spacing after delimiter (space or tab only)
1528        let spacing_start = i;
1529        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1530            i += 1;
1531        }
1532
1533        Some((
1534            &line[..number_start],
1535            &line[number_start..delimiter_pos],
1536            delimiter,
1537            &line[spacing_start..i],
1538            &line[i..],
1539        ))
1540    }
1541
1542    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1543    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1544    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1545        let num_lines = line_offsets.len();
1546        let mut in_code_block = vec![false; num_lines];
1547
1548        // For each code block, mark all lines within it
1549        for &(start, end) in code_blocks {
1550            // Ensure we're at valid UTF-8 boundaries
1551            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1552                let mut boundary = start;
1553                while boundary > 0 && !content.is_char_boundary(boundary) {
1554                    boundary -= 1;
1555                }
1556                boundary
1557            } else {
1558                start
1559            };
1560
1561            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1562                let mut boundary = end;
1563                while boundary < content.len() && !content.is_char_boundary(boundary) {
1564                    boundary += 1;
1565                }
1566                boundary
1567            } else {
1568                end.min(content.len())
1569            };
1570
1571            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1572            // That function now has proper list context awareness (see code_block_utils.rs)
1573            // and correctly distinguishes between:
1574            // - Fenced code blocks (``` or ~~~)
1575            // - Indented code blocks at document level (4 spaces + blank line before)
1576            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1577            //
1578            // We no longer need to re-validate here. The original validation logic
1579            // was causing false positives by marking list continuation paragraphs as
1580            // code blocks when they have 4 spaces of indentation.
1581
1582            // Use binary search to find the first and last line indices
1583            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1584            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1585            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1586            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1587
1588            // Mark all lines in the range at once
1589            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1590                *flag = true;
1591            }
1592        }
1593
1594        in_code_block
1595    }
1596
1597    /// Pre-compute basic line information (without headings/blockquotes)
1598    fn compute_basic_line_info(
1599        content: &str,
1600        line_offsets: &[usize],
1601        code_blocks: &[(usize, usize)],
1602        flavor: MarkdownFlavor,
1603        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1604        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1605    ) -> Vec<LineInfo> {
1606        let content_lines: Vec<&str> = content.lines().collect();
1607        let mut lines = Vec::with_capacity(content_lines.len());
1608
1609        // Pre-compute which lines are in code blocks
1610        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1611
1612        // Detect front matter boundaries FIRST, before any other parsing
1613        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1614        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1615
1616        for (i, line) in content_lines.iter().enumerate() {
1617            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1618            let indent = line.len() - line.trim_start().len();
1619
1620            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1621            let blockquote_parse = Self::parse_blockquote_prefix(line);
1622
1623            // For blank detection, consider blockquote context
1624            let is_blank = if let Some((_, content)) = blockquote_parse {
1625                // In blockquote context, check if content after prefix is blank
1626                content.trim().is_empty()
1627            } else {
1628                line.trim().is_empty()
1629            };
1630
1631            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1632            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1633
1634            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1635            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1636                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1637            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1638            let in_html_comment =
1639                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1640            let list_item = if !(in_code_block
1641                || is_blank
1642                || in_mkdocstrings
1643                || in_html_comment
1644                || (front_matter_end > 0 && i < front_matter_end))
1645            {
1646                // Strip blockquote prefix if present for list detection (reuse cached result)
1647                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1648                    (content, prefix.len())
1649                } else {
1650                    (&**line, 0)
1651                };
1652
1653                if let Some((leading_spaces, marker, spacing, _content)) =
1654                    Self::parse_unordered_list(line_for_list_check)
1655                {
1656                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1657                    let content_column = marker_column + 1 + spacing.len();
1658
1659                    // According to CommonMark spec, unordered list items MUST have at least one space
1660                    // after the marker (-, *, or +). Without a space, it's not a list item.
1661                    // This also naturally handles cases like:
1662                    // - *emphasis* (not a list)
1663                    // - **bold** (not a list)
1664                    // - --- (horizontal rule, not a list)
1665                    if spacing.is_empty() {
1666                        None
1667                    } else {
1668                        Some(ListItemInfo {
1669                            marker: marker.to_string(),
1670                            is_ordered: false,
1671                            number: None,
1672                            marker_column,
1673                            content_column,
1674                        })
1675                    }
1676                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1677                    Self::parse_ordered_list(line_for_list_check)
1678                {
1679                    let marker = format!("{number_str}{delimiter}");
1680                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1681                    let content_column = marker_column + marker.len() + spacing.len();
1682
1683                    // According to CommonMark spec, ordered list items MUST have at least one space
1684                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1685                    if spacing.is_empty() {
1686                        None
1687                    } else {
1688                        Some(ListItemInfo {
1689                            marker,
1690                            is_ordered: true,
1691                            number: number_str.parse().ok(),
1692                            marker_column,
1693                            content_column,
1694                        })
1695                    }
1696                } else {
1697                    None
1698                }
1699            } else {
1700                None
1701            };
1702
1703            lines.push(LineInfo {
1704                content: line.to_string(),
1705                byte_offset,
1706                indent,
1707                is_blank,
1708                in_code_block,
1709                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1710                in_html_block: false, // Will be populated after line creation
1711                in_html_comment,
1712                list_item,
1713                heading: None,    // Will be populated in second pass for Setext headings
1714                blockquote: None, // Will be populated after line creation
1715                in_mkdocstrings,
1716                in_esm_block: false, // Will be populated after line creation for MDX files
1717            });
1718        }
1719
1720        lines
1721    }
1722
1723    /// Detect headings and blockquotes (called after HTML block detection)
1724    fn detect_headings_and_blockquotes(
1725        content: &str,
1726        lines: &mut [LineInfo],
1727        flavor: MarkdownFlavor,
1728        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1729    ) {
1730        // Regex for heading detection
1731        static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
1732            LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
1733        static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
1734            LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
1735
1736        let content_lines: Vec<&str> = content.lines().collect();
1737
1738        // Detect front matter boundaries to skip those lines
1739        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1740
1741        // Detect headings (including Setext which needs look-ahead) and blockquotes
1742        for i in 0..lines.len() {
1743            if lines[i].in_code_block {
1744                continue;
1745            }
1746
1747            // Skip lines in front matter
1748            if front_matter_end > 0 && i < front_matter_end {
1749                continue;
1750            }
1751
1752            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1753            if lines[i].in_html_block {
1754                continue;
1755            }
1756
1757            let line = content_lines[i];
1758
1759            // Check for blockquotes (even on blank lines within blockquotes)
1760            if let Some(bq) = parse_blockquote_detailed(line) {
1761                let nesting_level = bq.markers.len(); // Each '>' is one level
1762                let marker_column = bq.indent.len();
1763
1764                // Build the prefix (indentation + markers + space)
1765                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1766
1767                // Check for various blockquote issues
1768                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1769                // Consider tabs as multiple spaces, or actual multiple spaces
1770                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1771
1772                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1773                // MD028 flags empty blockquote lines that don't have a single space after the marker
1774                // Lines like "> " or ">> " are already correct and don't need fixing
1775                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1776
1777                lines[i].blockquote = Some(BlockquoteInfo {
1778                    nesting_level,
1779                    indent: bq.indent.to_string(),
1780                    marker_column,
1781                    prefix,
1782                    content: bq.content.to_string(),
1783                    has_no_space_after_marker: has_no_space,
1784                    has_multiple_spaces_after_marker: has_multiple_spaces,
1785                    needs_md028_fix,
1786                });
1787            }
1788
1789            // Skip heading detection for blank lines
1790            if lines[i].is_blank {
1791                continue;
1792            }
1793
1794            // Check for ATX headings (but skip MkDocs snippet lines)
1795            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1796            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1797                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1798                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1799            } else {
1800                false
1801            };
1802
1803            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1804                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1805                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1806                    continue;
1807                }
1808                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1809                let hashes = caps.get(2).map_or("", |m| m.as_str());
1810                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1811                let rest = caps.get(4).map_or("", |m| m.as_str());
1812
1813                let level = hashes.len() as u8;
1814                let marker_column = leading_spaces.len();
1815
1816                // Check for closing sequence, but handle custom IDs that might come after
1817                let (text, has_closing, closing_seq) = {
1818                    // First check if there's a custom ID at the end
1819                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1820                        // Check if this looks like a valid custom ID (ends with })
1821                        if rest[id_start..].trim_end().ends_with('}') {
1822                            // Split off the custom ID
1823                            (&rest[..id_start], &rest[id_start..])
1824                        } else {
1825                            (rest, "")
1826                        }
1827                    } else {
1828                        (rest, "")
1829                    };
1830
1831                    // Now look for closing hashes in the part before the custom ID
1832                    let trimmed_rest = rest_without_id.trim_end();
1833                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1834                        // Look for the start of the hash sequence
1835                        let mut start_of_hashes = last_hash_pos;
1836                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1837                            start_of_hashes -= 1;
1838                        }
1839
1840                        // Check if there's at least one space before the closing hashes
1841                        let has_space_before = start_of_hashes == 0
1842                            || trimmed_rest
1843                                .chars()
1844                                .nth(start_of_hashes - 1)
1845                                .is_some_and(|c| c.is_whitespace());
1846
1847                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1848                        let potential_closing = &trimmed_rest[start_of_hashes..];
1849                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1850
1851                        if is_all_hashes && has_space_before {
1852                            // This is a closing sequence
1853                            let closing_hashes = potential_closing.to_string();
1854                            // The text is everything before the closing hashes
1855                            // Don't include the custom ID here - it will be extracted later
1856                            let text_part = if !custom_id_part.is_empty() {
1857                                // If we have a custom ID, append it back to get the full rest
1858                                // This allows the extract_header_id function to handle it properly
1859                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1860                            } else {
1861                                rest_without_id[..start_of_hashes].trim_end().to_string()
1862                            };
1863                            (text_part, true, closing_hashes)
1864                        } else {
1865                            // Not a valid closing sequence, return the full content
1866                            (rest.to_string(), false, String::new())
1867                        }
1868                    } else {
1869                        // No hashes found, return the full content
1870                        (rest.to_string(), false, String::new())
1871                    }
1872                };
1873
1874                let content_column = marker_column + hashes.len() + spaces_after.len();
1875
1876                // Extract custom header ID if present
1877                let raw_text = text.trim().to_string();
1878                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1879
1880                // If no custom ID was found on the header line, check the next line for standalone attr-list
1881                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1882                    let next_line = content_lines[i + 1];
1883                    if !lines[i + 1].in_code_block
1884                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1885                        && let Some(next_line_id) =
1886                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1887                    {
1888                        custom_id = Some(next_line_id);
1889                    }
1890                }
1891
1892                lines[i].heading = Some(HeadingInfo {
1893                    level,
1894                    style: HeadingStyle::ATX,
1895                    marker: hashes.to_string(),
1896                    marker_column,
1897                    content_column,
1898                    text: clean_text,
1899                    custom_id,
1900                    raw_text,
1901                    has_closing_sequence: has_closing,
1902                    closing_sequence: closing_seq,
1903                });
1904            }
1905            // Check for Setext headings (need to look at next line)
1906            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1907                let next_line = content_lines[i + 1];
1908                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1909                    // Skip if next line is front matter delimiter
1910                    if front_matter_end > 0 && i < front_matter_end {
1911                        continue;
1912                    }
1913
1914                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1915                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1916                    {
1917                        continue;
1918                    }
1919
1920                    let underline = next_line.trim();
1921
1922                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1923                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1924                    if underline == "---" {
1925                        continue;
1926                    }
1927
1928                    // Skip if the current line looks like YAML key-value syntax
1929                    let current_line_trimmed = line.trim();
1930                    if current_line_trimmed.contains(':')
1931                        && !current_line_trimmed.starts_with('#')
1932                        && !current_line_trimmed.contains('[')
1933                        && !current_line_trimmed.contains("](")
1934                    {
1935                        // This looks like "key: value" which suggests YAML, not a heading
1936                        continue;
1937                    }
1938
1939                    let level = if underline.starts_with('=') { 1 } else { 2 };
1940                    let style = if level == 1 {
1941                        HeadingStyle::Setext1
1942                    } else {
1943                        HeadingStyle::Setext2
1944                    };
1945
1946                    // Extract custom header ID if present
1947                    let raw_text = line.trim().to_string();
1948                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1949
1950                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1951                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1952                        let attr_line = content_lines[i + 2];
1953                        if !lines[i + 2].in_code_block
1954                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1955                            && let Some(attr_line_id) =
1956                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1957                        {
1958                            custom_id = Some(attr_line_id);
1959                        }
1960                    }
1961
1962                    lines[i].heading = Some(HeadingInfo {
1963                        level,
1964                        style,
1965                        marker: underline.to_string(),
1966                        marker_column: next_line.len() - next_line.trim_start().len(),
1967                        content_column: lines[i].indent,
1968                        text: clean_text,
1969                        custom_id,
1970                        raw_text,
1971                        has_closing_sequence: false,
1972                        closing_sequence: String::new(),
1973                    });
1974                }
1975            }
1976        }
1977    }
1978
1979    /// Detect HTML blocks in the content
1980    fn detect_html_blocks(lines: &mut [LineInfo]) {
1981        // HTML block elements that trigger block context
1982        const BLOCK_ELEMENTS: &[&str] = &[
1983            "address",
1984            "article",
1985            "aside",
1986            "blockquote",
1987            "details",
1988            "dialog",
1989            "dd",
1990            "div",
1991            "dl",
1992            "dt",
1993            "fieldset",
1994            "figcaption",
1995            "figure",
1996            "footer",
1997            "form",
1998            "h1",
1999            "h2",
2000            "h3",
2001            "h4",
2002            "h5",
2003            "h6",
2004            "header",
2005            "hr",
2006            "li",
2007            "main",
2008            "nav",
2009            "ol",
2010            "p",
2011            "pre",
2012            "script",
2013            "section",
2014            "style",
2015            "table",
2016            "tbody",
2017            "td",
2018            "tfoot",
2019            "th",
2020            "thead",
2021            "tr",
2022            "ul",
2023        ];
2024
2025        let mut i = 0;
2026        while i < lines.len() {
2027            // Skip if already in code block or front matter
2028            if lines[i].in_code_block || lines[i].in_front_matter {
2029                i += 1;
2030                continue;
2031            }
2032
2033            let trimmed = lines[i].content.trim_start();
2034
2035            // Check if line starts with an HTML tag
2036            if trimmed.starts_with('<') && trimmed.len() > 1 {
2037                // Extract tag name safely
2038                let after_bracket = &trimmed[1..];
2039                let is_closing = after_bracket.starts_with('/');
2040                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2041
2042                // Extract tag name (stop at space, >, /, or end of string)
2043                let tag_name = tag_start
2044                    .chars()
2045                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
2046                    .collect::<String>()
2047                    .to_lowercase();
2048
2049                // Check if it's a block element
2050                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2051                    // Mark this line as in HTML block
2052                    lines[i].in_html_block = true;
2053
2054                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
2055                    // This avoids complex nesting logic that might cause infinite loops
2056                    if !is_closing {
2057                        let closing_tag = format!("</{tag_name}>");
2058                        // style and script tags can contain blank lines (CSS/JS formatting)
2059                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
2060                        let mut j = i + 1;
2061                        while j < lines.len() && j < i + 100 {
2062                            // Limit search to 100 lines
2063                            // Stop at blank lines (except for style/script tags)
2064                            if !allow_blank_lines && lines[j].is_blank {
2065                                break;
2066                            }
2067
2068                            lines[j].in_html_block = true;
2069
2070                            // Check if this line contains the closing tag
2071                            if lines[j].content.contains(&closing_tag) {
2072                                break;
2073                            }
2074                            j += 1;
2075                        }
2076                    }
2077                }
2078            }
2079
2080            i += 1;
2081        }
2082    }
2083
2084    /// Detect ESM import/export blocks in MDX files
2085    /// ESM blocks consist of contiguous import/export statements at the top of the file
2086    fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2087        // Only process MDX files
2088        if !flavor.supports_esm_blocks() {
2089            return;
2090        }
2091
2092        for line in lines.iter_mut() {
2093            // Skip blank lines and comments at the start
2094            if line.is_blank || line.in_html_comment {
2095                continue;
2096            }
2097
2098            // Check if line starts with import or export
2099            let trimmed = line.content.trim_start();
2100            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2101                line.in_esm_block = true;
2102            } else {
2103                // Once we hit a non-ESM line, we're done with the ESM block
2104                break;
2105            }
2106        }
2107    }
2108
2109    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2110    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2111        let mut code_spans = Vec::new();
2112
2113        // Quick check - if no backticks, no code spans
2114        if !content.contains('`') {
2115            return code_spans;
2116        }
2117
2118        // Use pulldown-cmark's streaming parser with byte offsets
2119        let parser = Parser::new(content).into_offset_iter();
2120
2121        for (event, range) in parser {
2122            if let Event::Code(_) = event {
2123                let start_pos = range.start;
2124                let end_pos = range.end;
2125
2126                // The range includes the backticks, extract the actual content
2127                let full_span = &content[start_pos..end_pos];
2128                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2129
2130                // Extract content between backticks, preserving spaces
2131                let content_start = start_pos + backtick_count;
2132                let content_end = end_pos - backtick_count;
2133                let span_content = if content_start < content_end {
2134                    content[content_start..content_end].to_string()
2135                } else {
2136                    String::new()
2137                };
2138
2139                // Use binary search to find line number - O(log n) instead of O(n)
2140                // Find the rightmost line whose byte_offset <= start_pos
2141                let line_idx = lines
2142                    .partition_point(|line| line.byte_offset <= start_pos)
2143                    .saturating_sub(1);
2144                let line_num = line_idx + 1;
2145                let col_start = start_pos - lines[line_idx].byte_offset;
2146
2147                // Find end column using binary search
2148                let end_line_idx = lines
2149                    .partition_point(|line| line.byte_offset <= end_pos)
2150                    .saturating_sub(1);
2151                let col_end = end_pos - lines[end_line_idx].byte_offset;
2152
2153                code_spans.push(CodeSpan {
2154                    line: line_num,
2155                    start_col: col_start,
2156                    end_col: col_end,
2157                    byte_offset: start_pos,
2158                    byte_end: end_pos,
2159                    backtick_count,
2160                    content: span_content,
2161                });
2162            }
2163        }
2164
2165        // Sort by position to ensure consistent ordering
2166        code_spans.sort_by_key(|span| span.byte_offset);
2167
2168        code_spans
2169    }
2170
2171    /// Parse all list blocks in the content (legacy line-by-line approach)
2172    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
2173        // Pre-size based on lines that could be list items
2174        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2175        let mut current_block: Option<ListBlock> = None;
2176        let mut last_list_item_line = 0;
2177        let mut current_indent_level = 0;
2178        let mut last_marker_width = 0;
2179
2180        for (line_idx, line_info) in lines.iter().enumerate() {
2181            let line_num = line_idx + 1;
2182
2183            // Enhanced code block handling using Design #3's context analysis
2184            if line_info.in_code_block {
2185                if let Some(ref mut block) = current_block {
2186                    // Calculate minimum indentation for list continuation
2187                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
2188
2189                    // Analyze code block context using the three-tier classification
2190                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2191
2192                    match context {
2193                        CodeBlockContext::Indented => {
2194                            // Code block is properly indented - continues the list
2195                            block.end_line = line_num;
2196                            continue;
2197                        }
2198                        CodeBlockContext::Standalone => {
2199                            // Code block separates lists - end current block
2200                            let completed_block = current_block.take().unwrap();
2201                            list_blocks.push(completed_block);
2202                            continue;
2203                        }
2204                        CodeBlockContext::Adjacent => {
2205                            // Edge case - use conservative behavior (continue list)
2206                            block.end_line = line_num;
2207                            continue;
2208                        }
2209                    }
2210                } else {
2211                    // No current list block - skip code block lines
2212                    continue;
2213                }
2214            }
2215
2216            // Extract blockquote prefix if any
2217            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
2218                caps.get(0).unwrap().as_str().to_string()
2219            } else {
2220                String::new()
2221            };
2222
2223            // Check if this line is a list item
2224            if let Some(list_item) = &line_info.list_item {
2225                // Calculate nesting level based on indentation
2226                let item_indent = list_item.marker_column;
2227                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
2228
2229                if let Some(ref mut block) = current_block {
2230                    // Check if this continues the current block
2231                    // For nested lists, we need to check if this is a nested item (higher nesting level)
2232                    // or a continuation at the same or lower level
2233                    let is_nested = nesting > block.nesting_level;
2234                    let same_type =
2235                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
2236                    let same_context = block.blockquote_prefix == blockquote_prefix;
2237                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
2238
2239                    // For unordered lists, also check marker consistency
2240                    let marker_compatible =
2241                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
2242
2243                    // Check if there's non-list content between the last item and this one
2244                    let has_non_list_content = {
2245                        let mut found_non_list = false;
2246                        // Use the last item from the current block, not the global last_list_item_line
2247                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2248
2249                        // Debug: Special check for problematic line
2250                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2251                            let last_line = &lines[block_last_item_line - 1];
2252                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2253                                log::debug!(
2254                                    "After problematic line {}: checking lines {} to {} for non-list content",
2255                                    block_last_item_line,
2256                                    block_last_item_line + 1,
2257                                    line_num
2258                                );
2259                                // If they're consecutive list items, there's no content between
2260                                if line_num == block_last_item_line + 1 {
2261                                    log::debug!("Lines are consecutive, no content between");
2262                                }
2263                            }
2264                        }
2265
2266                        for check_line in (block_last_item_line + 1)..line_num {
2267                            let check_idx = check_line - 1;
2268                            if check_idx < lines.len() {
2269                                let check_info = &lines[check_idx];
2270                                // Check for content that breaks the list
2271                                let is_list_breaking_content = if check_info.in_code_block {
2272                                    // Use enhanced code block classification for list separation
2273                                    let last_item_marker_width =
2274                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2275                                            lines[block_last_item_line - 1]
2276                                                .list_item
2277                                                .as_ref()
2278                                                .map(|li| {
2279                                                    if li.is_ordered {
2280                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
2281                                                    } else {
2282                                                        li.marker.len()
2283                                                    }
2284                                                })
2285                                                .unwrap_or(3) // fallback to 3 if no list item found
2286                                        } else {
2287                                            3 // fallback
2288                                        };
2289
2290                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2291
2292                                    // Analyze code block context using our enhanced classification
2293                                    let context = CodeBlockUtils::analyze_code_block_context(
2294                                        lines,
2295                                        check_line - 1,
2296                                        min_continuation,
2297                                    );
2298
2299                                    // Standalone code blocks break lists, indented ones continue them
2300                                    matches!(context, CodeBlockContext::Standalone)
2301                                } else if !check_info.is_blank && check_info.list_item.is_none() {
2302                                    // Check for structural separators that should break lists (from issue #42)
2303                                    let line_content = check_info.content.trim();
2304
2305                                    // Any of these structural separators break lists
2306                                    if check_info.heading.is_some()
2307                                        || line_content.starts_with("---")
2308                                        || line_content.starts_with("***")
2309                                        || line_content.starts_with("___")
2310                                        || (line_content.contains('|')
2311                                            && !line_content.contains("](")
2312                                            && !line_content.contains("http")
2313                                            && (line_content.matches('|').count() > 1
2314                                                || line_content.starts_with('|')
2315                                                || line_content.ends_with('|')))
2316                                        || line_content.starts_with(">")
2317                                    {
2318                                        true
2319                                    }
2320                                    // Other non-list content - check if properly indented
2321                                    else {
2322                                        let last_item_marker_width =
2323                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2324                                                lines[block_last_item_line - 1]
2325                                                    .list_item
2326                                                    .as_ref()
2327                                                    .map(|li| {
2328                                                        if li.is_ordered {
2329                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
2330                                                        } else {
2331                                                            li.marker.len()
2332                                                        }
2333                                                    })
2334                                                    .unwrap_or(3) // fallback to 3 if no list item found
2335                                            } else {
2336                                                3 // fallback
2337                                            };
2338
2339                                        let min_continuation =
2340                                            if block.is_ordered { last_item_marker_width } else { 2 };
2341                                        check_info.indent < min_continuation
2342                                    }
2343                                } else {
2344                                    false
2345                                };
2346
2347                                if is_list_breaking_content {
2348                                    // Not indented enough, so it breaks the list
2349                                    found_non_list = true;
2350                                    break;
2351                                }
2352                            }
2353                        }
2354                        found_non_list
2355                    };
2356
2357                    // A list continues if:
2358                    // 1. It's a nested item (indented more than the parent), OR
2359                    // 2. It's the same type at the same level with reasonable distance
2360                    let mut continues_list = if is_nested {
2361                        // Nested items always continue the list if they're in the same context
2362                        same_context && reasonable_distance && !has_non_list_content
2363                    } else {
2364                        // Same-level items need to match type and markers
2365                        let result = same_type
2366                            && same_context
2367                            && reasonable_distance
2368                            && marker_compatible
2369                            && !has_non_list_content;
2370
2371                        // Debug logging for lines after problematic content
2372                        if block.item_lines.last().is_some_and(|&last_line| {
2373                            last_line > 0
2374                                && last_line <= lines.len()
2375                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2376                                && lines[last_line - 1].content.contains(r"\`")
2377                        }) {
2378                            log::debug!(
2379                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2380                            );
2381                            if line_num > 0 && line_num <= lines.len() {
2382                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2383                            }
2384                        }
2385
2386                        result
2387                    };
2388
2389                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2390                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2391                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2392                        // Check if the previous line was a list item
2393                        if block.item_lines.contains(&(line_num - 1)) {
2394                            // They're consecutive list items - force them to be in the same list
2395                            continues_list = true;
2396                        }
2397                    }
2398
2399                    if continues_list {
2400                        // Extend current block
2401                        block.end_line = line_num;
2402                        block.item_lines.push(line_num);
2403
2404                        // Update max marker width
2405                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2406                            list_item.marker.len() + 1
2407                        } else {
2408                            list_item.marker.len()
2409                        });
2410
2411                        // Update marker consistency for unordered lists
2412                        if !block.is_ordered
2413                            && block.marker.is_some()
2414                            && block.marker.as_ref() != Some(&list_item.marker)
2415                        {
2416                            // Mixed markers, clear the marker field
2417                            block.marker = None;
2418                        }
2419                    } else {
2420                        // End current block and start a new one
2421
2422                        list_blocks.push(block.clone());
2423
2424                        *block = ListBlock {
2425                            start_line: line_num,
2426                            end_line: line_num,
2427                            is_ordered: list_item.is_ordered,
2428                            marker: if list_item.is_ordered {
2429                                None
2430                            } else {
2431                                Some(list_item.marker.clone())
2432                            },
2433                            blockquote_prefix: blockquote_prefix.clone(),
2434                            item_lines: vec![line_num],
2435                            nesting_level: nesting,
2436                            max_marker_width: if list_item.is_ordered {
2437                                list_item.marker.len() + 1
2438                            } else {
2439                                list_item.marker.len()
2440                            },
2441                        };
2442                    }
2443                } else {
2444                    // Start a new block
2445                    current_block = Some(ListBlock {
2446                        start_line: line_num,
2447                        end_line: line_num,
2448                        is_ordered: list_item.is_ordered,
2449                        marker: if list_item.is_ordered {
2450                            None
2451                        } else {
2452                            Some(list_item.marker.clone())
2453                        },
2454                        blockquote_prefix,
2455                        item_lines: vec![line_num],
2456                        nesting_level: nesting,
2457                        max_marker_width: list_item.marker.len(),
2458                    });
2459                }
2460
2461                last_list_item_line = line_num;
2462                current_indent_level = item_indent;
2463                last_marker_width = if list_item.is_ordered {
2464                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2465                } else {
2466                    list_item.marker.len()
2467                };
2468            } else if let Some(ref mut block) = current_block {
2469                // Not a list item - check if it continues the current block
2470
2471                // For MD032 compatibility, we use a simple approach:
2472                // - Indented lines continue the list
2473                // - Blank lines followed by indented content continue the list
2474                // - Everything else ends the list
2475
2476                // Check if the last line in the list block ended with a backslash (hard line break)
2477                // This handles cases where list items use backslash for hard line breaks
2478                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2479                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2480                } else {
2481                    false
2482                };
2483
2484                // Calculate minimum indentation for list continuation
2485                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2486                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2487                let min_continuation_indent = if block.is_ordered {
2488                    current_indent_level + last_marker_width
2489                } else {
2490                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2491                };
2492
2493                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2494                    // Indented line or backslash continuation continues the list
2495                    block.end_line = line_num;
2496                } else if line_info.is_blank {
2497                    // Blank line - check if it's internal to the list or ending it
2498                    // We only include blank lines that are followed by more list content
2499                    let mut check_idx = line_idx + 1;
2500                    let mut found_continuation = false;
2501
2502                    // Skip additional blank lines
2503                    while check_idx < lines.len() && lines[check_idx].is_blank {
2504                        check_idx += 1;
2505                    }
2506
2507                    if check_idx < lines.len() {
2508                        let next_line = &lines[check_idx];
2509                        // Check if followed by indented content (list continuation)
2510                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2511                            found_continuation = true;
2512                        }
2513                        // Check if followed by another list item at the same level
2514                        else if !next_line.in_code_block
2515                            && next_line.list_item.is_some()
2516                            && let Some(item) = &next_line.list_item
2517                        {
2518                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2519                                .find(&next_line.content)
2520                                .map_or(String::new(), |m| m.as_str().to_string());
2521                            if item.marker_column == current_indent_level
2522                                && item.is_ordered == block.is_ordered
2523                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2524                            {
2525                                // Check if there was meaningful content between the list items (unused now)
2526                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2527                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2528                                    if let Some(between_line) = lines.get(idx) {
2529                                        let trimmed = between_line.content.trim();
2530                                        // Skip empty lines
2531                                        if trimmed.is_empty() {
2532                                            return false;
2533                                        }
2534                                        // Check for meaningful content
2535                                        let line_indent =
2536                                            between_line.content.len() - between_line.content.trim_start().len();
2537
2538                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2539                                        if trimmed.starts_with("```")
2540                                            || trimmed.starts_with("~~~")
2541                                            || trimmed.starts_with("---")
2542                                            || trimmed.starts_with("***")
2543                                            || trimmed.starts_with("___")
2544                                            || trimmed.starts_with(">")
2545                                            || trimmed.contains('|') // Tables
2546                                            || between_line.heading.is_some()
2547                                        {
2548                                            return true; // These are structural separators - meaningful content that breaks lists
2549                                        }
2550
2551                                        // Only properly indented content continues the list
2552                                        line_indent >= min_continuation_indent
2553                                    } else {
2554                                        false
2555                                    }
2556                                });
2557
2558                                if block.is_ordered {
2559                                    // For ordered lists: don't continue if there are structural separators
2560                                    // Check if there are structural separators between the list items
2561                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2562                                        if let Some(between_line) = lines.get(idx) {
2563                                            let trimmed = between_line.content.trim();
2564                                            if trimmed.is_empty() {
2565                                                return false;
2566                                            }
2567                                            // Check for structural separators that break lists
2568                                            trimmed.starts_with("```")
2569                                                || trimmed.starts_with("~~~")
2570                                                || trimmed.starts_with("---")
2571                                                || trimmed.starts_with("***")
2572                                                || trimmed.starts_with("___")
2573                                                || trimmed.starts_with(">")
2574                                                || trimmed.contains('|') // Tables
2575                                                || between_line.heading.is_some()
2576                                        } else {
2577                                            false
2578                                        }
2579                                    });
2580                                    found_continuation = !has_structural_separators;
2581                                } else {
2582                                    // For unordered lists: also check for structural separators
2583                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2584                                        if let Some(between_line) = lines.get(idx) {
2585                                            let trimmed = between_line.content.trim();
2586                                            if trimmed.is_empty() {
2587                                                return false;
2588                                            }
2589                                            // Check for structural separators that break lists
2590                                            trimmed.starts_with("```")
2591                                                || trimmed.starts_with("~~~")
2592                                                || trimmed.starts_with("---")
2593                                                || trimmed.starts_with("***")
2594                                                || trimmed.starts_with("___")
2595                                                || trimmed.starts_with(">")
2596                                                || trimmed.contains('|') // Tables
2597                                                || between_line.heading.is_some()
2598                                        } else {
2599                                            false
2600                                        }
2601                                    });
2602                                    found_continuation = !has_structural_separators;
2603                                }
2604                            }
2605                        }
2606                    }
2607
2608                    if found_continuation {
2609                        // Include the blank line in the block
2610                        block.end_line = line_num;
2611                    } else {
2612                        // Blank line ends the list - don't include it
2613                        list_blocks.push(block.clone());
2614                        current_block = None;
2615                    }
2616                } else {
2617                    // Check for lazy continuation - non-indented line immediately after a list item
2618                    // But only if the line has sufficient indentation for the list type
2619                    let min_required_indent = if block.is_ordered {
2620                        current_indent_level + last_marker_width
2621                    } else {
2622                        current_indent_level + 2
2623                    };
2624
2625                    // For lazy continuation to apply, the line must either:
2626                    // 1. Have no indentation (true lazy continuation)
2627                    // 2. Have sufficient indentation for the list type
2628                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2629                    let line_content = line_info.content.trim();
2630                    let is_structural_separator = line_info.heading.is_some()
2631                        || line_content.starts_with("```")
2632                        || line_content.starts_with("~~~")
2633                        || line_content.starts_with("---")
2634                        || line_content.starts_with("***")
2635                        || line_content.starts_with("___")
2636                        || line_content.starts_with(">")
2637                        || (line_content.contains('|')
2638                            && !line_content.contains("](")
2639                            && !line_content.contains("http")
2640                            && (line_content.matches('|').count() > 1
2641                                || line_content.starts_with('|')
2642                                || line_content.ends_with('|'))); // Tables
2643
2644                    // Allow lazy continuation if we're still within the same list block
2645                    // (not just immediately after a list item)
2646                    let is_lazy_continuation = !is_structural_separator
2647                        && !line_info.is_blank
2648                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2649
2650                    if is_lazy_continuation {
2651                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2652                        // it's probably not a continuation
2653                        let content_to_check = if !blockquote_prefix.is_empty() {
2654                            // Strip blockquote prefix to check the actual content
2655                            line_info
2656                                .content
2657                                .strip_prefix(&blockquote_prefix)
2658                                .unwrap_or(&line_info.content)
2659                                .trim()
2660                        } else {
2661                            line_info.content.trim()
2662                        };
2663
2664                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2665
2666                        // If it starts with uppercase and the previous line ended with punctuation,
2667                        // it's likely a new paragraph, not a continuation
2668                        if starts_with_uppercase && last_list_item_line > 0 {
2669                            // This looks like a new paragraph
2670                            list_blocks.push(block.clone());
2671                            current_block = None;
2672                        } else {
2673                            // This is a lazy continuation line
2674                            block.end_line = line_num;
2675                        }
2676                    } else {
2677                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2678                        list_blocks.push(block.clone());
2679                        current_block = None;
2680                    }
2681                }
2682            }
2683        }
2684
2685        // Don't forget the last block
2686        if let Some(block) = current_block {
2687            list_blocks.push(block);
2688        }
2689
2690        // Merge adjacent blocks that should be one
2691        merge_adjacent_list_blocks(&mut list_blocks, lines);
2692
2693        list_blocks
2694    }
2695
2696    /// Compute character frequency for fast content analysis
2697    fn compute_char_frequency(content: &str) -> CharFrequency {
2698        let mut frequency = CharFrequency::default();
2699
2700        for ch in content.chars() {
2701            match ch {
2702                '#' => frequency.hash_count += 1,
2703                '*' => frequency.asterisk_count += 1,
2704                '_' => frequency.underscore_count += 1,
2705                '-' => frequency.hyphen_count += 1,
2706                '+' => frequency.plus_count += 1,
2707                '>' => frequency.gt_count += 1,
2708                '|' => frequency.pipe_count += 1,
2709                '[' => frequency.bracket_count += 1,
2710                '`' => frequency.backtick_count += 1,
2711                '<' => frequency.lt_count += 1,
2712                '!' => frequency.exclamation_count += 1,
2713                '\n' => frequency.newline_count += 1,
2714                _ => {}
2715            }
2716        }
2717
2718        frequency
2719    }
2720
2721    /// Parse HTML tags in the content
2722    fn parse_html_tags(
2723        content: &str,
2724        lines: &[LineInfo],
2725        code_blocks: &[(usize, usize)],
2726        flavor: MarkdownFlavor,
2727    ) -> Vec<HtmlTag> {
2728        static HTML_TAG_REGEX: LazyLock<regex::Regex> =
2729            LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
2730
2731        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2732
2733        for cap in HTML_TAG_REGEX.captures_iter(content) {
2734            let full_match = cap.get(0).unwrap();
2735            let match_start = full_match.start();
2736            let match_end = full_match.end();
2737
2738            // Skip if in code block
2739            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2740                continue;
2741            }
2742
2743            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2744            let tag_name_original = cap.get(2).unwrap().as_str();
2745            let tag_name = tag_name_original.to_lowercase();
2746            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2747
2748            // Skip JSX components in MDX files (tags starting with uppercase letter)
2749            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2750            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2751                continue;
2752            }
2753
2754            // Find which line this tag is on
2755            let mut line_num = 1;
2756            let mut col_start = match_start;
2757            let mut col_end = match_end;
2758            for (idx, line_info) in lines.iter().enumerate() {
2759                if match_start >= line_info.byte_offset {
2760                    line_num = idx + 1;
2761                    col_start = match_start - line_info.byte_offset;
2762                    col_end = match_end - line_info.byte_offset;
2763                } else {
2764                    break;
2765                }
2766            }
2767
2768            html_tags.push(HtmlTag {
2769                line: line_num,
2770                start_col: col_start,
2771                end_col: col_end,
2772                byte_offset: match_start,
2773                byte_end: match_end,
2774                tag_name,
2775                is_closing,
2776                is_self_closing,
2777                raw_content: full_match.as_str().to_string(),
2778            });
2779        }
2780
2781        html_tags
2782    }
2783
2784    /// Parse emphasis spans in the content
2785    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2786        static EMPHASIS_REGEX: LazyLock<regex::Regex> =
2787            LazyLock::new(|| regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap());
2788
2789        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2790
2791        for cap in EMPHASIS_REGEX.captures_iter(content) {
2792            let full_match = cap.get(0).unwrap();
2793            let match_start = full_match.start();
2794            let match_end = full_match.end();
2795
2796            // Skip if in code block
2797            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2798                continue;
2799            }
2800
2801            let opening_markers = cap.get(1).unwrap().as_str();
2802            let content_part = cap.get(2).unwrap().as_str();
2803            let closing_markers = cap.get(3).unwrap().as_str();
2804
2805            // Validate matching markers
2806            if opening_markers.chars().next() != closing_markers.chars().next()
2807                || opening_markers.len() != closing_markers.len()
2808            {
2809                continue;
2810            }
2811
2812            let marker = opening_markers.chars().next().unwrap();
2813            let marker_count = opening_markers.len();
2814
2815            // Find which line this emphasis is on
2816            let mut line_num = 1;
2817            let mut col_start = match_start;
2818            let mut col_end = match_end;
2819            for (idx, line_info) in lines.iter().enumerate() {
2820                if match_start >= line_info.byte_offset {
2821                    line_num = idx + 1;
2822                    col_start = match_start - line_info.byte_offset;
2823                    col_end = match_end - line_info.byte_offset;
2824                } else {
2825                    break;
2826                }
2827            }
2828
2829            emphasis_spans.push(EmphasisSpan {
2830                line: line_num,
2831                start_col: col_start,
2832                end_col: col_end,
2833                byte_offset: match_start,
2834                byte_end: match_end,
2835                marker,
2836                marker_count,
2837                content: content_part.to_string(),
2838            });
2839        }
2840
2841        emphasis_spans
2842    }
2843
2844    /// Parse table rows in the content
2845    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2846        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2847
2848        for (line_idx, line_info) in lines.iter().enumerate() {
2849            // Skip lines in code blocks or blank lines
2850            if line_info.in_code_block || line_info.is_blank {
2851                continue;
2852            }
2853
2854            let line = &line_info.content;
2855            let line_num = line_idx + 1;
2856
2857            // Check if this line contains pipes (potential table row)
2858            if !line.contains('|') {
2859                continue;
2860            }
2861
2862            // Count columns by splitting on pipes
2863            let parts: Vec<&str> = line.split('|').collect();
2864            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2865
2866            // Check if this is a separator row
2867            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2868            let mut column_alignments = Vec::new();
2869
2870            if is_separator {
2871                for part in &parts[1..parts.len() - 1] {
2872                    // Skip first and last empty parts
2873                    let trimmed = part.trim();
2874                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2875                        "center".to_string()
2876                    } else if trimmed.ends_with(':') {
2877                        "right".to_string()
2878                    } else if trimmed.starts_with(':') {
2879                        "left".to_string()
2880                    } else {
2881                        "none".to_string()
2882                    };
2883                    column_alignments.push(alignment);
2884                }
2885            }
2886
2887            table_rows.push(TableRow {
2888                line: line_num,
2889                is_separator,
2890                column_count,
2891                column_alignments,
2892            });
2893        }
2894
2895        table_rows
2896    }
2897
2898    /// Parse bare URLs and emails in the content
2899    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2900        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2901
2902        // Check for bare URLs (not in angle brackets or markdown links)
2903        for cap in BARE_URL_PATTERN.captures_iter(content) {
2904            let full_match = cap.get(0).unwrap();
2905            let match_start = full_match.start();
2906            let match_end = full_match.end();
2907
2908            // Skip if in code block
2909            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2910                continue;
2911            }
2912
2913            // Skip if already in angle brackets or markdown links
2914            let preceding_char = if match_start > 0 {
2915                content.chars().nth(match_start - 1)
2916            } else {
2917                None
2918            };
2919            let following_char = content.chars().nth(match_end);
2920
2921            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2922                continue;
2923            }
2924            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2925                continue;
2926            }
2927
2928            let url = full_match.as_str();
2929            let url_type = if url.starts_with("https://") {
2930                "https"
2931            } else if url.starts_with("http://") {
2932                "http"
2933            } else if url.starts_with("ftp://") {
2934                "ftp"
2935            } else {
2936                "other"
2937            };
2938
2939            // Find which line this URL is on
2940            let mut line_num = 1;
2941            let mut col_start = match_start;
2942            let mut col_end = match_end;
2943            for (idx, line_info) in lines.iter().enumerate() {
2944                if match_start >= line_info.byte_offset {
2945                    line_num = idx + 1;
2946                    col_start = match_start - line_info.byte_offset;
2947                    col_end = match_end - line_info.byte_offset;
2948                } else {
2949                    break;
2950                }
2951            }
2952
2953            bare_urls.push(BareUrl {
2954                line: line_num,
2955                start_col: col_start,
2956                end_col: col_end,
2957                byte_offset: match_start,
2958                byte_end: match_end,
2959                url: url.to_string(),
2960                url_type: url_type.to_string(),
2961            });
2962        }
2963
2964        // Check for bare email addresses
2965        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2966            let full_match = cap.get(0).unwrap();
2967            let match_start = full_match.start();
2968            let match_end = full_match.end();
2969
2970            // Skip if in code block
2971            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2972                continue;
2973            }
2974
2975            // Skip if already in angle brackets or markdown links
2976            let preceding_char = if match_start > 0 {
2977                content.chars().nth(match_start - 1)
2978            } else {
2979                None
2980            };
2981            let following_char = content.chars().nth(match_end);
2982
2983            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2984                continue;
2985            }
2986            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2987                continue;
2988            }
2989
2990            let email = full_match.as_str();
2991
2992            // Find which line this email is on
2993            let mut line_num = 1;
2994            let mut col_start = match_start;
2995            let mut col_end = match_end;
2996            for (idx, line_info) in lines.iter().enumerate() {
2997                if match_start >= line_info.byte_offset {
2998                    line_num = idx + 1;
2999                    col_start = match_start - line_info.byte_offset;
3000                    col_end = match_end - line_info.byte_offset;
3001                } else {
3002                    break;
3003                }
3004            }
3005
3006            bare_urls.push(BareUrl {
3007                line: line_num,
3008                start_col: col_start,
3009                end_col: col_end,
3010                byte_offset: match_start,
3011                byte_end: match_end,
3012                url: email.to_string(),
3013                url_type: "email".to_string(),
3014            });
3015        }
3016
3017        bare_urls
3018    }
3019}
3020
3021/// Merge adjacent list blocks that should be treated as one
3022fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3023    if list_blocks.len() < 2 {
3024        return;
3025    }
3026
3027    let mut merger = ListBlockMerger::new(lines);
3028    *list_blocks = merger.merge(list_blocks);
3029}
3030
3031/// Helper struct to manage the complex logic of merging list blocks
3032struct ListBlockMerger<'a> {
3033    lines: &'a [LineInfo],
3034}
3035
3036impl<'a> ListBlockMerger<'a> {
3037    fn new(lines: &'a [LineInfo]) -> Self {
3038        Self { lines }
3039    }
3040
3041    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3042        let mut merged = Vec::with_capacity(list_blocks.len());
3043        let mut current = list_blocks[0].clone();
3044
3045        for next in list_blocks.iter().skip(1) {
3046            if self.should_merge_blocks(&current, next) {
3047                current = self.merge_two_blocks(current, next);
3048            } else {
3049                merged.push(current);
3050                current = next.clone();
3051            }
3052        }
3053
3054        merged.push(current);
3055        merged
3056    }
3057
3058    /// Determine if two adjacent list blocks should be merged
3059    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3060        // Basic compatibility checks
3061        if !self.blocks_are_compatible(current, next) {
3062            return false;
3063        }
3064
3065        // Check spacing and content between blocks
3066        let spacing = self.analyze_spacing_between(current, next);
3067        match spacing {
3068            BlockSpacing::Consecutive => true,
3069            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3070            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3071                self.can_merge_with_content_between(current, next)
3072            }
3073        }
3074    }
3075
3076    /// Check if blocks have compatible structure for merging
3077    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3078        current.is_ordered == next.is_ordered
3079            && current.blockquote_prefix == next.blockquote_prefix
3080            && current.nesting_level == next.nesting_level
3081    }
3082
3083    /// Analyze the spacing between two list blocks
3084    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3085        let gap = next.start_line - current.end_line;
3086
3087        match gap {
3088            1 => BlockSpacing::Consecutive,
3089            2 => BlockSpacing::SingleBlank,
3090            _ if gap > 2 => {
3091                if self.has_only_blank_lines_between(current, next) {
3092                    BlockSpacing::MultipleBlanks
3093                } else {
3094                    BlockSpacing::ContentBetween
3095                }
3096            }
3097            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3098        }
3099    }
3100
3101    /// Check if unordered lists can be merged with a single blank line between
3102    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3103        // Check if there are structural separators between the blocks
3104        // If has_meaningful_content_between returns true, it means there are structural separators
3105        if has_meaningful_content_between(current, next, self.lines) {
3106            return false; // Structural separators prevent merging
3107        }
3108
3109        // Only merge unordered lists with same marker across single blank
3110        !current.is_ordered && current.marker == next.marker
3111    }
3112
3113    /// Check if ordered lists can be merged when there's content between them
3114    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3115        // Do not merge lists if there are structural separators between them
3116        if has_meaningful_content_between(current, next, self.lines) {
3117            return false; // Structural separators prevent merging
3118        }
3119
3120        // Only consider merging ordered lists if there's no structural content between
3121        current.is_ordered && next.is_ordered
3122    }
3123
3124    /// Check if there are only blank lines between blocks
3125    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3126        for line_num in (current.end_line + 1)..next.start_line {
3127            if let Some(line_info) = self.lines.get(line_num - 1)
3128                && !line_info.content.trim().is_empty()
3129            {
3130                return false;
3131            }
3132        }
3133        true
3134    }
3135
3136    /// Merge two compatible list blocks into one
3137    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
3138        current.end_line = next.end_line;
3139        current.item_lines.extend_from_slice(&next.item_lines);
3140
3141        // Update max marker width
3142        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
3143
3144        // Handle marker consistency for unordered lists
3145        if !current.is_ordered && self.markers_differ(&current, next) {
3146            current.marker = None; // Mixed markers
3147        }
3148
3149        current
3150    }
3151
3152    /// Check if two blocks have different markers
3153    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
3154        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
3155    }
3156}
3157
3158/// Types of spacing between list blocks
3159#[derive(Debug, PartialEq)]
3160enum BlockSpacing {
3161    Consecutive,    // No gap between blocks
3162    SingleBlank,    // One blank line between blocks
3163    MultipleBlanks, // Multiple blank lines but no content
3164    ContentBetween, // Content exists between blocks
3165}
3166
3167/// Check if there's meaningful content (not just blank lines) between two list blocks
3168fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
3169    // Check lines between current.end_line and next.start_line
3170    for line_num in (current.end_line + 1)..next.start_line {
3171        if let Some(line_info) = lines.get(line_num - 1) {
3172            // Convert to 0-indexed
3173            let trimmed = line_info.content.trim();
3174
3175            // Skip empty lines
3176            if trimmed.is_empty() {
3177                continue;
3178            }
3179
3180            // Check for structural separators that should separate lists (CommonMark compliant)
3181
3182            // Headings separate lists
3183            if line_info.heading.is_some() {
3184                return true; // Has meaningful content - headings separate lists
3185            }
3186
3187            // Horizontal rules separate lists (---, ***, ___)
3188            if is_horizontal_rule(trimmed) {
3189                return true; // Has meaningful content - horizontal rules separate lists
3190            }
3191
3192            // Tables separate lists (lines containing | but not in URLs or code)
3193            // Simple heuristic: tables typically have | at start/end or multiple |
3194            if trimmed.contains('|') && trimmed.len() > 1 {
3195                // Don't treat URLs with | as tables
3196                if !trimmed.contains("](") && !trimmed.contains("http") {
3197                    // More robust check: tables usually have multiple | or | at edges
3198                    let pipe_count = trimmed.matches('|').count();
3199                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
3200                        return true; // Has meaningful content - tables separate lists
3201                    }
3202                }
3203            }
3204
3205            // Blockquotes separate lists
3206            if trimmed.starts_with('>') {
3207                return true; // Has meaningful content - blockquotes separate lists
3208            }
3209
3210            // Code block fences separate lists (unless properly indented as list content)
3211            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
3212                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3213
3214                // Check if this code block is properly indented as list continuation
3215                let min_continuation_indent = if current.is_ordered {
3216                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
3217                } else {
3218                    current.nesting_level + 2
3219                };
3220
3221                if line_indent < min_continuation_indent {
3222                    // This is a standalone code block that separates lists
3223                    return true; // Has meaningful content - standalone code blocks separate lists
3224                }
3225            }
3226
3227            // Check if this line has proper indentation for list continuation
3228            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
3229
3230            // Calculate minimum indentation needed to be list continuation
3231            let min_indent = if current.is_ordered {
3232                current.nesting_level + current.max_marker_width
3233            } else {
3234                current.nesting_level + 2
3235            };
3236
3237            // If the line is not indented enough to be list continuation, it's meaningful content
3238            if line_indent < min_indent {
3239                return true; // Has meaningful content - content not indented as list continuation
3240            }
3241
3242            // If we reach here, the line is properly indented as list continuation
3243            // Continue checking other lines
3244        }
3245    }
3246
3247    // Only blank lines or properly indented list continuation content between blocks
3248    false
3249}
3250
3251/// Check if a line is a horizontal rule (---, ***, ___)
3252fn is_horizontal_rule(trimmed: &str) -> bool {
3253    if trimmed.len() < 3 {
3254        return false;
3255    }
3256
3257    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3258    let chars: Vec<char> = trimmed.chars().collect();
3259    if let Some(&first_char) = chars.first()
3260        && (first_char == '-' || first_char == '*' || first_char == '_')
3261    {
3262        let mut count = 0;
3263        for &ch in &chars {
3264            if ch == first_char {
3265                count += 1;
3266            } else if ch != ' ' && ch != '\t' {
3267                return false; // Non-matching, non-whitespace character
3268            }
3269        }
3270        return count >= 3;
3271    }
3272    false
3273}
3274
3275/// Check if content contains patterns that cause the markdown crate to panic
3276#[cfg(test)]
3277mod tests {
3278    use super::*;
3279
3280    #[test]
3281    fn test_empty_content() {
3282        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3283        assert_eq!(ctx.content, "");
3284        assert_eq!(ctx.line_offsets, vec![0]);
3285        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3286        assert_eq!(ctx.lines.len(), 0);
3287    }
3288
3289    #[test]
3290    fn test_single_line() {
3291        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3292        assert_eq!(ctx.content, "# Hello");
3293        assert_eq!(ctx.line_offsets, vec![0]);
3294        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3295        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3296    }
3297
3298    #[test]
3299    fn test_multi_line() {
3300        let content = "# Title\n\nSecond line\nThird line";
3301        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3302        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3303        // Test offset to line/col
3304        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3305        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3306        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3307        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3308        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3309    }
3310
3311    #[test]
3312    fn test_line_info() {
3313        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3314        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3315
3316        // Test line info
3317        assert_eq!(ctx.lines.len(), 7);
3318
3319        // Line 1: "# Title"
3320        let line1 = &ctx.lines[0];
3321        assert_eq!(line1.content, "# Title");
3322        assert_eq!(line1.byte_offset, 0);
3323        assert_eq!(line1.indent, 0);
3324        assert!(!line1.is_blank);
3325        assert!(!line1.in_code_block);
3326        assert!(line1.list_item.is_none());
3327
3328        // Line 2: "    indented"
3329        let line2 = &ctx.lines[1];
3330        assert_eq!(line2.content, "    indented");
3331        assert_eq!(line2.byte_offset, 8);
3332        assert_eq!(line2.indent, 4);
3333        assert!(!line2.is_blank);
3334
3335        // Line 3: "" (blank)
3336        let line3 = &ctx.lines[2];
3337        assert_eq!(line3.content, "");
3338        assert!(line3.is_blank);
3339
3340        // Test helper methods
3341        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3342        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3343        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3344        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3345    }
3346
3347    #[test]
3348    fn test_list_item_detection() {
3349        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3350        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3351
3352        // Line 1: "- Unordered item"
3353        let line1 = &ctx.lines[0];
3354        assert!(line1.list_item.is_some());
3355        let list1 = line1.list_item.as_ref().unwrap();
3356        assert_eq!(list1.marker, "-");
3357        assert!(!list1.is_ordered);
3358        assert_eq!(list1.marker_column, 0);
3359        assert_eq!(list1.content_column, 2);
3360
3361        // Line 2: "  * Nested item"
3362        let line2 = &ctx.lines[1];
3363        assert!(line2.list_item.is_some());
3364        let list2 = line2.list_item.as_ref().unwrap();
3365        assert_eq!(list2.marker, "*");
3366        assert_eq!(list2.marker_column, 2);
3367
3368        // Line 3: "1. Ordered item"
3369        let line3 = &ctx.lines[2];
3370        assert!(line3.list_item.is_some());
3371        let list3 = line3.list_item.as_ref().unwrap();
3372        assert_eq!(list3.marker, "1.");
3373        assert!(list3.is_ordered);
3374        assert_eq!(list3.number, Some(1));
3375
3376        // Line 6: "Not a list"
3377        let line6 = &ctx.lines[5];
3378        assert!(line6.list_item.is_none());
3379    }
3380
3381    #[test]
3382    fn test_offset_to_line_col_edge_cases() {
3383        let content = "a\nb\nc";
3384        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3385        // line_offsets: [0, 2, 4]
3386        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3387        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3388        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3389        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3390        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3391        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3392    }
3393
3394    #[test]
3395    fn test_mdx_esm_blocks() {
3396        let content = r##"import {Chart} from './snowfall.js'
3397export const year = 2023
3398
3399# Last year's snowfall
3400
3401In {year}, the snowfall was above average.
3402It was followed by a warm spring which caused
3403flood conditions in many of the nearby rivers.
3404
3405<Chart color="#fcb32c" year={year} />
3406"##;
3407
3408        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3409
3410        // Check that lines 1 and 2 are marked as ESM blocks
3411        assert_eq!(ctx.lines.len(), 10);
3412        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3413        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3414        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3415        assert!(
3416            !ctx.lines[3].in_esm_block,
3417            "Line 4 (heading) should NOT be in_esm_block"
3418        );
3419        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3420        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3421    }
3422
3423    #[test]
3424    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3425        let content = r#"import {Chart} from './snowfall.js'
3426export const year = 2023
3427
3428# Last year's snowfall
3429"#;
3430
3431        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3432
3433        // ESM blocks should NOT be detected in Standard flavor
3434        assert!(
3435            !ctx.lines[0].in_esm_block,
3436            "Line 1 should NOT be in_esm_block in Standard flavor"
3437        );
3438        assert!(
3439            !ctx.lines[1].in_esm_block,
3440            "Line 2 should NOT be in_esm_block in Standard flavor"
3441        );
3442    }
3443}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs