rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use pulldown_cmark::{Event, Parser};
6use regex::Regex;
7
8lazy_static! {
9    // Comprehensive link pattern that captures both inline and reference links
10    // Use (?s) flag to make . match newlines
11    static ref LINK_PATTERN: Regex = Regex::new(
12        r#"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
16            |
17            \[([^\]]*)\]      # Reference ID in group 6
18        )"#
19    ).unwrap();
20
21    // Image pattern (similar to links but with ! prefix)
22    // Use (?s) flag to make . match newlines
23    static ref IMAGE_PATTERN: Regex = Regex::new(
24        r#"(?sx)
25        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
26        (?:
27            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
28            |
29            \[([^\]]*)\]      # Reference ID in group 6
30        )"#
31    ).unwrap();
32
33    // Reference definition pattern
34    static ref REF_DEF_PATTERN: Regex = Regex::new(
35        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36    ).unwrap();
37
38    // Code span pattern - matches backticks and captures content
39    // This handles multi-backtick code spans correctly
40    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41        r"`+"
42    ).unwrap();
43
44    // Pattern for bare URLs
45    static ref BARE_URL_PATTERN: Regex = Regex::new(
46        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47    ).unwrap();
48
49    // Pattern for email addresses
50    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52    ).unwrap();
53
54    // Pattern for angle bracket links (to exclude from bare URL detection)
55    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57    ).unwrap();
58
59    // Pattern for blockquote prefix in parse_list_blocks
60    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63/// Pre-computed information about a line
64#[derive(Debug, Clone)]
65pub struct LineInfo {
66    /// The actual line content (without newline)
67    pub content: String,
68    /// Byte offset where this line starts in the document
69    pub byte_offset: usize,
70    /// Number of leading spaces/tabs
71    pub indent: usize,
72    /// Whether the line is blank (empty or only whitespace)
73    pub is_blank: bool,
74    /// Whether this line is inside a code block
75    pub in_code_block: bool,
76    /// Whether this line is inside front matter
77    pub in_front_matter: bool,
78    /// Whether this line is inside an HTML block
79    pub in_html_block: bool,
80    /// Whether this line is inside an HTML comment
81    pub in_html_comment: bool,
82    /// List item information if this line starts a list item
83    pub list_item: Option<ListItemInfo>,
84    /// Heading information if this line is a heading
85    pub heading: Option<HeadingInfo>,
86    /// Blockquote information if this line is a blockquote
87    pub blockquote: Option<BlockquoteInfo>,
88    /// Whether this line is inside a mkdocstrings autodoc block
89    pub in_mkdocstrings: bool,
90    /// Whether this line is part of an ESM import/export block (MDX only)
91    pub in_esm_block: bool,
92}
93
94/// Information about a list item
95#[derive(Debug, Clone)]
96pub struct ListItemInfo {
97    /// The marker used (*, -, +, or number with . or ))
98    pub marker: String,
99    /// Whether it's ordered (true) or unordered (false)
100    pub is_ordered: bool,
101    /// The number for ordered lists
102    pub number: Option<usize>,
103    /// Column where the marker starts (0-based)
104    pub marker_column: usize,
105    /// Column where content after marker starts
106    pub content_column: usize,
107}
108
109/// Heading style type
110#[derive(Debug, Clone, PartialEq)]
111pub enum HeadingStyle {
112    /// ATX style heading (# Heading)
113    ATX,
114    /// Setext style heading with = underline
115    Setext1,
116    /// Setext style heading with - underline
117    Setext2,
118}
119
120/// Parsed link information
121#[derive(Debug, Clone)]
122pub struct ParsedLink {
123    /// Line number (1-indexed)
124    pub line: usize,
125    /// Start column (0-indexed) in the line
126    pub start_col: usize,
127    /// End column (0-indexed) in the line
128    pub end_col: usize,
129    /// Byte offset in document
130    pub byte_offset: usize,
131    /// End byte offset in document
132    pub byte_end: usize,
133    /// Link text
134    pub text: String,
135    /// Link URL or reference
136    pub url: String,
137    /// Whether this is a reference link [text][ref] vs inline [text](url)
138    pub is_reference: bool,
139    /// Reference ID for reference links
140    pub reference_id: Option<String>,
141}
142
143/// Parsed image information
144#[derive(Debug, Clone)]
145pub struct ParsedImage {
146    /// Line number (1-indexed)
147    pub line: usize,
148    /// Start column (0-indexed) in the line
149    pub start_col: usize,
150    /// End column (0-indexed) in the line
151    pub end_col: usize,
152    /// Byte offset in document
153    pub byte_offset: usize,
154    /// End byte offset in document
155    pub byte_end: usize,
156    /// Alt text
157    pub alt_text: String,
158    /// Image URL or reference
159    pub url: String,
160    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
161    pub is_reference: bool,
162    /// Reference ID for reference images
163    pub reference_id: Option<String>,
164}
165
166/// Reference definition [ref]: url "title"
167#[derive(Debug, Clone)]
168pub struct ReferenceDef {
169    /// Line number (1-indexed)
170    pub line: usize,
171    /// Reference ID (normalized to lowercase)
172    pub id: String,
173    /// URL
174    pub url: String,
175    /// Optional title
176    pub title: Option<String>,
177    /// Byte offset where the reference definition starts
178    pub byte_offset: usize,
179    /// Byte offset where the reference definition ends
180    pub byte_end: usize,
181}
182
183/// Parsed code span information
184#[derive(Debug, Clone)]
185pub struct CodeSpan {
186    /// Line number (1-indexed)
187    pub line: usize,
188    /// Start column (0-indexed) in the line
189    pub start_col: usize,
190    /// End column (0-indexed) in the line
191    pub end_col: usize,
192    /// Byte offset in document
193    pub byte_offset: usize,
194    /// End byte offset in document
195    pub byte_end: usize,
196    /// Number of backticks used (1, 2, 3, etc.)
197    pub backtick_count: usize,
198    /// Content inside the code span (without backticks)
199    pub content: String,
200}
201
202/// Information about a heading
203#[derive(Debug, Clone)]
204pub struct HeadingInfo {
205    /// Heading level (1-6 for ATX, 1-2 for Setext)
206    pub level: u8,
207    /// Style of heading
208    pub style: HeadingStyle,
209    /// The heading marker (# characters or underline)
210    pub marker: String,
211    /// Column where the marker starts (0-based)
212    pub marker_column: usize,
213    /// Column where heading text starts
214    pub content_column: usize,
215    /// The heading text (without markers and without custom ID syntax)
216    pub text: String,
217    /// Custom header ID if present (e.g., from {#custom-id} syntax)
218    pub custom_id: Option<String>,
219    /// Original heading text including custom ID syntax
220    pub raw_text: String,
221    /// Whether it has a closing sequence (for ATX)
222    pub has_closing_sequence: bool,
223    /// The closing sequence if present
224    pub closing_sequence: String,
225}
226
227/// Information about a blockquote line
228#[derive(Debug, Clone)]
229pub struct BlockquoteInfo {
230    /// Nesting level (1 for >, 2 for >>, etc.)
231    pub nesting_level: usize,
232    /// The indentation before the blockquote marker
233    pub indent: String,
234    /// Column where the first > starts (0-based)
235    pub marker_column: usize,
236    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
237    pub prefix: String,
238    /// Content after the blockquote marker(s)
239    pub content: String,
240    /// Whether the line has no space after the marker
241    pub has_no_space_after_marker: bool,
242    /// Whether the line has multiple spaces after the marker
243    pub has_multiple_spaces_after_marker: bool,
244    /// Whether this is an empty blockquote line needing MD028 fix
245    pub needs_md028_fix: bool,
246}
247
248/// Information about a list block
249#[derive(Debug, Clone)]
250pub struct ListBlock {
251    /// Line number where the list starts (1-indexed)
252    pub start_line: usize,
253    /// Line number where the list ends (1-indexed)
254    pub end_line: usize,
255    /// Whether it's ordered or unordered
256    pub is_ordered: bool,
257    /// The consistent marker for unordered lists (if any)
258    pub marker: Option<String>,
259    /// Blockquote prefix for this list (empty if not in blockquote)
260    pub blockquote_prefix: String,
261    /// Lines that are list items within this block
262    pub item_lines: Vec<usize>,
263    /// Nesting level (0 for top-level lists)
264    pub nesting_level: usize,
265    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
266    pub max_marker_width: usize,
267}
268
269use std::sync::{Arc, Mutex};
270
271/// Character frequency data for fast content analysis
272#[derive(Debug, Clone, Default)]
273pub struct CharFrequency {
274    /// Count of # characters (headings)
275    pub hash_count: usize,
276    /// Count of * characters (emphasis, lists, horizontal rules)
277    pub asterisk_count: usize,
278    /// Count of _ characters (emphasis, horizontal rules)
279    pub underscore_count: usize,
280    /// Count of - characters (lists, horizontal rules, setext headings)
281    pub hyphen_count: usize,
282    /// Count of + characters (lists)
283    pub plus_count: usize,
284    /// Count of > characters (blockquotes)
285    pub gt_count: usize,
286    /// Count of | characters (tables)
287    pub pipe_count: usize,
288    /// Count of [ characters (links, images)
289    pub bracket_count: usize,
290    /// Count of ` characters (code spans, code blocks)
291    pub backtick_count: usize,
292    /// Count of < characters (HTML tags, autolinks)
293    pub lt_count: usize,
294    /// Count of ! characters (images)
295    pub exclamation_count: usize,
296    /// Count of newline characters
297    pub newline_count: usize,
298}
299
300/// Pre-parsed HTML tag information
301#[derive(Debug, Clone)]
302pub struct HtmlTag {
303    /// Line number (1-indexed)
304    pub line: usize,
305    /// Start column (0-indexed) in the line
306    pub start_col: usize,
307    /// End column (0-indexed) in the line
308    pub end_col: usize,
309    /// Byte offset in document
310    pub byte_offset: usize,
311    /// End byte offset in document
312    pub byte_end: usize,
313    /// Tag name (e.g., "div", "img", "br")
314    pub tag_name: String,
315    /// Whether it's a closing tag (`</tag>`)
316    pub is_closing: bool,
317    /// Whether it's self-closing (`<tag />`)
318    pub is_self_closing: bool,
319    /// Raw tag content
320    pub raw_content: String,
321}
322
323/// Pre-parsed emphasis span information
324#[derive(Debug, Clone)]
325pub struct EmphasisSpan {
326    /// Line number (1-indexed)
327    pub line: usize,
328    /// Start column (0-indexed) in the line
329    pub start_col: usize,
330    /// End column (0-indexed) in the line
331    pub end_col: usize,
332    /// Byte offset in document
333    pub byte_offset: usize,
334    /// End byte offset in document
335    pub byte_end: usize,
336    /// Type of emphasis ('*' or '_')
337    pub marker: char,
338    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
339    pub marker_count: usize,
340    /// Content inside the emphasis
341    pub content: String,
342}
343
344/// Pre-parsed table row information
345#[derive(Debug, Clone)]
346pub struct TableRow {
347    /// Line number (1-indexed)
348    pub line: usize,
349    /// Whether this is a separator row (contains only |, -, :, and spaces)
350    pub is_separator: bool,
351    /// Number of columns (pipe-separated cells)
352    pub column_count: usize,
353    /// Alignment info from separator row
354    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
355}
356
357/// Pre-parsed bare URL information (not in links)
358#[derive(Debug, Clone)]
359pub struct BareUrl {
360    /// Line number (1-indexed)
361    pub line: usize,
362    /// Start column (0-indexed) in the line
363    pub start_col: usize,
364    /// End column (0-indexed) in the line
365    pub end_col: usize,
366    /// Byte offset in document
367    pub byte_offset: usize,
368    /// End byte offset in document
369    pub byte_end: usize,
370    /// The URL string
371    pub url: String,
372    /// Type of URL ("http", "https", "ftp", "email")
373    pub url_type: String,
374}
375
376pub struct LintContext<'a> {
377    pub content: &'a str,
378    pub line_offsets: Vec<usize>,
379    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
380    pub lines: Vec<LineInfo>,             // Pre-computed line information
381    pub links: Vec<ParsedLink>,           // Pre-parsed links
382    pub images: Vec<ParsedImage>,         // Pre-parsed images
383    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
384    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
385    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
386    pub char_frequency: CharFrequency,    // Character frequency analysis
387    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
388    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
389    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
390    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
391    html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
392    pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
393    pub line_index: crate::utils::range_utils::LineIndex, // Pre-computed line index for byte position calculations
394    jinja_ranges: Vec<(usize, usize)>,    // Pre-computed Jinja template ranges ({{ }}, {% %})
395    pub flavor: MarkdownFlavor,           // Markdown flavor being used
396}
397
398/// Detailed blockquote parse result with all components
399struct BlockquoteComponents<'a> {
400    indent: &'a str,
401    markers: &'a str,
402    spaces_after: &'a str,
403    content: &'a str,
404}
405
406/// Parse blockquote prefix with detailed components using manual parsing
407#[inline]
408fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
409    let bytes = line.as_bytes();
410    let mut pos = 0;
411
412    // Parse leading whitespace (indent)
413    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
414        pos += 1;
415    }
416    let indent_end = pos;
417
418    // Must have at least one '>' marker
419    if pos >= bytes.len() || bytes[pos] != b'>' {
420        return None;
421    }
422
423    // Parse '>' markers
424    while pos < bytes.len() && bytes[pos] == b'>' {
425        pos += 1;
426    }
427    let markers_end = pos;
428
429    // Parse spaces after markers
430    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
431        pos += 1;
432    }
433    let spaces_end = pos;
434
435    Some(BlockquoteComponents {
436        indent: &line[0..indent_end],
437        markers: &line[indent_end..markers_end],
438        spaces_after: &line[markers_end..spaces_end],
439        content: &line[spaces_end..],
440    })
441}
442
443impl<'a> LintContext<'a> {
444    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
445        use std::time::Instant;
446        let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
447
448        let start = Instant::now();
449        let mut line_offsets = vec![0];
450        for (i, c) in content.char_indices() {
451            if c == '\n' {
452                line_offsets.push(i + 1);
453            }
454        }
455        if profile {
456            eprintln!("[PROFILE] Line offsets: {:?}", start.elapsed());
457        }
458
459        // Detect code blocks once and cache them
460        let start = Instant::now();
461        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
462        if profile {
463            eprintln!("[PROFILE] Code blocks: {:?}", start.elapsed());
464        }
465
466        // Pre-compute HTML comment ranges ONCE for all operations
467        let start = Instant::now();
468        let html_comment_ranges = crate::utils::skip_context::compute_html_comment_ranges(content);
469        if profile {
470            eprintln!("[PROFILE] HTML comment ranges: {:?}", start.elapsed());
471        }
472
473        // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
474        let start = Instant::now();
475        let autodoc_ranges = if flavor == MarkdownFlavor::MkDocs {
476            crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
477        } else {
478            Vec::new()
479        };
480        if profile {
481            eprintln!("[PROFILE] Autodoc block ranges: {:?}", start.elapsed());
482        }
483
484        // Pre-compute line information (without headings/blockquotes yet)
485        let start = Instant::now();
486        let mut lines = Self::compute_basic_line_info(
487            content,
488            &line_offsets,
489            &code_blocks,
490            flavor,
491            &html_comment_ranges,
492            &autodoc_ranges,
493        );
494        if profile {
495            eprintln!("[PROFILE] Basic line info: {:?}", start.elapsed());
496        }
497
498        // Detect HTML blocks BEFORE heading detection
499        let start = Instant::now();
500        Self::detect_html_blocks(&mut lines);
501        if profile {
502            eprintln!("[PROFILE] HTML blocks: {:?}", start.elapsed());
503        }
504
505        // Detect ESM import/export blocks in MDX files BEFORE heading detection
506        let start = Instant::now();
507        Self::detect_esm_blocks(&mut lines, flavor);
508        if profile {
509            eprintln!("[PROFILE] ESM blocks: {:?}", start.elapsed());
510        }
511
512        // Now detect headings and blockquotes
513        let start = Instant::now();
514        Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges);
515        if profile {
516            eprintln!("[PROFILE] Headings & blockquotes: {:?}", start.elapsed());
517        }
518
519        // Parse code spans early so we can exclude them from link/image parsing
520        let start = Instant::now();
521        let code_spans = Self::parse_code_spans(content, &lines);
522        if profile {
523            eprintln!("[PROFILE] Code spans: {:?}", start.elapsed());
524        }
525
526        // Parse links, images, references, and list blocks
527        let start = Instant::now();
528        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges);
529        if profile {
530            eprintln!("[PROFILE] Links: {:?}", start.elapsed());
531        }
532
533        let start = Instant::now();
534        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges);
535        if profile {
536            eprintln!("[PROFILE] Images: {:?}", start.elapsed());
537        }
538
539        let start = Instant::now();
540        let reference_defs = Self::parse_reference_defs(content, &lines);
541        if profile {
542            eprintln!("[PROFILE] Reference defs: {:?}", start.elapsed());
543        }
544
545        let start = Instant::now();
546        let list_blocks = Self::parse_list_blocks(&lines);
547        if profile {
548            eprintln!("[PROFILE] List blocks: {:?}", start.elapsed());
549        }
550
551        // Compute character frequency for fast content analysis
552        let start = Instant::now();
553        let char_frequency = Self::compute_char_frequency(content);
554        if profile {
555            eprintln!("[PROFILE] Char frequency: {:?}", start.elapsed());
556        }
557
558        // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058)
559        let start = Instant::now();
560        let table_blocks =
561            crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(content, &code_blocks, &code_spans);
562        if profile {
563            eprintln!("[PROFILE] Table blocks: {:?}", start.elapsed());
564        }
565
566        // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
567        let start = Instant::now();
568        let line_index = crate::utils::range_utils::LineIndex::new(content.to_string());
569        if profile {
570            eprintln!("[PROFILE] Line index: {:?}", start.elapsed());
571        }
572
573        // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
574        let start = Instant::now();
575        let jinja_ranges = crate::utils::jinja_utils::find_jinja_ranges(content);
576        if profile {
577            eprintln!("[PROFILE] Jinja ranges: {:?}", start.elapsed());
578        }
579
580        Self {
581            content,
582            line_offsets,
583            code_blocks,
584            lines,
585            links,
586            images,
587            reference_defs,
588            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
589            list_blocks,
590            char_frequency,
591            html_tags_cache: Mutex::new(None),
592            emphasis_spans_cache: Mutex::new(None),
593            table_rows_cache: Mutex::new(None),
594            bare_urls_cache: Mutex::new(None),
595            html_comment_ranges,
596            table_blocks,
597            line_index,
598            jinja_ranges,
599            flavor,
600        }
601    }
602
603    /// Get code spans - computed lazily on first access
604    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
605        let mut cache = self.code_spans_cache.lock().unwrap();
606
607        // Check if we need to compute code spans
608        if cache.is_none() {
609            let code_spans = Self::parse_code_spans(self.content, &self.lines);
610            *cache = Some(Arc::new(code_spans));
611        }
612
613        // Return a reference to the cached code spans
614        cache.as_ref().unwrap().clone()
615    }
616
617    /// Get HTML tags - computed lazily on first access
618    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
619        let mut cache = self.html_tags_cache.lock().unwrap();
620
621        if cache.is_none() {
622            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks, self.flavor);
623            *cache = Some(Arc::new(html_tags));
624        }
625
626        cache.as_ref().unwrap().clone()
627    }
628
629    /// Get emphasis spans - computed lazily on first access
630    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
631        let mut cache = self.emphasis_spans_cache.lock().unwrap();
632
633        if cache.is_none() {
634            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
635            *cache = Some(Arc::new(emphasis_spans));
636        }
637
638        cache.as_ref().unwrap().clone()
639    }
640
641    /// Get table rows - computed lazily on first access
642    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
643        let mut cache = self.table_rows_cache.lock().unwrap();
644
645        if cache.is_none() {
646            let table_rows = Self::parse_table_rows(&self.lines);
647            *cache = Some(Arc::new(table_rows));
648        }
649
650        cache.as_ref().unwrap().clone()
651    }
652
653    /// Get bare URLs - computed lazily on first access
654    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
655        let mut cache = self.bare_urls_cache.lock().unwrap();
656
657        if cache.is_none() {
658            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
659            *cache = Some(Arc::new(bare_urls));
660        }
661
662        cache.as_ref().unwrap().clone()
663    }
664
665    /// Map a byte offset to (line, column)
666    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
667        match self.line_offsets.binary_search(&offset) {
668            Ok(line) => (line + 1, 1),
669            Err(line) => {
670                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
671                (line, offset - line_start + 1)
672            }
673        }
674    }
675
676    /// Check if a position is within a code block or code span
677    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
678        // Check code blocks first
679        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
680            return true;
681        }
682
683        // Check inline code spans (lazy load if needed)
684        self.code_spans()
685            .iter()
686            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
687    }
688
689    /// Get line information by line number (1-indexed)
690    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
691        if line_num > 0 {
692            self.lines.get(line_num - 1)
693        } else {
694            None
695        }
696    }
697
698    /// Get byte offset for a line number (1-indexed)
699    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
700        self.line_info(line_num).map(|info| info.byte_offset)
701    }
702
703    /// Get URL for a reference link/image by its ID
704    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
705        let normalized_id = ref_id.to_lowercase();
706        self.reference_defs
707            .iter()
708            .find(|def| def.id == normalized_id)
709            .map(|def| def.url.as_str())
710    }
711
712    /// Get links on a specific line
713    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
714        self.links.iter().filter(|link| link.line == line_num).collect()
715    }
716
717    /// Get images on a specific line
718    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
719        self.images.iter().filter(|img| img.line == line_num).collect()
720    }
721
722    /// Check if a line is part of a list block
723    pub fn is_in_list_block(&self, line_num: usize) -> bool {
724        self.list_blocks
725            .iter()
726            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
727    }
728
729    /// Get the list block containing a specific line
730    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
731        self.list_blocks
732            .iter()
733            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
734    }
735
736    // Compatibility methods for DocumentStructure migration
737
738    /// Check if a line is within a code block
739    pub fn is_in_code_block(&self, line_num: usize) -> bool {
740        if line_num == 0 || line_num > self.lines.len() {
741            return false;
742        }
743        self.lines[line_num - 1].in_code_block
744    }
745
746    /// Check if a line is within front matter
747    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
748        if line_num == 0 || line_num > self.lines.len() {
749            return false;
750        }
751        self.lines[line_num - 1].in_front_matter
752    }
753
754    /// Check if a line is within an HTML block
755    pub fn is_in_html_block(&self, line_num: usize) -> bool {
756        if line_num == 0 || line_num > self.lines.len() {
757            return false;
758        }
759        self.lines[line_num - 1].in_html_block
760    }
761
762    /// Check if a line and column is within a code span
763    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
764        if line_num == 0 || line_num > self.lines.len() {
765            return false;
766        }
767
768        // Use the code spans cache to check
769        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
770        // Convert col to 0-indexed for comparison
771        let col_0indexed = if col > 0 { col - 1 } else { 0 };
772        let code_spans = self.code_spans();
773        code_spans
774            .iter()
775            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
776    }
777
778    /// Check if a byte position is within a reference definition
779    /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
780    #[inline]
781    pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
782        self.reference_defs
783            .iter()
784            .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
785    }
786
787    /// Check if a byte position is within an HTML comment
788    /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
789    /// where k is the number of HTML comments (typically very small)
790    #[inline]
791    pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
792        self.html_comment_ranges
793            .iter()
794            .any(|range| byte_pos >= range.start && byte_pos < range.end)
795    }
796
797    /// Check if a byte position is within a Jinja template ({{ }} or {% %})
798    pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
799        self.jinja_ranges
800            .iter()
801            .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
802    }
803
804    /// Check if content has any instances of a specific character (fast)
805    pub fn has_char(&self, ch: char) -> bool {
806        match ch {
807            '#' => self.char_frequency.hash_count > 0,
808            '*' => self.char_frequency.asterisk_count > 0,
809            '_' => self.char_frequency.underscore_count > 0,
810            '-' => self.char_frequency.hyphen_count > 0,
811            '+' => self.char_frequency.plus_count > 0,
812            '>' => self.char_frequency.gt_count > 0,
813            '|' => self.char_frequency.pipe_count > 0,
814            '[' => self.char_frequency.bracket_count > 0,
815            '`' => self.char_frequency.backtick_count > 0,
816            '<' => self.char_frequency.lt_count > 0,
817            '!' => self.char_frequency.exclamation_count > 0,
818            '\n' => self.char_frequency.newline_count > 0,
819            _ => self.content.contains(ch), // Fallback for other characters
820        }
821    }
822
823    /// Get count of a specific character (fast)
824    pub fn char_count(&self, ch: char) -> usize {
825        match ch {
826            '#' => self.char_frequency.hash_count,
827            '*' => self.char_frequency.asterisk_count,
828            '_' => self.char_frequency.underscore_count,
829            '-' => self.char_frequency.hyphen_count,
830            '+' => self.char_frequency.plus_count,
831            '>' => self.char_frequency.gt_count,
832            '|' => self.char_frequency.pipe_count,
833            '[' => self.char_frequency.bracket_count,
834            '`' => self.char_frequency.backtick_count,
835            '<' => self.char_frequency.lt_count,
836            '!' => self.char_frequency.exclamation_count,
837            '\n' => self.char_frequency.newline_count,
838            _ => self.content.matches(ch).count(), // Fallback for other characters
839        }
840    }
841
842    /// Check if content likely contains headings (fast)
843    pub fn likely_has_headings(&self) -> bool {
844        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
845    }
846
847    /// Check if content likely contains lists (fast)
848    pub fn likely_has_lists(&self) -> bool {
849        self.char_frequency.asterisk_count > 0
850            || self.char_frequency.hyphen_count > 0
851            || self.char_frequency.plus_count > 0
852    }
853
854    /// Check if content likely contains emphasis (fast)
855    pub fn likely_has_emphasis(&self) -> bool {
856        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
857    }
858
859    /// Check if content likely contains tables (fast)
860    pub fn likely_has_tables(&self) -> bool {
861        self.char_frequency.pipe_count > 2
862    }
863
864    /// Check if content likely contains blockquotes (fast)
865    pub fn likely_has_blockquotes(&self) -> bool {
866        self.char_frequency.gt_count > 0
867    }
868
869    /// Check if content likely contains code (fast)
870    pub fn likely_has_code(&self) -> bool {
871        self.char_frequency.backtick_count > 0
872    }
873
874    /// Check if content likely contains links or images (fast)
875    pub fn likely_has_links_or_images(&self) -> bool {
876        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
877    }
878
879    /// Check if content likely contains HTML (fast)
880    pub fn likely_has_html(&self) -> bool {
881        self.char_frequency.lt_count > 0
882    }
883
884    /// Get HTML tags on a specific line
885    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
886        self.html_tags()
887            .iter()
888            .filter(|tag| tag.line == line_num)
889            .cloned()
890            .collect()
891    }
892
893    /// Get emphasis spans on a specific line
894    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
895        self.emphasis_spans()
896            .iter()
897            .filter(|span| span.line == line_num)
898            .cloned()
899            .collect()
900    }
901
902    /// Get table rows on a specific line
903    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
904        self.table_rows()
905            .iter()
906            .filter(|row| row.line == line_num)
907            .cloned()
908            .collect()
909    }
910
911    /// Get bare URLs on a specific line
912    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
913        self.bare_urls()
914            .iter()
915            .filter(|url| url.line == line_num)
916            .cloned()
917            .collect()
918    }
919
920    /// Find the line index for a given byte offset using binary search.
921    /// Returns (line_index, line_number, column) where:
922    /// - line_index is the 0-based index in the lines array
923    /// - line_number is the 1-based line number
924    /// - column is the byte offset within that line
925    #[inline]
926    fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
927        // Binary search to find the line containing this byte offset
928        let idx = match lines.binary_search_by(|line| {
929            if byte_offset < line.byte_offset {
930                std::cmp::Ordering::Greater
931            } else if byte_offset > line.byte_offset + line.content.len() {
932                std::cmp::Ordering::Less
933            } else {
934                std::cmp::Ordering::Equal
935            }
936        }) {
937            Ok(idx) => idx,
938            Err(idx) => idx.saturating_sub(1),
939        };
940
941        let line = &lines[idx];
942        let line_num = idx + 1;
943        let col = byte_offset.saturating_sub(line.byte_offset);
944
945        (idx, line_num, col)
946    }
947
948    /// Check if a byte offset is within a code span using binary search
949    #[inline]
950    fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
951        // Since spans are sorted by byte_offset, use partition_point for binary search
952        let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
953
954        // Check the span that starts at or before our offset
955        if idx > 0 {
956            let span = &code_spans[idx - 1];
957            if offset >= span.byte_offset && offset < span.byte_end {
958                return true;
959            }
960        }
961
962        false
963    }
964
965    /// Parse all links in the content
966    fn parse_links(
967        content: &str,
968        lines: &[LineInfo],
969        code_blocks: &[(usize, usize)],
970        code_spans: &[CodeSpan],
971        flavor: MarkdownFlavor,
972        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
973    ) -> Vec<ParsedLink> {
974        use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
975
976        // Pre-size based on a heuristic: most markdown files have relatively few links
977        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
978
979        // Parse links across the entire content, not line by line
980        for cap in LINK_PATTERN.captures_iter(content) {
981            let full_match = cap.get(0).unwrap();
982            let match_start = full_match.start();
983            let match_end = full_match.end();
984
985            // Skip if the opening bracket is escaped (preceded by \)
986            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
987                continue;
988            }
989
990            // Skip if this is actually an image (preceded by !)
991            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
992                continue;
993            }
994
995            // Skip if in code block
996            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
997                continue;
998            }
999
1000            // Skip if in code span
1001            if Self::is_offset_in_code_span(code_spans, match_start) {
1002                continue;
1003            }
1004
1005            // Skip if in HTML comment (using pre-computed ranges for efficiency)
1006            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1007                continue;
1008            }
1009
1010            // Use binary search to find the line this link is on
1011            let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1012
1013            // Skip if this link is on a MkDocs snippet line
1014            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
1015                continue;
1016            }
1017
1018            // Use binary search to find the end line
1019            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1020
1021            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1022
1023            // URL can be in group 2 (angle brackets) or group 3 (bare)
1024            let inline_url = cap.get(2).or_else(|| cap.get(3));
1025
1026            if let Some(url_match) = inline_url {
1027                // Inline link
1028                links.push(ParsedLink {
1029                    line: line_num,
1030                    start_col: col_start,
1031                    end_col: col_end,
1032                    byte_offset: match_start,
1033                    byte_end: match_end,
1034                    text,
1035                    url: url_match.as_str().to_string(),
1036                    is_reference: false,
1037                    reference_id: None,
1038                });
1039            } else if let Some(ref_id) = cap.get(6) {
1040                // Reference link
1041                let ref_id_str = ref_id.as_str();
1042                let normalized_ref = if ref_id_str.is_empty() {
1043                    text.to_lowercase() // Implicit reference
1044                } else {
1045                    ref_id_str.to_lowercase()
1046                };
1047
1048                links.push(ParsedLink {
1049                    line: line_num,
1050                    start_col: col_start,
1051                    end_col: col_end,
1052                    byte_offset: match_start,
1053                    byte_end: match_end,
1054                    text,
1055                    url: String::new(), // Will be resolved with reference_defs
1056                    is_reference: true,
1057                    reference_id: Some(normalized_ref),
1058                });
1059            }
1060        }
1061
1062        links
1063    }
1064
1065    /// Parse all images in the content
1066    fn parse_images(
1067        content: &str,
1068        lines: &[LineInfo],
1069        code_blocks: &[(usize, usize)],
1070        code_spans: &[CodeSpan],
1071        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1072    ) -> Vec<ParsedImage> {
1073        use crate::utils::skip_context::is_in_html_comment_ranges;
1074
1075        // Pre-size based on a heuristic: images are less common than links
1076        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
1077
1078        // Parse images across the entire content, not line by line
1079        for cap in IMAGE_PATTERN.captures_iter(content) {
1080            let full_match = cap.get(0).unwrap();
1081            let match_start = full_match.start();
1082            let match_end = full_match.end();
1083
1084            // Skip if the ! is escaped (preceded by \)
1085            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1086                continue;
1087            }
1088
1089            // Skip if in code block
1090            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1091                continue;
1092            }
1093
1094            // Skip if in code span
1095            if Self::is_offset_in_code_span(code_spans, match_start) {
1096                continue;
1097            }
1098
1099            // Skip if in HTML comment (using pre-computed ranges for efficiency)
1100            if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1101                continue;
1102            }
1103
1104            // Use binary search to find the line this image is on
1105            let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1106
1107            // Use binary search to find the end line
1108            let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1109
1110            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
1111
1112            // URL can be in group 2 (angle brackets) or group 3 (bare)
1113            let inline_url = cap.get(2).or_else(|| cap.get(3));
1114
1115            if let Some(url_match) = inline_url {
1116                // Inline image
1117                images.push(ParsedImage {
1118                    line: line_num,
1119                    start_col: col_start,
1120                    end_col: col_end,
1121                    byte_offset: match_start,
1122                    byte_end: match_end,
1123                    alt_text,
1124                    url: url_match.as_str().to_string(),
1125                    is_reference: false,
1126                    reference_id: None,
1127                });
1128            } else if let Some(ref_id) = cap.get(6) {
1129                // Reference image
1130                let ref_id_str = ref_id.as_str();
1131                let normalized_ref = if ref_id_str.is_empty() {
1132                    alt_text.to_lowercase() // Implicit reference
1133                } else {
1134                    ref_id_str.to_lowercase()
1135                };
1136
1137                images.push(ParsedImage {
1138                    line: line_num,
1139                    start_col: col_start,
1140                    end_col: col_end,
1141                    byte_offset: match_start,
1142                    byte_end: match_end,
1143                    alt_text,
1144                    url: String::new(), // Will be resolved with reference_defs
1145                    is_reference: true,
1146                    reference_id: Some(normalized_ref),
1147                });
1148            }
1149        }
1150
1151        images
1152    }
1153
1154    /// Parse reference definitions
1155    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1156        // Pre-size based on lines count as reference definitions are line-based
1157        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1158
1159        for (line_idx, line_info) in lines.iter().enumerate() {
1160            // Skip lines in code blocks
1161            if line_info.in_code_block {
1162                continue;
1163            }
1164
1165            let line = &line_info.content;
1166            let line_num = line_idx + 1;
1167
1168            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1169                let id = cap.get(1).unwrap().as_str().to_lowercase();
1170                let url = cap.get(2).unwrap().as_str().to_string();
1171                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1172
1173                // Calculate byte positions
1174                // The match starts at the beginning of the line (0) and extends to the end
1175                let match_obj = cap.get(0).unwrap();
1176                let byte_offset = line_info.byte_offset + match_obj.start();
1177                let byte_end = line_info.byte_offset + match_obj.end();
1178
1179                refs.push(ReferenceDef {
1180                    line: line_num,
1181                    id,
1182                    url,
1183                    title,
1184                    byte_offset,
1185                    byte_end,
1186                });
1187            }
1188        }
1189
1190        refs
1191    }
1192
1193    /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1194    /// Matches: ^(\s*>\s*)(.*)
1195    /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1196    #[inline]
1197    fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1198        let trimmed_start = line.trim_start();
1199        if !trimmed_start.starts_with('>') {
1200            return None;
1201        }
1202
1203        let leading_ws_len = line.len() - trimmed_start.len();
1204        let after_gt = &trimmed_start[1..];
1205        let content = after_gt.trim_start();
1206        let ws_after_gt_len = after_gt.len() - content.len();
1207        let prefix_len = leading_ws_len + 1 + ws_after_gt_len;
1208
1209        Some((&line[..prefix_len], content))
1210    }
1211
1212    /// Fast unordered list parser - replaces regex for 5-10x speedup
1213    /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1214    /// Returns: Some((leading_ws, marker, spacing, content)) or None
1215    #[inline]
1216    fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1217        let bytes = line.as_bytes();
1218        let mut i = 0;
1219
1220        // Skip leading whitespace
1221        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1222            i += 1;
1223        }
1224
1225        // Check for marker
1226        if i >= bytes.len() {
1227            return None;
1228        }
1229        let marker = bytes[i] as char;
1230        if marker != '-' && marker != '*' && marker != '+' {
1231            return None;
1232        }
1233        let marker_pos = i;
1234        i += 1;
1235
1236        // Collect spacing after marker (space or tab only)
1237        let spacing_start = i;
1238        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1239            i += 1;
1240        }
1241
1242        Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
1243    }
1244
1245    /// Fast ordered list parser - replaces regex for 5-10x speedup
1246    /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
1247    /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
1248    #[inline]
1249    fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
1250        let bytes = line.as_bytes();
1251        let mut i = 0;
1252
1253        // Skip leading whitespace
1254        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1255            i += 1;
1256        }
1257
1258        // Collect digits
1259        let number_start = i;
1260        while i < bytes.len() && bytes[i].is_ascii_digit() {
1261            i += 1;
1262        }
1263        if i == number_start {
1264            return None; // No digits found
1265        }
1266
1267        // Check for delimiter
1268        if i >= bytes.len() {
1269            return None;
1270        }
1271        let delimiter = bytes[i] as char;
1272        if delimiter != '.' && delimiter != ')' {
1273            return None;
1274        }
1275        let delimiter_pos = i;
1276        i += 1;
1277
1278        // Collect spacing after delimiter (space or tab only)
1279        let spacing_start = i;
1280        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1281            i += 1;
1282        }
1283
1284        Some((
1285            &line[..number_start],
1286            &line[number_start..delimiter_pos],
1287            delimiter,
1288            &line[spacing_start..i],
1289            &line[i..],
1290        ))
1291    }
1292
1293    /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
1294    /// Returns a Vec<bool> where index i indicates if line i is in a code block
1295    fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
1296        let num_lines = line_offsets.len();
1297        let mut in_code_block = vec![false; num_lines];
1298
1299        // For each code block, mark all lines within it
1300        for &(start, end) in code_blocks {
1301            // Ensure we're at valid UTF-8 boundaries
1302            let safe_start = if start > 0 && !content.is_char_boundary(start) {
1303                let mut boundary = start;
1304                while boundary > 0 && !content.is_char_boundary(boundary) {
1305                    boundary -= 1;
1306                }
1307                boundary
1308            } else {
1309                start
1310            };
1311
1312            let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1313                let mut boundary = end;
1314                while boundary < content.len() && !content.is_char_boundary(boundary) {
1315                    boundary += 1;
1316                }
1317                boundary
1318            } else {
1319                end.min(content.len())
1320            };
1321
1322            // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
1323            // That function now has proper list context awareness (see code_block_utils.rs)
1324            // and correctly distinguishes between:
1325            // - Fenced code blocks (``` or ~~~)
1326            // - Indented code blocks at document level (4 spaces + blank line before)
1327            // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
1328            //
1329            // We no longer need to re-validate here. The original validation logic
1330            // was causing false positives by marking list continuation paragraphs as
1331            // code blocks when they have 4 spaces of indentation.
1332
1333            // Use binary search to find the first and last line indices
1334            // line_offsets is sorted, so we can use partition_point for O(log n) lookup
1335            // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
1336            let first_line = line_offsets.partition_point(|&offset| offset < safe_start);
1337            let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
1338
1339            // Mark all lines in the range at once
1340            for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
1341                *flag = true;
1342            }
1343        }
1344
1345        in_code_block
1346    }
1347
1348    /// Pre-compute basic line information (without headings/blockquotes)
1349    fn compute_basic_line_info(
1350        content: &str,
1351        line_offsets: &[usize],
1352        code_blocks: &[(usize, usize)],
1353        flavor: MarkdownFlavor,
1354        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1355        autodoc_ranges: &[crate::utils::skip_context::ByteRange],
1356    ) -> Vec<LineInfo> {
1357        let content_lines: Vec<&str> = content.lines().collect();
1358        let mut lines = Vec::with_capacity(content_lines.len());
1359
1360        // Pre-compute which lines are in code blocks
1361        let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
1362
1363        // Detect front matter boundaries FIRST, before any other parsing
1364        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1365        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1366
1367        for (i, line) in content_lines.iter().enumerate() {
1368            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1369            let indent = line.len() - line.trim_start().len();
1370
1371            // Parse blockquote prefix once and reuse it (avoid redundant parsing)
1372            let blockquote_parse = Self::parse_blockquote_prefix(line);
1373
1374            // For blank detection, consider blockquote context
1375            let is_blank = if let Some((_, content)) = blockquote_parse {
1376                // In blockquote context, check if content after prefix is blank
1377                content.trim().is_empty()
1378            } else {
1379                line.trim().is_empty()
1380            };
1381
1382            // Use pre-computed map for O(1) lookup instead of O(m) iteration
1383            let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
1384
1385            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1386            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1387                && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
1388            // Use pre-computed ranges for efficiency (O(log n) vs O(file_size))
1389            let in_html_comment =
1390                crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, byte_offset);
1391            let list_item = if !(in_code_block
1392                || is_blank
1393                || in_mkdocstrings
1394                || in_html_comment
1395                || (front_matter_end > 0 && i < front_matter_end))
1396            {
1397                // Strip blockquote prefix if present for list detection (reuse cached result)
1398                let (line_for_list_check, blockquote_prefix_len) = if let Some((prefix, content)) = blockquote_parse {
1399                    (content, prefix.len())
1400                } else {
1401                    (&**line, 0)
1402                };
1403
1404                if let Some((leading_spaces, marker, spacing, _content)) =
1405                    Self::parse_unordered_list(line_for_list_check)
1406                {
1407                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1408                    let content_column = marker_column + 1 + spacing.len();
1409
1410                    // According to CommonMark spec, unordered list items MUST have at least one space
1411                    // after the marker (-, *, or +). Without a space, it's not a list item.
1412                    // This also naturally handles cases like:
1413                    // - *emphasis* (not a list)
1414                    // - **bold** (not a list)
1415                    // - --- (horizontal rule, not a list)
1416                    if spacing.is_empty() {
1417                        None
1418                    } else {
1419                        Some(ListItemInfo {
1420                            marker: marker.to_string(),
1421                            is_ordered: false,
1422                            number: None,
1423                            marker_column,
1424                            content_column,
1425                        })
1426                    }
1427                } else if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1428                    Self::parse_ordered_list(line_for_list_check)
1429                {
1430                    let marker = format!("{number_str}{delimiter}");
1431                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1432                    let content_column = marker_column + marker.len() + spacing.len();
1433
1434                    // According to CommonMark spec, ordered list items MUST have at least one space
1435                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1436                    if spacing.is_empty() {
1437                        None
1438                    } else {
1439                        Some(ListItemInfo {
1440                            marker,
1441                            is_ordered: true,
1442                            number: number_str.parse().ok(),
1443                            marker_column,
1444                            content_column,
1445                        })
1446                    }
1447                } else {
1448                    None
1449                }
1450            } else {
1451                None
1452            };
1453
1454            lines.push(LineInfo {
1455                content: line.to_string(),
1456                byte_offset,
1457                indent,
1458                is_blank,
1459                in_code_block,
1460                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1461                in_html_block: false, // Will be populated after line creation
1462                in_html_comment,
1463                list_item,
1464                heading: None,    // Will be populated in second pass for Setext headings
1465                blockquote: None, // Will be populated after line creation
1466                in_mkdocstrings,
1467                in_esm_block: false, // Will be populated after line creation for MDX files
1468            });
1469        }
1470
1471        lines
1472    }
1473
1474    /// Detect headings and blockquotes (called after HTML block detection)
1475    fn detect_headings_and_blockquotes(
1476        content: &str,
1477        lines: &mut [LineInfo],
1478        flavor: MarkdownFlavor,
1479        html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1480    ) {
1481        lazy_static! {
1482
1483            // Regex for heading detection
1484            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1485            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1486        }
1487
1488        let content_lines: Vec<&str> = content.lines().collect();
1489
1490        // Detect front matter boundaries to skip those lines
1491        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1492
1493        // Detect headings (including Setext which needs look-ahead) and blockquotes
1494        for i in 0..lines.len() {
1495            if lines[i].in_code_block {
1496                continue;
1497            }
1498
1499            // Skip lines in front matter
1500            if front_matter_end > 0 && i < front_matter_end {
1501                continue;
1502            }
1503
1504            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1505            if lines[i].in_html_block {
1506                continue;
1507            }
1508
1509            let line = content_lines[i];
1510
1511            // Check for blockquotes (even on blank lines within blockquotes)
1512            if let Some(bq) = parse_blockquote_detailed(line) {
1513                let nesting_level = bq.markers.len(); // Each '>' is one level
1514                let marker_column = bq.indent.len();
1515
1516                // Build the prefix (indentation + markers + space)
1517                let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
1518
1519                // Check for various blockquote issues
1520                let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
1521                // Consider tabs as multiple spaces, or actual multiple spaces
1522                let has_multiple_spaces = bq.spaces_after.len() > 1 || bq.spaces_after.contains('\t');
1523
1524                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1525                // MD028 flags empty blockquote lines that don't have a single space after the marker
1526                // Lines like "> " or ">> " are already correct and don't need fixing
1527                let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
1528
1529                lines[i].blockquote = Some(BlockquoteInfo {
1530                    nesting_level,
1531                    indent: bq.indent.to_string(),
1532                    marker_column,
1533                    prefix,
1534                    content: bq.content.to_string(),
1535                    has_no_space_after_marker: has_no_space,
1536                    has_multiple_spaces_after_marker: has_multiple_spaces,
1537                    needs_md028_fix,
1538                });
1539            }
1540
1541            // Skip heading detection for blank lines
1542            if lines[i].is_blank {
1543                continue;
1544            }
1545
1546            // Check for ATX headings (but skip MkDocs snippet lines)
1547            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1548            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1549                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1550                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1551            } else {
1552                false
1553            };
1554
1555            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1556                // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
1557                if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
1558                    continue;
1559                }
1560                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1561                let hashes = caps.get(2).map_or("", |m| m.as_str());
1562                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1563                let rest = caps.get(4).map_or("", |m| m.as_str());
1564
1565                let level = hashes.len() as u8;
1566                let marker_column = leading_spaces.len();
1567
1568                // Check for closing sequence, but handle custom IDs that might come after
1569                let (text, has_closing, closing_seq) = {
1570                    // First check if there's a custom ID at the end
1571                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1572                        // Check if this looks like a valid custom ID (ends with })
1573                        if rest[id_start..].trim_end().ends_with('}') {
1574                            // Split off the custom ID
1575                            (&rest[..id_start], &rest[id_start..])
1576                        } else {
1577                            (rest, "")
1578                        }
1579                    } else {
1580                        (rest, "")
1581                    };
1582
1583                    // Now look for closing hashes in the part before the custom ID
1584                    let trimmed_rest = rest_without_id.trim_end();
1585                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1586                        // Look for the start of the hash sequence
1587                        let mut start_of_hashes = last_hash_pos;
1588                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1589                            start_of_hashes -= 1;
1590                        }
1591
1592                        // Check if there's at least one space before the closing hashes
1593                        let has_space_before = start_of_hashes == 0
1594                            || trimmed_rest
1595                                .chars()
1596                                .nth(start_of_hashes - 1)
1597                                .is_some_and(|c| c.is_whitespace());
1598
1599                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1600                        let potential_closing = &trimmed_rest[start_of_hashes..];
1601                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1602
1603                        if is_all_hashes && has_space_before {
1604                            // This is a closing sequence
1605                            let closing_hashes = potential_closing.to_string();
1606                            // The text is everything before the closing hashes
1607                            // Don't include the custom ID here - it will be extracted later
1608                            let text_part = if !custom_id_part.is_empty() {
1609                                // If we have a custom ID, append it back to get the full rest
1610                                // This allows the extract_header_id function to handle it properly
1611                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1612                            } else {
1613                                rest_without_id[..start_of_hashes].trim_end().to_string()
1614                            };
1615                            (text_part, true, closing_hashes)
1616                        } else {
1617                            // Not a valid closing sequence, return the full content
1618                            (rest.to_string(), false, String::new())
1619                        }
1620                    } else {
1621                        // No hashes found, return the full content
1622                        (rest.to_string(), false, String::new())
1623                    }
1624                };
1625
1626                let content_column = marker_column + hashes.len() + spaces_after.len();
1627
1628                // Extract custom header ID if present
1629                let raw_text = text.trim().to_string();
1630                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1631
1632                // If no custom ID was found on the header line, check the next line for standalone attr-list
1633                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1634                    let next_line = content_lines[i + 1];
1635                    if !lines[i + 1].in_code_block
1636                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1637                        && let Some(next_line_id) =
1638                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1639                    {
1640                        custom_id = Some(next_line_id);
1641                    }
1642                }
1643
1644                lines[i].heading = Some(HeadingInfo {
1645                    level,
1646                    style: HeadingStyle::ATX,
1647                    marker: hashes.to_string(),
1648                    marker_column,
1649                    content_column,
1650                    text: clean_text,
1651                    custom_id,
1652                    raw_text,
1653                    has_closing_sequence: has_closing,
1654                    closing_sequence: closing_seq,
1655                });
1656            }
1657            // Check for Setext headings (need to look at next line)
1658            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1659                let next_line = content_lines[i + 1];
1660                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1661                    // Skip if next line is front matter delimiter
1662                    if front_matter_end > 0 && i < front_matter_end {
1663                        continue;
1664                    }
1665
1666                    // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
1667                    if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
1668                    {
1669                        continue;
1670                    }
1671
1672                    let underline = next_line.trim();
1673
1674                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1675                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1676                    if underline == "---" {
1677                        continue;
1678                    }
1679
1680                    // Skip if the current line looks like YAML key-value syntax
1681                    let current_line_trimmed = line.trim();
1682                    if current_line_trimmed.contains(':')
1683                        && !current_line_trimmed.starts_with('#')
1684                        && !current_line_trimmed.contains('[')
1685                        && !current_line_trimmed.contains("](")
1686                    {
1687                        // This looks like "key: value" which suggests YAML, not a heading
1688                        continue;
1689                    }
1690
1691                    let level = if underline.starts_with('=') { 1 } else { 2 };
1692                    let style = if level == 1 {
1693                        HeadingStyle::Setext1
1694                    } else {
1695                        HeadingStyle::Setext2
1696                    };
1697
1698                    // Extract custom header ID if present
1699                    let raw_text = line.trim().to_string();
1700                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1701
1702                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1703                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1704                        let attr_line = content_lines[i + 2];
1705                        if !lines[i + 2].in_code_block
1706                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1707                            && let Some(attr_line_id) =
1708                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1709                        {
1710                            custom_id = Some(attr_line_id);
1711                        }
1712                    }
1713
1714                    lines[i].heading = Some(HeadingInfo {
1715                        level,
1716                        style,
1717                        marker: underline.to_string(),
1718                        marker_column: next_line.len() - next_line.trim_start().len(),
1719                        content_column: lines[i].indent,
1720                        text: clean_text,
1721                        custom_id,
1722                        raw_text,
1723                        has_closing_sequence: false,
1724                        closing_sequence: String::new(),
1725                    });
1726                }
1727            }
1728        }
1729    }
1730
1731    /// Detect HTML blocks in the content
1732    fn detect_html_blocks(lines: &mut [LineInfo]) {
1733        // HTML block elements that trigger block context
1734        const BLOCK_ELEMENTS: &[&str] = &[
1735            "address",
1736            "article",
1737            "aside",
1738            "blockquote",
1739            "details",
1740            "dialog",
1741            "dd",
1742            "div",
1743            "dl",
1744            "dt",
1745            "fieldset",
1746            "figcaption",
1747            "figure",
1748            "footer",
1749            "form",
1750            "h1",
1751            "h2",
1752            "h3",
1753            "h4",
1754            "h5",
1755            "h6",
1756            "header",
1757            "hr",
1758            "li",
1759            "main",
1760            "nav",
1761            "ol",
1762            "p",
1763            "pre",
1764            "script",
1765            "section",
1766            "style",
1767            "table",
1768            "tbody",
1769            "td",
1770            "tfoot",
1771            "th",
1772            "thead",
1773            "tr",
1774            "ul",
1775        ];
1776
1777        let mut i = 0;
1778        while i < lines.len() {
1779            // Skip if already in code block or front matter
1780            if lines[i].in_code_block || lines[i].in_front_matter {
1781                i += 1;
1782                continue;
1783            }
1784
1785            let trimmed = lines[i].content.trim_start();
1786
1787            // Check if line starts with an HTML tag
1788            if trimmed.starts_with('<') && trimmed.len() > 1 {
1789                // Extract tag name safely
1790                let after_bracket = &trimmed[1..];
1791                let is_closing = after_bracket.starts_with('/');
1792                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1793
1794                // Extract tag name (stop at space, >, /, or end of string)
1795                let tag_name = tag_start
1796                    .chars()
1797                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1798                    .collect::<String>()
1799                    .to_lowercase();
1800
1801                // Check if it's a block element
1802                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1803                    // Mark this line as in HTML block
1804                    lines[i].in_html_block = true;
1805
1806                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1807                    // This avoids complex nesting logic that might cause infinite loops
1808                    if !is_closing {
1809                        let closing_tag = format!("</{tag_name}>");
1810                        // style and script tags can contain blank lines (CSS/JS formatting)
1811                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
1812                        let mut j = i + 1;
1813                        while j < lines.len() && j < i + 100 {
1814                            // Limit search to 100 lines
1815                            // Stop at blank lines (except for style/script tags)
1816                            if !allow_blank_lines && lines[j].is_blank {
1817                                break;
1818                            }
1819
1820                            lines[j].in_html_block = true;
1821
1822                            // Check if this line contains the closing tag
1823                            if lines[j].content.contains(&closing_tag) {
1824                                break;
1825                            }
1826                            j += 1;
1827                        }
1828                    }
1829                }
1830            }
1831
1832            i += 1;
1833        }
1834    }
1835
1836    /// Detect ESM import/export blocks in MDX files
1837    /// ESM blocks consist of contiguous import/export statements at the top of the file
1838    fn detect_esm_blocks(lines: &mut [LineInfo], flavor: MarkdownFlavor) {
1839        // Only process MDX files
1840        if !flavor.supports_esm_blocks() {
1841            return;
1842        }
1843
1844        for line in lines.iter_mut() {
1845            // Skip blank lines and comments at the start
1846            if line.is_blank || line.in_html_comment {
1847                continue;
1848            }
1849
1850            // Check if line starts with import or export
1851            let trimmed = line.content.trim_start();
1852            if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
1853                line.in_esm_block = true;
1854            } else {
1855                // Once we hit a non-ESM line, we're done with the ESM block
1856                break;
1857            }
1858        }
1859    }
1860
1861    /// Parse all inline code spans in the content using pulldown-cmark streaming parser
1862    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1863        let mut code_spans = Vec::new();
1864
1865        // Quick check - if no backticks, no code spans
1866        if !content.contains('`') {
1867            return code_spans;
1868        }
1869
1870        // Use pulldown-cmark's streaming parser with byte offsets
1871        let parser = Parser::new(content).into_offset_iter();
1872
1873        for (event, range) in parser {
1874            if let Event::Code(_) = event {
1875                let start_pos = range.start;
1876                let end_pos = range.end;
1877
1878                // The range includes the backticks, extract the actual content
1879                let full_span = &content[start_pos..end_pos];
1880                let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1881
1882                // Extract content between backticks, preserving spaces
1883                let content_start = start_pos + backtick_count;
1884                let content_end = end_pos - backtick_count;
1885                let span_content = if content_start < content_end {
1886                    content[content_start..content_end].to_string()
1887                } else {
1888                    String::new()
1889                };
1890
1891                // Use binary search to find line number - O(log n) instead of O(n)
1892                // Find the rightmost line whose byte_offset <= start_pos
1893                let line_idx = lines
1894                    .partition_point(|line| line.byte_offset <= start_pos)
1895                    .saturating_sub(1);
1896                let line_num = line_idx + 1;
1897                let col_start = start_pos - lines[line_idx].byte_offset;
1898
1899                // Find end column using binary search
1900                let end_line_idx = lines
1901                    .partition_point(|line| line.byte_offset <= end_pos)
1902                    .saturating_sub(1);
1903                let col_end = end_pos - lines[end_line_idx].byte_offset;
1904
1905                code_spans.push(CodeSpan {
1906                    line: line_num,
1907                    start_col: col_start,
1908                    end_col: col_end,
1909                    byte_offset: start_pos,
1910                    byte_end: end_pos,
1911                    backtick_count,
1912                    content: span_content,
1913                });
1914            }
1915        }
1916
1917        // Sort by position to ensure consistent ordering
1918        code_spans.sort_by_key(|span| span.byte_offset);
1919
1920        code_spans
1921    }
1922
1923    /// Parse all list blocks in the content (legacy line-by-line approach)
1924    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1925        // Pre-size based on lines that could be list items
1926        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1927        let mut current_block: Option<ListBlock> = None;
1928        let mut last_list_item_line = 0;
1929        let mut current_indent_level = 0;
1930        let mut last_marker_width = 0;
1931
1932        for (line_idx, line_info) in lines.iter().enumerate() {
1933            let line_num = line_idx + 1;
1934
1935            // Enhanced code block handling using Design #3's context analysis
1936            if line_info.in_code_block {
1937                if let Some(ref mut block) = current_block {
1938                    // Calculate minimum indentation for list continuation
1939                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1940
1941                    // Analyze code block context using the three-tier classification
1942                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1943
1944                    match context {
1945                        CodeBlockContext::Indented => {
1946                            // Code block is properly indented - continues the list
1947                            block.end_line = line_num;
1948                            continue;
1949                        }
1950                        CodeBlockContext::Standalone => {
1951                            // Code block separates lists - end current block
1952                            let completed_block = current_block.take().unwrap();
1953                            list_blocks.push(completed_block);
1954                            continue;
1955                        }
1956                        CodeBlockContext::Adjacent => {
1957                            // Edge case - use conservative behavior (continue list)
1958                            block.end_line = line_num;
1959                            continue;
1960                        }
1961                    }
1962                } else {
1963                    // No current list block - skip code block lines
1964                    continue;
1965                }
1966            }
1967
1968            // Extract blockquote prefix if any
1969            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1970                caps.get(0).unwrap().as_str().to_string()
1971            } else {
1972                String::new()
1973            };
1974
1975            // Check if this line is a list item
1976            if let Some(list_item) = &line_info.list_item {
1977                // Calculate nesting level based on indentation
1978                let item_indent = list_item.marker_column;
1979                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1980
1981                if let Some(ref mut block) = current_block {
1982                    // Check if this continues the current block
1983                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1984                    // or a continuation at the same or lower level
1985                    let is_nested = nesting > block.nesting_level;
1986                    let same_type =
1987                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1988                    let same_context = block.blockquote_prefix == blockquote_prefix;
1989                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1990
1991                    // For unordered lists, also check marker consistency
1992                    let marker_compatible =
1993                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1994
1995                    // Check if there's non-list content between the last item and this one
1996                    let has_non_list_content = {
1997                        let mut found_non_list = false;
1998                        // Use the last item from the current block, not the global last_list_item_line
1999                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
2000
2001                        // Debug: Special check for problematic line
2002                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2003                            let last_line = &lines[block_last_item_line - 1];
2004                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
2005                                log::debug!(
2006                                    "After problematic line {}: checking lines {} to {} for non-list content",
2007                                    block_last_item_line,
2008                                    block_last_item_line + 1,
2009                                    line_num
2010                                );
2011                                // If they're consecutive list items, there's no content between
2012                                if line_num == block_last_item_line + 1 {
2013                                    log::debug!("Lines are consecutive, no content between");
2014                                }
2015                            }
2016                        }
2017
2018                        for check_line in (block_last_item_line + 1)..line_num {
2019                            let check_idx = check_line - 1;
2020                            if check_idx < lines.len() {
2021                                let check_info = &lines[check_idx];
2022                                // Check for content that breaks the list
2023                                let is_list_breaking_content = if check_info.in_code_block {
2024                                    // Use enhanced code block classification for list separation
2025                                    let last_item_marker_width =
2026                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2027                                            lines[block_last_item_line - 1]
2028                                                .list_item
2029                                                .as_ref()
2030                                                .map(|li| {
2031                                                    if li.is_ordered {
2032                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
2033                                                    } else {
2034                                                        li.marker.len()
2035                                                    }
2036                                                })
2037                                                .unwrap_or(3) // fallback to 3 if no list item found
2038                                        } else {
2039                                            3 // fallback
2040                                        };
2041
2042                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
2043
2044                                    // Analyze code block context using our enhanced classification
2045                                    let context = CodeBlockUtils::analyze_code_block_context(
2046                                        lines,
2047                                        check_line - 1,
2048                                        min_continuation,
2049                                    );
2050
2051                                    // Standalone code blocks break lists, indented ones continue them
2052                                    matches!(context, CodeBlockContext::Standalone)
2053                                } else if !check_info.is_blank && check_info.list_item.is_none() {
2054                                    // Check for structural separators that should break lists (from issue #42)
2055                                    let line_content = check_info.content.trim();
2056
2057                                    // Any of these structural separators break lists
2058                                    if check_info.heading.is_some()
2059                                        || line_content.starts_with("---")
2060                                        || line_content.starts_with("***")
2061                                        || line_content.starts_with("___")
2062                                        || (line_content.contains('|')
2063                                            && !line_content.contains("](")
2064                                            && !line_content.contains("http")
2065                                            && (line_content.matches('|').count() > 1
2066                                                || line_content.starts_with('|')
2067                                                || line_content.ends_with('|')))
2068                                        || line_content.starts_with(">")
2069                                    {
2070                                        true
2071                                    }
2072                                    // Other non-list content - check if properly indented
2073                                    else {
2074                                        let last_item_marker_width =
2075                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
2076                                                lines[block_last_item_line - 1]
2077                                                    .list_item
2078                                                    .as_ref()
2079                                                    .map(|li| {
2080                                                        if li.is_ordered {
2081                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
2082                                                        } else {
2083                                                            li.marker.len()
2084                                                        }
2085                                                    })
2086                                                    .unwrap_or(3) // fallback to 3 if no list item found
2087                                            } else {
2088                                                3 // fallback
2089                                            };
2090
2091                                        let min_continuation =
2092                                            if block.is_ordered { last_item_marker_width } else { 2 };
2093                                        check_info.indent < min_continuation
2094                                    }
2095                                } else {
2096                                    false
2097                                };
2098
2099                                if is_list_breaking_content {
2100                                    // Not indented enough, so it breaks the list
2101                                    found_non_list = true;
2102                                    break;
2103                                }
2104                            }
2105                        }
2106                        found_non_list
2107                    };
2108
2109                    // A list continues if:
2110                    // 1. It's a nested item (indented more than the parent), OR
2111                    // 2. It's the same type at the same level with reasonable distance
2112                    let mut continues_list = if is_nested {
2113                        // Nested items always continue the list if they're in the same context
2114                        same_context && reasonable_distance && !has_non_list_content
2115                    } else {
2116                        // Same-level items need to match type and markers
2117                        let result = same_type
2118                            && same_context
2119                            && reasonable_distance
2120                            && marker_compatible
2121                            && !has_non_list_content;
2122
2123                        // Debug logging for lines after problematic content
2124                        if block.item_lines.last().is_some_and(|&last_line| {
2125                            last_line > 0
2126                                && last_line <= lines.len()
2127                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
2128                                && lines[last_line - 1].content.contains(r"\`")
2129                        }) {
2130                            log::debug!(
2131                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
2132                            );
2133                            if line_num > 0 && line_num <= lines.len() {
2134                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
2135                            }
2136                        }
2137
2138                        result
2139                    };
2140
2141                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
2142                    // This handles edge cases where content patterns might otherwise split lists incorrectly
2143                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
2144                        // Check if the previous line was a list item
2145                        if block.item_lines.contains(&(line_num - 1)) {
2146                            // They're consecutive list items - force them to be in the same list
2147                            continues_list = true;
2148                        }
2149                    }
2150
2151                    if continues_list {
2152                        // Extend current block
2153                        block.end_line = line_num;
2154                        block.item_lines.push(line_num);
2155
2156                        // Update max marker width
2157                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
2158                            list_item.marker.len() + 1
2159                        } else {
2160                            list_item.marker.len()
2161                        });
2162
2163                        // Update marker consistency for unordered lists
2164                        if !block.is_ordered
2165                            && block.marker.is_some()
2166                            && block.marker.as_ref() != Some(&list_item.marker)
2167                        {
2168                            // Mixed markers, clear the marker field
2169                            block.marker = None;
2170                        }
2171                    } else {
2172                        // End current block and start a new one
2173
2174                        list_blocks.push(block.clone());
2175
2176                        *block = ListBlock {
2177                            start_line: line_num,
2178                            end_line: line_num,
2179                            is_ordered: list_item.is_ordered,
2180                            marker: if list_item.is_ordered {
2181                                None
2182                            } else {
2183                                Some(list_item.marker.clone())
2184                            },
2185                            blockquote_prefix: blockquote_prefix.clone(),
2186                            item_lines: vec![line_num],
2187                            nesting_level: nesting,
2188                            max_marker_width: if list_item.is_ordered {
2189                                list_item.marker.len() + 1
2190                            } else {
2191                                list_item.marker.len()
2192                            },
2193                        };
2194                    }
2195                } else {
2196                    // Start a new block
2197                    current_block = Some(ListBlock {
2198                        start_line: line_num,
2199                        end_line: line_num,
2200                        is_ordered: list_item.is_ordered,
2201                        marker: if list_item.is_ordered {
2202                            None
2203                        } else {
2204                            Some(list_item.marker.clone())
2205                        },
2206                        blockquote_prefix,
2207                        item_lines: vec![line_num],
2208                        nesting_level: nesting,
2209                        max_marker_width: list_item.marker.len(),
2210                    });
2211                }
2212
2213                last_list_item_line = line_num;
2214                current_indent_level = item_indent;
2215                last_marker_width = if list_item.is_ordered {
2216                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2217                } else {
2218                    list_item.marker.len()
2219                };
2220            } else if let Some(ref mut block) = current_block {
2221                // Not a list item - check if it continues the current block
2222
2223                // For MD032 compatibility, we use a simple approach:
2224                // - Indented lines continue the list
2225                // - Blank lines followed by indented content continue the list
2226                // - Everything else ends the list
2227
2228                // Check if the last line in the list block ended with a backslash (hard line break)
2229                // This handles cases where list items use backslash for hard line breaks
2230                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2231                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2232                } else {
2233                    false
2234                };
2235
2236                // Calculate minimum indentation for list continuation
2237                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2238                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2239                let min_continuation_indent = if block.is_ordered {
2240                    current_indent_level + last_marker_width
2241                } else {
2242                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2243                };
2244
2245                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2246                    // Indented line or backslash continuation continues the list
2247                    block.end_line = line_num;
2248                } else if line_info.is_blank {
2249                    // Blank line - check if it's internal to the list or ending it
2250                    // We only include blank lines that are followed by more list content
2251                    let mut check_idx = line_idx + 1;
2252                    let mut found_continuation = false;
2253
2254                    // Skip additional blank lines
2255                    while check_idx < lines.len() && lines[check_idx].is_blank {
2256                        check_idx += 1;
2257                    }
2258
2259                    if check_idx < lines.len() {
2260                        let next_line = &lines[check_idx];
2261                        // Check if followed by indented content (list continuation)
2262                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2263                            found_continuation = true;
2264                        }
2265                        // Check if followed by another list item at the same level
2266                        else if !next_line.in_code_block
2267                            && next_line.list_item.is_some()
2268                            && let Some(item) = &next_line.list_item
2269                        {
2270                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2271                                .find(&next_line.content)
2272                                .map_or(String::new(), |m| m.as_str().to_string());
2273                            if item.marker_column == current_indent_level
2274                                && item.is_ordered == block.is_ordered
2275                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2276                            {
2277                                // Check if there was meaningful content between the list items (unused now)
2278                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2279                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2280                                    if let Some(between_line) = lines.get(idx) {
2281                                        let trimmed = between_line.content.trim();
2282                                        // Skip empty lines
2283                                        if trimmed.is_empty() {
2284                                            return false;
2285                                        }
2286                                        // Check for meaningful content
2287                                        let line_indent =
2288                                            between_line.content.len() - between_line.content.trim_start().len();
2289
2290                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2291                                        if trimmed.starts_with("```")
2292                                            || trimmed.starts_with("~~~")
2293                                            || trimmed.starts_with("---")
2294                                            || trimmed.starts_with("***")
2295                                            || trimmed.starts_with("___")
2296                                            || trimmed.starts_with(">")
2297                                            || trimmed.contains('|') // Tables
2298                                            || between_line.heading.is_some()
2299                                        {
2300                                            return true; // These are structural separators - meaningful content that breaks lists
2301                                        }
2302
2303                                        // Only properly indented content continues the list
2304                                        line_indent >= min_continuation_indent
2305                                    } else {
2306                                        false
2307                                    }
2308                                });
2309
2310                                if block.is_ordered {
2311                                    // For ordered lists: don't continue if there are structural separators
2312                                    // Check if there are structural separators between the list items
2313                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2314                                        if let Some(between_line) = lines.get(idx) {
2315                                            let trimmed = between_line.content.trim();
2316                                            if trimmed.is_empty() {
2317                                                return false;
2318                                            }
2319                                            // Check for structural separators that break lists
2320                                            trimmed.starts_with("```")
2321                                                || trimmed.starts_with("~~~")
2322                                                || trimmed.starts_with("---")
2323                                                || trimmed.starts_with("***")
2324                                                || trimmed.starts_with("___")
2325                                                || trimmed.starts_with(">")
2326                                                || trimmed.contains('|') // Tables
2327                                                || between_line.heading.is_some()
2328                                        } else {
2329                                            false
2330                                        }
2331                                    });
2332                                    found_continuation = !has_structural_separators;
2333                                } else {
2334                                    // For unordered lists: also check for structural separators
2335                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2336                                        if let Some(between_line) = lines.get(idx) {
2337                                            let trimmed = between_line.content.trim();
2338                                            if trimmed.is_empty() {
2339                                                return false;
2340                                            }
2341                                            // Check for structural separators that break lists
2342                                            trimmed.starts_with("```")
2343                                                || trimmed.starts_with("~~~")
2344                                                || trimmed.starts_with("---")
2345                                                || trimmed.starts_with("***")
2346                                                || trimmed.starts_with("___")
2347                                                || trimmed.starts_with(">")
2348                                                || trimmed.contains('|') // Tables
2349                                                || between_line.heading.is_some()
2350                                        } else {
2351                                            false
2352                                        }
2353                                    });
2354                                    found_continuation = !has_structural_separators;
2355                                }
2356                            }
2357                        }
2358                    }
2359
2360                    if found_continuation {
2361                        // Include the blank line in the block
2362                        block.end_line = line_num;
2363                    } else {
2364                        // Blank line ends the list - don't include it
2365                        list_blocks.push(block.clone());
2366                        current_block = None;
2367                    }
2368                } else {
2369                    // Check for lazy continuation - non-indented line immediately after a list item
2370                    // But only if the line has sufficient indentation for the list type
2371                    let min_required_indent = if block.is_ordered {
2372                        current_indent_level + last_marker_width
2373                    } else {
2374                        current_indent_level + 2
2375                    };
2376
2377                    // For lazy continuation to apply, the line must either:
2378                    // 1. Have no indentation (true lazy continuation)
2379                    // 2. Have sufficient indentation for the list type
2380                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2381                    let line_content = line_info.content.trim();
2382                    let is_structural_separator = line_info.heading.is_some()
2383                        || line_content.starts_with("```")
2384                        || line_content.starts_with("~~~")
2385                        || line_content.starts_with("---")
2386                        || line_content.starts_with("***")
2387                        || line_content.starts_with("___")
2388                        || line_content.starts_with(">")
2389                        || (line_content.contains('|')
2390                            && !line_content.contains("](")
2391                            && !line_content.contains("http")
2392                            && (line_content.matches('|').count() > 1
2393                                || line_content.starts_with('|')
2394                                || line_content.ends_with('|'))); // Tables
2395
2396                    // Allow lazy continuation if we're still within the same list block
2397                    // (not just immediately after a list item)
2398                    let is_lazy_continuation = !is_structural_separator
2399                        && !line_info.is_blank
2400                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2401
2402                    if is_lazy_continuation {
2403                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2404                        // it's probably not a continuation
2405                        let content_to_check = if !blockquote_prefix.is_empty() {
2406                            // Strip blockquote prefix to check the actual content
2407                            line_info
2408                                .content
2409                                .strip_prefix(&blockquote_prefix)
2410                                .unwrap_or(&line_info.content)
2411                                .trim()
2412                        } else {
2413                            line_info.content.trim()
2414                        };
2415
2416                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2417
2418                        // If it starts with uppercase and the previous line ended with punctuation,
2419                        // it's likely a new paragraph, not a continuation
2420                        if starts_with_uppercase && last_list_item_line > 0 {
2421                            // This looks like a new paragraph
2422                            list_blocks.push(block.clone());
2423                            current_block = None;
2424                        } else {
2425                            // This is a lazy continuation line
2426                            block.end_line = line_num;
2427                        }
2428                    } else {
2429                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2430                        list_blocks.push(block.clone());
2431                        current_block = None;
2432                    }
2433                }
2434            }
2435        }
2436
2437        // Don't forget the last block
2438        if let Some(block) = current_block {
2439            list_blocks.push(block);
2440        }
2441
2442        // Merge adjacent blocks that should be one
2443        merge_adjacent_list_blocks(&mut list_blocks, lines);
2444
2445        list_blocks
2446    }
2447
2448    /// Compute character frequency for fast content analysis
2449    fn compute_char_frequency(content: &str) -> CharFrequency {
2450        let mut frequency = CharFrequency::default();
2451
2452        for ch in content.chars() {
2453            match ch {
2454                '#' => frequency.hash_count += 1,
2455                '*' => frequency.asterisk_count += 1,
2456                '_' => frequency.underscore_count += 1,
2457                '-' => frequency.hyphen_count += 1,
2458                '+' => frequency.plus_count += 1,
2459                '>' => frequency.gt_count += 1,
2460                '|' => frequency.pipe_count += 1,
2461                '[' => frequency.bracket_count += 1,
2462                '`' => frequency.backtick_count += 1,
2463                '<' => frequency.lt_count += 1,
2464                '!' => frequency.exclamation_count += 1,
2465                '\n' => frequency.newline_count += 1,
2466                _ => {}
2467            }
2468        }
2469
2470        frequency
2471    }
2472
2473    /// Parse HTML tags in the content
2474    fn parse_html_tags(
2475        content: &str,
2476        lines: &[LineInfo],
2477        code_blocks: &[(usize, usize)],
2478        flavor: MarkdownFlavor,
2479    ) -> Vec<HtmlTag> {
2480        lazy_static! {
2481            static ref HTML_TAG_REGEX: regex::Regex =
2482                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2483        }
2484
2485        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2486
2487        for cap in HTML_TAG_REGEX.captures_iter(content) {
2488            let full_match = cap.get(0).unwrap();
2489            let match_start = full_match.start();
2490            let match_end = full_match.end();
2491
2492            // Skip if in code block
2493            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2494                continue;
2495            }
2496
2497            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2498            let tag_name_original = cap.get(2).unwrap().as_str();
2499            let tag_name = tag_name_original.to_lowercase();
2500            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2501
2502            // Skip JSX components in MDX files (tags starting with uppercase letter)
2503            // JSX components like <Chart />, <MyComponent> should not be treated as HTML
2504            if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
2505                continue;
2506            }
2507
2508            // Find which line this tag is on
2509            let mut line_num = 1;
2510            let mut col_start = match_start;
2511            let mut col_end = match_end;
2512            for (idx, line_info) in lines.iter().enumerate() {
2513                if match_start >= line_info.byte_offset {
2514                    line_num = idx + 1;
2515                    col_start = match_start - line_info.byte_offset;
2516                    col_end = match_end - line_info.byte_offset;
2517                } else {
2518                    break;
2519                }
2520            }
2521
2522            html_tags.push(HtmlTag {
2523                line: line_num,
2524                start_col: col_start,
2525                end_col: col_end,
2526                byte_offset: match_start,
2527                byte_end: match_end,
2528                tag_name,
2529                is_closing,
2530                is_self_closing,
2531                raw_content: full_match.as_str().to_string(),
2532            });
2533        }
2534
2535        html_tags
2536    }
2537
2538    /// Parse emphasis spans in the content
2539    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2540        lazy_static! {
2541            static ref EMPHASIS_REGEX: regex::Regex =
2542                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2543        }
2544
2545        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2546
2547        for cap in EMPHASIS_REGEX.captures_iter(content) {
2548            let full_match = cap.get(0).unwrap();
2549            let match_start = full_match.start();
2550            let match_end = full_match.end();
2551
2552            // Skip if in code block
2553            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2554                continue;
2555            }
2556
2557            let opening_markers = cap.get(1).unwrap().as_str();
2558            let content_part = cap.get(2).unwrap().as_str();
2559            let closing_markers = cap.get(3).unwrap().as_str();
2560
2561            // Validate matching markers
2562            if opening_markers.chars().next() != closing_markers.chars().next()
2563                || opening_markers.len() != closing_markers.len()
2564            {
2565                continue;
2566            }
2567
2568            let marker = opening_markers.chars().next().unwrap();
2569            let marker_count = opening_markers.len();
2570
2571            // Find which line this emphasis is on
2572            let mut line_num = 1;
2573            let mut col_start = match_start;
2574            let mut col_end = match_end;
2575            for (idx, line_info) in lines.iter().enumerate() {
2576                if match_start >= line_info.byte_offset {
2577                    line_num = idx + 1;
2578                    col_start = match_start - line_info.byte_offset;
2579                    col_end = match_end - line_info.byte_offset;
2580                } else {
2581                    break;
2582                }
2583            }
2584
2585            emphasis_spans.push(EmphasisSpan {
2586                line: line_num,
2587                start_col: col_start,
2588                end_col: col_end,
2589                byte_offset: match_start,
2590                byte_end: match_end,
2591                marker,
2592                marker_count,
2593                content: content_part.to_string(),
2594            });
2595        }
2596
2597        emphasis_spans
2598    }
2599
2600    /// Parse table rows in the content
2601    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2602        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2603
2604        for (line_idx, line_info) in lines.iter().enumerate() {
2605            // Skip lines in code blocks or blank lines
2606            if line_info.in_code_block || line_info.is_blank {
2607                continue;
2608            }
2609
2610            let line = &line_info.content;
2611            let line_num = line_idx + 1;
2612
2613            // Check if this line contains pipes (potential table row)
2614            if !line.contains('|') {
2615                continue;
2616            }
2617
2618            // Count columns by splitting on pipes
2619            let parts: Vec<&str> = line.split('|').collect();
2620            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2621
2622            // Check if this is a separator row
2623            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2624            let mut column_alignments = Vec::new();
2625
2626            if is_separator {
2627                for part in &parts[1..parts.len() - 1] {
2628                    // Skip first and last empty parts
2629                    let trimmed = part.trim();
2630                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2631                        "center".to_string()
2632                    } else if trimmed.ends_with(':') {
2633                        "right".to_string()
2634                    } else if trimmed.starts_with(':') {
2635                        "left".to_string()
2636                    } else {
2637                        "none".to_string()
2638                    };
2639                    column_alignments.push(alignment);
2640                }
2641            }
2642
2643            table_rows.push(TableRow {
2644                line: line_num,
2645                is_separator,
2646                column_count,
2647                column_alignments,
2648            });
2649        }
2650
2651        table_rows
2652    }
2653
2654    /// Parse bare URLs and emails in the content
2655    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2656        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2657
2658        // Check for bare URLs (not in angle brackets or markdown links)
2659        for cap in BARE_URL_PATTERN.captures_iter(content) {
2660            let full_match = cap.get(0).unwrap();
2661            let match_start = full_match.start();
2662            let match_end = full_match.end();
2663
2664            // Skip if in code block
2665            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2666                continue;
2667            }
2668
2669            // Skip if already in angle brackets or markdown links
2670            let preceding_char = if match_start > 0 {
2671                content.chars().nth(match_start - 1)
2672            } else {
2673                None
2674            };
2675            let following_char = content.chars().nth(match_end);
2676
2677            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2678                continue;
2679            }
2680            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2681                continue;
2682            }
2683
2684            let url = full_match.as_str();
2685            let url_type = if url.starts_with("https://") {
2686                "https"
2687            } else if url.starts_with("http://") {
2688                "http"
2689            } else if url.starts_with("ftp://") {
2690                "ftp"
2691            } else {
2692                "other"
2693            };
2694
2695            // Find which line this URL is on
2696            let mut line_num = 1;
2697            let mut col_start = match_start;
2698            let mut col_end = match_end;
2699            for (idx, line_info) in lines.iter().enumerate() {
2700                if match_start >= line_info.byte_offset {
2701                    line_num = idx + 1;
2702                    col_start = match_start - line_info.byte_offset;
2703                    col_end = match_end - line_info.byte_offset;
2704                } else {
2705                    break;
2706                }
2707            }
2708
2709            bare_urls.push(BareUrl {
2710                line: line_num,
2711                start_col: col_start,
2712                end_col: col_end,
2713                byte_offset: match_start,
2714                byte_end: match_end,
2715                url: url.to_string(),
2716                url_type: url_type.to_string(),
2717            });
2718        }
2719
2720        // Check for bare email addresses
2721        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2722            let full_match = cap.get(0).unwrap();
2723            let match_start = full_match.start();
2724            let match_end = full_match.end();
2725
2726            // Skip if in code block
2727            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2728                continue;
2729            }
2730
2731            // Skip if already in angle brackets or markdown links
2732            let preceding_char = if match_start > 0 {
2733                content.chars().nth(match_start - 1)
2734            } else {
2735                None
2736            };
2737            let following_char = content.chars().nth(match_end);
2738
2739            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2740                continue;
2741            }
2742            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2743                continue;
2744            }
2745
2746            let email = full_match.as_str();
2747
2748            // Find which line this email is on
2749            let mut line_num = 1;
2750            let mut col_start = match_start;
2751            let mut col_end = match_end;
2752            for (idx, line_info) in lines.iter().enumerate() {
2753                if match_start >= line_info.byte_offset {
2754                    line_num = idx + 1;
2755                    col_start = match_start - line_info.byte_offset;
2756                    col_end = match_end - line_info.byte_offset;
2757                } else {
2758                    break;
2759                }
2760            }
2761
2762            bare_urls.push(BareUrl {
2763                line: line_num,
2764                start_col: col_start,
2765                end_col: col_end,
2766                byte_offset: match_start,
2767                byte_end: match_end,
2768                url: email.to_string(),
2769                url_type: "email".to_string(),
2770            });
2771        }
2772
2773        bare_urls
2774    }
2775}
2776
2777/// Merge adjacent list blocks that should be treated as one
2778fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2779    if list_blocks.len() < 2 {
2780        return;
2781    }
2782
2783    let mut merger = ListBlockMerger::new(lines);
2784    *list_blocks = merger.merge(list_blocks);
2785}
2786
2787/// Helper struct to manage the complex logic of merging list blocks
2788struct ListBlockMerger<'a> {
2789    lines: &'a [LineInfo],
2790}
2791
2792impl<'a> ListBlockMerger<'a> {
2793    fn new(lines: &'a [LineInfo]) -> Self {
2794        Self { lines }
2795    }
2796
2797    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2798        let mut merged = Vec::with_capacity(list_blocks.len());
2799        let mut current = list_blocks[0].clone();
2800
2801        for next in list_blocks.iter().skip(1) {
2802            if self.should_merge_blocks(&current, next) {
2803                current = self.merge_two_blocks(current, next);
2804            } else {
2805                merged.push(current);
2806                current = next.clone();
2807            }
2808        }
2809
2810        merged.push(current);
2811        merged
2812    }
2813
2814    /// Determine if two adjacent list blocks should be merged
2815    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2816        // Basic compatibility checks
2817        if !self.blocks_are_compatible(current, next) {
2818            return false;
2819        }
2820
2821        // Check spacing and content between blocks
2822        let spacing = self.analyze_spacing_between(current, next);
2823        match spacing {
2824            BlockSpacing::Consecutive => true,
2825            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2826            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2827                self.can_merge_with_content_between(current, next)
2828            }
2829        }
2830    }
2831
2832    /// Check if blocks have compatible structure for merging
2833    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2834        current.is_ordered == next.is_ordered
2835            && current.blockquote_prefix == next.blockquote_prefix
2836            && current.nesting_level == next.nesting_level
2837    }
2838
2839    /// Analyze the spacing between two list blocks
2840    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2841        let gap = next.start_line - current.end_line;
2842
2843        match gap {
2844            1 => BlockSpacing::Consecutive,
2845            2 => BlockSpacing::SingleBlank,
2846            _ if gap > 2 => {
2847                if self.has_only_blank_lines_between(current, next) {
2848                    BlockSpacing::MultipleBlanks
2849                } else {
2850                    BlockSpacing::ContentBetween
2851                }
2852            }
2853            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2854        }
2855    }
2856
2857    /// Check if unordered lists can be merged with a single blank line between
2858    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2859        // Check if there are structural separators between the blocks
2860        // If has_meaningful_content_between returns true, it means there are structural separators
2861        if has_meaningful_content_between(current, next, self.lines) {
2862            return false; // Structural separators prevent merging
2863        }
2864
2865        // Only merge unordered lists with same marker across single blank
2866        !current.is_ordered && current.marker == next.marker
2867    }
2868
2869    /// Check if ordered lists can be merged when there's content between them
2870    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2871        // Do not merge lists if there are structural separators between them
2872        if has_meaningful_content_between(current, next, self.lines) {
2873            return false; // Structural separators prevent merging
2874        }
2875
2876        // Only consider merging ordered lists if there's no structural content between
2877        current.is_ordered && next.is_ordered
2878    }
2879
2880    /// Check if there are only blank lines between blocks
2881    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2882        for line_num in (current.end_line + 1)..next.start_line {
2883            if let Some(line_info) = self.lines.get(line_num - 1)
2884                && !line_info.content.trim().is_empty()
2885            {
2886                return false;
2887            }
2888        }
2889        true
2890    }
2891
2892    /// Merge two compatible list blocks into one
2893    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2894        current.end_line = next.end_line;
2895        current.item_lines.extend_from_slice(&next.item_lines);
2896
2897        // Update max marker width
2898        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2899
2900        // Handle marker consistency for unordered lists
2901        if !current.is_ordered && self.markers_differ(&current, next) {
2902            current.marker = None; // Mixed markers
2903        }
2904
2905        current
2906    }
2907
2908    /// Check if two blocks have different markers
2909    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2910        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2911    }
2912}
2913
2914/// Types of spacing between list blocks
2915#[derive(Debug, PartialEq)]
2916enum BlockSpacing {
2917    Consecutive,    // No gap between blocks
2918    SingleBlank,    // One blank line between blocks
2919    MultipleBlanks, // Multiple blank lines but no content
2920    ContentBetween, // Content exists between blocks
2921}
2922
2923/// Check if there's meaningful content (not just blank lines) between two list blocks
2924fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2925    // Check lines between current.end_line and next.start_line
2926    for line_num in (current.end_line + 1)..next.start_line {
2927        if let Some(line_info) = lines.get(line_num - 1) {
2928            // Convert to 0-indexed
2929            let trimmed = line_info.content.trim();
2930
2931            // Skip empty lines
2932            if trimmed.is_empty() {
2933                continue;
2934            }
2935
2936            // Check for structural separators that should separate lists (CommonMark compliant)
2937
2938            // Headings separate lists
2939            if line_info.heading.is_some() {
2940                return true; // Has meaningful content - headings separate lists
2941            }
2942
2943            // Horizontal rules separate lists (---, ***, ___)
2944            if is_horizontal_rule(trimmed) {
2945                return true; // Has meaningful content - horizontal rules separate lists
2946            }
2947
2948            // Tables separate lists (lines containing | but not in URLs or code)
2949            // Simple heuristic: tables typically have | at start/end or multiple |
2950            if trimmed.contains('|') && trimmed.len() > 1 {
2951                // Don't treat URLs with | as tables
2952                if !trimmed.contains("](") && !trimmed.contains("http") {
2953                    // More robust check: tables usually have multiple | or | at edges
2954                    let pipe_count = trimmed.matches('|').count();
2955                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2956                        return true; // Has meaningful content - tables separate lists
2957                    }
2958                }
2959            }
2960
2961            // Blockquotes separate lists
2962            if trimmed.starts_with('>') {
2963                return true; // Has meaningful content - blockquotes separate lists
2964            }
2965
2966            // Code block fences separate lists (unless properly indented as list content)
2967            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2968                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2969
2970                // Check if this code block is properly indented as list continuation
2971                let min_continuation_indent = if current.is_ordered {
2972                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2973                } else {
2974                    current.nesting_level + 2
2975                };
2976
2977                if line_indent < min_continuation_indent {
2978                    // This is a standalone code block that separates lists
2979                    return true; // Has meaningful content - standalone code blocks separate lists
2980                }
2981            }
2982
2983            // Check if this line has proper indentation for list continuation
2984            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2985
2986            // Calculate minimum indentation needed to be list continuation
2987            let min_indent = if current.is_ordered {
2988                current.nesting_level + current.max_marker_width
2989            } else {
2990                current.nesting_level + 2
2991            };
2992
2993            // If the line is not indented enough to be list continuation, it's meaningful content
2994            if line_indent < min_indent {
2995                return true; // Has meaningful content - content not indented as list continuation
2996            }
2997
2998            // If we reach here, the line is properly indented as list continuation
2999            // Continue checking other lines
3000        }
3001    }
3002
3003    // Only blank lines or properly indented list continuation content between blocks
3004    false
3005}
3006
3007/// Check if a line is a horizontal rule (---, ***, ___)
3008fn is_horizontal_rule(trimmed: &str) -> bool {
3009    if trimmed.len() < 3 {
3010        return false;
3011    }
3012
3013    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
3014    let chars: Vec<char> = trimmed.chars().collect();
3015    if let Some(&first_char) = chars.first()
3016        && (first_char == '-' || first_char == '*' || first_char == '_')
3017    {
3018        let mut count = 0;
3019        for &ch in &chars {
3020            if ch == first_char {
3021                count += 1;
3022            } else if ch != ' ' && ch != '\t' {
3023                return false; // Non-matching, non-whitespace character
3024            }
3025        }
3026        return count >= 3;
3027    }
3028    false
3029}
3030
3031/// Check if content contains patterns that cause the markdown crate to panic
3032#[cfg(test)]
3033mod tests {
3034    use super::*;
3035
3036    #[test]
3037    fn test_empty_content() {
3038        let ctx = LintContext::new("", MarkdownFlavor::Standard);
3039        assert_eq!(ctx.content, "");
3040        assert_eq!(ctx.line_offsets, vec![0]);
3041        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3042        assert_eq!(ctx.lines.len(), 0);
3043    }
3044
3045    #[test]
3046    fn test_single_line() {
3047        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
3048        assert_eq!(ctx.content, "# Hello");
3049        assert_eq!(ctx.line_offsets, vec![0]);
3050        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
3051        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
3052    }
3053
3054    #[test]
3055    fn test_multi_line() {
3056        let content = "# Title\n\nSecond line\nThird line";
3057        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3058        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
3059        // Test offset to line/col
3060        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
3061        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
3062        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
3063        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
3064        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
3065    }
3066
3067    #[test]
3068    fn test_line_info() {
3069        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
3070        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3071
3072        // Test line info
3073        assert_eq!(ctx.lines.len(), 7);
3074
3075        // Line 1: "# Title"
3076        let line1 = &ctx.lines[0];
3077        assert_eq!(line1.content, "# Title");
3078        assert_eq!(line1.byte_offset, 0);
3079        assert_eq!(line1.indent, 0);
3080        assert!(!line1.is_blank);
3081        assert!(!line1.in_code_block);
3082        assert!(line1.list_item.is_none());
3083
3084        // Line 2: "    indented"
3085        let line2 = &ctx.lines[1];
3086        assert_eq!(line2.content, "    indented");
3087        assert_eq!(line2.byte_offset, 8);
3088        assert_eq!(line2.indent, 4);
3089        assert!(!line2.is_blank);
3090
3091        // Line 3: "" (blank)
3092        let line3 = &ctx.lines[2];
3093        assert_eq!(line3.content, "");
3094        assert!(line3.is_blank);
3095
3096        // Test helper methods
3097        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
3098        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
3099        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
3100        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
3101    }
3102
3103    #[test]
3104    fn test_list_item_detection() {
3105        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
3106        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3107
3108        // Line 1: "- Unordered item"
3109        let line1 = &ctx.lines[0];
3110        assert!(line1.list_item.is_some());
3111        let list1 = line1.list_item.as_ref().unwrap();
3112        assert_eq!(list1.marker, "-");
3113        assert!(!list1.is_ordered);
3114        assert_eq!(list1.marker_column, 0);
3115        assert_eq!(list1.content_column, 2);
3116
3117        // Line 2: "  * Nested item"
3118        let line2 = &ctx.lines[1];
3119        assert!(line2.list_item.is_some());
3120        let list2 = line2.list_item.as_ref().unwrap();
3121        assert_eq!(list2.marker, "*");
3122        assert_eq!(list2.marker_column, 2);
3123
3124        // Line 3: "1. Ordered item"
3125        let line3 = &ctx.lines[2];
3126        assert!(line3.list_item.is_some());
3127        let list3 = line3.list_item.as_ref().unwrap();
3128        assert_eq!(list3.marker, "1.");
3129        assert!(list3.is_ordered);
3130        assert_eq!(list3.number, Some(1));
3131
3132        // Line 6: "Not a list"
3133        let line6 = &ctx.lines[5];
3134        assert!(line6.list_item.is_none());
3135    }
3136
3137    #[test]
3138    fn test_offset_to_line_col_edge_cases() {
3139        let content = "a\nb\nc";
3140        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3141        // line_offsets: [0, 2, 4]
3142        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
3143        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
3144        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
3145        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
3146        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
3147        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
3148    }
3149
3150    #[test]
3151    fn test_mdx_esm_blocks() {
3152        let content = r##"import {Chart} from './snowfall.js'
3153export const year = 2023
3154
3155# Last year's snowfall
3156
3157In {year}, the snowfall was above average.
3158It was followed by a warm spring which caused
3159flood conditions in many of the nearby rivers.
3160
3161<Chart color="#fcb32c" year={year} />
3162"##;
3163
3164        let ctx = LintContext::new(content, MarkdownFlavor::MDX);
3165
3166        // Check that lines 1 and 2 are marked as ESM blocks
3167        assert_eq!(ctx.lines.len(), 10);
3168        assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
3169        assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
3170        assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
3171        assert!(
3172            !ctx.lines[3].in_esm_block,
3173            "Line 4 (heading) should NOT be in_esm_block"
3174        );
3175        assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
3176        assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
3177    }
3178
3179    #[test]
3180    fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
3181        let content = r#"import {Chart} from './snowfall.js'
3182export const year = 2023
3183
3184# Last year's snowfall
3185"#;
3186
3187        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
3188
3189        // ESM blocks should NOT be detected in Standard flavor
3190        assert!(
3191            !ctx.lines[0].in_esm_block,
3192            "Line 1 should NOT be in_esm_block in Standard flavor"
3193        );
3194        assert!(
3195            !ctx.lines[1].in_esm_block,
3196            "Line 2 should NOT be in_esm_block in Standard flavor"
3197        );
3198    }
3199}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs