rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::ast_utils::get_cached_ast;
4use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
5use lazy_static::lazy_static;
6use markdown::mdast::Node;
7use regex::Regex;
8
9lazy_static! {
10    // Comprehensive link pattern that captures both inline and reference links
11    // Use (?s) flag to make . match newlines
12    static ref LINK_PATTERN: Regex = Regex::new(
13        r#"(?sx)
14        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
15        (?:
16            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
17            |
18            \[([^\]]*)\]      # Reference ID in group 6
19        )"#
20    ).unwrap();
21
22    // Image pattern (similar to links but with ! prefix)
23    // Use (?s) flag to make . match newlines
24    static ref IMAGE_PATTERN: Regex = Regex::new(
25        r#"(?sx)
26        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
27        (?:
28            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
29            |
30            \[([^\]]*)\]      # Reference ID in group 6
31        )"#
32    ).unwrap();
33
34    // Reference definition pattern
35    static ref REF_DEF_PATTERN: Regex = Regex::new(
36        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
37    ).unwrap();
38
39    // Code span pattern - matches backticks and captures content
40    // This handles multi-backtick code spans correctly
41    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
42        r"`+"
43    ).unwrap();
44
45    // Pattern for bare URLs
46    static ref BARE_URL_PATTERN: Regex = Regex::new(
47        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
48    ).unwrap();
49
50    // Pattern for email addresses
51    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
52        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
53    ).unwrap();
54
55    // Pattern for angle bracket links (to exclude from bare URL detection)
56    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
57        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
58    ).unwrap();
59
60    // Pattern for blockquote prefix in parse_list_blocks
61    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
62}
63
64/// Pre-computed information about a line
65#[derive(Debug, Clone)]
66pub struct LineInfo {
67    /// The actual line content (without newline)
68    pub content: String,
69    /// Byte offset where this line starts in the document
70    pub byte_offset: usize,
71    /// Number of leading spaces/tabs
72    pub indent: usize,
73    /// Whether the line is blank (empty or only whitespace)
74    pub is_blank: bool,
75    /// Whether this line is inside a code block
76    pub in_code_block: bool,
77    /// Whether this line is inside front matter
78    pub in_front_matter: bool,
79    /// Whether this line is inside an HTML block
80    pub in_html_block: bool,
81    /// Whether this line is inside an HTML comment
82    pub in_html_comment: bool,
83    /// List item information if this line starts a list item
84    pub list_item: Option<ListItemInfo>,
85    /// Heading information if this line is a heading
86    pub heading: Option<HeadingInfo>,
87    /// Blockquote information if this line is a blockquote
88    pub blockquote: Option<BlockquoteInfo>,
89    /// Whether this line is inside a mkdocstrings autodoc block
90    pub in_mkdocstrings: bool,
91}
92
93/// Information about a list item
94#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96    /// The marker used (*, -, +, or number with . or ))
97    pub marker: String,
98    /// Whether it's ordered (true) or unordered (false)
99    pub is_ordered: bool,
100    /// The number for ordered lists
101    pub number: Option<usize>,
102    /// Column where the marker starts (0-based)
103    pub marker_column: usize,
104    /// Column where content after marker starts
105    pub content_column: usize,
106}
107
108/// Heading style type
109#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111    /// ATX style heading (# Heading)
112    ATX,
113    /// Setext style heading with = underline
114    Setext1,
115    /// Setext style heading with - underline
116    Setext2,
117}
118
119/// Parsed link information
120#[derive(Debug, Clone)]
121pub struct ParsedLink {
122    /// Line number (1-indexed)
123    pub line: usize,
124    /// Start column (0-indexed) in the line
125    pub start_col: usize,
126    /// End column (0-indexed) in the line
127    pub end_col: usize,
128    /// Byte offset in document
129    pub byte_offset: usize,
130    /// End byte offset in document
131    pub byte_end: usize,
132    /// Link text
133    pub text: String,
134    /// Link URL or reference
135    pub url: String,
136    /// Whether this is a reference link [text][ref] vs inline [text](url)
137    pub is_reference: bool,
138    /// Reference ID for reference links
139    pub reference_id: Option<String>,
140}
141
142/// Parsed image information
143#[derive(Debug, Clone)]
144pub struct ParsedImage {
145    /// Line number (1-indexed)
146    pub line: usize,
147    /// Start column (0-indexed) in the line
148    pub start_col: usize,
149    /// End column (0-indexed) in the line
150    pub end_col: usize,
151    /// Byte offset in document
152    pub byte_offset: usize,
153    /// End byte offset in document
154    pub byte_end: usize,
155    /// Alt text
156    pub alt_text: String,
157    /// Image URL or reference
158    pub url: String,
159    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
160    pub is_reference: bool,
161    /// Reference ID for reference images
162    pub reference_id: Option<String>,
163}
164
165/// Reference definition [ref]: url "title"
166#[derive(Debug, Clone)]
167pub struct ReferenceDef {
168    /// Line number (1-indexed)
169    pub line: usize,
170    /// Reference ID (normalized to lowercase)
171    pub id: String,
172    /// URL
173    pub url: String,
174    /// Optional title
175    pub title: Option<String>,
176}
177
178/// Parsed code span information
179#[derive(Debug, Clone)]
180pub struct CodeSpan {
181    /// Line number (1-indexed)
182    pub line: usize,
183    /// Start column (0-indexed) in the line
184    pub start_col: usize,
185    /// End column (0-indexed) in the line
186    pub end_col: usize,
187    /// Byte offset in document
188    pub byte_offset: usize,
189    /// End byte offset in document
190    pub byte_end: usize,
191    /// Number of backticks used (1, 2, 3, etc.)
192    pub backtick_count: usize,
193    /// Content inside the code span (without backticks)
194    pub content: String,
195}
196
197/// Information about a heading
198#[derive(Debug, Clone)]
199pub struct HeadingInfo {
200    /// Heading level (1-6 for ATX, 1-2 for Setext)
201    pub level: u8,
202    /// Style of heading
203    pub style: HeadingStyle,
204    /// The heading marker (# characters or underline)
205    pub marker: String,
206    /// Column where the marker starts (0-based)
207    pub marker_column: usize,
208    /// Column where heading text starts
209    pub content_column: usize,
210    /// The heading text (without markers and without custom ID syntax)
211    pub text: String,
212    /// Custom header ID if present (e.g., from {#custom-id} syntax)
213    pub custom_id: Option<String>,
214    /// Original heading text including custom ID syntax
215    pub raw_text: String,
216    /// Whether it has a closing sequence (for ATX)
217    pub has_closing_sequence: bool,
218    /// The closing sequence if present
219    pub closing_sequence: String,
220}
221
222/// Information about a blockquote line
223#[derive(Debug, Clone)]
224pub struct BlockquoteInfo {
225    /// Nesting level (1 for >, 2 for >>, etc.)
226    pub nesting_level: usize,
227    /// The indentation before the blockquote marker
228    pub indent: String,
229    /// Column where the first > starts (0-based)
230    pub marker_column: usize,
231    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
232    pub prefix: String,
233    /// Content after the blockquote marker(s)
234    pub content: String,
235    /// Whether the line has no space after the marker
236    pub has_no_space_after_marker: bool,
237    /// Whether the line has multiple spaces after the marker
238    pub has_multiple_spaces_after_marker: bool,
239    /// Whether this is an empty blockquote line needing MD028 fix
240    pub needs_md028_fix: bool,
241}
242
243/// Information about a list block
244#[derive(Debug, Clone)]
245pub struct ListBlock {
246    /// Line number where the list starts (1-indexed)
247    pub start_line: usize,
248    /// Line number where the list ends (1-indexed)
249    pub end_line: usize,
250    /// Whether it's ordered or unordered
251    pub is_ordered: bool,
252    /// The consistent marker for unordered lists (if any)
253    pub marker: Option<String>,
254    /// Blockquote prefix for this list (empty if not in blockquote)
255    pub blockquote_prefix: String,
256    /// Lines that are list items within this block
257    pub item_lines: Vec<usize>,
258    /// Nesting level (0 for top-level lists)
259    pub nesting_level: usize,
260    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
261    pub max_marker_width: usize,
262}
263
264use std::sync::{Arc, Mutex};
265
266/// Character frequency data for fast content analysis
267#[derive(Debug, Clone, Default)]
268pub struct CharFrequency {
269    /// Count of # characters (headings)
270    pub hash_count: usize,
271    /// Count of * characters (emphasis, lists, horizontal rules)
272    pub asterisk_count: usize,
273    /// Count of _ characters (emphasis, horizontal rules)
274    pub underscore_count: usize,
275    /// Count of - characters (lists, horizontal rules, setext headings)
276    pub hyphen_count: usize,
277    /// Count of + characters (lists)
278    pub plus_count: usize,
279    /// Count of > characters (blockquotes)
280    pub gt_count: usize,
281    /// Count of | characters (tables)
282    pub pipe_count: usize,
283    /// Count of [ characters (links, images)
284    pub bracket_count: usize,
285    /// Count of ` characters (code spans, code blocks)
286    pub backtick_count: usize,
287    /// Count of < characters (HTML tags, autolinks)
288    pub lt_count: usize,
289    /// Count of ! characters (images)
290    pub exclamation_count: usize,
291    /// Count of newline characters
292    pub newline_count: usize,
293}
294
295/// Pre-parsed HTML tag information
296#[derive(Debug, Clone)]
297pub struct HtmlTag {
298    /// Line number (1-indexed)
299    pub line: usize,
300    /// Start column (0-indexed) in the line
301    pub start_col: usize,
302    /// End column (0-indexed) in the line
303    pub end_col: usize,
304    /// Byte offset in document
305    pub byte_offset: usize,
306    /// End byte offset in document
307    pub byte_end: usize,
308    /// Tag name (e.g., "div", "img", "br")
309    pub tag_name: String,
310    /// Whether it's a closing tag (`</tag>`)
311    pub is_closing: bool,
312    /// Whether it's self-closing (`<tag />`)
313    pub is_self_closing: bool,
314    /// Raw tag content
315    pub raw_content: String,
316}
317
318/// Pre-parsed emphasis span information
319#[derive(Debug, Clone)]
320pub struct EmphasisSpan {
321    /// Line number (1-indexed)
322    pub line: usize,
323    /// Start column (0-indexed) in the line
324    pub start_col: usize,
325    /// End column (0-indexed) in the line
326    pub end_col: usize,
327    /// Byte offset in document
328    pub byte_offset: usize,
329    /// End byte offset in document
330    pub byte_end: usize,
331    /// Type of emphasis ('*' or '_')
332    pub marker: char,
333    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
334    pub marker_count: usize,
335    /// Content inside the emphasis
336    pub content: String,
337}
338
339/// Pre-parsed table row information
340#[derive(Debug, Clone)]
341pub struct TableRow {
342    /// Line number (1-indexed)
343    pub line: usize,
344    /// Whether this is a separator row (contains only |, -, :, and spaces)
345    pub is_separator: bool,
346    /// Number of columns (pipe-separated cells)
347    pub column_count: usize,
348    /// Alignment info from separator row
349    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
350}
351
352/// Pre-parsed bare URL information (not in links)
353#[derive(Debug, Clone)]
354pub struct BareUrl {
355    /// Line number (1-indexed)
356    pub line: usize,
357    /// Start column (0-indexed) in the line
358    pub start_col: usize,
359    /// End column (0-indexed) in the line
360    pub end_col: usize,
361    /// Byte offset in document
362    pub byte_offset: usize,
363    /// End byte offset in document
364    pub byte_end: usize,
365    /// The URL string
366    pub url: String,
367    /// Type of URL ("http", "https", "ftp", "email")
368    pub url_type: String,
369}
370
371pub struct LintContext<'a> {
372    pub content: &'a str,
373    pub line_offsets: Vec<usize>,
374    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
375    pub lines: Vec<LineInfo>,             // Pre-computed line information
376    pub links: Vec<ParsedLink>,           // Pre-parsed links
377    pub images: Vec<ParsedImage>,         // Pre-parsed images
378    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
379    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
380    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
381    pub char_frequency: CharFrequency,    // Character frequency analysis
382    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
383    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
384    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
385    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
386    ast_cache: Mutex<Option<Arc<Node>>>,  // Lazy-loaded AST
387    pub flavor: MarkdownFlavor,           // Markdown flavor being used
388}
389
390impl<'a> LintContext<'a> {
391    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
392        let mut line_offsets = vec![0];
393        for (i, c) in content.char_indices() {
394            if c == '\n' {
395                line_offsets.push(i + 1);
396            }
397        }
398
399        // Detect code blocks once and cache them
400        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
401
402        // Pre-compute line information
403        let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
404
405        // Parse code spans early so we can exclude them from link/image parsing
406        let ast = get_cached_ast(content);
407        let code_spans = Self::parse_code_spans(content, &lines, &ast);
408
409        // Parse links, images, references, and list blocks
410        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
411        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
412        let reference_defs = Self::parse_reference_defs(content, &lines);
413        // Use line-by-line list parsing for MD032 compatibility
414        // TODO: Consider using AST-based parsing in the future when MD032 is updated
415        let list_blocks = Self::parse_list_blocks(&lines);
416
417        // Detect HTML blocks
418        Self::detect_html_blocks(&mut lines);
419
420        // Compute character frequency for fast content analysis
421        let char_frequency = Self::compute_char_frequency(content);
422
423        Self {
424            content,
425            line_offsets,
426            code_blocks,
427            lines,
428            links,
429            images,
430            reference_defs,
431            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
432            list_blocks,
433            char_frequency,
434            html_tags_cache: Mutex::new(None),
435            emphasis_spans_cache: Mutex::new(None),
436            table_rows_cache: Mutex::new(None),
437            bare_urls_cache: Mutex::new(None),
438            ast_cache: Mutex::new(None),
439            flavor,
440        }
441    }
442
443    /// Get AST - uses global cache for deduplication
444    pub fn get_ast(&self) -> Arc<Node> {
445        let mut cache = self.ast_cache.lock().unwrap();
446
447        if cache.is_none() {
448            // Use global AST cache to avoid duplicate parsing
449            // MarkdownAst is just a type alias for Node, so no conversion needed
450            *cache = Some(get_cached_ast(self.content));
451        }
452
453        cache.as_ref().unwrap().clone()
454    }
455
456    /// Get code spans - computed lazily on first access
457    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
458        let mut cache = self.code_spans_cache.lock().unwrap();
459
460        // Check if we need to compute code spans
461        if cache.is_none() {
462            let ast = self.get_ast();
463            let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
464            *cache = Some(Arc::new(code_spans));
465        }
466
467        // Return a reference to the cached code spans
468        cache.as_ref().unwrap().clone()
469    }
470
471    /// Get HTML tags - computed lazily on first access
472    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
473        let mut cache = self.html_tags_cache.lock().unwrap();
474
475        if cache.is_none() {
476            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
477            *cache = Some(Arc::new(html_tags));
478        }
479
480        cache.as_ref().unwrap().clone()
481    }
482
483    /// Get emphasis spans - computed lazily on first access
484    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
485        let mut cache = self.emphasis_spans_cache.lock().unwrap();
486
487        if cache.is_none() {
488            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
489            *cache = Some(Arc::new(emphasis_spans));
490        }
491
492        cache.as_ref().unwrap().clone()
493    }
494
495    /// Get table rows - computed lazily on first access
496    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
497        let mut cache = self.table_rows_cache.lock().unwrap();
498
499        if cache.is_none() {
500            let table_rows = Self::parse_table_rows(&self.lines);
501            *cache = Some(Arc::new(table_rows));
502        }
503
504        cache.as_ref().unwrap().clone()
505    }
506
507    /// Get bare URLs - computed lazily on first access
508    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
509        let mut cache = self.bare_urls_cache.lock().unwrap();
510
511        if cache.is_none() {
512            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
513            *cache = Some(Arc::new(bare_urls));
514        }
515
516        cache.as_ref().unwrap().clone()
517    }
518
519    /// Map a byte offset to (line, column)
520    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
521        match self.line_offsets.binary_search(&offset) {
522            Ok(line) => (line + 1, 1),
523            Err(line) => {
524                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
525                (line, offset - line_start + 1)
526            }
527        }
528    }
529
530    /// Check if a position is within a code block or code span
531    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
532        // Check code blocks first
533        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
534            return true;
535        }
536
537        // Check inline code spans (lazy load if needed)
538        self.code_spans()
539            .iter()
540            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
541    }
542
543    /// Get line information by line number (1-indexed)
544    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
545        if line_num > 0 {
546            self.lines.get(line_num - 1)
547        } else {
548            None
549        }
550    }
551
552    /// Get byte offset for a line number (1-indexed)
553    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
554        self.line_info(line_num).map(|info| info.byte_offset)
555    }
556
557    /// Get URL for a reference link/image by its ID
558    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
559        let normalized_id = ref_id.to_lowercase();
560        self.reference_defs
561            .iter()
562            .find(|def| def.id == normalized_id)
563            .map(|def| def.url.as_str())
564    }
565
566    /// Get links on a specific line
567    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
568        self.links.iter().filter(|link| link.line == line_num).collect()
569    }
570
571    /// Get images on a specific line
572    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
573        self.images.iter().filter(|img| img.line == line_num).collect()
574    }
575
576    /// Check if a line is part of a list block
577    pub fn is_in_list_block(&self, line_num: usize) -> bool {
578        self.list_blocks
579            .iter()
580            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
581    }
582
583    /// Get the list block containing a specific line
584    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
585        self.list_blocks
586            .iter()
587            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
588    }
589
590    // Compatibility methods for DocumentStructure migration
591
592    /// Check if a line is within a code block
593    pub fn is_in_code_block(&self, line_num: usize) -> bool {
594        if line_num == 0 || line_num > self.lines.len() {
595            return false;
596        }
597        self.lines[line_num - 1].in_code_block
598    }
599
600    /// Check if a line is within front matter
601    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
602        if line_num == 0 || line_num > self.lines.len() {
603            return false;
604        }
605        self.lines[line_num - 1].in_front_matter
606    }
607
608    /// Check if a line is within an HTML block
609    pub fn is_in_html_block(&self, line_num: usize) -> bool {
610        if line_num == 0 || line_num > self.lines.len() {
611            return false;
612        }
613        self.lines[line_num - 1].in_html_block
614    }
615
616    /// Check if a line and column is within a code span
617    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
618        if line_num == 0 || line_num > self.lines.len() {
619            return false;
620        }
621
622        // Use the code spans cache to check
623        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
624        // Convert col to 0-indexed for comparison
625        let col_0indexed = if col > 0 { col - 1 } else { 0 };
626        let code_spans = self.code_spans();
627        code_spans
628            .iter()
629            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
630    }
631
632    /// Check if content has any instances of a specific character (fast)
633    pub fn has_char(&self, ch: char) -> bool {
634        match ch {
635            '#' => self.char_frequency.hash_count > 0,
636            '*' => self.char_frequency.asterisk_count > 0,
637            '_' => self.char_frequency.underscore_count > 0,
638            '-' => self.char_frequency.hyphen_count > 0,
639            '+' => self.char_frequency.plus_count > 0,
640            '>' => self.char_frequency.gt_count > 0,
641            '|' => self.char_frequency.pipe_count > 0,
642            '[' => self.char_frequency.bracket_count > 0,
643            '`' => self.char_frequency.backtick_count > 0,
644            '<' => self.char_frequency.lt_count > 0,
645            '!' => self.char_frequency.exclamation_count > 0,
646            '\n' => self.char_frequency.newline_count > 0,
647            _ => self.content.contains(ch), // Fallback for other characters
648        }
649    }
650
651    /// Get count of a specific character (fast)
652    pub fn char_count(&self, ch: char) -> usize {
653        match ch {
654            '#' => self.char_frequency.hash_count,
655            '*' => self.char_frequency.asterisk_count,
656            '_' => self.char_frequency.underscore_count,
657            '-' => self.char_frequency.hyphen_count,
658            '+' => self.char_frequency.plus_count,
659            '>' => self.char_frequency.gt_count,
660            '|' => self.char_frequency.pipe_count,
661            '[' => self.char_frequency.bracket_count,
662            '`' => self.char_frequency.backtick_count,
663            '<' => self.char_frequency.lt_count,
664            '!' => self.char_frequency.exclamation_count,
665            '\n' => self.char_frequency.newline_count,
666            _ => self.content.matches(ch).count(), // Fallback for other characters
667        }
668    }
669
670    /// Check if content likely contains headings (fast)
671    pub fn likely_has_headings(&self) -> bool {
672        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
673    }
674
675    /// Check if content likely contains lists (fast)
676    pub fn likely_has_lists(&self) -> bool {
677        self.char_frequency.asterisk_count > 0
678            || self.char_frequency.hyphen_count > 0
679            || self.char_frequency.plus_count > 0
680    }
681
682    /// Check if content likely contains emphasis (fast)
683    pub fn likely_has_emphasis(&self) -> bool {
684        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
685    }
686
687    /// Check if content likely contains tables (fast)
688    pub fn likely_has_tables(&self) -> bool {
689        self.char_frequency.pipe_count > 2
690    }
691
692    /// Check if content likely contains blockquotes (fast)
693    pub fn likely_has_blockquotes(&self) -> bool {
694        self.char_frequency.gt_count > 0
695    }
696
697    /// Check if content likely contains code (fast)
698    pub fn likely_has_code(&self) -> bool {
699        self.char_frequency.backtick_count > 0
700    }
701
702    /// Check if content likely contains links or images (fast)
703    pub fn likely_has_links_or_images(&self) -> bool {
704        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
705    }
706
707    /// Check if content likely contains HTML (fast)
708    pub fn likely_has_html(&self) -> bool {
709        self.char_frequency.lt_count > 0
710    }
711
712    /// Get HTML tags on a specific line
713    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
714        self.html_tags()
715            .iter()
716            .filter(|tag| tag.line == line_num)
717            .cloned()
718            .collect()
719    }
720
721    /// Get emphasis spans on a specific line
722    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
723        self.emphasis_spans()
724            .iter()
725            .filter(|span| span.line == line_num)
726            .cloned()
727            .collect()
728    }
729
730    /// Get table rows on a specific line
731    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
732        self.table_rows()
733            .iter()
734            .filter(|row| row.line == line_num)
735            .cloned()
736            .collect()
737    }
738
739    /// Get bare URLs on a specific line
740    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
741        self.bare_urls()
742            .iter()
743            .filter(|url| url.line == line_num)
744            .cloned()
745            .collect()
746    }
747
748    /// Parse all links in the content
749    fn parse_links(
750        content: &str,
751        lines: &[LineInfo],
752        code_blocks: &[(usize, usize)],
753        code_spans: &[CodeSpan],
754        flavor: MarkdownFlavor,
755    ) -> Vec<ParsedLink> {
756        use crate::utils::skip_context::{is_in_html_comment, is_mkdocs_snippet_line};
757
758        // Pre-size based on a heuristic: most markdown files have relatively few links
759        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
760
761        // Parse links across the entire content, not line by line
762        for cap in LINK_PATTERN.captures_iter(content) {
763            let full_match = cap.get(0).unwrap();
764            let match_start = full_match.start();
765            let match_end = full_match.end();
766
767            // Skip if the opening bracket is escaped (preceded by \)
768            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
769                continue;
770            }
771
772            // Skip if this is actually an image (preceded by !)
773            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
774                continue;
775            }
776
777            // Skip if in code block
778            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
779                continue;
780            }
781
782            // Skip if in code span
783            if code_spans
784                .iter()
785                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
786            {
787                continue;
788            }
789
790            // Skip if in HTML comment
791            if is_in_html_comment(content, match_start) {
792                continue;
793            }
794
795            // Skip if this link is on a MkDocs snippet line
796            // Find which line this link is on
797            let line_idx = lines
798                .iter()
799                .position(|line| {
800                    match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
801                })
802                .unwrap_or(0);
803
804            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
805                continue;
806            }
807
808            // Find which line this link starts on
809            let mut line_num = 1;
810            let mut col_start = match_start;
811            for (idx, line_info) in lines.iter().enumerate() {
812                if match_start >= line_info.byte_offset {
813                    line_num = idx + 1;
814                    col_start = match_start - line_info.byte_offset;
815                } else {
816                    break;
817                }
818            }
819
820            // Find which line this link ends on (and calculate column on that line)
821            let mut end_line_num = 1;
822            let mut col_end = match_end;
823            for (idx, line_info) in lines.iter().enumerate() {
824                if match_end > line_info.byte_offset {
825                    end_line_num = idx + 1;
826                    col_end = match_end - line_info.byte_offset;
827                } else {
828                    break;
829                }
830            }
831
832            // For single-line links, use the same approach as before
833            if line_num == end_line_num {
834                // col_end is already correct
835            } else {
836                // For multi-line links, col_end represents the column on the ending line
837                // which is what we want
838            }
839
840            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
841
842            // URL can be in group 2 (angle brackets) or group 3 (bare)
843            let inline_url = cap.get(2).or_else(|| cap.get(3));
844
845            if let Some(url_match) = inline_url {
846                // Inline link
847                links.push(ParsedLink {
848                    line: line_num,
849                    start_col: col_start,
850                    end_col: col_end,
851                    byte_offset: match_start,
852                    byte_end: match_end,
853                    text,
854                    url: url_match.as_str().to_string(),
855                    is_reference: false,
856                    reference_id: None,
857                });
858            } else if let Some(ref_id) = cap.get(6) {
859                // Reference link
860                let ref_id_str = ref_id.as_str();
861                let normalized_ref = if ref_id_str.is_empty() {
862                    text.to_lowercase() // Implicit reference
863                } else {
864                    ref_id_str.to_lowercase()
865                };
866
867                links.push(ParsedLink {
868                    line: line_num,
869                    start_col: col_start,
870                    end_col: col_end,
871                    byte_offset: match_start,
872                    byte_end: match_end,
873                    text,
874                    url: String::new(), // Will be resolved with reference_defs
875                    is_reference: true,
876                    reference_id: Some(normalized_ref),
877                });
878            }
879        }
880
881        links
882    }
883
884    /// Parse all images in the content
885    fn parse_images(
886        content: &str,
887        lines: &[LineInfo],
888        code_blocks: &[(usize, usize)],
889        code_spans: &[CodeSpan],
890    ) -> Vec<ParsedImage> {
891        use crate::utils::skip_context::is_in_html_comment;
892
893        // Pre-size based on a heuristic: images are less common than links
894        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
895
896        // Parse images across the entire content, not line by line
897        for cap in IMAGE_PATTERN.captures_iter(content) {
898            let full_match = cap.get(0).unwrap();
899            let match_start = full_match.start();
900            let match_end = full_match.end();
901
902            // Skip if the ! is escaped (preceded by \)
903            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
904                continue;
905            }
906
907            // Skip if in code block
908            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
909                continue;
910            }
911
912            // Skip if in code span
913            if code_spans
914                .iter()
915                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
916            {
917                continue;
918            }
919
920            // Skip if in HTML comment
921            if is_in_html_comment(content, match_start) {
922                continue;
923            }
924
925            // Find which line this image starts on
926            let mut line_num = 1;
927            let mut col_start = match_start;
928            for (idx, line_info) in lines.iter().enumerate() {
929                if match_start >= line_info.byte_offset {
930                    line_num = idx + 1;
931                    col_start = match_start - line_info.byte_offset;
932                } else {
933                    break;
934                }
935            }
936
937            // Find which line this image ends on (and calculate column on that line)
938            let mut end_line_num = 1;
939            let mut col_end = match_end;
940            for (idx, line_info) in lines.iter().enumerate() {
941                if match_end > line_info.byte_offset {
942                    end_line_num = idx + 1;
943                    col_end = match_end - line_info.byte_offset;
944                } else {
945                    break;
946                }
947            }
948
949            // For single-line images, use the same approach as before
950            if line_num == end_line_num {
951                // col_end is already correct
952            } else {
953                // For multi-line images, col_end represents the column on the ending line
954                // which is what we want
955            }
956
957            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
958
959            // URL can be in group 2 (angle brackets) or group 3 (bare)
960            let inline_url = cap.get(2).or_else(|| cap.get(3));
961
962            if let Some(url_match) = inline_url {
963                // Inline image
964                images.push(ParsedImage {
965                    line: line_num,
966                    start_col: col_start,
967                    end_col: col_end,
968                    byte_offset: match_start,
969                    byte_end: match_end,
970                    alt_text,
971                    url: url_match.as_str().to_string(),
972                    is_reference: false,
973                    reference_id: None,
974                });
975            } else if let Some(ref_id) = cap.get(6) {
976                // Reference image
977                let ref_id_str = ref_id.as_str();
978                let normalized_ref = if ref_id_str.is_empty() {
979                    alt_text.to_lowercase() // Implicit reference
980                } else {
981                    ref_id_str.to_lowercase()
982                };
983
984                images.push(ParsedImage {
985                    line: line_num,
986                    start_col: col_start,
987                    end_col: col_end,
988                    byte_offset: match_start,
989                    byte_end: match_end,
990                    alt_text,
991                    url: String::new(), // Will be resolved with reference_defs
992                    is_reference: true,
993                    reference_id: Some(normalized_ref),
994                });
995            }
996        }
997
998        images
999    }
1000
1001    /// Parse reference definitions
1002    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1003        // Pre-size based on lines count as reference definitions are line-based
1004        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1005
1006        for (line_idx, line_info) in lines.iter().enumerate() {
1007            // Skip lines in code blocks
1008            if line_info.in_code_block {
1009                continue;
1010            }
1011
1012            let line = &line_info.content;
1013            let line_num = line_idx + 1;
1014
1015            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1016                let id = cap.get(1).unwrap().as_str().to_lowercase();
1017                let url = cap.get(2).unwrap().as_str().to_string();
1018                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1019
1020                refs.push(ReferenceDef {
1021                    line: line_num,
1022                    id,
1023                    url,
1024                    title,
1025                });
1026            }
1027        }
1028
1029        refs
1030    }
1031
1032    /// Pre-compute line information
1033    fn compute_line_info(
1034        content: &str,
1035        line_offsets: &[usize],
1036        code_blocks: &[(usize, usize)],
1037        flavor: MarkdownFlavor,
1038    ) -> Vec<LineInfo> {
1039        lazy_static! {
1040            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
1041            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1042            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1043
1044            // Regex for blockquote prefix
1045            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1046
1047            // Regex for heading detection
1048            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1049            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1050
1051            // Regex for blockquote detection
1052            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1053        }
1054
1055        let content_lines: Vec<&str> = content.lines().collect();
1056        let mut lines = Vec::with_capacity(content_lines.len());
1057
1058        // Detect front matter boundaries FIRST, before any other parsing
1059        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1060        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1061
1062        for (i, line) in content_lines.iter().enumerate() {
1063            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1064            let indent = line.len() - line.trim_start().len();
1065            // For blank detection, consider blockquote context
1066            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1067                // In blockquote context, check if content after prefix is blank
1068                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1069                after_prefix.trim().is_empty()
1070            } else {
1071                line.trim().is_empty()
1072            };
1073            // Check if this line is inside a code block (not inline code span)
1074            // We only want to check for fenced/indented code blocks, not inline code
1075            let in_code_block = code_blocks.iter().any(|&(start, end)| {
1076                // Only consider ranges that span multiple lines (code blocks)
1077                // Inline code spans are typically on a single line
1078
1079                // Ensure we're at valid UTF-8 boundaries
1080                let safe_start = if start > 0 && !content.is_char_boundary(start) {
1081                    // Find the nearest valid boundary before start
1082                    let mut boundary = start;
1083                    while boundary > 0 && !content.is_char_boundary(boundary) {
1084                        boundary -= 1;
1085                    }
1086                    boundary
1087                } else {
1088                    start
1089                };
1090
1091                let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1092                    // Find the nearest valid boundary after end
1093                    let mut boundary = end;
1094                    while boundary < content.len() && !content.is_char_boundary(boundary) {
1095                        boundary += 1;
1096                    }
1097                    boundary
1098                } else {
1099                    end.min(content.len())
1100                };
1101
1102                let block_content = &content[safe_start..safe_end];
1103                let is_multiline = block_content.contains('\n');
1104                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1105                let is_indented = !is_fenced
1106                    && block_content
1107                        .lines()
1108                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
1109
1110                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1111            });
1112
1113            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1114            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1115                && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1116            let in_html_comment = crate::utils::skip_context::is_in_html_comment(content, byte_offset);
1117            let list_item = if !(in_code_block
1118                || is_blank
1119                || in_mkdocstrings
1120                || in_html_comment
1121                || (front_matter_end > 0 && i < front_matter_end))
1122            {
1123                // Strip blockquote prefix if present for list detection
1124                let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1125                    let prefix = caps.get(1).unwrap().as_str();
1126                    let content = caps.get(2).unwrap().as_str();
1127                    (content, prefix.len())
1128                } else {
1129                    (&**line, 0)
1130                };
1131
1132                if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1133                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1134                    let marker = caps.get(2).map_or("", |m| m.as_str());
1135                    let spacing = caps.get(3).map_or("", |m| m.as_str());
1136                    let _content = caps.get(4).map_or("", |m| m.as_str());
1137                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1138                    let content_column = marker_column + marker.len() + spacing.len();
1139
1140                    // According to CommonMark spec, unordered list items MUST have at least one space
1141                    // after the marker (-, *, or +). Without a space, it's not a list item.
1142                    // This also naturally handles cases like:
1143                    // - *emphasis* (not a list)
1144                    // - **bold** (not a list)
1145                    // - --- (horizontal rule, not a list)
1146                    if spacing.is_empty() {
1147                        None
1148                    } else {
1149                        Some(ListItemInfo {
1150                            marker: marker.to_string(),
1151                            is_ordered: false,
1152                            number: None,
1153                            marker_column,
1154                            content_column,
1155                        })
1156                    }
1157                } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1158                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1159                    let number_str = caps.get(2).map_or("", |m| m.as_str());
1160                    let delimiter = caps.get(3).map_or("", |m| m.as_str());
1161                    let spacing = caps.get(4).map_or("", |m| m.as_str());
1162                    let _content = caps.get(5).map_or("", |m| m.as_str());
1163                    let marker = format!("{number_str}{delimiter}");
1164                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1165                    let content_column = marker_column + marker.len() + spacing.len();
1166
1167                    // According to CommonMark spec, ordered list items MUST have at least one space
1168                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1169                    if spacing.is_empty() {
1170                        None
1171                    } else {
1172                        Some(ListItemInfo {
1173                            marker,
1174                            is_ordered: true,
1175                            number: number_str.parse().ok(),
1176                            marker_column,
1177                            content_column,
1178                        })
1179                    }
1180                } else {
1181                    None
1182                }
1183            } else {
1184                None
1185            };
1186
1187            lines.push(LineInfo {
1188                content: line.to_string(),
1189                byte_offset,
1190                indent,
1191                is_blank,
1192                in_code_block,
1193                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1194                in_html_block: false, // Will be populated after line creation
1195                in_html_comment,
1196                list_item,
1197                heading: None,    // Will be populated in second pass for Setext headings
1198                blockquote: None, // Will be populated after line creation
1199                in_mkdocstrings,
1200            });
1201        }
1202
1203        // Second pass: detect headings (including Setext which needs look-ahead) and blockquotes
1204        for i in 0..content_lines.len() {
1205            if lines[i].in_code_block {
1206                continue;
1207            }
1208
1209            // Skip lines in front matter
1210            if front_matter_end > 0 && i < front_matter_end {
1211                continue;
1212            }
1213
1214            let line = content_lines[i];
1215
1216            // Check for blockquotes (even on blank lines within blockquotes)
1217            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1218                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1219                let markers = caps.get(2).map_or("", |m| m.as_str());
1220                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1221                let content = caps.get(4).map_or("", |m| m.as_str());
1222
1223                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1224                let marker_column = indent_str.len();
1225
1226                // Build the prefix (indentation + markers + space)
1227                let prefix = format!("{indent_str}{markers}{spaces_after}");
1228
1229                // Check for various blockquote issues
1230                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1231                // Consider tabs as multiple spaces, or actual multiple spaces
1232                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1233
1234                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1235                // MD028 flags empty blockquote lines that don't have a single space after the marker
1236                // Lines like "> " or ">> " are already correct and don't need fixing
1237                let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1238
1239                lines[i].blockquote = Some(BlockquoteInfo {
1240                    nesting_level,
1241                    indent: indent_str.to_string(),
1242                    marker_column,
1243                    prefix,
1244                    content: content.to_string(),
1245                    has_no_space_after_marker: has_no_space,
1246                    has_multiple_spaces_after_marker: has_multiple_spaces,
1247                    needs_md028_fix,
1248                });
1249            }
1250
1251            // Skip heading detection for blank lines
1252            if lines[i].is_blank {
1253                continue;
1254            }
1255
1256            // Check for ATX headings (but skip MkDocs snippet lines)
1257            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1258            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1259                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1260                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1261            } else {
1262                false
1263            };
1264
1265            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1266                // Skip headings inside HTML comments
1267                if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1268                    continue;
1269                }
1270                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1271                let hashes = caps.get(2).map_or("", |m| m.as_str());
1272                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1273                let rest = caps.get(4).map_or("", |m| m.as_str());
1274
1275                let level = hashes.len() as u8;
1276                let marker_column = leading_spaces.len();
1277
1278                // Check for closing sequence, but handle custom IDs that might come after
1279                let (text, has_closing, closing_seq) = {
1280                    // First check if there's a custom ID at the end
1281                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1282                        // Check if this looks like a valid custom ID (ends with })
1283                        if rest[id_start..].trim_end().ends_with('}') {
1284                            // Split off the custom ID
1285                            (&rest[..id_start], &rest[id_start..])
1286                        } else {
1287                            (rest, "")
1288                        }
1289                    } else {
1290                        (rest, "")
1291                    };
1292
1293                    // Now look for closing hashes in the part before the custom ID
1294                    let trimmed_rest = rest_without_id.trim_end();
1295                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1296                        // Look for the start of the hash sequence
1297                        let mut start_of_hashes = last_hash_pos;
1298                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1299                            start_of_hashes -= 1;
1300                        }
1301
1302                        // Check if there's at least one space before the closing hashes
1303                        let has_space_before = start_of_hashes == 0
1304                            || trimmed_rest
1305                                .chars()
1306                                .nth(start_of_hashes - 1)
1307                                .is_some_and(|c| c.is_whitespace());
1308
1309                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1310                        let potential_closing = &trimmed_rest[start_of_hashes..];
1311                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1312
1313                        if is_all_hashes && has_space_before {
1314                            // This is a closing sequence
1315                            let closing_hashes = potential_closing.to_string();
1316                            // The text is everything before the closing hashes
1317                            // Don't include the custom ID here - it will be extracted later
1318                            let text_part = if !custom_id_part.is_empty() {
1319                                // If we have a custom ID, append it back to get the full rest
1320                                // This allows the extract_header_id function to handle it properly
1321                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1322                            } else {
1323                                rest_without_id[..start_of_hashes].trim_end().to_string()
1324                            };
1325                            (text_part, true, closing_hashes)
1326                        } else {
1327                            // Not a valid closing sequence, return the full content
1328                            (rest.to_string(), false, String::new())
1329                        }
1330                    } else {
1331                        // No hashes found, return the full content
1332                        (rest.to_string(), false, String::new())
1333                    }
1334                };
1335
1336                let content_column = marker_column + hashes.len() + spaces_after.len();
1337
1338                // Extract custom header ID if present
1339                let raw_text = text.trim().to_string();
1340                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1341
1342                // If no custom ID was found on the header line, check the next line for standalone attr-list
1343                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1344                    let next_line = content_lines[i + 1];
1345                    if !lines[i + 1].in_code_block
1346                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1347                        && let Some(next_line_id) =
1348                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1349                    {
1350                        custom_id = Some(next_line_id);
1351                    }
1352                }
1353
1354                lines[i].heading = Some(HeadingInfo {
1355                    level,
1356                    style: HeadingStyle::ATX,
1357                    marker: hashes.to_string(),
1358                    marker_column,
1359                    content_column,
1360                    text: clean_text,
1361                    custom_id,
1362                    raw_text,
1363                    has_closing_sequence: has_closing,
1364                    closing_sequence: closing_seq,
1365                });
1366            }
1367            // Check for Setext headings (need to look at next line)
1368            else if i + 1 < content_lines.len() {
1369                let next_line = content_lines[i + 1];
1370                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1371                    // Skip if next line is front matter delimiter
1372                    if front_matter_end > 0 && i < front_matter_end {
1373                        continue;
1374                    }
1375
1376                    // Skip Setext headings inside HTML comments
1377                    if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1378                        continue;
1379                    }
1380
1381                    let underline = next_line.trim();
1382
1383                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1384                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1385                    if underline == "---" {
1386                        continue;
1387                    }
1388
1389                    // Skip if the current line looks like YAML key-value syntax
1390                    let current_line_trimmed = line.trim();
1391                    if current_line_trimmed.contains(':')
1392                        && !current_line_trimmed.starts_with('#')
1393                        && !current_line_trimmed.contains('[')
1394                        && !current_line_trimmed.contains("](")
1395                    {
1396                        // This looks like "key: value" which suggests YAML, not a heading
1397                        continue;
1398                    }
1399
1400                    let level = if underline.starts_with('=') { 1 } else { 2 };
1401                    let style = if level == 1 {
1402                        HeadingStyle::Setext1
1403                    } else {
1404                        HeadingStyle::Setext2
1405                    };
1406
1407                    // Extract custom header ID if present
1408                    let raw_text = line.trim().to_string();
1409                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1410
1411                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1412                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1413                        let attr_line = content_lines[i + 2];
1414                        if !lines[i + 2].in_code_block
1415                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1416                            && let Some(attr_line_id) =
1417                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1418                        {
1419                            custom_id = Some(attr_line_id);
1420                        }
1421                    }
1422
1423                    lines[i].heading = Some(HeadingInfo {
1424                        level,
1425                        style,
1426                        marker: underline.to_string(),
1427                        marker_column: next_line.len() - next_line.trim_start().len(),
1428                        content_column: lines[i].indent,
1429                        text: clean_text,
1430                        custom_id,
1431                        raw_text,
1432                        has_closing_sequence: false,
1433                        closing_sequence: String::new(),
1434                    });
1435                }
1436            }
1437        }
1438
1439        lines
1440    }
1441
1442    /// Detect HTML blocks in the content
1443    fn detect_html_blocks(lines: &mut [LineInfo]) {
1444        // HTML block elements that trigger block context
1445        const BLOCK_ELEMENTS: &[&str] = &[
1446            "address",
1447            "article",
1448            "aside",
1449            "blockquote",
1450            "details",
1451            "dialog",
1452            "dd",
1453            "div",
1454            "dl",
1455            "dt",
1456            "fieldset",
1457            "figcaption",
1458            "figure",
1459            "footer",
1460            "form",
1461            "h1",
1462            "h2",
1463            "h3",
1464            "h4",
1465            "h5",
1466            "h6",
1467            "header",
1468            "hr",
1469            "li",
1470            "main",
1471            "nav",
1472            "ol",
1473            "p",
1474            "pre",
1475            "section",
1476            "table",
1477            "tbody",
1478            "td",
1479            "tfoot",
1480            "th",
1481            "thead",
1482            "tr",
1483            "ul",
1484        ];
1485
1486        let mut i = 0;
1487        while i < lines.len() {
1488            // Skip if already in code block or front matter
1489            if lines[i].in_code_block || lines[i].in_front_matter {
1490                i += 1;
1491                continue;
1492            }
1493
1494            let trimmed = lines[i].content.trim_start();
1495
1496            // Check if line starts with an HTML tag
1497            if trimmed.starts_with('<') && trimmed.len() > 1 {
1498                // Extract tag name safely
1499                let after_bracket = &trimmed[1..];
1500                let is_closing = after_bracket.starts_with('/');
1501                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1502
1503                // Extract tag name (stop at space, >, /, or end of string)
1504                let tag_name = tag_start
1505                    .chars()
1506                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1507                    .collect::<String>()
1508                    .to_lowercase();
1509
1510                // Check if it's a block element
1511                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1512                    // Mark this line as in HTML block
1513                    lines[i].in_html_block = true;
1514
1515                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1516                    // This avoids complex nesting logic that might cause infinite loops
1517                    if !is_closing {
1518                        let closing_tag = format!("</{tag_name}>");
1519                        let mut j = i + 1;
1520                        while j < lines.len() && j < i + 100 {
1521                            // Limit search to 100 lines
1522                            // Stop at blank lines
1523                            if lines[j].is_blank {
1524                                break;
1525                            }
1526
1527                            lines[j].in_html_block = true;
1528
1529                            // Check if this line contains the closing tag
1530                            if lines[j].content.contains(&closing_tag) {
1531                                break;
1532                            }
1533                            j += 1;
1534                        }
1535                    }
1536                }
1537            }
1538
1539            i += 1;
1540        }
1541    }
1542
1543    /// Parse all inline code spans in the content using AST
1544    fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1545        let mut code_spans = Vec::new();
1546
1547        // Quick check - if no backticks, no code spans
1548        if !content.contains('`') {
1549            return code_spans;
1550        }
1551
1552        // Helper function to recursively extract inline code spans from AST nodes
1553        fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1554            match node {
1555                Node::InlineCode(inline_code) => {
1556                    if let Some(pos) = &inline_code.position {
1557                        let start_pos = pos.start.offset;
1558                        let end_pos = pos.end.offset;
1559
1560                        // The position includes the backticks, extract the actual content
1561                        let full_span = &content[start_pos..end_pos];
1562                        let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1563
1564                        // Extract content between backticks, preserving spaces
1565                        let content_start = start_pos + backtick_count;
1566                        let content_end = end_pos - backtick_count;
1567                        let span_content = if content_start < content_end {
1568                            content[content_start..content_end].to_string()
1569                        } else {
1570                            String::new()
1571                        };
1572
1573                        // Find which line this code span starts on
1574                        let mut line_num = 1;
1575                        let mut col_start = start_pos;
1576                        for (idx, line_info) in lines.iter().enumerate() {
1577                            if start_pos >= line_info.byte_offset {
1578                                line_num = idx + 1;
1579                                col_start = start_pos - line_info.byte_offset;
1580                            } else {
1581                                break;
1582                            }
1583                        }
1584
1585                        // Find end column
1586                        let mut col_end = end_pos;
1587                        for line_info in lines.iter() {
1588                            if end_pos > line_info.byte_offset {
1589                                col_end = end_pos - line_info.byte_offset;
1590                            } else {
1591                                break;
1592                            }
1593                        }
1594
1595                        spans.push(CodeSpan {
1596                            line: line_num,
1597                            start_col: col_start,
1598                            end_col: col_end,
1599                            byte_offset: start_pos,
1600                            byte_end: end_pos,
1601                            backtick_count,
1602                            content: span_content,
1603                        });
1604                    }
1605                }
1606                // Recursively process children
1607                Node::Root(root) => {
1608                    for child in &root.children {
1609                        extract_code_spans(child, content, lines, spans);
1610                    }
1611                }
1612                Node::Paragraph(para) => {
1613                    for child in &para.children {
1614                        extract_code_spans(child, content, lines, spans);
1615                    }
1616                }
1617                Node::Heading(heading) => {
1618                    for child in &heading.children {
1619                        extract_code_spans(child, content, lines, spans);
1620                    }
1621                }
1622                Node::List(list) => {
1623                    for child in &list.children {
1624                        extract_code_spans(child, content, lines, spans);
1625                    }
1626                }
1627                Node::ListItem(item) => {
1628                    for child in &item.children {
1629                        extract_code_spans(child, content, lines, spans);
1630                    }
1631                }
1632                Node::Blockquote(blockquote) => {
1633                    for child in &blockquote.children {
1634                        extract_code_spans(child, content, lines, spans);
1635                    }
1636                }
1637                Node::Table(table) => {
1638                    for child in &table.children {
1639                        extract_code_spans(child, content, lines, spans);
1640                    }
1641                }
1642                Node::TableRow(row) => {
1643                    for child in &row.children {
1644                        extract_code_spans(child, content, lines, spans);
1645                    }
1646                }
1647                Node::TableCell(cell) => {
1648                    for child in &cell.children {
1649                        extract_code_spans(child, content, lines, spans);
1650                    }
1651                }
1652                Node::Emphasis(emphasis) => {
1653                    for child in &emphasis.children {
1654                        extract_code_spans(child, content, lines, spans);
1655                    }
1656                }
1657                Node::Strong(strong) => {
1658                    for child in &strong.children {
1659                        extract_code_spans(child, content, lines, spans);
1660                    }
1661                }
1662                Node::Link(link) => {
1663                    for child in &link.children {
1664                        extract_code_spans(child, content, lines, spans);
1665                    }
1666                }
1667                Node::LinkReference(link_ref) => {
1668                    for child in &link_ref.children {
1669                        extract_code_spans(child, content, lines, spans);
1670                    }
1671                }
1672                Node::FootnoteDefinition(footnote) => {
1673                    for child in &footnote.children {
1674                        extract_code_spans(child, content, lines, spans);
1675                    }
1676                }
1677                Node::Delete(delete) => {
1678                    for child in &delete.children {
1679                        extract_code_spans(child, content, lines, spans);
1680                    }
1681                }
1682                // Terminal nodes or nodes without relevant children
1683                Node::Code(_)
1684                | Node::Text(_)
1685                | Node::Html(_)
1686                | Node::Image(_)
1687                | Node::ImageReference(_)
1688                | Node::FootnoteReference(_)
1689                | Node::Break(_)
1690                | Node::ThematicBreak(_)
1691                | Node::Definition(_)
1692                | Node::Yaml(_)
1693                | Node::Toml(_)
1694                | Node::Math(_)
1695                | Node::InlineMath(_)
1696                | Node::MdxJsxFlowElement(_)
1697                | Node::MdxFlowExpression(_)
1698                | Node::MdxJsxTextElement(_)
1699                | Node::MdxTextExpression(_)
1700                | Node::MdxjsEsm(_) => {
1701                    // No children to process or not relevant for code spans
1702                }
1703            }
1704        }
1705
1706        // Extract all code spans from the AST
1707        extract_code_spans(ast, content, lines, &mut code_spans);
1708
1709        // Sort by position to ensure consistent ordering
1710        code_spans.sort_by_key(|span| span.byte_offset);
1711
1712        code_spans
1713    }
1714
1715    /// Parse all list blocks in the content (legacy line-by-line approach)
1716    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1717        // Pre-size based on lines that could be list items
1718        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1719        let mut current_block: Option<ListBlock> = None;
1720        let mut last_list_item_line = 0;
1721        let mut current_indent_level = 0;
1722        let mut last_marker_width = 0;
1723
1724        for (line_idx, line_info) in lines.iter().enumerate() {
1725            let line_num = line_idx + 1;
1726
1727            // Enhanced code block handling using Design #3's context analysis
1728            if line_info.in_code_block {
1729                if let Some(ref mut block) = current_block {
1730                    // Calculate minimum indentation for list continuation
1731                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1732
1733                    // Analyze code block context using the three-tier classification
1734                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1735
1736                    match context {
1737                        CodeBlockContext::Indented => {
1738                            // Code block is properly indented - continues the list
1739                            block.end_line = line_num;
1740                            continue;
1741                        }
1742                        CodeBlockContext::Standalone => {
1743                            // Code block separates lists - end current block
1744                            let completed_block = current_block.take().unwrap();
1745                            list_blocks.push(completed_block);
1746                            continue;
1747                        }
1748                        CodeBlockContext::Adjacent => {
1749                            // Edge case - use conservative behavior (continue list)
1750                            block.end_line = line_num;
1751                            continue;
1752                        }
1753                    }
1754                } else {
1755                    // No current list block - skip code block lines
1756                    continue;
1757                }
1758            }
1759
1760            // Extract blockquote prefix if any
1761            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1762                caps.get(0).unwrap().as_str().to_string()
1763            } else {
1764                String::new()
1765            };
1766
1767            // Check if this line is a list item
1768            if let Some(list_item) = &line_info.list_item {
1769                // Calculate nesting level based on indentation
1770                let item_indent = list_item.marker_column;
1771                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1772
1773                if let Some(ref mut block) = current_block {
1774                    // Check if this continues the current block
1775                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1776                    // or a continuation at the same or lower level
1777                    let is_nested = nesting > block.nesting_level;
1778                    let same_type =
1779                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1780                    let same_context = block.blockquote_prefix == blockquote_prefix;
1781                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1782
1783                    // For unordered lists, also check marker consistency
1784                    let marker_compatible =
1785                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1786
1787                    // Check if there's non-list content between the last item and this one
1788                    let has_non_list_content = {
1789                        let mut found_non_list = false;
1790                        // Use the last item from the current block, not the global last_list_item_line
1791                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1792
1793                        // Debug: Special check for problematic line
1794                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1795                            let last_line = &lines[block_last_item_line - 1];
1796                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1797                                log::debug!(
1798                                    "After problematic line {}: checking lines {} to {} for non-list content",
1799                                    block_last_item_line,
1800                                    block_last_item_line + 1,
1801                                    line_num
1802                                );
1803                                // If they're consecutive list items, there's no content between
1804                                if line_num == block_last_item_line + 1 {
1805                                    log::debug!("Lines are consecutive, no content between");
1806                                }
1807                            }
1808                        }
1809
1810                        for check_line in (block_last_item_line + 1)..line_num {
1811                            let check_idx = check_line - 1;
1812                            if check_idx < lines.len() {
1813                                let check_info = &lines[check_idx];
1814                                // Check for content that breaks the list
1815                                let is_list_breaking_content = if check_info.in_code_block {
1816                                    // Use enhanced code block classification for list separation
1817                                    let last_item_marker_width =
1818                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1819                                            lines[block_last_item_line - 1]
1820                                                .list_item
1821                                                .as_ref()
1822                                                .map(|li| {
1823                                                    if li.is_ordered {
1824                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1825                                                    } else {
1826                                                        li.marker.len()
1827                                                    }
1828                                                })
1829                                                .unwrap_or(3) // fallback to 3 if no list item found
1830                                        } else {
1831                                            3 // fallback
1832                                        };
1833
1834                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1835
1836                                    // Analyze code block context using our enhanced classification
1837                                    let context = CodeBlockUtils::analyze_code_block_context(
1838                                        lines,
1839                                        check_line - 1,
1840                                        min_continuation,
1841                                    );
1842
1843                                    // Standalone code blocks break lists, indented ones continue them
1844                                    matches!(context, CodeBlockContext::Standalone)
1845                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1846                                    // Check for structural separators that should break lists (from issue #42)
1847                                    let line_content = check_info.content.trim();
1848
1849                                    // Any of these structural separators break lists
1850                                    if check_info.heading.is_some()
1851                                        || line_content.starts_with("---")
1852                                        || line_content.starts_with("***")
1853                                        || line_content.starts_with("___")
1854                                        || (line_content.contains('|')
1855                                            && !line_content.contains("](")
1856                                            && !line_content.contains("http")
1857                                            && (line_content.matches('|').count() > 1
1858                                                || line_content.starts_with('|')
1859                                                || line_content.ends_with('|')))
1860                                        || line_content.starts_with(">")
1861                                    {
1862                                        true
1863                                    }
1864                                    // Other non-list content - check if properly indented
1865                                    else {
1866                                        let last_item_marker_width =
1867                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1868                                                lines[block_last_item_line - 1]
1869                                                    .list_item
1870                                                    .as_ref()
1871                                                    .map(|li| {
1872                                                        if li.is_ordered {
1873                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1874                                                        } else {
1875                                                            li.marker.len()
1876                                                        }
1877                                                    })
1878                                                    .unwrap_or(3) // fallback to 3 if no list item found
1879                                            } else {
1880                                                3 // fallback
1881                                            };
1882
1883                                        let min_continuation =
1884                                            if block.is_ordered { last_item_marker_width } else { 2 };
1885                                        check_info.indent < min_continuation
1886                                    }
1887                                } else {
1888                                    false
1889                                };
1890
1891                                if is_list_breaking_content {
1892                                    // Not indented enough, so it breaks the list
1893                                    found_non_list = true;
1894                                    break;
1895                                }
1896                            }
1897                        }
1898                        found_non_list
1899                    };
1900
1901                    // A list continues if:
1902                    // 1. It's a nested item (indented more than the parent), OR
1903                    // 2. It's the same type at the same level with reasonable distance
1904                    let mut continues_list = if is_nested {
1905                        // Nested items always continue the list if they're in the same context
1906                        same_context && reasonable_distance && !has_non_list_content
1907                    } else {
1908                        // Same-level items need to match type and markers
1909                        let result = same_type
1910                            && same_context
1911                            && reasonable_distance
1912                            && marker_compatible
1913                            && !has_non_list_content;
1914
1915                        // Debug logging for lines after problematic content
1916                        if block.item_lines.last().is_some_and(|&last_line| {
1917                            last_line > 0
1918                                && last_line <= lines.len()
1919                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1920                                && lines[last_line - 1].content.contains(r"\`")
1921                        }) {
1922                            log::debug!(
1923                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1924                            );
1925                            if line_num > 0 && line_num <= lines.len() {
1926                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1927                            }
1928                        }
1929
1930                        result
1931                    };
1932
1933                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
1934                    // This handles edge cases where content patterns might otherwise split lists incorrectly
1935                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1936                        // Check if the previous line was a list item
1937                        if block.item_lines.contains(&(line_num - 1)) {
1938                            // They're consecutive list items - force them to be in the same list
1939                            continues_list = true;
1940                        }
1941                    }
1942
1943                    if continues_list {
1944                        // Extend current block
1945                        block.end_line = line_num;
1946                        block.item_lines.push(line_num);
1947
1948                        // Update max marker width
1949                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1950                            list_item.marker.len() + 1
1951                        } else {
1952                            list_item.marker.len()
1953                        });
1954
1955                        // Update marker consistency for unordered lists
1956                        if !block.is_ordered
1957                            && block.marker.is_some()
1958                            && block.marker.as_ref() != Some(&list_item.marker)
1959                        {
1960                            // Mixed markers, clear the marker field
1961                            block.marker = None;
1962                        }
1963                    } else {
1964                        // End current block and start a new one
1965
1966                        list_blocks.push(block.clone());
1967
1968                        *block = ListBlock {
1969                            start_line: line_num,
1970                            end_line: line_num,
1971                            is_ordered: list_item.is_ordered,
1972                            marker: if list_item.is_ordered {
1973                                None
1974                            } else {
1975                                Some(list_item.marker.clone())
1976                            },
1977                            blockquote_prefix: blockquote_prefix.clone(),
1978                            item_lines: vec![line_num],
1979                            nesting_level: nesting,
1980                            max_marker_width: if list_item.is_ordered {
1981                                list_item.marker.len() + 1
1982                            } else {
1983                                list_item.marker.len()
1984                            },
1985                        };
1986                    }
1987                } else {
1988                    // Start a new block
1989                    current_block = Some(ListBlock {
1990                        start_line: line_num,
1991                        end_line: line_num,
1992                        is_ordered: list_item.is_ordered,
1993                        marker: if list_item.is_ordered {
1994                            None
1995                        } else {
1996                            Some(list_item.marker.clone())
1997                        },
1998                        blockquote_prefix,
1999                        item_lines: vec![line_num],
2000                        nesting_level: nesting,
2001                        max_marker_width: list_item.marker.len(),
2002                    });
2003                }
2004
2005                last_list_item_line = line_num;
2006                current_indent_level = item_indent;
2007                last_marker_width = if list_item.is_ordered {
2008                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2009                } else {
2010                    list_item.marker.len()
2011                };
2012            } else if let Some(ref mut block) = current_block {
2013                // Not a list item - check if it continues the current block
2014
2015                // For MD032 compatibility, we use a simple approach:
2016                // - Indented lines continue the list
2017                // - Blank lines followed by indented content continue the list
2018                // - Everything else ends the list
2019
2020                // Check if the last line in the list block ended with a backslash (hard line break)
2021                // This handles cases where list items use backslash for hard line breaks
2022                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2023                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2024                } else {
2025                    false
2026                };
2027
2028                // Calculate minimum indentation for list continuation
2029                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2030                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2031                let min_continuation_indent = if block.is_ordered {
2032                    current_indent_level + last_marker_width
2033                } else {
2034                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2035                };
2036
2037                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2038                    // Indented line or backslash continuation continues the list
2039                    block.end_line = line_num;
2040                } else if line_info.is_blank {
2041                    // Blank line - check if it's internal to the list or ending it
2042                    // We only include blank lines that are followed by more list content
2043                    let mut check_idx = line_idx + 1;
2044                    let mut found_continuation = false;
2045
2046                    // Skip additional blank lines
2047                    while check_idx < lines.len() && lines[check_idx].is_blank {
2048                        check_idx += 1;
2049                    }
2050
2051                    if check_idx < lines.len() {
2052                        let next_line = &lines[check_idx];
2053                        // Check if followed by indented content (list continuation)
2054                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2055                            found_continuation = true;
2056                        }
2057                        // Check if followed by another list item at the same level
2058                        else if !next_line.in_code_block
2059                            && next_line.list_item.is_some()
2060                            && let Some(item) = &next_line.list_item
2061                        {
2062                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2063                                .find(&next_line.content)
2064                                .map_or(String::new(), |m| m.as_str().to_string());
2065                            if item.marker_column == current_indent_level
2066                                && item.is_ordered == block.is_ordered
2067                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2068                            {
2069                                // Check if there was meaningful content between the list items (unused now)
2070                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2071                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2072                                    if let Some(between_line) = lines.get(idx) {
2073                                        let trimmed = between_line.content.trim();
2074                                        // Skip empty lines
2075                                        if trimmed.is_empty() {
2076                                            return false;
2077                                        }
2078                                        // Check for meaningful content
2079                                        let line_indent =
2080                                            between_line.content.len() - between_line.content.trim_start().len();
2081
2082                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2083                                        if trimmed.starts_with("```")
2084                                            || trimmed.starts_with("~~~")
2085                                            || trimmed.starts_with("---")
2086                                            || trimmed.starts_with("***")
2087                                            || trimmed.starts_with("___")
2088                                            || trimmed.starts_with(">")
2089                                            || trimmed.contains('|') // Tables
2090                                            || between_line.heading.is_some()
2091                                        {
2092                                            return true; // These are structural separators - meaningful content that breaks lists
2093                                        }
2094
2095                                        // Only properly indented content continues the list
2096                                        line_indent >= min_continuation_indent
2097                                    } else {
2098                                        false
2099                                    }
2100                                });
2101
2102                                if block.is_ordered {
2103                                    // For ordered lists: don't continue if there are structural separators
2104                                    // Check if there are structural separators between the list items
2105                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2106                                        if let Some(between_line) = lines.get(idx) {
2107                                            let trimmed = between_line.content.trim();
2108                                            if trimmed.is_empty() {
2109                                                return false;
2110                                            }
2111                                            // Check for structural separators that break lists
2112                                            trimmed.starts_with("```")
2113                                                || trimmed.starts_with("~~~")
2114                                                || trimmed.starts_with("---")
2115                                                || trimmed.starts_with("***")
2116                                                || trimmed.starts_with("___")
2117                                                || trimmed.starts_with(">")
2118                                                || trimmed.contains('|') // Tables
2119                                                || between_line.heading.is_some()
2120                                        } else {
2121                                            false
2122                                        }
2123                                    });
2124                                    found_continuation = !has_structural_separators;
2125                                } else {
2126                                    // For unordered lists: also check for structural separators
2127                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2128                                        if let Some(between_line) = lines.get(idx) {
2129                                            let trimmed = between_line.content.trim();
2130                                            if trimmed.is_empty() {
2131                                                return false;
2132                                            }
2133                                            // Check for structural separators that break lists
2134                                            trimmed.starts_with("```")
2135                                                || trimmed.starts_with("~~~")
2136                                                || trimmed.starts_with("---")
2137                                                || trimmed.starts_with("***")
2138                                                || trimmed.starts_with("___")
2139                                                || trimmed.starts_with(">")
2140                                                || trimmed.contains('|') // Tables
2141                                                || between_line.heading.is_some()
2142                                        } else {
2143                                            false
2144                                        }
2145                                    });
2146                                    found_continuation = !has_structural_separators;
2147                                }
2148                            }
2149                        }
2150                    }
2151
2152                    if found_continuation {
2153                        // Include the blank line in the block
2154                        block.end_line = line_num;
2155                    } else {
2156                        // Blank line ends the list - don't include it
2157                        list_blocks.push(block.clone());
2158                        current_block = None;
2159                    }
2160                } else {
2161                    // Check for lazy continuation - non-indented line immediately after a list item
2162                    // But only if the line has sufficient indentation for the list type
2163                    let min_required_indent = if block.is_ordered {
2164                        current_indent_level + last_marker_width
2165                    } else {
2166                        current_indent_level + 2
2167                    };
2168
2169                    // For lazy continuation to apply, the line must either:
2170                    // 1. Have no indentation (true lazy continuation)
2171                    // 2. Have sufficient indentation for the list type
2172                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2173                    let line_content = line_info.content.trim();
2174                    let is_structural_separator = line_info.heading.is_some()
2175                        || line_content.starts_with("```")
2176                        || line_content.starts_with("~~~")
2177                        || line_content.starts_with("---")
2178                        || line_content.starts_with("***")
2179                        || line_content.starts_with("___")
2180                        || line_content.starts_with(">")
2181                        || (line_content.contains('|')
2182                            && !line_content.contains("](")
2183                            && !line_content.contains("http")
2184                            && (line_content.matches('|').count() > 1
2185                                || line_content.starts_with('|')
2186                                || line_content.ends_with('|'))); // Tables
2187
2188                    // Allow lazy continuation if we're still within the same list block
2189                    // (not just immediately after a list item)
2190                    let is_lazy_continuation = !is_structural_separator
2191                        && !line_info.is_blank
2192                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2193
2194                    if is_lazy_continuation {
2195                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2196                        // it's probably not a continuation
2197                        let content_to_check = if !blockquote_prefix.is_empty() {
2198                            // Strip blockquote prefix to check the actual content
2199                            line_info
2200                                .content
2201                                .strip_prefix(&blockquote_prefix)
2202                                .unwrap_or(&line_info.content)
2203                                .trim()
2204                        } else {
2205                            line_info.content.trim()
2206                        };
2207
2208                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2209
2210                        // If it starts with uppercase and the previous line ended with punctuation,
2211                        // it's likely a new paragraph, not a continuation
2212                        if starts_with_uppercase && last_list_item_line > 0 {
2213                            // This looks like a new paragraph
2214                            list_blocks.push(block.clone());
2215                            current_block = None;
2216                        } else {
2217                            // This is a lazy continuation line
2218                            block.end_line = line_num;
2219                        }
2220                    } else {
2221                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2222                        list_blocks.push(block.clone());
2223                        current_block = None;
2224                    }
2225                }
2226            }
2227        }
2228
2229        // Don't forget the last block
2230        if let Some(block) = current_block {
2231            list_blocks.push(block);
2232        }
2233
2234        // Merge adjacent blocks that should be one
2235        merge_adjacent_list_blocks(&mut list_blocks, lines);
2236
2237        list_blocks
2238    }
2239
2240    /// Compute character frequency for fast content analysis
2241    fn compute_char_frequency(content: &str) -> CharFrequency {
2242        let mut frequency = CharFrequency::default();
2243
2244        for ch in content.chars() {
2245            match ch {
2246                '#' => frequency.hash_count += 1,
2247                '*' => frequency.asterisk_count += 1,
2248                '_' => frequency.underscore_count += 1,
2249                '-' => frequency.hyphen_count += 1,
2250                '+' => frequency.plus_count += 1,
2251                '>' => frequency.gt_count += 1,
2252                '|' => frequency.pipe_count += 1,
2253                '[' => frequency.bracket_count += 1,
2254                '`' => frequency.backtick_count += 1,
2255                '<' => frequency.lt_count += 1,
2256                '!' => frequency.exclamation_count += 1,
2257                '\n' => frequency.newline_count += 1,
2258                _ => {}
2259            }
2260        }
2261
2262        frequency
2263    }
2264
2265    /// Parse HTML tags in the content
2266    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2267        lazy_static! {
2268            static ref HTML_TAG_REGEX: regex::Regex =
2269                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2270        }
2271
2272        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2273
2274        for cap in HTML_TAG_REGEX.captures_iter(content) {
2275            let full_match = cap.get(0).unwrap();
2276            let match_start = full_match.start();
2277            let match_end = full_match.end();
2278
2279            // Skip if in code block
2280            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2281                continue;
2282            }
2283
2284            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2285            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2286            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2287
2288            // Find which line this tag is on
2289            let mut line_num = 1;
2290            let mut col_start = match_start;
2291            let mut col_end = match_end;
2292            for (idx, line_info) in lines.iter().enumerate() {
2293                if match_start >= line_info.byte_offset {
2294                    line_num = idx + 1;
2295                    col_start = match_start - line_info.byte_offset;
2296                    col_end = match_end - line_info.byte_offset;
2297                } else {
2298                    break;
2299                }
2300            }
2301
2302            html_tags.push(HtmlTag {
2303                line: line_num,
2304                start_col: col_start,
2305                end_col: col_end,
2306                byte_offset: match_start,
2307                byte_end: match_end,
2308                tag_name,
2309                is_closing,
2310                is_self_closing,
2311                raw_content: full_match.as_str().to_string(),
2312            });
2313        }
2314
2315        html_tags
2316    }
2317
2318    /// Parse emphasis spans in the content
2319    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2320        lazy_static! {
2321            static ref EMPHASIS_REGEX: regex::Regex =
2322                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2323        }
2324
2325        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2326
2327        for cap in EMPHASIS_REGEX.captures_iter(content) {
2328            let full_match = cap.get(0).unwrap();
2329            let match_start = full_match.start();
2330            let match_end = full_match.end();
2331
2332            // Skip if in code block
2333            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2334                continue;
2335            }
2336
2337            let opening_markers = cap.get(1).unwrap().as_str();
2338            let content_part = cap.get(2).unwrap().as_str();
2339            let closing_markers = cap.get(3).unwrap().as_str();
2340
2341            // Validate matching markers
2342            if opening_markers.chars().next() != closing_markers.chars().next()
2343                || opening_markers.len() != closing_markers.len()
2344            {
2345                continue;
2346            }
2347
2348            let marker = opening_markers.chars().next().unwrap();
2349            let marker_count = opening_markers.len();
2350
2351            // Find which line this emphasis is on
2352            let mut line_num = 1;
2353            let mut col_start = match_start;
2354            let mut col_end = match_end;
2355            for (idx, line_info) in lines.iter().enumerate() {
2356                if match_start >= line_info.byte_offset {
2357                    line_num = idx + 1;
2358                    col_start = match_start - line_info.byte_offset;
2359                    col_end = match_end - line_info.byte_offset;
2360                } else {
2361                    break;
2362                }
2363            }
2364
2365            emphasis_spans.push(EmphasisSpan {
2366                line: line_num,
2367                start_col: col_start,
2368                end_col: col_end,
2369                byte_offset: match_start,
2370                byte_end: match_end,
2371                marker,
2372                marker_count,
2373                content: content_part.to_string(),
2374            });
2375        }
2376
2377        emphasis_spans
2378    }
2379
2380    /// Parse table rows in the content
2381    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2382        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2383
2384        for (line_idx, line_info) in lines.iter().enumerate() {
2385            // Skip lines in code blocks or blank lines
2386            if line_info.in_code_block || line_info.is_blank {
2387                continue;
2388            }
2389
2390            let line = &line_info.content;
2391            let line_num = line_idx + 1;
2392
2393            // Check if this line contains pipes (potential table row)
2394            if !line.contains('|') {
2395                continue;
2396            }
2397
2398            // Count columns by splitting on pipes
2399            let parts: Vec<&str> = line.split('|').collect();
2400            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2401
2402            // Check if this is a separator row
2403            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2404            let mut column_alignments = Vec::new();
2405
2406            if is_separator {
2407                for part in &parts[1..parts.len() - 1] {
2408                    // Skip first and last empty parts
2409                    let trimmed = part.trim();
2410                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2411                        "center".to_string()
2412                    } else if trimmed.ends_with(':') {
2413                        "right".to_string()
2414                    } else if trimmed.starts_with(':') {
2415                        "left".to_string()
2416                    } else {
2417                        "none".to_string()
2418                    };
2419                    column_alignments.push(alignment);
2420                }
2421            }
2422
2423            table_rows.push(TableRow {
2424                line: line_num,
2425                is_separator,
2426                column_count,
2427                column_alignments,
2428            });
2429        }
2430
2431        table_rows
2432    }
2433
2434    /// Parse bare URLs and emails in the content
2435    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2436        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2437
2438        // Check for bare URLs (not in angle brackets or markdown links)
2439        for cap in BARE_URL_PATTERN.captures_iter(content) {
2440            let full_match = cap.get(0).unwrap();
2441            let match_start = full_match.start();
2442            let match_end = full_match.end();
2443
2444            // Skip if in code block
2445            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2446                continue;
2447            }
2448
2449            // Skip if already in angle brackets or markdown links
2450            let preceding_char = if match_start > 0 {
2451                content.chars().nth(match_start - 1)
2452            } else {
2453                None
2454            };
2455            let following_char = content.chars().nth(match_end);
2456
2457            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2458                continue;
2459            }
2460            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2461                continue;
2462            }
2463
2464            let url = full_match.as_str();
2465            let url_type = if url.starts_with("https://") {
2466                "https"
2467            } else if url.starts_with("http://") {
2468                "http"
2469            } else if url.starts_with("ftp://") {
2470                "ftp"
2471            } else {
2472                "other"
2473            };
2474
2475            // Find which line this URL is on
2476            let mut line_num = 1;
2477            let mut col_start = match_start;
2478            let mut col_end = match_end;
2479            for (idx, line_info) in lines.iter().enumerate() {
2480                if match_start >= line_info.byte_offset {
2481                    line_num = idx + 1;
2482                    col_start = match_start - line_info.byte_offset;
2483                    col_end = match_end - line_info.byte_offset;
2484                } else {
2485                    break;
2486                }
2487            }
2488
2489            bare_urls.push(BareUrl {
2490                line: line_num,
2491                start_col: col_start,
2492                end_col: col_end,
2493                byte_offset: match_start,
2494                byte_end: match_end,
2495                url: url.to_string(),
2496                url_type: url_type.to_string(),
2497            });
2498        }
2499
2500        // Check for bare email addresses
2501        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2502            let full_match = cap.get(0).unwrap();
2503            let match_start = full_match.start();
2504            let match_end = full_match.end();
2505
2506            // Skip if in code block
2507            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2508                continue;
2509            }
2510
2511            // Skip if already in angle brackets or markdown links
2512            let preceding_char = if match_start > 0 {
2513                content.chars().nth(match_start - 1)
2514            } else {
2515                None
2516            };
2517            let following_char = content.chars().nth(match_end);
2518
2519            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2520                continue;
2521            }
2522            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2523                continue;
2524            }
2525
2526            let email = full_match.as_str();
2527
2528            // Find which line this email is on
2529            let mut line_num = 1;
2530            let mut col_start = match_start;
2531            let mut col_end = match_end;
2532            for (idx, line_info) in lines.iter().enumerate() {
2533                if match_start >= line_info.byte_offset {
2534                    line_num = idx + 1;
2535                    col_start = match_start - line_info.byte_offset;
2536                    col_end = match_end - line_info.byte_offset;
2537                } else {
2538                    break;
2539                }
2540            }
2541
2542            bare_urls.push(BareUrl {
2543                line: line_num,
2544                start_col: col_start,
2545                end_col: col_end,
2546                byte_offset: match_start,
2547                byte_end: match_end,
2548                url: email.to_string(),
2549                url_type: "email".to_string(),
2550            });
2551        }
2552
2553        bare_urls
2554    }
2555}
2556
2557/// Merge adjacent list blocks that should be treated as one
2558fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2559    if list_blocks.len() < 2 {
2560        return;
2561    }
2562
2563    let mut merger = ListBlockMerger::new(lines);
2564    *list_blocks = merger.merge(list_blocks);
2565}
2566
2567/// Helper struct to manage the complex logic of merging list blocks
2568struct ListBlockMerger<'a> {
2569    lines: &'a [LineInfo],
2570}
2571
2572impl<'a> ListBlockMerger<'a> {
2573    fn new(lines: &'a [LineInfo]) -> Self {
2574        Self { lines }
2575    }
2576
2577    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2578        let mut merged = Vec::with_capacity(list_blocks.len());
2579        let mut current = list_blocks[0].clone();
2580
2581        for next in list_blocks.iter().skip(1) {
2582            if self.should_merge_blocks(&current, next) {
2583                current = self.merge_two_blocks(current, next);
2584            } else {
2585                merged.push(current);
2586                current = next.clone();
2587            }
2588        }
2589
2590        merged.push(current);
2591        merged
2592    }
2593
2594    /// Determine if two adjacent list blocks should be merged
2595    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2596        // Basic compatibility checks
2597        if !self.blocks_are_compatible(current, next) {
2598            return false;
2599        }
2600
2601        // Check spacing and content between blocks
2602        let spacing = self.analyze_spacing_between(current, next);
2603        match spacing {
2604            BlockSpacing::Consecutive => true,
2605            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2606            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2607                self.can_merge_with_content_between(current, next)
2608            }
2609        }
2610    }
2611
2612    /// Check if blocks have compatible structure for merging
2613    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2614        current.is_ordered == next.is_ordered
2615            && current.blockquote_prefix == next.blockquote_prefix
2616            && current.nesting_level == next.nesting_level
2617    }
2618
2619    /// Analyze the spacing between two list blocks
2620    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2621        let gap = next.start_line - current.end_line;
2622
2623        match gap {
2624            1 => BlockSpacing::Consecutive,
2625            2 => BlockSpacing::SingleBlank,
2626            _ if gap > 2 => {
2627                if self.has_only_blank_lines_between(current, next) {
2628                    BlockSpacing::MultipleBlanks
2629                } else {
2630                    BlockSpacing::ContentBetween
2631                }
2632            }
2633            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2634        }
2635    }
2636
2637    /// Check if unordered lists can be merged with a single blank line between
2638    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2639        // Check if there are structural separators between the blocks
2640        // If has_meaningful_content_between returns true, it means there are structural separators
2641        if has_meaningful_content_between(current, next, self.lines) {
2642            return false; // Structural separators prevent merging
2643        }
2644
2645        // Only merge unordered lists with same marker across single blank
2646        !current.is_ordered && current.marker == next.marker
2647    }
2648
2649    /// Check if ordered lists can be merged when there's content between them
2650    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2651        // Do not merge lists if there are structural separators between them
2652        if has_meaningful_content_between(current, next, self.lines) {
2653            return false; // Structural separators prevent merging
2654        }
2655
2656        // Only consider merging ordered lists if there's no structural content between
2657        current.is_ordered && next.is_ordered
2658    }
2659
2660    /// Check if there are only blank lines between blocks
2661    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2662        for line_num in (current.end_line + 1)..next.start_line {
2663            if let Some(line_info) = self.lines.get(line_num - 1)
2664                && !line_info.content.trim().is_empty()
2665            {
2666                return false;
2667            }
2668        }
2669        true
2670    }
2671
2672    /// Merge two compatible list blocks into one
2673    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2674        current.end_line = next.end_line;
2675        current.item_lines.extend_from_slice(&next.item_lines);
2676
2677        // Update max marker width
2678        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2679
2680        // Handle marker consistency for unordered lists
2681        if !current.is_ordered && self.markers_differ(&current, next) {
2682            current.marker = None; // Mixed markers
2683        }
2684
2685        current
2686    }
2687
2688    /// Check if two blocks have different markers
2689    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2690        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2691    }
2692}
2693
2694/// Types of spacing between list blocks
2695#[derive(Debug, PartialEq)]
2696enum BlockSpacing {
2697    Consecutive,    // No gap between blocks
2698    SingleBlank,    // One blank line between blocks
2699    MultipleBlanks, // Multiple blank lines but no content
2700    ContentBetween, // Content exists between blocks
2701}
2702
2703/// Check if there's meaningful content (not just blank lines) between two list blocks
2704fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2705    // Check lines between current.end_line and next.start_line
2706    for line_num in (current.end_line + 1)..next.start_line {
2707        if let Some(line_info) = lines.get(line_num - 1) {
2708            // Convert to 0-indexed
2709            let trimmed = line_info.content.trim();
2710
2711            // Skip empty lines
2712            if trimmed.is_empty() {
2713                continue;
2714            }
2715
2716            // Check for structural separators that should separate lists (CommonMark compliant)
2717
2718            // Headings separate lists
2719            if line_info.heading.is_some() {
2720                return true; // Has meaningful content - headings separate lists
2721            }
2722
2723            // Horizontal rules separate lists (---, ***, ___)
2724            if is_horizontal_rule(trimmed) {
2725                return true; // Has meaningful content - horizontal rules separate lists
2726            }
2727
2728            // Tables separate lists (lines containing | but not in URLs or code)
2729            // Simple heuristic: tables typically have | at start/end or multiple |
2730            if trimmed.contains('|') && trimmed.len() > 1 {
2731                // Don't treat URLs with | as tables
2732                if !trimmed.contains("](") && !trimmed.contains("http") {
2733                    // More robust check: tables usually have multiple | or | at edges
2734                    let pipe_count = trimmed.matches('|').count();
2735                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2736                        return true; // Has meaningful content - tables separate lists
2737                    }
2738                }
2739            }
2740
2741            // Blockquotes separate lists
2742            if trimmed.starts_with('>') {
2743                return true; // Has meaningful content - blockquotes separate lists
2744            }
2745
2746            // Code block fences separate lists (unless properly indented as list content)
2747            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2748                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2749
2750                // Check if this code block is properly indented as list continuation
2751                let min_continuation_indent = if current.is_ordered {
2752                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2753                } else {
2754                    current.nesting_level + 2
2755                };
2756
2757                if line_indent < min_continuation_indent {
2758                    // This is a standalone code block that separates lists
2759                    return true; // Has meaningful content - standalone code blocks separate lists
2760                }
2761            }
2762
2763            // Check if this line has proper indentation for list continuation
2764            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2765
2766            // Calculate minimum indentation needed to be list continuation
2767            let min_indent = if current.is_ordered {
2768                current.nesting_level + current.max_marker_width
2769            } else {
2770                current.nesting_level + 2
2771            };
2772
2773            // If the line is not indented enough to be list continuation, it's meaningful content
2774            if line_indent < min_indent {
2775                return true; // Has meaningful content - content not indented as list continuation
2776            }
2777
2778            // If we reach here, the line is properly indented as list continuation
2779            // Continue checking other lines
2780        }
2781    }
2782
2783    // Only blank lines or properly indented list continuation content between blocks
2784    false
2785}
2786
2787/// Check if a line is a horizontal rule (---, ***, ___)
2788fn is_horizontal_rule(trimmed: &str) -> bool {
2789    if trimmed.len() < 3 {
2790        return false;
2791    }
2792
2793    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2794    let chars: Vec<char> = trimmed.chars().collect();
2795    if let Some(&first_char) = chars.first()
2796        && (first_char == '-' || first_char == '*' || first_char == '_')
2797    {
2798        let mut count = 0;
2799        for &ch in &chars {
2800            if ch == first_char {
2801                count += 1;
2802            } else if ch != ' ' && ch != '\t' {
2803                return false; // Non-matching, non-whitespace character
2804            }
2805        }
2806        return count >= 3;
2807    }
2808    false
2809}
2810
2811/// Check if content contains patterns that cause the markdown crate to panic
2812#[cfg(test)]
2813mod tests {
2814    use super::*;
2815
2816    #[test]
2817    fn test_empty_content() {
2818        let ctx = LintContext::new("", MarkdownFlavor::Standard);
2819        assert_eq!(ctx.content, "");
2820        assert_eq!(ctx.line_offsets, vec![0]);
2821        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2822        assert_eq!(ctx.lines.len(), 0);
2823    }
2824
2825    #[test]
2826    fn test_single_line() {
2827        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2828        assert_eq!(ctx.content, "# Hello");
2829        assert_eq!(ctx.line_offsets, vec![0]);
2830        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2831        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2832    }
2833
2834    #[test]
2835    fn test_multi_line() {
2836        let content = "# Title\n\nSecond line\nThird line";
2837        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2838        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2839        // Test offset to line/col
2840        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2841        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2842        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2843        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2844        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2845    }
2846
2847    #[test]
2848    fn test_line_info() {
2849        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2850        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2851
2852        // Test line info
2853        assert_eq!(ctx.lines.len(), 7);
2854
2855        // Line 1: "# Title"
2856        let line1 = &ctx.lines[0];
2857        assert_eq!(line1.content, "# Title");
2858        assert_eq!(line1.byte_offset, 0);
2859        assert_eq!(line1.indent, 0);
2860        assert!(!line1.is_blank);
2861        assert!(!line1.in_code_block);
2862        assert!(line1.list_item.is_none());
2863
2864        // Line 2: "    indented"
2865        let line2 = &ctx.lines[1];
2866        assert_eq!(line2.content, "    indented");
2867        assert_eq!(line2.byte_offset, 8);
2868        assert_eq!(line2.indent, 4);
2869        assert!(!line2.is_blank);
2870
2871        // Line 3: "" (blank)
2872        let line3 = &ctx.lines[2];
2873        assert_eq!(line3.content, "");
2874        assert!(line3.is_blank);
2875
2876        // Test helper methods
2877        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2878        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2879        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2880        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2881    }
2882
2883    #[test]
2884    fn test_list_item_detection() {
2885        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2886        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2887
2888        // Line 1: "- Unordered item"
2889        let line1 = &ctx.lines[0];
2890        assert!(line1.list_item.is_some());
2891        let list1 = line1.list_item.as_ref().unwrap();
2892        assert_eq!(list1.marker, "-");
2893        assert!(!list1.is_ordered);
2894        assert_eq!(list1.marker_column, 0);
2895        assert_eq!(list1.content_column, 2);
2896
2897        // Line 2: "  * Nested item"
2898        let line2 = &ctx.lines[1];
2899        assert!(line2.list_item.is_some());
2900        let list2 = line2.list_item.as_ref().unwrap();
2901        assert_eq!(list2.marker, "*");
2902        assert_eq!(list2.marker_column, 2);
2903
2904        // Line 3: "1. Ordered item"
2905        let line3 = &ctx.lines[2];
2906        assert!(line3.list_item.is_some());
2907        let list3 = line3.list_item.as_ref().unwrap();
2908        assert_eq!(list3.marker, "1.");
2909        assert!(list3.is_ordered);
2910        assert_eq!(list3.number, Some(1));
2911
2912        // Line 6: "Not a list"
2913        let line6 = &ctx.lines[5];
2914        assert!(line6.list_item.is_none());
2915    }
2916
2917    #[test]
2918    fn test_offset_to_line_col_edge_cases() {
2919        let content = "a\nb\nc";
2920        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2921        // line_offsets: [0, 2, 4]
2922        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2923        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2924        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2925        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2926        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2927        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2928    }
2929}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs