rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::ast_utils::get_cached_ast;
4use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
5use lazy_static::lazy_static;
6use markdown::mdast::Node;
7use regex::Regex;
8
9lazy_static! {
10    // Comprehensive link pattern that captures both inline and reference links
11    // Use (?s) flag to make . match newlines
12    static ref LINK_PATTERN: Regex = Regex::new(
13        r#"(?sx)
14        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
15        (?:
16            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
17            |
18            \[([^\]]*)\]      # Reference ID in group 6
19        )"#
20    ).unwrap();
21
22    // Image pattern (similar to links but with ! prefix)
23    // Use (?s) flag to make . match newlines
24    static ref IMAGE_PATTERN: Regex = Regex::new(
25        r#"(?sx)
26        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
27        (?:
28            \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\)  # URL in group 2 (angle) or 3 (bare), title in 4/5
29            |
30            \[([^\]]*)\]      # Reference ID in group 6
31        )"#
32    ).unwrap();
33
34    // Reference definition pattern
35    static ref REF_DEF_PATTERN: Regex = Regex::new(
36        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
37    ).unwrap();
38
39    // Code span pattern - matches backticks and captures content
40    // This handles multi-backtick code spans correctly
41    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
42        r"`+"
43    ).unwrap();
44
45    // Pattern for bare URLs
46    static ref BARE_URL_PATTERN: Regex = Regex::new(
47        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
48    ).unwrap();
49
50    // Pattern for email addresses
51    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
52        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
53    ).unwrap();
54
55    // Pattern for angle bracket links (to exclude from bare URL detection)
56    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
57        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
58    ).unwrap();
59
60    // Pattern for blockquote prefix in parse_list_blocks
61    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
62}
63
64/// Pre-computed information about a line
65#[derive(Debug, Clone)]
66pub struct LineInfo {
67    /// The actual line content (without newline)
68    pub content: String,
69    /// Byte offset where this line starts in the document
70    pub byte_offset: usize,
71    /// Number of leading spaces/tabs
72    pub indent: usize,
73    /// Whether the line is blank (empty or only whitespace)
74    pub is_blank: bool,
75    /// Whether this line is inside a code block
76    pub in_code_block: bool,
77    /// Whether this line is inside front matter
78    pub in_front_matter: bool,
79    /// Whether this line is inside an HTML block
80    pub in_html_block: bool,
81    /// Whether this line is inside an HTML comment
82    pub in_html_comment: bool,
83    /// List item information if this line starts a list item
84    pub list_item: Option<ListItemInfo>,
85    /// Heading information if this line is a heading
86    pub heading: Option<HeadingInfo>,
87    /// Blockquote information if this line is a blockquote
88    pub blockquote: Option<BlockquoteInfo>,
89    /// Whether this line is inside a mkdocstrings autodoc block
90    pub in_mkdocstrings: bool,
91}
92
93/// Information about a list item
94#[derive(Debug, Clone)]
95pub struct ListItemInfo {
96    /// The marker used (*, -, +, or number with . or ))
97    pub marker: String,
98    /// Whether it's ordered (true) or unordered (false)
99    pub is_ordered: bool,
100    /// The number for ordered lists
101    pub number: Option<usize>,
102    /// Column where the marker starts (0-based)
103    pub marker_column: usize,
104    /// Column where content after marker starts
105    pub content_column: usize,
106}
107
108/// Heading style type
109#[derive(Debug, Clone, PartialEq)]
110pub enum HeadingStyle {
111    /// ATX style heading (# Heading)
112    ATX,
113    /// Setext style heading with = underline
114    Setext1,
115    /// Setext style heading with - underline
116    Setext2,
117}
118
119/// Parsed link information
120#[derive(Debug, Clone)]
121pub struct ParsedLink {
122    /// Line number (1-indexed)
123    pub line: usize,
124    /// Start column (0-indexed) in the line
125    pub start_col: usize,
126    /// End column (0-indexed) in the line
127    pub end_col: usize,
128    /// Byte offset in document
129    pub byte_offset: usize,
130    /// End byte offset in document
131    pub byte_end: usize,
132    /// Link text
133    pub text: String,
134    /// Link URL or reference
135    pub url: String,
136    /// Whether this is a reference link [text][ref] vs inline [text](url)
137    pub is_reference: bool,
138    /// Reference ID for reference links
139    pub reference_id: Option<String>,
140}
141
142/// Parsed image information
143#[derive(Debug, Clone)]
144pub struct ParsedImage {
145    /// Line number (1-indexed)
146    pub line: usize,
147    /// Start column (0-indexed) in the line
148    pub start_col: usize,
149    /// End column (0-indexed) in the line
150    pub end_col: usize,
151    /// Byte offset in document
152    pub byte_offset: usize,
153    /// End byte offset in document
154    pub byte_end: usize,
155    /// Alt text
156    pub alt_text: String,
157    /// Image URL or reference
158    pub url: String,
159    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
160    pub is_reference: bool,
161    /// Reference ID for reference images
162    pub reference_id: Option<String>,
163}
164
165/// Reference definition [ref]: url "title"
166#[derive(Debug, Clone)]
167pub struct ReferenceDef {
168    /// Line number (1-indexed)
169    pub line: usize,
170    /// Reference ID (normalized to lowercase)
171    pub id: String,
172    /// URL
173    pub url: String,
174    /// Optional title
175    pub title: Option<String>,
176}
177
178/// Parsed code span information
179#[derive(Debug, Clone)]
180pub struct CodeSpan {
181    /// Line number (1-indexed)
182    pub line: usize,
183    /// Start column (0-indexed) in the line
184    pub start_col: usize,
185    /// End column (0-indexed) in the line
186    pub end_col: usize,
187    /// Byte offset in document
188    pub byte_offset: usize,
189    /// End byte offset in document
190    pub byte_end: usize,
191    /// Number of backticks used (1, 2, 3, etc.)
192    pub backtick_count: usize,
193    /// Content inside the code span (without backticks)
194    pub content: String,
195}
196
197/// Information about a heading
198#[derive(Debug, Clone)]
199pub struct HeadingInfo {
200    /// Heading level (1-6 for ATX, 1-2 for Setext)
201    pub level: u8,
202    /// Style of heading
203    pub style: HeadingStyle,
204    /// The heading marker (# characters or underline)
205    pub marker: String,
206    /// Column where the marker starts (0-based)
207    pub marker_column: usize,
208    /// Column where heading text starts
209    pub content_column: usize,
210    /// The heading text (without markers and without custom ID syntax)
211    pub text: String,
212    /// Custom header ID if present (e.g., from {#custom-id} syntax)
213    pub custom_id: Option<String>,
214    /// Original heading text including custom ID syntax
215    pub raw_text: String,
216    /// Whether it has a closing sequence (for ATX)
217    pub has_closing_sequence: bool,
218    /// The closing sequence if present
219    pub closing_sequence: String,
220}
221
222/// Information about a blockquote line
223#[derive(Debug, Clone)]
224pub struct BlockquoteInfo {
225    /// Nesting level (1 for >, 2 for >>, etc.)
226    pub nesting_level: usize,
227    /// The indentation before the blockquote marker
228    pub indent: String,
229    /// Column where the first > starts (0-based)
230    pub marker_column: usize,
231    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
232    pub prefix: String,
233    /// Content after the blockquote marker(s)
234    pub content: String,
235    /// Whether the line has no space after the marker
236    pub has_no_space_after_marker: bool,
237    /// Whether the line has multiple spaces after the marker
238    pub has_multiple_spaces_after_marker: bool,
239    /// Whether this is an empty blockquote line needing MD028 fix
240    pub needs_md028_fix: bool,
241}
242
243/// Information about a list block
244#[derive(Debug, Clone)]
245pub struct ListBlock {
246    /// Line number where the list starts (1-indexed)
247    pub start_line: usize,
248    /// Line number where the list ends (1-indexed)
249    pub end_line: usize,
250    /// Whether it's ordered or unordered
251    pub is_ordered: bool,
252    /// The consistent marker for unordered lists (if any)
253    pub marker: Option<String>,
254    /// Blockquote prefix for this list (empty if not in blockquote)
255    pub blockquote_prefix: String,
256    /// Lines that are list items within this block
257    pub item_lines: Vec<usize>,
258    /// Nesting level (0 for top-level lists)
259    pub nesting_level: usize,
260    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
261    pub max_marker_width: usize,
262}
263
264use std::sync::{Arc, Mutex};
265
266/// Character frequency data for fast content analysis
267#[derive(Debug, Clone, Default)]
268pub struct CharFrequency {
269    /// Count of # characters (headings)
270    pub hash_count: usize,
271    /// Count of * characters (emphasis, lists, horizontal rules)
272    pub asterisk_count: usize,
273    /// Count of _ characters (emphasis, horizontal rules)
274    pub underscore_count: usize,
275    /// Count of - characters (lists, horizontal rules, setext headings)
276    pub hyphen_count: usize,
277    /// Count of + characters (lists)
278    pub plus_count: usize,
279    /// Count of > characters (blockquotes)
280    pub gt_count: usize,
281    /// Count of | characters (tables)
282    pub pipe_count: usize,
283    /// Count of [ characters (links, images)
284    pub bracket_count: usize,
285    /// Count of ` characters (code spans, code blocks)
286    pub backtick_count: usize,
287    /// Count of < characters (HTML tags, autolinks)
288    pub lt_count: usize,
289    /// Count of ! characters (images)
290    pub exclamation_count: usize,
291    /// Count of newline characters
292    pub newline_count: usize,
293}
294
295/// Pre-parsed HTML tag information
296#[derive(Debug, Clone)]
297pub struct HtmlTag {
298    /// Line number (1-indexed)
299    pub line: usize,
300    /// Start column (0-indexed) in the line
301    pub start_col: usize,
302    /// End column (0-indexed) in the line
303    pub end_col: usize,
304    /// Byte offset in document
305    pub byte_offset: usize,
306    /// End byte offset in document
307    pub byte_end: usize,
308    /// Tag name (e.g., "div", "img", "br")
309    pub tag_name: String,
310    /// Whether it's a closing tag (`</tag>`)
311    pub is_closing: bool,
312    /// Whether it's self-closing (`<tag />`)
313    pub is_self_closing: bool,
314    /// Raw tag content
315    pub raw_content: String,
316}
317
318/// Pre-parsed emphasis span information
319#[derive(Debug, Clone)]
320pub struct EmphasisSpan {
321    /// Line number (1-indexed)
322    pub line: usize,
323    /// Start column (0-indexed) in the line
324    pub start_col: usize,
325    /// End column (0-indexed) in the line
326    pub end_col: usize,
327    /// Byte offset in document
328    pub byte_offset: usize,
329    /// End byte offset in document
330    pub byte_end: usize,
331    /// Type of emphasis ('*' or '_')
332    pub marker: char,
333    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
334    pub marker_count: usize,
335    /// Content inside the emphasis
336    pub content: String,
337}
338
339/// Pre-parsed table row information
340#[derive(Debug, Clone)]
341pub struct TableRow {
342    /// Line number (1-indexed)
343    pub line: usize,
344    /// Whether this is a separator row (contains only |, -, :, and spaces)
345    pub is_separator: bool,
346    /// Number of columns (pipe-separated cells)
347    pub column_count: usize,
348    /// Alignment info from separator row
349    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
350}
351
352/// Pre-parsed bare URL information (not in links)
353#[derive(Debug, Clone)]
354pub struct BareUrl {
355    /// Line number (1-indexed)
356    pub line: usize,
357    /// Start column (0-indexed) in the line
358    pub start_col: usize,
359    /// End column (0-indexed) in the line
360    pub end_col: usize,
361    /// Byte offset in document
362    pub byte_offset: usize,
363    /// End byte offset in document
364    pub byte_end: usize,
365    /// The URL string
366    pub url: String,
367    /// Type of URL ("http", "https", "ftp", "email")
368    pub url_type: String,
369}
370
371pub struct LintContext<'a> {
372    pub content: &'a str,
373    pub line_offsets: Vec<usize>,
374    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
375    pub lines: Vec<LineInfo>,             // Pre-computed line information
376    pub links: Vec<ParsedLink>,           // Pre-parsed links
377    pub images: Vec<ParsedImage>,         // Pre-parsed images
378    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
379    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
380    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
381    pub char_frequency: CharFrequency,    // Character frequency analysis
382    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
383    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
384    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
385    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
386    ast_cache: Mutex<Option<Arc<Node>>>,  // Lazy-loaded AST
387    pub flavor: MarkdownFlavor,           // Markdown flavor being used
388}
389
390impl<'a> LintContext<'a> {
391    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
392        let mut line_offsets = vec![0];
393        for (i, c) in content.char_indices() {
394            if c == '\n' {
395                line_offsets.push(i + 1);
396            }
397        }
398
399        // Detect code blocks once and cache them
400        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
401
402        // Pre-compute line information (without headings/blockquotes yet)
403        let mut lines = Self::compute_basic_line_info(content, &line_offsets, &code_blocks, flavor);
404
405        // Detect HTML blocks BEFORE heading detection
406        // This ensures HTML content is never considered as markdown
407        Self::detect_html_blocks(&mut lines);
408
409        // Now detect headings and blockquotes, which will skip HTML blocks
410        Self::detect_headings_and_blockquotes(content, &mut lines, flavor);
411
412        // Parse code spans early so we can exclude them from link/image parsing
413        let ast = get_cached_ast(content);
414        let code_spans = Self::parse_code_spans(content, &lines, &ast);
415
416        // Parse links, images, references, and list blocks
417        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
418        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
419        let reference_defs = Self::parse_reference_defs(content, &lines);
420        // Use line-by-line list parsing for MD032 compatibility
421        // TODO: Consider using AST-based parsing in the future when MD032 is updated
422        let list_blocks = Self::parse_list_blocks(&lines);
423
424        // Compute character frequency for fast content analysis
425        let char_frequency = Self::compute_char_frequency(content);
426
427        Self {
428            content,
429            line_offsets,
430            code_blocks,
431            lines,
432            links,
433            images,
434            reference_defs,
435            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
436            list_blocks,
437            char_frequency,
438            html_tags_cache: Mutex::new(None),
439            emphasis_spans_cache: Mutex::new(None),
440            table_rows_cache: Mutex::new(None),
441            bare_urls_cache: Mutex::new(None),
442            ast_cache: Mutex::new(None),
443            flavor,
444        }
445    }
446
447    /// Get AST - uses global cache for deduplication
448    pub fn get_ast(&self) -> Arc<Node> {
449        let mut cache = self.ast_cache.lock().unwrap();
450
451        if cache.is_none() {
452            // Use global AST cache to avoid duplicate parsing
453            // MarkdownAst is just a type alias for Node, so no conversion needed
454            *cache = Some(get_cached_ast(self.content));
455        }
456
457        cache.as_ref().unwrap().clone()
458    }
459
460    /// Get code spans - computed lazily on first access
461    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
462        let mut cache = self.code_spans_cache.lock().unwrap();
463
464        // Check if we need to compute code spans
465        if cache.is_none() {
466            let ast = self.get_ast();
467            let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
468            *cache = Some(Arc::new(code_spans));
469        }
470
471        // Return a reference to the cached code spans
472        cache.as_ref().unwrap().clone()
473    }
474
475    /// Get HTML tags - computed lazily on first access
476    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
477        let mut cache = self.html_tags_cache.lock().unwrap();
478
479        if cache.is_none() {
480            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
481            *cache = Some(Arc::new(html_tags));
482        }
483
484        cache.as_ref().unwrap().clone()
485    }
486
487    /// Get emphasis spans - computed lazily on first access
488    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
489        let mut cache = self.emphasis_spans_cache.lock().unwrap();
490
491        if cache.is_none() {
492            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
493            *cache = Some(Arc::new(emphasis_spans));
494        }
495
496        cache.as_ref().unwrap().clone()
497    }
498
499    /// Get table rows - computed lazily on first access
500    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
501        let mut cache = self.table_rows_cache.lock().unwrap();
502
503        if cache.is_none() {
504            let table_rows = Self::parse_table_rows(&self.lines);
505            *cache = Some(Arc::new(table_rows));
506        }
507
508        cache.as_ref().unwrap().clone()
509    }
510
511    /// Get bare URLs - computed lazily on first access
512    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
513        let mut cache = self.bare_urls_cache.lock().unwrap();
514
515        if cache.is_none() {
516            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
517            *cache = Some(Arc::new(bare_urls));
518        }
519
520        cache.as_ref().unwrap().clone()
521    }
522
523    /// Map a byte offset to (line, column)
524    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
525        match self.line_offsets.binary_search(&offset) {
526            Ok(line) => (line + 1, 1),
527            Err(line) => {
528                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
529                (line, offset - line_start + 1)
530            }
531        }
532    }
533
534    /// Check if a position is within a code block or code span
535    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
536        // Check code blocks first
537        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
538            return true;
539        }
540
541        // Check inline code spans (lazy load if needed)
542        self.code_spans()
543            .iter()
544            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
545    }
546
547    /// Get line information by line number (1-indexed)
548    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
549        if line_num > 0 {
550            self.lines.get(line_num - 1)
551        } else {
552            None
553        }
554    }
555
556    /// Get byte offset for a line number (1-indexed)
557    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
558        self.line_info(line_num).map(|info| info.byte_offset)
559    }
560
561    /// Get URL for a reference link/image by its ID
562    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
563        let normalized_id = ref_id.to_lowercase();
564        self.reference_defs
565            .iter()
566            .find(|def| def.id == normalized_id)
567            .map(|def| def.url.as_str())
568    }
569
570    /// Get links on a specific line
571    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
572        self.links.iter().filter(|link| link.line == line_num).collect()
573    }
574
575    /// Get images on a specific line
576    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
577        self.images.iter().filter(|img| img.line == line_num).collect()
578    }
579
580    /// Check if a line is part of a list block
581    pub fn is_in_list_block(&self, line_num: usize) -> bool {
582        self.list_blocks
583            .iter()
584            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
585    }
586
587    /// Get the list block containing a specific line
588    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
589        self.list_blocks
590            .iter()
591            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
592    }
593
594    // Compatibility methods for DocumentStructure migration
595
596    /// Check if a line is within a code block
597    pub fn is_in_code_block(&self, line_num: usize) -> bool {
598        if line_num == 0 || line_num > self.lines.len() {
599            return false;
600        }
601        self.lines[line_num - 1].in_code_block
602    }
603
604    /// Check if a line is within front matter
605    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
606        if line_num == 0 || line_num > self.lines.len() {
607            return false;
608        }
609        self.lines[line_num - 1].in_front_matter
610    }
611
612    /// Check if a line is within an HTML block
613    pub fn is_in_html_block(&self, line_num: usize) -> bool {
614        if line_num == 0 || line_num > self.lines.len() {
615            return false;
616        }
617        self.lines[line_num - 1].in_html_block
618    }
619
620    /// Check if a line and column is within a code span
621    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
622        if line_num == 0 || line_num > self.lines.len() {
623            return false;
624        }
625
626        // Use the code spans cache to check
627        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
628        // Convert col to 0-indexed for comparison
629        let col_0indexed = if col > 0 { col - 1 } else { 0 };
630        let code_spans = self.code_spans();
631        code_spans
632            .iter()
633            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
634    }
635
636    /// Check if content has any instances of a specific character (fast)
637    pub fn has_char(&self, ch: char) -> bool {
638        match ch {
639            '#' => self.char_frequency.hash_count > 0,
640            '*' => self.char_frequency.asterisk_count > 0,
641            '_' => self.char_frequency.underscore_count > 0,
642            '-' => self.char_frequency.hyphen_count > 0,
643            '+' => self.char_frequency.plus_count > 0,
644            '>' => self.char_frequency.gt_count > 0,
645            '|' => self.char_frequency.pipe_count > 0,
646            '[' => self.char_frequency.bracket_count > 0,
647            '`' => self.char_frequency.backtick_count > 0,
648            '<' => self.char_frequency.lt_count > 0,
649            '!' => self.char_frequency.exclamation_count > 0,
650            '\n' => self.char_frequency.newline_count > 0,
651            _ => self.content.contains(ch), // Fallback for other characters
652        }
653    }
654
655    /// Get count of a specific character (fast)
656    pub fn char_count(&self, ch: char) -> usize {
657        match ch {
658            '#' => self.char_frequency.hash_count,
659            '*' => self.char_frequency.asterisk_count,
660            '_' => self.char_frequency.underscore_count,
661            '-' => self.char_frequency.hyphen_count,
662            '+' => self.char_frequency.plus_count,
663            '>' => self.char_frequency.gt_count,
664            '|' => self.char_frequency.pipe_count,
665            '[' => self.char_frequency.bracket_count,
666            '`' => self.char_frequency.backtick_count,
667            '<' => self.char_frequency.lt_count,
668            '!' => self.char_frequency.exclamation_count,
669            '\n' => self.char_frequency.newline_count,
670            _ => self.content.matches(ch).count(), // Fallback for other characters
671        }
672    }
673
674    /// Check if content likely contains headings (fast)
675    pub fn likely_has_headings(&self) -> bool {
676        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
677    }
678
679    /// Check if content likely contains lists (fast)
680    pub fn likely_has_lists(&self) -> bool {
681        self.char_frequency.asterisk_count > 0
682            || self.char_frequency.hyphen_count > 0
683            || self.char_frequency.plus_count > 0
684    }
685
686    /// Check if content likely contains emphasis (fast)
687    pub fn likely_has_emphasis(&self) -> bool {
688        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
689    }
690
691    /// Check if content likely contains tables (fast)
692    pub fn likely_has_tables(&self) -> bool {
693        self.char_frequency.pipe_count > 2
694    }
695
696    /// Check if content likely contains blockquotes (fast)
697    pub fn likely_has_blockquotes(&self) -> bool {
698        self.char_frequency.gt_count > 0
699    }
700
701    /// Check if content likely contains code (fast)
702    pub fn likely_has_code(&self) -> bool {
703        self.char_frequency.backtick_count > 0
704    }
705
706    /// Check if content likely contains links or images (fast)
707    pub fn likely_has_links_or_images(&self) -> bool {
708        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
709    }
710
711    /// Check if content likely contains HTML (fast)
712    pub fn likely_has_html(&self) -> bool {
713        self.char_frequency.lt_count > 0
714    }
715
716    /// Get HTML tags on a specific line
717    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
718        self.html_tags()
719            .iter()
720            .filter(|tag| tag.line == line_num)
721            .cloned()
722            .collect()
723    }
724
725    /// Get emphasis spans on a specific line
726    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
727        self.emphasis_spans()
728            .iter()
729            .filter(|span| span.line == line_num)
730            .cloned()
731            .collect()
732    }
733
734    /// Get table rows on a specific line
735    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
736        self.table_rows()
737            .iter()
738            .filter(|row| row.line == line_num)
739            .cloned()
740            .collect()
741    }
742
743    /// Get bare URLs on a specific line
744    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
745        self.bare_urls()
746            .iter()
747            .filter(|url| url.line == line_num)
748            .cloned()
749            .collect()
750    }
751
752    /// Parse all links in the content
753    fn parse_links(
754        content: &str,
755        lines: &[LineInfo],
756        code_blocks: &[(usize, usize)],
757        code_spans: &[CodeSpan],
758        flavor: MarkdownFlavor,
759    ) -> Vec<ParsedLink> {
760        use crate::utils::skip_context::{is_in_html_comment, is_mkdocs_snippet_line};
761
762        // Pre-size based on a heuristic: most markdown files have relatively few links
763        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
764
765        // Parse links across the entire content, not line by line
766        for cap in LINK_PATTERN.captures_iter(content) {
767            let full_match = cap.get(0).unwrap();
768            let match_start = full_match.start();
769            let match_end = full_match.end();
770
771            // Skip if the opening bracket is escaped (preceded by \)
772            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
773                continue;
774            }
775
776            // Skip if this is actually an image (preceded by !)
777            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
778                continue;
779            }
780
781            // Skip if in code block
782            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
783                continue;
784            }
785
786            // Skip if in code span
787            if code_spans
788                .iter()
789                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
790            {
791                continue;
792            }
793
794            // Skip if in HTML comment
795            if is_in_html_comment(content, match_start) {
796                continue;
797            }
798
799            // Skip if this link is on a MkDocs snippet line
800            // Find which line this link is on
801            let line_idx = lines
802                .iter()
803                .position(|line| {
804                    match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
805                })
806                .unwrap_or(0);
807
808            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
809                continue;
810            }
811
812            // Find which line this link starts on
813            let mut line_num = 1;
814            let mut col_start = match_start;
815            for (idx, line_info) in lines.iter().enumerate() {
816                if match_start >= line_info.byte_offset {
817                    line_num = idx + 1;
818                    col_start = match_start - line_info.byte_offset;
819                } else {
820                    break;
821                }
822            }
823
824            // Find which line this link ends on (and calculate column on that line)
825            let mut end_line_num = 1;
826            let mut col_end = match_end;
827            for (idx, line_info) in lines.iter().enumerate() {
828                if match_end > line_info.byte_offset {
829                    end_line_num = idx + 1;
830                    col_end = match_end - line_info.byte_offset;
831                } else {
832                    break;
833                }
834            }
835
836            // For single-line links, use the same approach as before
837            if line_num == end_line_num {
838                // col_end is already correct
839            } else {
840                // For multi-line links, col_end represents the column on the ending line
841                // which is what we want
842            }
843
844            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
845
846            // URL can be in group 2 (angle brackets) or group 3 (bare)
847            let inline_url = cap.get(2).or_else(|| cap.get(3));
848
849            if let Some(url_match) = inline_url {
850                // Inline link
851                links.push(ParsedLink {
852                    line: line_num,
853                    start_col: col_start,
854                    end_col: col_end,
855                    byte_offset: match_start,
856                    byte_end: match_end,
857                    text,
858                    url: url_match.as_str().to_string(),
859                    is_reference: false,
860                    reference_id: None,
861                });
862            } else if let Some(ref_id) = cap.get(6) {
863                // Reference link
864                let ref_id_str = ref_id.as_str();
865                let normalized_ref = if ref_id_str.is_empty() {
866                    text.to_lowercase() // Implicit reference
867                } else {
868                    ref_id_str.to_lowercase()
869                };
870
871                links.push(ParsedLink {
872                    line: line_num,
873                    start_col: col_start,
874                    end_col: col_end,
875                    byte_offset: match_start,
876                    byte_end: match_end,
877                    text,
878                    url: String::new(), // Will be resolved with reference_defs
879                    is_reference: true,
880                    reference_id: Some(normalized_ref),
881                });
882            }
883        }
884
885        links
886    }
887
888    /// Parse all images in the content
889    fn parse_images(
890        content: &str,
891        lines: &[LineInfo],
892        code_blocks: &[(usize, usize)],
893        code_spans: &[CodeSpan],
894    ) -> Vec<ParsedImage> {
895        use crate::utils::skip_context::is_in_html_comment;
896
897        // Pre-size based on a heuristic: images are less common than links
898        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
899
900        // Parse images across the entire content, not line by line
901        for cap in IMAGE_PATTERN.captures_iter(content) {
902            let full_match = cap.get(0).unwrap();
903            let match_start = full_match.start();
904            let match_end = full_match.end();
905
906            // Skip if the ! is escaped (preceded by \)
907            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
908                continue;
909            }
910
911            // Skip if in code block
912            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
913                continue;
914            }
915
916            // Skip if in code span
917            if code_spans
918                .iter()
919                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
920            {
921                continue;
922            }
923
924            // Skip if in HTML comment
925            if is_in_html_comment(content, match_start) {
926                continue;
927            }
928
929            // Find which line this image starts on
930            let mut line_num = 1;
931            let mut col_start = match_start;
932            for (idx, line_info) in lines.iter().enumerate() {
933                if match_start >= line_info.byte_offset {
934                    line_num = idx + 1;
935                    col_start = match_start - line_info.byte_offset;
936                } else {
937                    break;
938                }
939            }
940
941            // Find which line this image ends on (and calculate column on that line)
942            let mut end_line_num = 1;
943            let mut col_end = match_end;
944            for (idx, line_info) in lines.iter().enumerate() {
945                if match_end > line_info.byte_offset {
946                    end_line_num = idx + 1;
947                    col_end = match_end - line_info.byte_offset;
948                } else {
949                    break;
950                }
951            }
952
953            // For single-line images, use the same approach as before
954            if line_num == end_line_num {
955                // col_end is already correct
956            } else {
957                // For multi-line images, col_end represents the column on the ending line
958                // which is what we want
959            }
960
961            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
962
963            // URL can be in group 2 (angle brackets) or group 3 (bare)
964            let inline_url = cap.get(2).or_else(|| cap.get(3));
965
966            if let Some(url_match) = inline_url {
967                // Inline image
968                images.push(ParsedImage {
969                    line: line_num,
970                    start_col: col_start,
971                    end_col: col_end,
972                    byte_offset: match_start,
973                    byte_end: match_end,
974                    alt_text,
975                    url: url_match.as_str().to_string(),
976                    is_reference: false,
977                    reference_id: None,
978                });
979            } else if let Some(ref_id) = cap.get(6) {
980                // Reference image
981                let ref_id_str = ref_id.as_str();
982                let normalized_ref = if ref_id_str.is_empty() {
983                    alt_text.to_lowercase() // Implicit reference
984                } else {
985                    ref_id_str.to_lowercase()
986                };
987
988                images.push(ParsedImage {
989                    line: line_num,
990                    start_col: col_start,
991                    end_col: col_end,
992                    byte_offset: match_start,
993                    byte_end: match_end,
994                    alt_text,
995                    url: String::new(), // Will be resolved with reference_defs
996                    is_reference: true,
997                    reference_id: Some(normalized_ref),
998                });
999            }
1000        }
1001
1002        images
1003    }
1004
1005    /// Parse reference definitions
1006    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1007        // Pre-size based on lines count as reference definitions are line-based
1008        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1009
1010        for (line_idx, line_info) in lines.iter().enumerate() {
1011            // Skip lines in code blocks
1012            if line_info.in_code_block {
1013                continue;
1014            }
1015
1016            let line = &line_info.content;
1017            let line_num = line_idx + 1;
1018
1019            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1020                let id = cap.get(1).unwrap().as_str().to_lowercase();
1021                let url = cap.get(2).unwrap().as_str().to_string();
1022                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
1023
1024                refs.push(ReferenceDef {
1025                    line: line_num,
1026                    id,
1027                    url,
1028                    title,
1029                });
1030            }
1031        }
1032
1033        refs
1034    }
1035
1036    /// Pre-compute basic line information (without headings/blockquotes)
1037    fn compute_basic_line_info(
1038        content: &str,
1039        line_offsets: &[usize],
1040        code_blocks: &[(usize, usize)],
1041        flavor: MarkdownFlavor,
1042    ) -> Vec<LineInfo> {
1043        lazy_static! {
1044            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
1045            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1046            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1047
1048            // Regex for blockquote prefix (used for blank line detection in blockquotes)
1049            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1050        }
1051
1052        let content_lines: Vec<&str> = content.lines().collect();
1053        let mut lines = Vec::with_capacity(content_lines.len());
1054
1055        // Detect front matter boundaries FIRST, before any other parsing
1056        // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
1057        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1058
1059        for (i, line) in content_lines.iter().enumerate() {
1060            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1061            let indent = line.len() - line.trim_start().len();
1062            // For blank detection, consider blockquote context
1063            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1064                // In blockquote context, check if content after prefix is blank
1065                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1066                after_prefix.trim().is_empty()
1067            } else {
1068                line.trim().is_empty()
1069            };
1070            // Check if this line is inside a code block (not inline code span)
1071            // We only want to check for fenced/indented code blocks, not inline code
1072            let in_code_block = code_blocks.iter().any(|&(start, end)| {
1073                // Only consider ranges that span multiple lines (code blocks)
1074                // Inline code spans are typically on a single line
1075
1076                // Ensure we're at valid UTF-8 boundaries
1077                let safe_start = if start > 0 && !content.is_char_boundary(start) {
1078                    // Find the nearest valid boundary before start
1079                    let mut boundary = start;
1080                    while boundary > 0 && !content.is_char_boundary(boundary) {
1081                        boundary -= 1;
1082                    }
1083                    boundary
1084                } else {
1085                    start
1086                };
1087
1088                let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1089                    // Find the nearest valid boundary after end
1090                    let mut boundary = end;
1091                    while boundary < content.len() && !content.is_char_boundary(boundary) {
1092                        boundary += 1;
1093                    }
1094                    boundary
1095                } else {
1096                    end.min(content.len())
1097                };
1098
1099                let block_content = &content[safe_start..safe_end];
1100                let is_multiline = block_content.contains('\n');
1101                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1102                let is_indented = !is_fenced
1103                    && block_content
1104                        .lines()
1105                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
1106
1107                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1108            });
1109
1110            // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
1111            let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
1112                && crate::utils::mkdocstrings_refs::is_within_autodoc_block(content, byte_offset);
1113            let in_html_comment = crate::utils::skip_context::is_in_html_comment(content, byte_offset);
1114            let list_item = if !(in_code_block
1115                || is_blank
1116                || in_mkdocstrings
1117                || in_html_comment
1118                || (front_matter_end > 0 && i < front_matter_end))
1119            {
1120                // Strip blockquote prefix if present for list detection
1121                let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1122                    let prefix = caps.get(1).unwrap().as_str();
1123                    let content = caps.get(2).unwrap().as_str();
1124                    (content, prefix.len())
1125                } else {
1126                    (&**line, 0)
1127                };
1128
1129                if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1130                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1131                    let marker = caps.get(2).map_or("", |m| m.as_str());
1132                    let spacing = caps.get(3).map_or("", |m| m.as_str());
1133                    let _content = caps.get(4).map_or("", |m| m.as_str());
1134                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1135                    let content_column = marker_column + marker.len() + spacing.len();
1136
1137                    // According to CommonMark spec, unordered list items MUST have at least one space
1138                    // after the marker (-, *, or +). Without a space, it's not a list item.
1139                    // This also naturally handles cases like:
1140                    // - *emphasis* (not a list)
1141                    // - **bold** (not a list)
1142                    // - --- (horizontal rule, not a list)
1143                    if spacing.is_empty() {
1144                        None
1145                    } else {
1146                        Some(ListItemInfo {
1147                            marker: marker.to_string(),
1148                            is_ordered: false,
1149                            number: None,
1150                            marker_column,
1151                            content_column,
1152                        })
1153                    }
1154                } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1155                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1156                    let number_str = caps.get(2).map_or("", |m| m.as_str());
1157                    let delimiter = caps.get(3).map_or("", |m| m.as_str());
1158                    let spacing = caps.get(4).map_or("", |m| m.as_str());
1159                    let _content = caps.get(5).map_or("", |m| m.as_str());
1160                    let marker = format!("{number_str}{delimiter}");
1161                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1162                    let content_column = marker_column + marker.len() + spacing.len();
1163
1164                    // According to CommonMark spec, ordered list items MUST have at least one space
1165                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1166                    if spacing.is_empty() {
1167                        None
1168                    } else {
1169                        Some(ListItemInfo {
1170                            marker,
1171                            is_ordered: true,
1172                            number: number_str.parse().ok(),
1173                            marker_column,
1174                            content_column,
1175                        })
1176                    }
1177                } else {
1178                    None
1179                }
1180            } else {
1181                None
1182            };
1183
1184            lines.push(LineInfo {
1185                content: line.to_string(),
1186                byte_offset,
1187                indent,
1188                is_blank,
1189                in_code_block,
1190                in_front_matter: front_matter_end > 0 && i < front_matter_end,
1191                in_html_block: false, // Will be populated after line creation
1192                in_html_comment,
1193                list_item,
1194                heading: None,    // Will be populated in second pass for Setext headings
1195                blockquote: None, // Will be populated after line creation
1196                in_mkdocstrings,
1197            });
1198        }
1199
1200        lines
1201    }
1202
1203    /// Detect headings and blockquotes (called after HTML block detection)
1204    fn detect_headings_and_blockquotes(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
1205        lazy_static! {
1206            // Regex for blockquote prefix
1207            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1208
1209            // Regex for heading detection
1210            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1211            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1212        }
1213
1214        let content_lines: Vec<&str> = content.lines().collect();
1215
1216        // Detect front matter boundaries to skip those lines
1217        let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
1218
1219        // Detect headings (including Setext which needs look-ahead) and blockquotes
1220        for i in 0..lines.len() {
1221            if lines[i].in_code_block {
1222                continue;
1223            }
1224
1225            // Skip lines in front matter
1226            if front_matter_end > 0 && i < front_matter_end {
1227                continue;
1228            }
1229
1230            // Skip lines in HTML blocks - HTML content should not be parsed as markdown
1231            if lines[i].in_html_block {
1232                continue;
1233            }
1234
1235            let line = content_lines[i];
1236
1237            // Check for blockquotes (even on blank lines within blockquotes)
1238            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1239                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1240                let markers = caps.get(2).map_or("", |m| m.as_str());
1241                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1242                let content = caps.get(4).map_or("", |m| m.as_str());
1243
1244                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1245                let marker_column = indent_str.len();
1246
1247                // Build the prefix (indentation + markers + space)
1248                let prefix = format!("{indent_str}{markers}{spaces_after}");
1249
1250                // Check for various blockquote issues
1251                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1252                // Consider tabs as multiple spaces, or actual multiple spaces
1253                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1254
1255                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1256                // MD028 flags empty blockquote lines that don't have a single space after the marker
1257                // Lines like "> " or ">> " are already correct and don't need fixing
1258                let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1259
1260                lines[i].blockquote = Some(BlockquoteInfo {
1261                    nesting_level,
1262                    indent: indent_str.to_string(),
1263                    marker_column,
1264                    prefix,
1265                    content: content.to_string(),
1266                    has_no_space_after_marker: has_no_space,
1267                    has_multiple_spaces_after_marker: has_multiple_spaces,
1268                    needs_md028_fix,
1269                });
1270            }
1271
1272            // Skip heading detection for blank lines
1273            if lines[i].is_blank {
1274                continue;
1275            }
1276
1277            // Check for ATX headings (but skip MkDocs snippet lines)
1278            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1279            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1280                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1281                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1282            } else {
1283                false
1284            };
1285
1286            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1287                // Skip headings inside HTML comments
1288                if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1289                    continue;
1290                }
1291                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1292                let hashes = caps.get(2).map_or("", |m| m.as_str());
1293                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1294                let rest = caps.get(4).map_or("", |m| m.as_str());
1295
1296                let level = hashes.len() as u8;
1297                let marker_column = leading_spaces.len();
1298
1299                // Check for closing sequence, but handle custom IDs that might come after
1300                let (text, has_closing, closing_seq) = {
1301                    // First check if there's a custom ID at the end
1302                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1303                        // Check if this looks like a valid custom ID (ends with })
1304                        if rest[id_start..].trim_end().ends_with('}') {
1305                            // Split off the custom ID
1306                            (&rest[..id_start], &rest[id_start..])
1307                        } else {
1308                            (rest, "")
1309                        }
1310                    } else {
1311                        (rest, "")
1312                    };
1313
1314                    // Now look for closing hashes in the part before the custom ID
1315                    let trimmed_rest = rest_without_id.trim_end();
1316                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1317                        // Look for the start of the hash sequence
1318                        let mut start_of_hashes = last_hash_pos;
1319                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1320                            start_of_hashes -= 1;
1321                        }
1322
1323                        // Check if there's at least one space before the closing hashes
1324                        let has_space_before = start_of_hashes == 0
1325                            || trimmed_rest
1326                                .chars()
1327                                .nth(start_of_hashes - 1)
1328                                .is_some_and(|c| c.is_whitespace());
1329
1330                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1331                        let potential_closing = &trimmed_rest[start_of_hashes..];
1332                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1333
1334                        if is_all_hashes && has_space_before {
1335                            // This is a closing sequence
1336                            let closing_hashes = potential_closing.to_string();
1337                            // The text is everything before the closing hashes
1338                            // Don't include the custom ID here - it will be extracted later
1339                            let text_part = if !custom_id_part.is_empty() {
1340                                // If we have a custom ID, append it back to get the full rest
1341                                // This allows the extract_header_id function to handle it properly
1342                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1343                            } else {
1344                                rest_without_id[..start_of_hashes].trim_end().to_string()
1345                            };
1346                            (text_part, true, closing_hashes)
1347                        } else {
1348                            // Not a valid closing sequence, return the full content
1349                            (rest.to_string(), false, String::new())
1350                        }
1351                    } else {
1352                        // No hashes found, return the full content
1353                        (rest.to_string(), false, String::new())
1354                    }
1355                };
1356
1357                let content_column = marker_column + hashes.len() + spaces_after.len();
1358
1359                // Extract custom header ID if present
1360                let raw_text = text.trim().to_string();
1361                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1362
1363                // If no custom ID was found on the header line, check the next line for standalone attr-list
1364                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1365                    let next_line = content_lines[i + 1];
1366                    if !lines[i + 1].in_code_block
1367                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1368                        && let Some(next_line_id) =
1369                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1370                    {
1371                        custom_id = Some(next_line_id);
1372                    }
1373                }
1374
1375                lines[i].heading = Some(HeadingInfo {
1376                    level,
1377                    style: HeadingStyle::ATX,
1378                    marker: hashes.to_string(),
1379                    marker_column,
1380                    content_column,
1381                    text: clean_text,
1382                    custom_id,
1383                    raw_text,
1384                    has_closing_sequence: has_closing,
1385                    closing_sequence: closing_seq,
1386                });
1387            }
1388            // Check for Setext headings (need to look at next line)
1389            else if i + 1 < content_lines.len() && i + 1 < lines.len() {
1390                let next_line = content_lines[i + 1];
1391                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1392                    // Skip if next line is front matter delimiter
1393                    if front_matter_end > 0 && i < front_matter_end {
1394                        continue;
1395                    }
1396
1397                    // Skip Setext headings inside HTML comments
1398                    if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1399                        continue;
1400                    }
1401
1402                    let underline = next_line.trim();
1403
1404                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1405                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1406                    if underline == "---" {
1407                        continue;
1408                    }
1409
1410                    // Skip if the current line looks like YAML key-value syntax
1411                    let current_line_trimmed = line.trim();
1412                    if current_line_trimmed.contains(':')
1413                        && !current_line_trimmed.starts_with('#')
1414                        && !current_line_trimmed.contains('[')
1415                        && !current_line_trimmed.contains("](")
1416                    {
1417                        // This looks like "key: value" which suggests YAML, not a heading
1418                        continue;
1419                    }
1420
1421                    let level = if underline.starts_with('=') { 1 } else { 2 };
1422                    let style = if level == 1 {
1423                        HeadingStyle::Setext1
1424                    } else {
1425                        HeadingStyle::Setext2
1426                    };
1427
1428                    // Extract custom header ID if present
1429                    let raw_text = line.trim().to_string();
1430                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1431
1432                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1433                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1434                        let attr_line = content_lines[i + 2];
1435                        if !lines[i + 2].in_code_block
1436                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1437                            && let Some(attr_line_id) =
1438                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1439                        {
1440                            custom_id = Some(attr_line_id);
1441                        }
1442                    }
1443
1444                    lines[i].heading = Some(HeadingInfo {
1445                        level,
1446                        style,
1447                        marker: underline.to_string(),
1448                        marker_column: next_line.len() - next_line.trim_start().len(),
1449                        content_column: lines[i].indent,
1450                        text: clean_text,
1451                        custom_id,
1452                        raw_text,
1453                        has_closing_sequence: false,
1454                        closing_sequence: String::new(),
1455                    });
1456                }
1457            }
1458        }
1459    }
1460
1461    /// Detect HTML blocks in the content
1462    fn detect_html_blocks(lines: &mut [LineInfo]) {
1463        // HTML block elements that trigger block context
1464        const BLOCK_ELEMENTS: &[&str] = &[
1465            "address",
1466            "article",
1467            "aside",
1468            "blockquote",
1469            "details",
1470            "dialog",
1471            "dd",
1472            "div",
1473            "dl",
1474            "dt",
1475            "fieldset",
1476            "figcaption",
1477            "figure",
1478            "footer",
1479            "form",
1480            "h1",
1481            "h2",
1482            "h3",
1483            "h4",
1484            "h5",
1485            "h6",
1486            "header",
1487            "hr",
1488            "li",
1489            "main",
1490            "nav",
1491            "ol",
1492            "p",
1493            "pre",
1494            "script",
1495            "section",
1496            "style",
1497            "table",
1498            "tbody",
1499            "td",
1500            "tfoot",
1501            "th",
1502            "thead",
1503            "tr",
1504            "ul",
1505        ];
1506
1507        let mut i = 0;
1508        while i < lines.len() {
1509            // Skip if already in code block or front matter
1510            if lines[i].in_code_block || lines[i].in_front_matter {
1511                i += 1;
1512                continue;
1513            }
1514
1515            let trimmed = lines[i].content.trim_start();
1516
1517            // Check if line starts with an HTML tag
1518            if trimmed.starts_with('<') && trimmed.len() > 1 {
1519                // Extract tag name safely
1520                let after_bracket = &trimmed[1..];
1521                let is_closing = after_bracket.starts_with('/');
1522                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1523
1524                // Extract tag name (stop at space, >, /, or end of string)
1525                let tag_name = tag_start
1526                    .chars()
1527                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1528                    .collect::<String>()
1529                    .to_lowercase();
1530
1531                // Check if it's a block element
1532                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1533                    // Mark this line as in HTML block
1534                    lines[i].in_html_block = true;
1535
1536                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1537                    // This avoids complex nesting logic that might cause infinite loops
1538                    if !is_closing {
1539                        let closing_tag = format!("</{tag_name}>");
1540                        // style and script tags can contain blank lines (CSS/JS formatting)
1541                        let allow_blank_lines = tag_name == "style" || tag_name == "script";
1542                        let mut j = i + 1;
1543                        while j < lines.len() && j < i + 100 {
1544                            // Limit search to 100 lines
1545                            // Stop at blank lines (except for style/script tags)
1546                            if !allow_blank_lines && lines[j].is_blank {
1547                                break;
1548                            }
1549
1550                            lines[j].in_html_block = true;
1551
1552                            // Check if this line contains the closing tag
1553                            if lines[j].content.contains(&closing_tag) {
1554                                break;
1555                            }
1556                            j += 1;
1557                        }
1558                    }
1559                }
1560            }
1561
1562            i += 1;
1563        }
1564    }
1565
1566    /// Parse all inline code spans in the content using AST
1567    fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1568        let mut code_spans = Vec::new();
1569
1570        // Quick check - if no backticks, no code spans
1571        if !content.contains('`') {
1572            return code_spans;
1573        }
1574
1575        // Helper function to recursively extract inline code spans from AST nodes
1576        fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1577            match node {
1578                Node::InlineCode(inline_code) => {
1579                    if let Some(pos) = &inline_code.position {
1580                        let start_pos = pos.start.offset;
1581                        let end_pos = pos.end.offset;
1582
1583                        // The position includes the backticks, extract the actual content
1584                        let full_span = &content[start_pos..end_pos];
1585                        let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1586
1587                        // Extract content between backticks, preserving spaces
1588                        let content_start = start_pos + backtick_count;
1589                        let content_end = end_pos - backtick_count;
1590                        let span_content = if content_start < content_end {
1591                            content[content_start..content_end].to_string()
1592                        } else {
1593                            String::new()
1594                        };
1595
1596                        // Find which line this code span starts on
1597                        let mut line_num = 1;
1598                        let mut col_start = start_pos;
1599                        for (idx, line_info) in lines.iter().enumerate() {
1600                            if start_pos >= line_info.byte_offset {
1601                                line_num = idx + 1;
1602                                col_start = start_pos - line_info.byte_offset;
1603                            } else {
1604                                break;
1605                            }
1606                        }
1607
1608                        // Find end column
1609                        let mut col_end = end_pos;
1610                        for line_info in lines.iter() {
1611                            if end_pos > line_info.byte_offset {
1612                                col_end = end_pos - line_info.byte_offset;
1613                            } else {
1614                                break;
1615                            }
1616                        }
1617
1618                        spans.push(CodeSpan {
1619                            line: line_num,
1620                            start_col: col_start,
1621                            end_col: col_end,
1622                            byte_offset: start_pos,
1623                            byte_end: end_pos,
1624                            backtick_count,
1625                            content: span_content,
1626                        });
1627                    }
1628                }
1629                // Recursively process children
1630                Node::Root(root) => {
1631                    for child in &root.children {
1632                        extract_code_spans(child, content, lines, spans);
1633                    }
1634                }
1635                Node::Paragraph(para) => {
1636                    for child in &para.children {
1637                        extract_code_spans(child, content, lines, spans);
1638                    }
1639                }
1640                Node::Heading(heading) => {
1641                    for child in &heading.children {
1642                        extract_code_spans(child, content, lines, spans);
1643                    }
1644                }
1645                Node::List(list) => {
1646                    for child in &list.children {
1647                        extract_code_spans(child, content, lines, spans);
1648                    }
1649                }
1650                Node::ListItem(item) => {
1651                    for child in &item.children {
1652                        extract_code_spans(child, content, lines, spans);
1653                    }
1654                }
1655                Node::Blockquote(blockquote) => {
1656                    for child in &blockquote.children {
1657                        extract_code_spans(child, content, lines, spans);
1658                    }
1659                }
1660                Node::Table(table) => {
1661                    for child in &table.children {
1662                        extract_code_spans(child, content, lines, spans);
1663                    }
1664                }
1665                Node::TableRow(row) => {
1666                    for child in &row.children {
1667                        extract_code_spans(child, content, lines, spans);
1668                    }
1669                }
1670                Node::TableCell(cell) => {
1671                    for child in &cell.children {
1672                        extract_code_spans(child, content, lines, spans);
1673                    }
1674                }
1675                Node::Emphasis(emphasis) => {
1676                    for child in &emphasis.children {
1677                        extract_code_spans(child, content, lines, spans);
1678                    }
1679                }
1680                Node::Strong(strong) => {
1681                    for child in &strong.children {
1682                        extract_code_spans(child, content, lines, spans);
1683                    }
1684                }
1685                Node::Link(link) => {
1686                    for child in &link.children {
1687                        extract_code_spans(child, content, lines, spans);
1688                    }
1689                }
1690                Node::LinkReference(link_ref) => {
1691                    for child in &link_ref.children {
1692                        extract_code_spans(child, content, lines, spans);
1693                    }
1694                }
1695                Node::FootnoteDefinition(footnote) => {
1696                    for child in &footnote.children {
1697                        extract_code_spans(child, content, lines, spans);
1698                    }
1699                }
1700                Node::Delete(delete) => {
1701                    for child in &delete.children {
1702                        extract_code_spans(child, content, lines, spans);
1703                    }
1704                }
1705                // Terminal nodes or nodes without relevant children
1706                Node::Code(_)
1707                | Node::Text(_)
1708                | Node::Html(_)
1709                | Node::Image(_)
1710                | Node::ImageReference(_)
1711                | Node::FootnoteReference(_)
1712                | Node::Break(_)
1713                | Node::ThematicBreak(_)
1714                | Node::Definition(_)
1715                | Node::Yaml(_)
1716                | Node::Toml(_)
1717                | Node::Math(_)
1718                | Node::InlineMath(_)
1719                | Node::MdxJsxFlowElement(_)
1720                | Node::MdxFlowExpression(_)
1721                | Node::MdxJsxTextElement(_)
1722                | Node::MdxTextExpression(_)
1723                | Node::MdxjsEsm(_) => {
1724                    // No children to process or not relevant for code spans
1725                }
1726            }
1727        }
1728
1729        // Extract all code spans from the AST
1730        extract_code_spans(ast, content, lines, &mut code_spans);
1731
1732        // Sort by position to ensure consistent ordering
1733        code_spans.sort_by_key(|span| span.byte_offset);
1734
1735        code_spans
1736    }
1737
1738    /// Parse all list blocks in the content (legacy line-by-line approach)
1739    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1740        // Pre-size based on lines that could be list items
1741        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1742        let mut current_block: Option<ListBlock> = None;
1743        let mut last_list_item_line = 0;
1744        let mut current_indent_level = 0;
1745        let mut last_marker_width = 0;
1746
1747        for (line_idx, line_info) in lines.iter().enumerate() {
1748            let line_num = line_idx + 1;
1749
1750            // Enhanced code block handling using Design #3's context analysis
1751            if line_info.in_code_block {
1752                if let Some(ref mut block) = current_block {
1753                    // Calculate minimum indentation for list continuation
1754                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1755
1756                    // Analyze code block context using the three-tier classification
1757                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1758
1759                    match context {
1760                        CodeBlockContext::Indented => {
1761                            // Code block is properly indented - continues the list
1762                            block.end_line = line_num;
1763                            continue;
1764                        }
1765                        CodeBlockContext::Standalone => {
1766                            // Code block separates lists - end current block
1767                            let completed_block = current_block.take().unwrap();
1768                            list_blocks.push(completed_block);
1769                            continue;
1770                        }
1771                        CodeBlockContext::Adjacent => {
1772                            // Edge case - use conservative behavior (continue list)
1773                            block.end_line = line_num;
1774                            continue;
1775                        }
1776                    }
1777                } else {
1778                    // No current list block - skip code block lines
1779                    continue;
1780                }
1781            }
1782
1783            // Extract blockquote prefix if any
1784            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1785                caps.get(0).unwrap().as_str().to_string()
1786            } else {
1787                String::new()
1788            };
1789
1790            // Check if this line is a list item
1791            if let Some(list_item) = &line_info.list_item {
1792                // Calculate nesting level based on indentation
1793                let item_indent = list_item.marker_column;
1794                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1795
1796                if let Some(ref mut block) = current_block {
1797                    // Check if this continues the current block
1798                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1799                    // or a continuation at the same or lower level
1800                    let is_nested = nesting > block.nesting_level;
1801                    let same_type =
1802                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1803                    let same_context = block.blockquote_prefix == blockquote_prefix;
1804                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1805
1806                    // For unordered lists, also check marker consistency
1807                    let marker_compatible =
1808                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1809
1810                    // Check if there's non-list content between the last item and this one
1811                    let has_non_list_content = {
1812                        let mut found_non_list = false;
1813                        // Use the last item from the current block, not the global last_list_item_line
1814                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1815
1816                        // Debug: Special check for problematic line
1817                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1818                            let last_line = &lines[block_last_item_line - 1];
1819                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1820                                log::debug!(
1821                                    "After problematic line {}: checking lines {} to {} for non-list content",
1822                                    block_last_item_line,
1823                                    block_last_item_line + 1,
1824                                    line_num
1825                                );
1826                                // If they're consecutive list items, there's no content between
1827                                if line_num == block_last_item_line + 1 {
1828                                    log::debug!("Lines are consecutive, no content between");
1829                                }
1830                            }
1831                        }
1832
1833                        for check_line in (block_last_item_line + 1)..line_num {
1834                            let check_idx = check_line - 1;
1835                            if check_idx < lines.len() {
1836                                let check_info = &lines[check_idx];
1837                                // Check for content that breaks the list
1838                                let is_list_breaking_content = if check_info.in_code_block {
1839                                    // Use enhanced code block classification for list separation
1840                                    let last_item_marker_width =
1841                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1842                                            lines[block_last_item_line - 1]
1843                                                .list_item
1844                                                .as_ref()
1845                                                .map(|li| {
1846                                                    if li.is_ordered {
1847                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1848                                                    } else {
1849                                                        li.marker.len()
1850                                                    }
1851                                                })
1852                                                .unwrap_or(3) // fallback to 3 if no list item found
1853                                        } else {
1854                                            3 // fallback
1855                                        };
1856
1857                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1858
1859                                    // Analyze code block context using our enhanced classification
1860                                    let context = CodeBlockUtils::analyze_code_block_context(
1861                                        lines,
1862                                        check_line - 1,
1863                                        min_continuation,
1864                                    );
1865
1866                                    // Standalone code blocks break lists, indented ones continue them
1867                                    matches!(context, CodeBlockContext::Standalone)
1868                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1869                                    // Check for structural separators that should break lists (from issue #42)
1870                                    let line_content = check_info.content.trim();
1871
1872                                    // Any of these structural separators break lists
1873                                    if check_info.heading.is_some()
1874                                        || line_content.starts_with("---")
1875                                        || line_content.starts_with("***")
1876                                        || line_content.starts_with("___")
1877                                        || (line_content.contains('|')
1878                                            && !line_content.contains("](")
1879                                            && !line_content.contains("http")
1880                                            && (line_content.matches('|').count() > 1
1881                                                || line_content.starts_with('|')
1882                                                || line_content.ends_with('|')))
1883                                        || line_content.starts_with(">")
1884                                    {
1885                                        true
1886                                    }
1887                                    // Other non-list content - check if properly indented
1888                                    else {
1889                                        let last_item_marker_width =
1890                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1891                                                lines[block_last_item_line - 1]
1892                                                    .list_item
1893                                                    .as_ref()
1894                                                    .map(|li| {
1895                                                        if li.is_ordered {
1896                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1897                                                        } else {
1898                                                            li.marker.len()
1899                                                        }
1900                                                    })
1901                                                    .unwrap_or(3) // fallback to 3 if no list item found
1902                                            } else {
1903                                                3 // fallback
1904                                            };
1905
1906                                        let min_continuation =
1907                                            if block.is_ordered { last_item_marker_width } else { 2 };
1908                                        check_info.indent < min_continuation
1909                                    }
1910                                } else {
1911                                    false
1912                                };
1913
1914                                if is_list_breaking_content {
1915                                    // Not indented enough, so it breaks the list
1916                                    found_non_list = true;
1917                                    break;
1918                                }
1919                            }
1920                        }
1921                        found_non_list
1922                    };
1923
1924                    // A list continues if:
1925                    // 1. It's a nested item (indented more than the parent), OR
1926                    // 2. It's the same type at the same level with reasonable distance
1927                    let mut continues_list = if is_nested {
1928                        // Nested items always continue the list if they're in the same context
1929                        same_context && reasonable_distance && !has_non_list_content
1930                    } else {
1931                        // Same-level items need to match type and markers
1932                        let result = same_type
1933                            && same_context
1934                            && reasonable_distance
1935                            && marker_compatible
1936                            && !has_non_list_content;
1937
1938                        // Debug logging for lines after problematic content
1939                        if block.item_lines.last().is_some_and(|&last_line| {
1940                            last_line > 0
1941                                && last_line <= lines.len()
1942                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1943                                && lines[last_line - 1].content.contains(r"\`")
1944                        }) {
1945                            log::debug!(
1946                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1947                            );
1948                            if line_num > 0 && line_num <= lines.len() {
1949                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1950                            }
1951                        }
1952
1953                        result
1954                    };
1955
1956                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
1957                    // This handles edge cases where content patterns might otherwise split lists incorrectly
1958                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1959                        // Check if the previous line was a list item
1960                        if block.item_lines.contains(&(line_num - 1)) {
1961                            // They're consecutive list items - force them to be in the same list
1962                            continues_list = true;
1963                        }
1964                    }
1965
1966                    if continues_list {
1967                        // Extend current block
1968                        block.end_line = line_num;
1969                        block.item_lines.push(line_num);
1970
1971                        // Update max marker width
1972                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1973                            list_item.marker.len() + 1
1974                        } else {
1975                            list_item.marker.len()
1976                        });
1977
1978                        // Update marker consistency for unordered lists
1979                        if !block.is_ordered
1980                            && block.marker.is_some()
1981                            && block.marker.as_ref() != Some(&list_item.marker)
1982                        {
1983                            // Mixed markers, clear the marker field
1984                            block.marker = None;
1985                        }
1986                    } else {
1987                        // End current block and start a new one
1988
1989                        list_blocks.push(block.clone());
1990
1991                        *block = ListBlock {
1992                            start_line: line_num,
1993                            end_line: line_num,
1994                            is_ordered: list_item.is_ordered,
1995                            marker: if list_item.is_ordered {
1996                                None
1997                            } else {
1998                                Some(list_item.marker.clone())
1999                            },
2000                            blockquote_prefix: blockquote_prefix.clone(),
2001                            item_lines: vec![line_num],
2002                            nesting_level: nesting,
2003                            max_marker_width: if list_item.is_ordered {
2004                                list_item.marker.len() + 1
2005                            } else {
2006                                list_item.marker.len()
2007                            },
2008                        };
2009                    }
2010                } else {
2011                    // Start a new block
2012                    current_block = Some(ListBlock {
2013                        start_line: line_num,
2014                        end_line: line_num,
2015                        is_ordered: list_item.is_ordered,
2016                        marker: if list_item.is_ordered {
2017                            None
2018                        } else {
2019                            Some(list_item.marker.clone())
2020                        },
2021                        blockquote_prefix,
2022                        item_lines: vec![line_num],
2023                        nesting_level: nesting,
2024                        max_marker_width: list_item.marker.len(),
2025                    });
2026                }
2027
2028                last_list_item_line = line_num;
2029                current_indent_level = item_indent;
2030                last_marker_width = if list_item.is_ordered {
2031                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
2032                } else {
2033                    list_item.marker.len()
2034                };
2035            } else if let Some(ref mut block) = current_block {
2036                // Not a list item - check if it continues the current block
2037
2038                // For MD032 compatibility, we use a simple approach:
2039                // - Indented lines continue the list
2040                // - Blank lines followed by indented content continue the list
2041                // - Everything else ends the list
2042
2043                // Check if the last line in the list block ended with a backslash (hard line break)
2044                // This handles cases where list items use backslash for hard line breaks
2045                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
2046                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2047                } else {
2048                    false
2049                };
2050
2051                // Calculate minimum indentation for list continuation
2052                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2053                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2054                let min_continuation_indent = if block.is_ordered {
2055                    current_indent_level + last_marker_width
2056                } else {
2057                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2058                };
2059
2060                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2061                    // Indented line or backslash continuation continues the list
2062                    block.end_line = line_num;
2063                } else if line_info.is_blank {
2064                    // Blank line - check if it's internal to the list or ending it
2065                    // We only include blank lines that are followed by more list content
2066                    let mut check_idx = line_idx + 1;
2067                    let mut found_continuation = false;
2068
2069                    // Skip additional blank lines
2070                    while check_idx < lines.len() && lines[check_idx].is_blank {
2071                        check_idx += 1;
2072                    }
2073
2074                    if check_idx < lines.len() {
2075                        let next_line = &lines[check_idx];
2076                        // Check if followed by indented content (list continuation)
2077                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2078                            found_continuation = true;
2079                        }
2080                        // Check if followed by another list item at the same level
2081                        else if !next_line.in_code_block
2082                            && next_line.list_item.is_some()
2083                            && let Some(item) = &next_line.list_item
2084                        {
2085                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2086                                .find(&next_line.content)
2087                                .map_or(String::new(), |m| m.as_str().to_string());
2088                            if item.marker_column == current_indent_level
2089                                && item.is_ordered == block.is_ordered
2090                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2091                            {
2092                                // Check if there was meaningful content between the list items (unused now)
2093                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2094                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2095                                    if let Some(between_line) = lines.get(idx) {
2096                                        let trimmed = between_line.content.trim();
2097                                        // Skip empty lines
2098                                        if trimmed.is_empty() {
2099                                            return false;
2100                                        }
2101                                        // Check for meaningful content
2102                                        let line_indent =
2103                                            between_line.content.len() - between_line.content.trim_start().len();
2104
2105                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2106                                        if trimmed.starts_with("```")
2107                                            || trimmed.starts_with("~~~")
2108                                            || trimmed.starts_with("---")
2109                                            || trimmed.starts_with("***")
2110                                            || trimmed.starts_with("___")
2111                                            || trimmed.starts_with(">")
2112                                            || trimmed.contains('|') // Tables
2113                                            || between_line.heading.is_some()
2114                                        {
2115                                            return true; // These are structural separators - meaningful content that breaks lists
2116                                        }
2117
2118                                        // Only properly indented content continues the list
2119                                        line_indent >= min_continuation_indent
2120                                    } else {
2121                                        false
2122                                    }
2123                                });
2124
2125                                if block.is_ordered {
2126                                    // For ordered lists: don't continue if there are structural separators
2127                                    // Check if there are structural separators between the list items
2128                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2129                                        if let Some(between_line) = lines.get(idx) {
2130                                            let trimmed = between_line.content.trim();
2131                                            if trimmed.is_empty() {
2132                                                return false;
2133                                            }
2134                                            // Check for structural separators that break lists
2135                                            trimmed.starts_with("```")
2136                                                || trimmed.starts_with("~~~")
2137                                                || trimmed.starts_with("---")
2138                                                || trimmed.starts_with("***")
2139                                                || trimmed.starts_with("___")
2140                                                || trimmed.starts_with(">")
2141                                                || trimmed.contains('|') // Tables
2142                                                || between_line.heading.is_some()
2143                                        } else {
2144                                            false
2145                                        }
2146                                    });
2147                                    found_continuation = !has_structural_separators;
2148                                } else {
2149                                    // For unordered lists: also check for structural separators
2150                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2151                                        if let Some(between_line) = lines.get(idx) {
2152                                            let trimmed = between_line.content.trim();
2153                                            if trimmed.is_empty() {
2154                                                return false;
2155                                            }
2156                                            // Check for structural separators that break lists
2157                                            trimmed.starts_with("```")
2158                                                || trimmed.starts_with("~~~")
2159                                                || trimmed.starts_with("---")
2160                                                || trimmed.starts_with("***")
2161                                                || trimmed.starts_with("___")
2162                                                || trimmed.starts_with(">")
2163                                                || trimmed.contains('|') // Tables
2164                                                || between_line.heading.is_some()
2165                                        } else {
2166                                            false
2167                                        }
2168                                    });
2169                                    found_continuation = !has_structural_separators;
2170                                }
2171                            }
2172                        }
2173                    }
2174
2175                    if found_continuation {
2176                        // Include the blank line in the block
2177                        block.end_line = line_num;
2178                    } else {
2179                        // Blank line ends the list - don't include it
2180                        list_blocks.push(block.clone());
2181                        current_block = None;
2182                    }
2183                } else {
2184                    // Check for lazy continuation - non-indented line immediately after a list item
2185                    // But only if the line has sufficient indentation for the list type
2186                    let min_required_indent = if block.is_ordered {
2187                        current_indent_level + last_marker_width
2188                    } else {
2189                        current_indent_level + 2
2190                    };
2191
2192                    // For lazy continuation to apply, the line must either:
2193                    // 1. Have no indentation (true lazy continuation)
2194                    // 2. Have sufficient indentation for the list type
2195                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2196                    let line_content = line_info.content.trim();
2197                    let is_structural_separator = line_info.heading.is_some()
2198                        || line_content.starts_with("```")
2199                        || line_content.starts_with("~~~")
2200                        || line_content.starts_with("---")
2201                        || line_content.starts_with("***")
2202                        || line_content.starts_with("___")
2203                        || line_content.starts_with(">")
2204                        || (line_content.contains('|')
2205                            && !line_content.contains("](")
2206                            && !line_content.contains("http")
2207                            && (line_content.matches('|').count() > 1
2208                                || line_content.starts_with('|')
2209                                || line_content.ends_with('|'))); // Tables
2210
2211                    // Allow lazy continuation if we're still within the same list block
2212                    // (not just immediately after a list item)
2213                    let is_lazy_continuation = !is_structural_separator
2214                        && !line_info.is_blank
2215                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2216
2217                    if is_lazy_continuation {
2218                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2219                        // it's probably not a continuation
2220                        let content_to_check = if !blockquote_prefix.is_empty() {
2221                            // Strip blockquote prefix to check the actual content
2222                            line_info
2223                                .content
2224                                .strip_prefix(&blockquote_prefix)
2225                                .unwrap_or(&line_info.content)
2226                                .trim()
2227                        } else {
2228                            line_info.content.trim()
2229                        };
2230
2231                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2232
2233                        // If it starts with uppercase and the previous line ended with punctuation,
2234                        // it's likely a new paragraph, not a continuation
2235                        if starts_with_uppercase && last_list_item_line > 0 {
2236                            // This looks like a new paragraph
2237                            list_blocks.push(block.clone());
2238                            current_block = None;
2239                        } else {
2240                            // This is a lazy continuation line
2241                            block.end_line = line_num;
2242                        }
2243                    } else {
2244                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2245                        list_blocks.push(block.clone());
2246                        current_block = None;
2247                    }
2248                }
2249            }
2250        }
2251
2252        // Don't forget the last block
2253        if let Some(block) = current_block {
2254            list_blocks.push(block);
2255        }
2256
2257        // Merge adjacent blocks that should be one
2258        merge_adjacent_list_blocks(&mut list_blocks, lines);
2259
2260        list_blocks
2261    }
2262
2263    /// Compute character frequency for fast content analysis
2264    fn compute_char_frequency(content: &str) -> CharFrequency {
2265        let mut frequency = CharFrequency::default();
2266
2267        for ch in content.chars() {
2268            match ch {
2269                '#' => frequency.hash_count += 1,
2270                '*' => frequency.asterisk_count += 1,
2271                '_' => frequency.underscore_count += 1,
2272                '-' => frequency.hyphen_count += 1,
2273                '+' => frequency.plus_count += 1,
2274                '>' => frequency.gt_count += 1,
2275                '|' => frequency.pipe_count += 1,
2276                '[' => frequency.bracket_count += 1,
2277                '`' => frequency.backtick_count += 1,
2278                '<' => frequency.lt_count += 1,
2279                '!' => frequency.exclamation_count += 1,
2280                '\n' => frequency.newline_count += 1,
2281                _ => {}
2282            }
2283        }
2284
2285        frequency
2286    }
2287
2288    /// Parse HTML tags in the content
2289    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2290        lazy_static! {
2291            static ref HTML_TAG_REGEX: regex::Regex =
2292                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2293        }
2294
2295        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2296
2297        for cap in HTML_TAG_REGEX.captures_iter(content) {
2298            let full_match = cap.get(0).unwrap();
2299            let match_start = full_match.start();
2300            let match_end = full_match.end();
2301
2302            // Skip if in code block
2303            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2304                continue;
2305            }
2306
2307            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2308            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2309            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2310
2311            // Find which line this tag is on
2312            let mut line_num = 1;
2313            let mut col_start = match_start;
2314            let mut col_end = match_end;
2315            for (idx, line_info) in lines.iter().enumerate() {
2316                if match_start >= line_info.byte_offset {
2317                    line_num = idx + 1;
2318                    col_start = match_start - line_info.byte_offset;
2319                    col_end = match_end - line_info.byte_offset;
2320                } else {
2321                    break;
2322                }
2323            }
2324
2325            html_tags.push(HtmlTag {
2326                line: line_num,
2327                start_col: col_start,
2328                end_col: col_end,
2329                byte_offset: match_start,
2330                byte_end: match_end,
2331                tag_name,
2332                is_closing,
2333                is_self_closing,
2334                raw_content: full_match.as_str().to_string(),
2335            });
2336        }
2337
2338        html_tags
2339    }
2340
2341    /// Parse emphasis spans in the content
2342    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2343        lazy_static! {
2344            static ref EMPHASIS_REGEX: regex::Regex =
2345                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2346        }
2347
2348        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2349
2350        for cap in EMPHASIS_REGEX.captures_iter(content) {
2351            let full_match = cap.get(0).unwrap();
2352            let match_start = full_match.start();
2353            let match_end = full_match.end();
2354
2355            // Skip if in code block
2356            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2357                continue;
2358            }
2359
2360            let opening_markers = cap.get(1).unwrap().as_str();
2361            let content_part = cap.get(2).unwrap().as_str();
2362            let closing_markers = cap.get(3).unwrap().as_str();
2363
2364            // Validate matching markers
2365            if opening_markers.chars().next() != closing_markers.chars().next()
2366                || opening_markers.len() != closing_markers.len()
2367            {
2368                continue;
2369            }
2370
2371            let marker = opening_markers.chars().next().unwrap();
2372            let marker_count = opening_markers.len();
2373
2374            // Find which line this emphasis is on
2375            let mut line_num = 1;
2376            let mut col_start = match_start;
2377            let mut col_end = match_end;
2378            for (idx, line_info) in lines.iter().enumerate() {
2379                if match_start >= line_info.byte_offset {
2380                    line_num = idx + 1;
2381                    col_start = match_start - line_info.byte_offset;
2382                    col_end = match_end - line_info.byte_offset;
2383                } else {
2384                    break;
2385                }
2386            }
2387
2388            emphasis_spans.push(EmphasisSpan {
2389                line: line_num,
2390                start_col: col_start,
2391                end_col: col_end,
2392                byte_offset: match_start,
2393                byte_end: match_end,
2394                marker,
2395                marker_count,
2396                content: content_part.to_string(),
2397            });
2398        }
2399
2400        emphasis_spans
2401    }
2402
2403    /// Parse table rows in the content
2404    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2405        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2406
2407        for (line_idx, line_info) in lines.iter().enumerate() {
2408            // Skip lines in code blocks or blank lines
2409            if line_info.in_code_block || line_info.is_blank {
2410                continue;
2411            }
2412
2413            let line = &line_info.content;
2414            let line_num = line_idx + 1;
2415
2416            // Check if this line contains pipes (potential table row)
2417            if !line.contains('|') {
2418                continue;
2419            }
2420
2421            // Count columns by splitting on pipes
2422            let parts: Vec<&str> = line.split('|').collect();
2423            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2424
2425            // Check if this is a separator row
2426            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2427            let mut column_alignments = Vec::new();
2428
2429            if is_separator {
2430                for part in &parts[1..parts.len() - 1] {
2431                    // Skip first and last empty parts
2432                    let trimmed = part.trim();
2433                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2434                        "center".to_string()
2435                    } else if trimmed.ends_with(':') {
2436                        "right".to_string()
2437                    } else if trimmed.starts_with(':') {
2438                        "left".to_string()
2439                    } else {
2440                        "none".to_string()
2441                    };
2442                    column_alignments.push(alignment);
2443                }
2444            }
2445
2446            table_rows.push(TableRow {
2447                line: line_num,
2448                is_separator,
2449                column_count,
2450                column_alignments,
2451            });
2452        }
2453
2454        table_rows
2455    }
2456
2457    /// Parse bare URLs and emails in the content
2458    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2459        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2460
2461        // Check for bare URLs (not in angle brackets or markdown links)
2462        for cap in BARE_URL_PATTERN.captures_iter(content) {
2463            let full_match = cap.get(0).unwrap();
2464            let match_start = full_match.start();
2465            let match_end = full_match.end();
2466
2467            // Skip if in code block
2468            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2469                continue;
2470            }
2471
2472            // Skip if already in angle brackets or markdown links
2473            let preceding_char = if match_start > 0 {
2474                content.chars().nth(match_start - 1)
2475            } else {
2476                None
2477            };
2478            let following_char = content.chars().nth(match_end);
2479
2480            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2481                continue;
2482            }
2483            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2484                continue;
2485            }
2486
2487            let url = full_match.as_str();
2488            let url_type = if url.starts_with("https://") {
2489                "https"
2490            } else if url.starts_with("http://") {
2491                "http"
2492            } else if url.starts_with("ftp://") {
2493                "ftp"
2494            } else {
2495                "other"
2496            };
2497
2498            // Find which line this URL is on
2499            let mut line_num = 1;
2500            let mut col_start = match_start;
2501            let mut col_end = match_end;
2502            for (idx, line_info) in lines.iter().enumerate() {
2503                if match_start >= line_info.byte_offset {
2504                    line_num = idx + 1;
2505                    col_start = match_start - line_info.byte_offset;
2506                    col_end = match_end - line_info.byte_offset;
2507                } else {
2508                    break;
2509                }
2510            }
2511
2512            bare_urls.push(BareUrl {
2513                line: line_num,
2514                start_col: col_start,
2515                end_col: col_end,
2516                byte_offset: match_start,
2517                byte_end: match_end,
2518                url: url.to_string(),
2519                url_type: url_type.to_string(),
2520            });
2521        }
2522
2523        // Check for bare email addresses
2524        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2525            let full_match = cap.get(0).unwrap();
2526            let match_start = full_match.start();
2527            let match_end = full_match.end();
2528
2529            // Skip if in code block
2530            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2531                continue;
2532            }
2533
2534            // Skip if already in angle brackets or markdown links
2535            let preceding_char = if match_start > 0 {
2536                content.chars().nth(match_start - 1)
2537            } else {
2538                None
2539            };
2540            let following_char = content.chars().nth(match_end);
2541
2542            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2543                continue;
2544            }
2545            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2546                continue;
2547            }
2548
2549            let email = full_match.as_str();
2550
2551            // Find which line this email is on
2552            let mut line_num = 1;
2553            let mut col_start = match_start;
2554            let mut col_end = match_end;
2555            for (idx, line_info) in lines.iter().enumerate() {
2556                if match_start >= line_info.byte_offset {
2557                    line_num = idx + 1;
2558                    col_start = match_start - line_info.byte_offset;
2559                    col_end = match_end - line_info.byte_offset;
2560                } else {
2561                    break;
2562                }
2563            }
2564
2565            bare_urls.push(BareUrl {
2566                line: line_num,
2567                start_col: col_start,
2568                end_col: col_end,
2569                byte_offset: match_start,
2570                byte_end: match_end,
2571                url: email.to_string(),
2572                url_type: "email".to_string(),
2573            });
2574        }
2575
2576        bare_urls
2577    }
2578}
2579
2580/// Merge adjacent list blocks that should be treated as one
2581fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2582    if list_blocks.len() < 2 {
2583        return;
2584    }
2585
2586    let mut merger = ListBlockMerger::new(lines);
2587    *list_blocks = merger.merge(list_blocks);
2588}
2589
2590/// Helper struct to manage the complex logic of merging list blocks
2591struct ListBlockMerger<'a> {
2592    lines: &'a [LineInfo],
2593}
2594
2595impl<'a> ListBlockMerger<'a> {
2596    fn new(lines: &'a [LineInfo]) -> Self {
2597        Self { lines }
2598    }
2599
2600    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2601        let mut merged = Vec::with_capacity(list_blocks.len());
2602        let mut current = list_blocks[0].clone();
2603
2604        for next in list_blocks.iter().skip(1) {
2605            if self.should_merge_blocks(&current, next) {
2606                current = self.merge_two_blocks(current, next);
2607            } else {
2608                merged.push(current);
2609                current = next.clone();
2610            }
2611        }
2612
2613        merged.push(current);
2614        merged
2615    }
2616
2617    /// Determine if two adjacent list blocks should be merged
2618    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2619        // Basic compatibility checks
2620        if !self.blocks_are_compatible(current, next) {
2621            return false;
2622        }
2623
2624        // Check spacing and content between blocks
2625        let spacing = self.analyze_spacing_between(current, next);
2626        match spacing {
2627            BlockSpacing::Consecutive => true,
2628            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2629            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2630                self.can_merge_with_content_between(current, next)
2631            }
2632        }
2633    }
2634
2635    /// Check if blocks have compatible structure for merging
2636    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2637        current.is_ordered == next.is_ordered
2638            && current.blockquote_prefix == next.blockquote_prefix
2639            && current.nesting_level == next.nesting_level
2640    }
2641
2642    /// Analyze the spacing between two list blocks
2643    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2644        let gap = next.start_line - current.end_line;
2645
2646        match gap {
2647            1 => BlockSpacing::Consecutive,
2648            2 => BlockSpacing::SingleBlank,
2649            _ if gap > 2 => {
2650                if self.has_only_blank_lines_between(current, next) {
2651                    BlockSpacing::MultipleBlanks
2652                } else {
2653                    BlockSpacing::ContentBetween
2654                }
2655            }
2656            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2657        }
2658    }
2659
2660    /// Check if unordered lists can be merged with a single blank line between
2661    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2662        // Check if there are structural separators between the blocks
2663        // If has_meaningful_content_between returns true, it means there are structural separators
2664        if has_meaningful_content_between(current, next, self.lines) {
2665            return false; // Structural separators prevent merging
2666        }
2667
2668        // Only merge unordered lists with same marker across single blank
2669        !current.is_ordered && current.marker == next.marker
2670    }
2671
2672    /// Check if ordered lists can be merged when there's content between them
2673    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2674        // Do not merge lists if there are structural separators between them
2675        if has_meaningful_content_between(current, next, self.lines) {
2676            return false; // Structural separators prevent merging
2677        }
2678
2679        // Only consider merging ordered lists if there's no structural content between
2680        current.is_ordered && next.is_ordered
2681    }
2682
2683    /// Check if there are only blank lines between blocks
2684    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2685        for line_num in (current.end_line + 1)..next.start_line {
2686            if let Some(line_info) = self.lines.get(line_num - 1)
2687                && !line_info.content.trim().is_empty()
2688            {
2689                return false;
2690            }
2691        }
2692        true
2693    }
2694
2695    /// Merge two compatible list blocks into one
2696    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2697        current.end_line = next.end_line;
2698        current.item_lines.extend_from_slice(&next.item_lines);
2699
2700        // Update max marker width
2701        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2702
2703        // Handle marker consistency for unordered lists
2704        if !current.is_ordered && self.markers_differ(&current, next) {
2705            current.marker = None; // Mixed markers
2706        }
2707
2708        current
2709    }
2710
2711    /// Check if two blocks have different markers
2712    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2713        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2714    }
2715}
2716
2717/// Types of spacing between list blocks
2718#[derive(Debug, PartialEq)]
2719enum BlockSpacing {
2720    Consecutive,    // No gap between blocks
2721    SingleBlank,    // One blank line between blocks
2722    MultipleBlanks, // Multiple blank lines but no content
2723    ContentBetween, // Content exists between blocks
2724}
2725
2726/// Check if there's meaningful content (not just blank lines) between two list blocks
2727fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2728    // Check lines between current.end_line and next.start_line
2729    for line_num in (current.end_line + 1)..next.start_line {
2730        if let Some(line_info) = lines.get(line_num - 1) {
2731            // Convert to 0-indexed
2732            let trimmed = line_info.content.trim();
2733
2734            // Skip empty lines
2735            if trimmed.is_empty() {
2736                continue;
2737            }
2738
2739            // Check for structural separators that should separate lists (CommonMark compliant)
2740
2741            // Headings separate lists
2742            if line_info.heading.is_some() {
2743                return true; // Has meaningful content - headings separate lists
2744            }
2745
2746            // Horizontal rules separate lists (---, ***, ___)
2747            if is_horizontal_rule(trimmed) {
2748                return true; // Has meaningful content - horizontal rules separate lists
2749            }
2750
2751            // Tables separate lists (lines containing | but not in URLs or code)
2752            // Simple heuristic: tables typically have | at start/end or multiple |
2753            if trimmed.contains('|') && trimmed.len() > 1 {
2754                // Don't treat URLs with | as tables
2755                if !trimmed.contains("](") && !trimmed.contains("http") {
2756                    // More robust check: tables usually have multiple | or | at edges
2757                    let pipe_count = trimmed.matches('|').count();
2758                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2759                        return true; // Has meaningful content - tables separate lists
2760                    }
2761                }
2762            }
2763
2764            // Blockquotes separate lists
2765            if trimmed.starts_with('>') {
2766                return true; // Has meaningful content - blockquotes separate lists
2767            }
2768
2769            // Code block fences separate lists (unless properly indented as list content)
2770            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2771                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2772
2773                // Check if this code block is properly indented as list continuation
2774                let min_continuation_indent = if current.is_ordered {
2775                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2776                } else {
2777                    current.nesting_level + 2
2778                };
2779
2780                if line_indent < min_continuation_indent {
2781                    // This is a standalone code block that separates lists
2782                    return true; // Has meaningful content - standalone code blocks separate lists
2783                }
2784            }
2785
2786            // Check if this line has proper indentation for list continuation
2787            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2788
2789            // Calculate minimum indentation needed to be list continuation
2790            let min_indent = if current.is_ordered {
2791                current.nesting_level + current.max_marker_width
2792            } else {
2793                current.nesting_level + 2
2794            };
2795
2796            // If the line is not indented enough to be list continuation, it's meaningful content
2797            if line_indent < min_indent {
2798                return true; // Has meaningful content - content not indented as list continuation
2799            }
2800
2801            // If we reach here, the line is properly indented as list continuation
2802            // Continue checking other lines
2803        }
2804    }
2805
2806    // Only blank lines or properly indented list continuation content between blocks
2807    false
2808}
2809
2810/// Check if a line is a horizontal rule (---, ***, ___)
2811fn is_horizontal_rule(trimmed: &str) -> bool {
2812    if trimmed.len() < 3 {
2813        return false;
2814    }
2815
2816    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2817    let chars: Vec<char> = trimmed.chars().collect();
2818    if let Some(&first_char) = chars.first()
2819        && (first_char == '-' || first_char == '*' || first_char == '_')
2820    {
2821        let mut count = 0;
2822        for &ch in &chars {
2823            if ch == first_char {
2824                count += 1;
2825            } else if ch != ' ' && ch != '\t' {
2826                return false; // Non-matching, non-whitespace character
2827            }
2828        }
2829        return count >= 3;
2830    }
2831    false
2832}
2833
2834/// Check if content contains patterns that cause the markdown crate to panic
2835#[cfg(test)]
2836mod tests {
2837    use super::*;
2838
2839    #[test]
2840    fn test_empty_content() {
2841        let ctx = LintContext::new("", MarkdownFlavor::Standard);
2842        assert_eq!(ctx.content, "");
2843        assert_eq!(ctx.line_offsets, vec![0]);
2844        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2845        assert_eq!(ctx.lines.len(), 0);
2846    }
2847
2848    #[test]
2849    fn test_single_line() {
2850        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2851        assert_eq!(ctx.content, "# Hello");
2852        assert_eq!(ctx.line_offsets, vec![0]);
2853        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2854        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2855    }
2856
2857    #[test]
2858    fn test_multi_line() {
2859        let content = "# Title\n\nSecond line\nThird line";
2860        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2861        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2862        // Test offset to line/col
2863        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2864        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2865        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2866        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2867        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2868    }
2869
2870    #[test]
2871    fn test_line_info() {
2872        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2873        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2874
2875        // Test line info
2876        assert_eq!(ctx.lines.len(), 7);
2877
2878        // Line 1: "# Title"
2879        let line1 = &ctx.lines[0];
2880        assert_eq!(line1.content, "# Title");
2881        assert_eq!(line1.byte_offset, 0);
2882        assert_eq!(line1.indent, 0);
2883        assert!(!line1.is_blank);
2884        assert!(!line1.in_code_block);
2885        assert!(line1.list_item.is_none());
2886
2887        // Line 2: "    indented"
2888        let line2 = &ctx.lines[1];
2889        assert_eq!(line2.content, "    indented");
2890        assert_eq!(line2.byte_offset, 8);
2891        assert_eq!(line2.indent, 4);
2892        assert!(!line2.is_blank);
2893
2894        // Line 3: "" (blank)
2895        let line3 = &ctx.lines[2];
2896        assert_eq!(line3.content, "");
2897        assert!(line3.is_blank);
2898
2899        // Test helper methods
2900        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2901        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2902        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2903        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2904    }
2905
2906    #[test]
2907    fn test_list_item_detection() {
2908        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2909        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2910
2911        // Line 1: "- Unordered item"
2912        let line1 = &ctx.lines[0];
2913        assert!(line1.list_item.is_some());
2914        let list1 = line1.list_item.as_ref().unwrap();
2915        assert_eq!(list1.marker, "-");
2916        assert!(!list1.is_ordered);
2917        assert_eq!(list1.marker_column, 0);
2918        assert_eq!(list1.content_column, 2);
2919
2920        // Line 2: "  * Nested item"
2921        let line2 = &ctx.lines[1];
2922        assert!(line2.list_item.is_some());
2923        let list2 = line2.list_item.as_ref().unwrap();
2924        assert_eq!(list2.marker, "*");
2925        assert_eq!(list2.marker_column, 2);
2926
2927        // Line 3: "1. Ordered item"
2928        let line3 = &ctx.lines[2];
2929        assert!(line3.list_item.is_some());
2930        let list3 = line3.list_item.as_ref().unwrap();
2931        assert_eq!(list3.marker, "1.");
2932        assert!(list3.is_ordered);
2933        assert_eq!(list3.number, Some(1));
2934
2935        // Line 6: "Not a list"
2936        let line6 = &ctx.lines[5];
2937        assert!(line6.list_item.is_none());
2938    }
2939
2940    #[test]
2941    fn test_offset_to_line_col_edge_cases() {
2942        let content = "a\nb\nc";
2943        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2944        // line_offsets: [0, 2, 4]
2945        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2946        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2947        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2948        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2949        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2950        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2951    }
2952}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs