rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::utils::ast_utils::get_cached_ast;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use markdown::mdast::Node;
6use regex::Regex;
7
8lazy_static! {
9    // Comprehensive link pattern that captures both inline and reference links
10    // Use (?s) flag to make . match newlines
11    static ref LINK_PATTERN: Regex = Regex::new(
12        r"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
16            |
17            \[([^\]]*)\]      # Reference ID in group 3
18        )"
19    ).unwrap();
20
21    // Image pattern (similar to links but with ! prefix)
22    // Use (?s) flag to make . match newlines
23    static ref IMAGE_PATTERN: Regex = Regex::new(
24        r"(?sx)
25        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
26        (?:
27            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
28            |
29            \[([^\]]*)\]      # Reference ID in group 3
30        )"
31    ).unwrap();
32
33    // Reference definition pattern
34    static ref REF_DEF_PATTERN: Regex = Regex::new(
35        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36    ).unwrap();
37
38    // Code span pattern - matches backticks and captures content
39    // This handles multi-backtick code spans correctly
40    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41        r"`+"
42    ).unwrap();
43
44    // Pattern for bare URLs
45    static ref BARE_URL_PATTERN: Regex = Regex::new(
46        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47    ).unwrap();
48
49    // Pattern for email addresses
50    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52    ).unwrap();
53
54    // Pattern for angle bracket links (to exclude from bare URL detection)
55    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57    ).unwrap();
58
59    // Pattern for blockquote prefix in parse_list_blocks
60    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63/// Pre-computed information about a line
64#[derive(Debug, Clone)]
65pub struct LineInfo {
66    /// The actual line content (without newline)
67    pub content: String,
68    /// Byte offset where this line starts in the document
69    pub byte_offset: usize,
70    /// Number of leading spaces/tabs
71    pub indent: usize,
72    /// Whether the line is blank (empty or only whitespace)
73    pub is_blank: bool,
74    /// Whether this line is inside a code block
75    pub in_code_block: bool,
76    /// Whether this line is inside front matter
77    pub in_front_matter: bool,
78    /// Whether this line is inside an HTML block
79    pub in_html_block: bool,
80    /// List item information if this line starts a list item
81    pub list_item: Option<ListItemInfo>,
82    /// Heading information if this line is a heading
83    pub heading: Option<HeadingInfo>,
84    /// Blockquote information if this line is a blockquote
85    pub blockquote: Option<BlockquoteInfo>,
86}
87
88/// Information about a list item
89#[derive(Debug, Clone)]
90pub struct ListItemInfo {
91    /// The marker used (*, -, +, or number with . or ))
92    pub marker: String,
93    /// Whether it's ordered (true) or unordered (false)
94    pub is_ordered: bool,
95    /// The number for ordered lists
96    pub number: Option<usize>,
97    /// Column where the marker starts (0-based)
98    pub marker_column: usize,
99    /// Column where content after marker starts
100    pub content_column: usize,
101}
102
103/// Heading style type
104#[derive(Debug, Clone, PartialEq)]
105pub enum HeadingStyle {
106    /// ATX style heading (# Heading)
107    ATX,
108    /// Setext style heading with = underline
109    Setext1,
110    /// Setext style heading with - underline
111    Setext2,
112}
113
114/// Parsed link information
115#[derive(Debug, Clone)]
116pub struct ParsedLink {
117    /// Line number (1-indexed)
118    pub line: usize,
119    /// Start column (0-indexed) in the line
120    pub start_col: usize,
121    /// End column (0-indexed) in the line
122    pub end_col: usize,
123    /// Byte offset in document
124    pub byte_offset: usize,
125    /// End byte offset in document
126    pub byte_end: usize,
127    /// Link text
128    pub text: String,
129    /// Link URL or reference
130    pub url: String,
131    /// Whether this is a reference link [text][ref] vs inline [text](url)
132    pub is_reference: bool,
133    /// Reference ID for reference links
134    pub reference_id: Option<String>,
135}
136
137/// Parsed image information
138#[derive(Debug, Clone)]
139pub struct ParsedImage {
140    /// Line number (1-indexed)
141    pub line: usize,
142    /// Start column (0-indexed) in the line
143    pub start_col: usize,
144    /// End column (0-indexed) in the line
145    pub end_col: usize,
146    /// Byte offset in document
147    pub byte_offset: usize,
148    /// End byte offset in document
149    pub byte_end: usize,
150    /// Alt text
151    pub alt_text: String,
152    /// Image URL or reference
153    pub url: String,
154    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
155    pub is_reference: bool,
156    /// Reference ID for reference images
157    pub reference_id: Option<String>,
158}
159
160/// Reference definition [ref]: url "title"
161#[derive(Debug, Clone)]
162pub struct ReferenceDef {
163    /// Line number (1-indexed)
164    pub line: usize,
165    /// Reference ID (normalized to lowercase)
166    pub id: String,
167    /// URL
168    pub url: String,
169    /// Optional title
170    pub title: Option<String>,
171}
172
173/// Parsed code span information
174#[derive(Debug, Clone)]
175pub struct CodeSpan {
176    /// Line number (1-indexed)
177    pub line: usize,
178    /// Start column (0-indexed) in the line
179    pub start_col: usize,
180    /// End column (0-indexed) in the line
181    pub end_col: usize,
182    /// Byte offset in document
183    pub byte_offset: usize,
184    /// End byte offset in document
185    pub byte_end: usize,
186    /// Number of backticks used (1, 2, 3, etc.)
187    pub backtick_count: usize,
188    /// Content inside the code span (without backticks)
189    pub content: String,
190}
191
192/// Information about a heading
193#[derive(Debug, Clone)]
194pub struct HeadingInfo {
195    /// Heading level (1-6 for ATX, 1-2 for Setext)
196    pub level: u8,
197    /// Style of heading
198    pub style: HeadingStyle,
199    /// The heading marker (# characters or underline)
200    pub marker: String,
201    /// Column where the marker starts (0-based)
202    pub marker_column: usize,
203    /// Column where heading text starts
204    pub content_column: usize,
205    /// The heading text (without markers and without custom ID syntax)
206    pub text: String,
207    /// Custom header ID if present (e.g., from {#custom-id} syntax)
208    pub custom_id: Option<String>,
209    /// Original heading text including custom ID syntax
210    pub raw_text: String,
211    /// Whether it has a closing sequence (for ATX)
212    pub has_closing_sequence: bool,
213    /// The closing sequence if present
214    pub closing_sequence: String,
215}
216
217/// Information about a blockquote line
218#[derive(Debug, Clone)]
219pub struct BlockquoteInfo {
220    /// Nesting level (1 for >, 2 for >>, etc.)
221    pub nesting_level: usize,
222    /// The indentation before the blockquote marker
223    pub indent: String,
224    /// Column where the first > starts (0-based)
225    pub marker_column: usize,
226    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
227    pub prefix: String,
228    /// Content after the blockquote marker(s)
229    pub content: String,
230    /// Whether the line has no space after the marker
231    pub has_no_space_after_marker: bool,
232    /// Whether the line has multiple spaces after the marker
233    pub has_multiple_spaces_after_marker: bool,
234    /// Whether this is an empty blockquote line needing MD028 fix
235    pub needs_md028_fix: bool,
236}
237
238/// Information about a list block
239#[derive(Debug, Clone)]
240pub struct ListBlock {
241    /// Line number where the list starts (1-indexed)
242    pub start_line: usize,
243    /// Line number where the list ends (1-indexed)
244    pub end_line: usize,
245    /// Whether it's ordered or unordered
246    pub is_ordered: bool,
247    /// The consistent marker for unordered lists (if any)
248    pub marker: Option<String>,
249    /// Blockquote prefix for this list (empty if not in blockquote)
250    pub blockquote_prefix: String,
251    /// Lines that are list items within this block
252    pub item_lines: Vec<usize>,
253    /// Nesting level (0 for top-level lists)
254    pub nesting_level: usize,
255    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
256    pub max_marker_width: usize,
257}
258
259use std::sync::{Arc, Mutex};
260
261/// Character frequency data for fast content analysis
262#[derive(Debug, Clone, Default)]
263pub struct CharFrequency {
264    /// Count of # characters (headings)
265    pub hash_count: usize,
266    /// Count of * characters (emphasis, lists, horizontal rules)
267    pub asterisk_count: usize,
268    /// Count of _ characters (emphasis, horizontal rules)
269    pub underscore_count: usize,
270    /// Count of - characters (lists, horizontal rules, setext headings)
271    pub hyphen_count: usize,
272    /// Count of + characters (lists)
273    pub plus_count: usize,
274    /// Count of > characters (blockquotes)
275    pub gt_count: usize,
276    /// Count of | characters (tables)
277    pub pipe_count: usize,
278    /// Count of [ characters (links, images)
279    pub bracket_count: usize,
280    /// Count of ` characters (code spans, code blocks)
281    pub backtick_count: usize,
282    /// Count of < characters (HTML tags, autolinks)
283    pub lt_count: usize,
284    /// Count of ! characters (images)
285    pub exclamation_count: usize,
286    /// Count of newline characters
287    pub newline_count: usize,
288}
289
290/// Pre-parsed HTML tag information
291#[derive(Debug, Clone)]
292pub struct HtmlTag {
293    /// Line number (1-indexed)
294    pub line: usize,
295    /// Start column (0-indexed) in the line
296    pub start_col: usize,
297    /// End column (0-indexed) in the line
298    pub end_col: usize,
299    /// Byte offset in document
300    pub byte_offset: usize,
301    /// End byte offset in document
302    pub byte_end: usize,
303    /// Tag name (e.g., "div", "img", "br")
304    pub tag_name: String,
305    /// Whether it's a closing tag (</tag>)
306    pub is_closing: bool,
307    /// Whether it's self-closing (<tag />)
308    pub is_self_closing: bool,
309    /// Raw tag content
310    pub raw_content: String,
311}
312
313/// Pre-parsed emphasis span information
314#[derive(Debug, Clone)]
315pub struct EmphasisSpan {
316    /// Line number (1-indexed)
317    pub line: usize,
318    /// Start column (0-indexed) in the line
319    pub start_col: usize,
320    /// End column (0-indexed) in the line
321    pub end_col: usize,
322    /// Byte offset in document
323    pub byte_offset: usize,
324    /// End byte offset in document
325    pub byte_end: usize,
326    /// Type of emphasis ('*' or '_')
327    pub marker: char,
328    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
329    pub marker_count: usize,
330    /// Content inside the emphasis
331    pub content: String,
332}
333
334/// Pre-parsed table row information
335#[derive(Debug, Clone)]
336pub struct TableRow {
337    /// Line number (1-indexed)
338    pub line: usize,
339    /// Whether this is a separator row (contains only |, -, :, and spaces)
340    pub is_separator: bool,
341    /// Number of columns (pipe-separated cells)
342    pub column_count: usize,
343    /// Alignment info from separator row
344    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
345}
346
347/// Pre-parsed bare URL information (not in links)
348#[derive(Debug, Clone)]
349pub struct BareUrl {
350    /// Line number (1-indexed)
351    pub line: usize,
352    /// Start column (0-indexed) in the line
353    pub start_col: usize,
354    /// End column (0-indexed) in the line
355    pub end_col: usize,
356    /// Byte offset in document
357    pub byte_offset: usize,
358    /// End byte offset in document
359    pub byte_end: usize,
360    /// The URL string
361    pub url: String,
362    /// Type of URL ("http", "https", "ftp", "email")
363    pub url_type: String,
364}
365
366pub struct LintContext<'a> {
367    pub content: &'a str,
368    pub line_offsets: Vec<usize>,
369    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
370    pub lines: Vec<LineInfo>,             // Pre-computed line information
371    pub links: Vec<ParsedLink>,           // Pre-parsed links
372    pub images: Vec<ParsedImage>,         // Pre-parsed images
373    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
374    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
375    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
376    pub char_frequency: CharFrequency,    // Character frequency analysis
377    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
378    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
379    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
380    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
381    ast_cache: Mutex<Option<Arc<Node>>>,  // Lazy-loaded AST
382    pub flavor: MarkdownFlavor,           // Markdown flavor being used
383}
384
385impl<'a> LintContext<'a> {
386    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
387        let mut line_offsets = vec![0];
388        for (i, c) in content.char_indices() {
389            if c == '\n' {
390                line_offsets.push(i + 1);
391            }
392        }
393
394        // Detect code blocks once and cache them
395        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
396
397        // Pre-compute line information
398        let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
399
400        // Parse code spans early so we can exclude them from link/image parsing
401        let ast = get_cached_ast(content);
402        let code_spans = Self::parse_code_spans(content, &lines, &ast);
403
404        // Parse links, images, references, and list blocks
405        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
406        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
407        let reference_defs = Self::parse_reference_defs(content, &lines);
408        // Use line-by-line list parsing for MD032 compatibility
409        // TODO: Consider using AST-based parsing in the future when MD032 is updated
410        let list_blocks = Self::parse_list_blocks(&lines);
411
412        // Detect HTML blocks
413        Self::detect_html_blocks(&mut lines);
414
415        // Compute character frequency for fast content analysis
416        let char_frequency = Self::compute_char_frequency(content);
417
418        Self {
419            content,
420            line_offsets,
421            code_blocks,
422            lines,
423            links,
424            images,
425            reference_defs,
426            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
427            list_blocks,
428            char_frequency,
429            html_tags_cache: Mutex::new(None),
430            emphasis_spans_cache: Mutex::new(None),
431            table_rows_cache: Mutex::new(None),
432            bare_urls_cache: Mutex::new(None),
433            ast_cache: Mutex::new(None),
434            flavor,
435        }
436    }
437
438    /// Get AST - uses global cache for deduplication
439    pub fn get_ast(&self) -> Arc<Node> {
440        let mut cache = self.ast_cache.lock().unwrap();
441
442        if cache.is_none() {
443            // Use global AST cache to avoid duplicate parsing
444            // MarkdownAst is just a type alias for Node, so no conversion needed
445            *cache = Some(get_cached_ast(self.content));
446        }
447
448        cache.as_ref().unwrap().clone()
449    }
450
451    /// Get code spans - computed lazily on first access
452    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
453        let mut cache = self.code_spans_cache.lock().unwrap();
454
455        // Check if we need to compute code spans
456        if cache.is_none() {
457            let ast = self.get_ast();
458            let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
459            *cache = Some(Arc::new(code_spans));
460        }
461
462        // Return a reference to the cached code spans
463        cache.as_ref().unwrap().clone()
464    }
465
466    /// Get HTML tags - computed lazily on first access
467    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
468        let mut cache = self.html_tags_cache.lock().unwrap();
469
470        if cache.is_none() {
471            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
472            *cache = Some(Arc::new(html_tags));
473        }
474
475        cache.as_ref().unwrap().clone()
476    }
477
478    /// Get emphasis spans - computed lazily on first access
479    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
480        let mut cache = self.emphasis_spans_cache.lock().unwrap();
481
482        if cache.is_none() {
483            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
484            *cache = Some(Arc::new(emphasis_spans));
485        }
486
487        cache.as_ref().unwrap().clone()
488    }
489
490    /// Get table rows - computed lazily on first access
491    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
492        let mut cache = self.table_rows_cache.lock().unwrap();
493
494        if cache.is_none() {
495            let table_rows = Self::parse_table_rows(&self.lines);
496            *cache = Some(Arc::new(table_rows));
497        }
498
499        cache.as_ref().unwrap().clone()
500    }
501
502    /// Get bare URLs - computed lazily on first access
503    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
504        let mut cache = self.bare_urls_cache.lock().unwrap();
505
506        if cache.is_none() {
507            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
508            *cache = Some(Arc::new(bare_urls));
509        }
510
511        cache.as_ref().unwrap().clone()
512    }
513
514    /// Map a byte offset to (line, column)
515    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
516        match self.line_offsets.binary_search(&offset) {
517            Ok(line) => (line + 1, 1),
518            Err(line) => {
519                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
520                (line, offset - line_start + 1)
521            }
522        }
523    }
524
525    /// Check if a position is within a code block or code span
526    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
527        // Check code blocks first
528        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
529            return true;
530        }
531
532        // Check inline code spans (lazy load if needed)
533        self.code_spans()
534            .iter()
535            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
536    }
537
538    /// Get line information by line number (1-indexed)
539    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
540        if line_num > 0 {
541            self.lines.get(line_num - 1)
542        } else {
543            None
544        }
545    }
546
547    /// Get byte offset for a line number (1-indexed)
548    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
549        self.line_info(line_num).map(|info| info.byte_offset)
550    }
551
552    /// Get URL for a reference link/image by its ID
553    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
554        let normalized_id = ref_id.to_lowercase();
555        self.reference_defs
556            .iter()
557            .find(|def| def.id == normalized_id)
558            .map(|def| def.url.as_str())
559    }
560
561    /// Get links on a specific line
562    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
563        self.links.iter().filter(|link| link.line == line_num).collect()
564    }
565
566    /// Get images on a specific line
567    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
568        self.images.iter().filter(|img| img.line == line_num).collect()
569    }
570
571    /// Check if a line is part of a list block
572    pub fn is_in_list_block(&self, line_num: usize) -> bool {
573        self.list_blocks
574            .iter()
575            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
576    }
577
578    /// Get the list block containing a specific line
579    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
580        self.list_blocks
581            .iter()
582            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
583    }
584
585    // Compatibility methods for DocumentStructure migration
586
587    /// Check if a line is within a code block
588    pub fn is_in_code_block(&self, line_num: usize) -> bool {
589        if line_num == 0 || line_num > self.lines.len() {
590            return false;
591        }
592        self.lines[line_num - 1].in_code_block
593    }
594
595    /// Check if a line is within front matter
596    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
597        if line_num == 0 || line_num > self.lines.len() {
598            return false;
599        }
600        self.lines[line_num - 1].in_front_matter
601    }
602
603    /// Check if a line is within an HTML block
604    pub fn is_in_html_block(&self, line_num: usize) -> bool {
605        if line_num == 0 || line_num > self.lines.len() {
606            return false;
607        }
608        self.lines[line_num - 1].in_html_block
609    }
610
611    /// Check if a line and column is within a code span
612    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
613        if line_num == 0 || line_num > self.lines.len() {
614            return false;
615        }
616
617        // Use the code spans cache to check
618        // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
619        // Convert col to 0-indexed for comparison
620        let col_0indexed = if col > 0 { col - 1 } else { 0 };
621        let code_spans = self.code_spans();
622        code_spans
623            .iter()
624            .any(|span| span.line == line_num && col_0indexed >= span.start_col && col_0indexed < span.end_col)
625    }
626
627    /// Check if content has any instances of a specific character (fast)
628    pub fn has_char(&self, ch: char) -> bool {
629        match ch {
630            '#' => self.char_frequency.hash_count > 0,
631            '*' => self.char_frequency.asterisk_count > 0,
632            '_' => self.char_frequency.underscore_count > 0,
633            '-' => self.char_frequency.hyphen_count > 0,
634            '+' => self.char_frequency.plus_count > 0,
635            '>' => self.char_frequency.gt_count > 0,
636            '|' => self.char_frequency.pipe_count > 0,
637            '[' => self.char_frequency.bracket_count > 0,
638            '`' => self.char_frequency.backtick_count > 0,
639            '<' => self.char_frequency.lt_count > 0,
640            '!' => self.char_frequency.exclamation_count > 0,
641            '\n' => self.char_frequency.newline_count > 0,
642            _ => self.content.contains(ch), // Fallback for other characters
643        }
644    }
645
646    /// Get count of a specific character (fast)
647    pub fn char_count(&self, ch: char) -> usize {
648        match ch {
649            '#' => self.char_frequency.hash_count,
650            '*' => self.char_frequency.asterisk_count,
651            '_' => self.char_frequency.underscore_count,
652            '-' => self.char_frequency.hyphen_count,
653            '+' => self.char_frequency.plus_count,
654            '>' => self.char_frequency.gt_count,
655            '|' => self.char_frequency.pipe_count,
656            '[' => self.char_frequency.bracket_count,
657            '`' => self.char_frequency.backtick_count,
658            '<' => self.char_frequency.lt_count,
659            '!' => self.char_frequency.exclamation_count,
660            '\n' => self.char_frequency.newline_count,
661            _ => self.content.matches(ch).count(), // Fallback for other characters
662        }
663    }
664
665    /// Check if content likely contains headings (fast)
666    pub fn likely_has_headings(&self) -> bool {
667        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
668    }
669
670    /// Check if content likely contains lists (fast)
671    pub fn likely_has_lists(&self) -> bool {
672        self.char_frequency.asterisk_count > 0
673            || self.char_frequency.hyphen_count > 0
674            || self.char_frequency.plus_count > 0
675    }
676
677    /// Check if content likely contains emphasis (fast)
678    pub fn likely_has_emphasis(&self) -> bool {
679        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
680    }
681
682    /// Check if content likely contains tables (fast)
683    pub fn likely_has_tables(&self) -> bool {
684        self.char_frequency.pipe_count > 2
685    }
686
687    /// Check if content likely contains blockquotes (fast)
688    pub fn likely_has_blockquotes(&self) -> bool {
689        self.char_frequency.gt_count > 0
690    }
691
692    /// Check if content likely contains code (fast)
693    pub fn likely_has_code(&self) -> bool {
694        self.char_frequency.backtick_count > 0
695    }
696
697    /// Check if content likely contains links or images (fast)
698    pub fn likely_has_links_or_images(&self) -> bool {
699        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
700    }
701
702    /// Check if content likely contains HTML (fast)
703    pub fn likely_has_html(&self) -> bool {
704        self.char_frequency.lt_count > 0
705    }
706
707    /// Get HTML tags on a specific line
708    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
709        self.html_tags()
710            .iter()
711            .filter(|tag| tag.line == line_num)
712            .cloned()
713            .collect()
714    }
715
716    /// Get emphasis spans on a specific line
717    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
718        self.emphasis_spans()
719            .iter()
720            .filter(|span| span.line == line_num)
721            .cloned()
722            .collect()
723    }
724
725    /// Get table rows on a specific line
726    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
727        self.table_rows()
728            .iter()
729            .filter(|row| row.line == line_num)
730            .cloned()
731            .collect()
732    }
733
734    /// Get bare URLs on a specific line
735    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
736        self.bare_urls()
737            .iter()
738            .filter(|url| url.line == line_num)
739            .cloned()
740            .collect()
741    }
742
743    /// Parse all links in the content
744    fn parse_links(
745        content: &str,
746        lines: &[LineInfo],
747        code_blocks: &[(usize, usize)],
748        code_spans: &[CodeSpan],
749        flavor: MarkdownFlavor,
750    ) -> Vec<ParsedLink> {
751        use crate::utils::skip_context::is_mkdocs_snippet_line;
752
753        // Pre-size based on a heuristic: most markdown files have relatively few links
754        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
755
756        // Parse links across the entire content, not line by line
757        for cap in LINK_PATTERN.captures_iter(content) {
758            let full_match = cap.get(0).unwrap();
759            let match_start = full_match.start();
760            let match_end = full_match.end();
761
762            // Skip if the opening bracket is escaped (preceded by \)
763            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
764                continue;
765            }
766
767            // Skip if this is actually an image (preceded by !)
768            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
769                continue;
770            }
771
772            // Skip if in code block
773            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
774                continue;
775            }
776
777            // Skip if in code span
778            if code_spans
779                .iter()
780                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
781            {
782                continue;
783            }
784
785            // Skip if this link is on a MkDocs snippet line
786            // Find which line this link is on
787            let line_idx = lines
788                .iter()
789                .position(|line| {
790                    match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
791                })
792                .unwrap_or(0);
793
794            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
795                continue;
796            }
797
798            // Find which line this link starts on
799            let mut line_num = 1;
800            let mut col_start = match_start;
801            for (idx, line_info) in lines.iter().enumerate() {
802                if match_start >= line_info.byte_offset {
803                    line_num = idx + 1;
804                    col_start = match_start - line_info.byte_offset;
805                } else {
806                    break;
807                }
808            }
809
810            // Find which line this link ends on (and calculate column on that line)
811            let mut end_line_num = 1;
812            let mut col_end = match_end;
813            for (idx, line_info) in lines.iter().enumerate() {
814                if match_end > line_info.byte_offset {
815                    end_line_num = idx + 1;
816                    col_end = match_end - line_info.byte_offset;
817                } else {
818                    break;
819                }
820            }
821
822            // For single-line links, use the same approach as before
823            if line_num == end_line_num {
824                // col_end is already correct
825            } else {
826                // For multi-line links, col_end represents the column on the ending line
827                // which is what we want
828            }
829
830            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
831
832            if let Some(inline_url) = cap.get(2) {
833                // Inline link
834                links.push(ParsedLink {
835                    line: line_num,
836                    start_col: col_start,
837                    end_col: col_end,
838                    byte_offset: match_start,
839                    byte_end: match_end,
840                    text,
841                    url: inline_url.as_str().to_string(),
842                    is_reference: false,
843                    reference_id: None,
844                });
845            } else if let Some(ref_id) = cap.get(3) {
846                // Reference link
847                let ref_id_str = ref_id.as_str();
848                let normalized_ref = if ref_id_str.is_empty() {
849                    text.to_lowercase() // Implicit reference
850                } else {
851                    ref_id_str.to_lowercase()
852                };
853
854                links.push(ParsedLink {
855                    line: line_num,
856                    start_col: col_start,
857                    end_col: col_end,
858                    byte_offset: match_start,
859                    byte_end: match_end,
860                    text,
861                    url: String::new(), // Will be resolved with reference_defs
862                    is_reference: true,
863                    reference_id: Some(normalized_ref),
864                });
865            }
866        }
867
868        links
869    }
870
871    /// Parse all images in the content
872    fn parse_images(
873        content: &str,
874        lines: &[LineInfo],
875        code_blocks: &[(usize, usize)],
876        code_spans: &[CodeSpan],
877    ) -> Vec<ParsedImage> {
878        // Pre-size based on a heuristic: images are less common than links
879        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
880
881        // Parse images across the entire content, not line by line
882        for cap in IMAGE_PATTERN.captures_iter(content) {
883            let full_match = cap.get(0).unwrap();
884            let match_start = full_match.start();
885            let match_end = full_match.end();
886
887            // Skip if the ! is escaped (preceded by \)
888            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
889                continue;
890            }
891
892            // Skip if in code block
893            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
894                continue;
895            }
896
897            // Skip if in code span
898            if code_spans
899                .iter()
900                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
901            {
902                continue;
903            }
904
905            // Find which line this image starts on
906            let mut line_num = 1;
907            let mut col_start = match_start;
908            for (idx, line_info) in lines.iter().enumerate() {
909                if match_start >= line_info.byte_offset {
910                    line_num = idx + 1;
911                    col_start = match_start - line_info.byte_offset;
912                } else {
913                    break;
914                }
915            }
916
917            // Find which line this image ends on (and calculate column on that line)
918            let mut end_line_num = 1;
919            let mut col_end = match_end;
920            for (idx, line_info) in lines.iter().enumerate() {
921                if match_end > line_info.byte_offset {
922                    end_line_num = idx + 1;
923                    col_end = match_end - line_info.byte_offset;
924                } else {
925                    break;
926                }
927            }
928
929            // For single-line images, use the same approach as before
930            if line_num == end_line_num {
931                // col_end is already correct
932            } else {
933                // For multi-line images, col_end represents the column on the ending line
934                // which is what we want
935            }
936
937            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
938
939            if let Some(inline_url) = cap.get(2) {
940                // Inline image
941                images.push(ParsedImage {
942                    line: line_num,
943                    start_col: col_start,
944                    end_col: col_end,
945                    byte_offset: match_start,
946                    byte_end: match_end,
947                    alt_text,
948                    url: inline_url.as_str().to_string(),
949                    is_reference: false,
950                    reference_id: None,
951                });
952            } else if let Some(ref_id) = cap.get(3) {
953                // Reference image
954                let ref_id_str = ref_id.as_str();
955                let normalized_ref = if ref_id_str.is_empty() {
956                    alt_text.to_lowercase() // Implicit reference
957                } else {
958                    ref_id_str.to_lowercase()
959                };
960
961                images.push(ParsedImage {
962                    line: line_num,
963                    start_col: col_start,
964                    end_col: col_end,
965                    byte_offset: match_start,
966                    byte_end: match_end,
967                    alt_text,
968                    url: String::new(), // Will be resolved with reference_defs
969                    is_reference: true,
970                    reference_id: Some(normalized_ref),
971                });
972            }
973        }
974
975        images
976    }
977
978    /// Parse reference definitions
979    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
980        // Pre-size based on lines count as reference definitions are line-based
981        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
982
983        for (line_idx, line_info) in lines.iter().enumerate() {
984            // Skip lines in code blocks
985            if line_info.in_code_block {
986                continue;
987            }
988
989            let line = &line_info.content;
990            let line_num = line_idx + 1;
991
992            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
993                let id = cap.get(1).unwrap().as_str().to_lowercase();
994                let url = cap.get(2).unwrap().as_str().to_string();
995                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
996
997                refs.push(ReferenceDef {
998                    line: line_num,
999                    id,
1000                    url,
1001                    title,
1002                });
1003            }
1004        }
1005
1006        refs
1007    }
1008
1009    /// Pre-compute line information
1010    fn compute_line_info(
1011        content: &str,
1012        line_offsets: &[usize],
1013        code_blocks: &[(usize, usize)],
1014        flavor: MarkdownFlavor,
1015    ) -> Vec<LineInfo> {
1016        lazy_static! {
1017            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
1018            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1019            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1020
1021            // Regex for blockquote prefix
1022            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1023
1024            // Regex for heading detection
1025            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1026            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1027
1028            // Regex for blockquote detection
1029            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1030        }
1031
1032        let content_lines: Vec<&str> = content.lines().collect();
1033        let mut lines = Vec::with_capacity(content_lines.len());
1034
1035        // Detect front matter boundaries FIRST, before any other parsing
1036        let mut in_front_matter = false;
1037        let mut front_matter_end = 0;
1038        if content_lines.first().map(|l| l.trim()) == Some("---") {
1039            in_front_matter = true;
1040            for (idx, line) in content_lines.iter().enumerate().skip(1) {
1041                if line.trim() == "---" {
1042                    front_matter_end = idx;
1043                    break;
1044                }
1045            }
1046        }
1047
1048        for (i, line) in content_lines.iter().enumerate() {
1049            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1050            let indent = line.len() - line.trim_start().len();
1051            // For blank detection, consider blockquote context
1052            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1053                // In blockquote context, check if content after prefix is blank
1054                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1055                after_prefix.trim().is_empty()
1056            } else {
1057                line.trim().is_empty()
1058            };
1059            // Check if this line is inside a code block (not inline code span)
1060            // We only want to check for fenced/indented code blocks, not inline code
1061            let in_code_block = code_blocks.iter().any(|&(start, end)| {
1062                // Only consider ranges that span multiple lines (code blocks)
1063                // Inline code spans are typically on a single line
1064
1065                // Ensure we're at valid UTF-8 boundaries
1066                let safe_start = if start > 0 && !content.is_char_boundary(start) {
1067                    // Find the nearest valid boundary before start
1068                    let mut boundary = start;
1069                    while boundary > 0 && !content.is_char_boundary(boundary) {
1070                        boundary -= 1;
1071                    }
1072                    boundary
1073                } else {
1074                    start
1075                };
1076
1077                let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1078                    // Find the nearest valid boundary after end
1079                    let mut boundary = end;
1080                    while boundary < content.len() && !content.is_char_boundary(boundary) {
1081                        boundary += 1;
1082                    }
1083                    boundary
1084                } else {
1085                    end.min(content.len())
1086                };
1087
1088                let block_content = &content[safe_start..safe_end];
1089                let is_multiline = block_content.contains('\n');
1090                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1091                let is_indented = !is_fenced
1092                    && block_content
1093                        .lines()
1094                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
1095
1096                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1097            });
1098
1099            // Detect list items (skip if in frontmatter)
1100            let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
1101                // Strip blockquote prefix if present for list detection
1102                let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1103                    let prefix = caps.get(1).unwrap().as_str();
1104                    let content = caps.get(2).unwrap().as_str();
1105                    (content, prefix.len())
1106                } else {
1107                    (&**line, 0)
1108                };
1109
1110                if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1111                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1112                    let marker = caps.get(2).map_or("", |m| m.as_str());
1113                    let spacing = caps.get(3).map_or("", |m| m.as_str());
1114                    let _content = caps.get(4).map_or("", |m| m.as_str());
1115                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1116                    let content_column = marker_column + marker.len() + spacing.len();
1117
1118                    // According to CommonMark spec, unordered list items MUST have at least one space
1119                    // after the marker (-, *, or +). Without a space, it's not a list item.
1120                    // This also naturally handles cases like:
1121                    // - *emphasis* (not a list)
1122                    // - **bold** (not a list)
1123                    // - --- (horizontal rule, not a list)
1124                    if spacing.is_empty() {
1125                        None
1126                    } else {
1127                        Some(ListItemInfo {
1128                            marker: marker.to_string(),
1129                            is_ordered: false,
1130                            number: None,
1131                            marker_column,
1132                            content_column,
1133                        })
1134                    }
1135                } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1136                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1137                    let number_str = caps.get(2).map_or("", |m| m.as_str());
1138                    let delimiter = caps.get(3).map_or("", |m| m.as_str());
1139                    let spacing = caps.get(4).map_or("", |m| m.as_str());
1140                    let _content = caps.get(5).map_or("", |m| m.as_str());
1141                    let marker = format!("{number_str}{delimiter}");
1142                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1143                    let content_column = marker_column + marker.len() + spacing.len();
1144
1145                    // According to CommonMark spec, ordered list items MUST have at least one space
1146                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1147                    if spacing.is_empty() {
1148                        None
1149                    } else {
1150                        Some(ListItemInfo {
1151                            marker,
1152                            is_ordered: true,
1153                            number: number_str.parse().ok(),
1154                            marker_column,
1155                            content_column,
1156                        })
1157                    }
1158                } else {
1159                    None
1160                }
1161            } else {
1162                None
1163            };
1164
1165            lines.push(LineInfo {
1166                content: line.to_string(),
1167                byte_offset,
1168                indent,
1169                is_blank,
1170                in_code_block,
1171                in_front_matter: in_front_matter && i <= front_matter_end,
1172                in_html_block: false, // Will be populated after line creation
1173                list_item,
1174                heading: None,    // Will be populated in second pass for Setext headings
1175                blockquote: None, // Will be populated after line creation
1176            });
1177        }
1178
1179        // Second pass: detect headings (including Setext which needs look-ahead) and blockquotes
1180        for i in 0..content_lines.len() {
1181            if lines[i].in_code_block {
1182                continue;
1183            }
1184
1185            // Skip lines in front matter
1186            if in_front_matter && i <= front_matter_end {
1187                continue;
1188            }
1189
1190            let line = content_lines[i];
1191
1192            // Check for blockquotes (even on blank lines within blockquotes)
1193            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1194                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1195                let markers = caps.get(2).map_or("", |m| m.as_str());
1196                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1197                let content = caps.get(4).map_or("", |m| m.as_str());
1198
1199                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1200                let marker_column = indent_str.len();
1201
1202                // Build the prefix (indentation + markers + space)
1203                let prefix = format!("{indent_str}{markers}{spaces_after}");
1204
1205                // Check for various blockquote issues
1206                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1207                // Consider tabs as multiple spaces, or actual multiple spaces
1208                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1209
1210                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1211                // MD028 flags empty blockquote lines that don't have a single space after the marker
1212                // Lines like "> " or ">> " are already correct and don't need fixing
1213                let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1214
1215                lines[i].blockquote = Some(BlockquoteInfo {
1216                    nesting_level,
1217                    indent: indent_str.to_string(),
1218                    marker_column,
1219                    prefix,
1220                    content: content.to_string(),
1221                    has_no_space_after_marker: has_no_space,
1222                    has_multiple_spaces_after_marker: has_multiple_spaces,
1223                    needs_md028_fix,
1224                });
1225            }
1226
1227            // Skip heading detection for blank lines
1228            if lines[i].is_blank {
1229                continue;
1230            }
1231
1232            // Check for ATX headings (but skip MkDocs snippet lines)
1233            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1234            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1235                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1236                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1237            } else {
1238                false
1239            };
1240
1241            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1242                // Skip headings inside HTML comments
1243                if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1244                    continue;
1245                }
1246                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1247                let hashes = caps.get(2).map_or("", |m| m.as_str());
1248                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1249                let rest = caps.get(4).map_or("", |m| m.as_str());
1250
1251                let level = hashes.len() as u8;
1252                let marker_column = leading_spaces.len();
1253
1254                // Check for closing sequence, but handle custom IDs that might come after
1255                let (text, has_closing, closing_seq) = {
1256                    // First check if there's a custom ID at the end
1257                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1258                        // Check if this looks like a valid custom ID (ends with })
1259                        if rest[id_start..].trim_end().ends_with('}') {
1260                            // Split off the custom ID
1261                            (&rest[..id_start], &rest[id_start..])
1262                        } else {
1263                            (rest, "")
1264                        }
1265                    } else {
1266                        (rest, "")
1267                    };
1268
1269                    // Now look for closing hashes in the part before the custom ID
1270                    let trimmed_rest = rest_without_id.trim_end();
1271                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1272                        // Look for the start of the hash sequence
1273                        let mut start_of_hashes = last_hash_pos;
1274                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1275                            start_of_hashes -= 1;
1276                        }
1277
1278                        // Check if there's at least one space before the closing hashes
1279                        let has_space_before = start_of_hashes == 0
1280                            || trimmed_rest
1281                                .chars()
1282                                .nth(start_of_hashes - 1)
1283                                .is_some_and(|c| c.is_whitespace());
1284
1285                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1286                        let potential_closing = &trimmed_rest[start_of_hashes..];
1287                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1288
1289                        if is_all_hashes && has_space_before {
1290                            // This is a closing sequence
1291                            let closing_hashes = potential_closing.to_string();
1292                            // The text is everything before the closing hashes
1293                            // Don't include the custom ID here - it will be extracted later
1294                            let text_part = if !custom_id_part.is_empty() {
1295                                // If we have a custom ID, append it back to get the full rest
1296                                // This allows the extract_header_id function to handle it properly
1297                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1298                            } else {
1299                                rest_without_id[..start_of_hashes].trim_end().to_string()
1300                            };
1301                            (text_part, true, closing_hashes)
1302                        } else {
1303                            // Not a valid closing sequence, return the full content
1304                            (rest.to_string(), false, String::new())
1305                        }
1306                    } else {
1307                        // No hashes found, return the full content
1308                        (rest.to_string(), false, String::new())
1309                    }
1310                };
1311
1312                let content_column = marker_column + hashes.len() + spaces_after.len();
1313
1314                // Extract custom header ID if present
1315                let raw_text = text.trim().to_string();
1316                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1317
1318                // If no custom ID was found on the header line, check the next line for standalone attr-list
1319                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1320                    let next_line = content_lines[i + 1];
1321                    if !lines[i + 1].in_code_block
1322                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1323                        && let Some(next_line_id) =
1324                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1325                    {
1326                        custom_id = Some(next_line_id);
1327                    }
1328                }
1329
1330                lines[i].heading = Some(HeadingInfo {
1331                    level,
1332                    style: HeadingStyle::ATX,
1333                    marker: hashes.to_string(),
1334                    marker_column,
1335                    content_column,
1336                    text: clean_text,
1337                    custom_id,
1338                    raw_text,
1339                    has_closing_sequence: has_closing,
1340                    closing_sequence: closing_seq,
1341                });
1342            }
1343            // Check for Setext headings (need to look at next line)
1344            else if i + 1 < content_lines.len() {
1345                let next_line = content_lines[i + 1];
1346                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1347                    // Skip if next line is front matter delimiter
1348                    if in_front_matter && i < front_matter_end {
1349                        continue;
1350                    }
1351
1352                    // Skip Setext headings inside HTML comments
1353                    if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1354                        continue;
1355                    }
1356
1357                    let underline = next_line.trim();
1358
1359                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1360                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1361                    if underline == "---" {
1362                        continue;
1363                    }
1364
1365                    // Skip if the current line looks like YAML key-value syntax
1366                    let current_line_trimmed = line.trim();
1367                    if current_line_trimmed.contains(':')
1368                        && !current_line_trimmed.starts_with('#')
1369                        && !current_line_trimmed.contains('[')
1370                        && !current_line_trimmed.contains("](")
1371                    {
1372                        // This looks like "key: value" which suggests YAML, not a heading
1373                        continue;
1374                    }
1375
1376                    let level = if underline.starts_with('=') { 1 } else { 2 };
1377                    let style = if level == 1 {
1378                        HeadingStyle::Setext1
1379                    } else {
1380                        HeadingStyle::Setext2
1381                    };
1382
1383                    // Extract custom header ID if present
1384                    let raw_text = line.trim().to_string();
1385                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1386
1387                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1388                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1389                        let attr_line = content_lines[i + 2];
1390                        if !lines[i + 2].in_code_block
1391                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1392                            && let Some(attr_line_id) =
1393                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1394                        {
1395                            custom_id = Some(attr_line_id);
1396                        }
1397                    }
1398
1399                    lines[i].heading = Some(HeadingInfo {
1400                        level,
1401                        style,
1402                        marker: underline.to_string(),
1403                        marker_column: next_line.len() - next_line.trim_start().len(),
1404                        content_column: lines[i].indent,
1405                        text: clean_text,
1406                        custom_id,
1407                        raw_text,
1408                        has_closing_sequence: false,
1409                        closing_sequence: String::new(),
1410                    });
1411                }
1412            }
1413        }
1414
1415        lines
1416    }
1417
1418    /// Detect HTML blocks in the content
1419    fn detect_html_blocks(lines: &mut [LineInfo]) {
1420        // HTML block elements that trigger block context
1421        const BLOCK_ELEMENTS: &[&str] = &[
1422            "address",
1423            "article",
1424            "aside",
1425            "blockquote",
1426            "details",
1427            "dialog",
1428            "dd",
1429            "div",
1430            "dl",
1431            "dt",
1432            "fieldset",
1433            "figcaption",
1434            "figure",
1435            "footer",
1436            "form",
1437            "h1",
1438            "h2",
1439            "h3",
1440            "h4",
1441            "h5",
1442            "h6",
1443            "header",
1444            "hr",
1445            "li",
1446            "main",
1447            "nav",
1448            "ol",
1449            "p",
1450            "pre",
1451            "section",
1452            "table",
1453            "tbody",
1454            "td",
1455            "tfoot",
1456            "th",
1457            "thead",
1458            "tr",
1459            "ul",
1460        ];
1461
1462        let mut i = 0;
1463        while i < lines.len() {
1464            // Skip if already in code block or front matter
1465            if lines[i].in_code_block || lines[i].in_front_matter {
1466                i += 1;
1467                continue;
1468            }
1469
1470            let trimmed = lines[i].content.trim_start();
1471
1472            // Check if line starts with an HTML tag
1473            if trimmed.starts_with('<') && trimmed.len() > 1 {
1474                // Extract tag name safely
1475                let after_bracket = &trimmed[1..];
1476                let is_closing = after_bracket.starts_with('/');
1477                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1478
1479                // Extract tag name (stop at space, >, /, or end of string)
1480                let tag_name = tag_start
1481                    .chars()
1482                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1483                    .collect::<String>()
1484                    .to_lowercase();
1485
1486                // Check if it's a block element
1487                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1488                    // Mark this line as in HTML block
1489                    lines[i].in_html_block = true;
1490
1491                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1492                    // This avoids complex nesting logic that might cause infinite loops
1493                    if !is_closing {
1494                        let closing_tag = format!("</{tag_name}>");
1495                        let mut j = i + 1;
1496                        while j < lines.len() && j < i + 100 {
1497                            // Limit search to 100 lines
1498                            // Stop at blank lines
1499                            if lines[j].is_blank {
1500                                break;
1501                            }
1502
1503                            lines[j].in_html_block = true;
1504
1505                            // Check if this line contains the closing tag
1506                            if lines[j].content.contains(&closing_tag) {
1507                                break;
1508                            }
1509                            j += 1;
1510                        }
1511                    }
1512                }
1513            }
1514
1515            i += 1;
1516        }
1517    }
1518
1519    /// Parse all inline code spans in the content using AST
1520    fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1521        let mut code_spans = Vec::new();
1522
1523        // Quick check - if no backticks, no code spans
1524        if !content.contains('`') {
1525            return code_spans;
1526        }
1527
1528        // Helper function to recursively extract inline code spans from AST nodes
1529        fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1530            match node {
1531                Node::InlineCode(inline_code) => {
1532                    if let Some(pos) = &inline_code.position {
1533                        let start_pos = pos.start.offset;
1534                        let end_pos = pos.end.offset;
1535
1536                        // The position includes the backticks, extract the actual content
1537                        let full_span = &content[start_pos..end_pos];
1538                        let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1539
1540                        // Extract content between backticks, preserving spaces
1541                        let content_start = start_pos + backtick_count;
1542                        let content_end = end_pos - backtick_count;
1543                        let span_content = if content_start < content_end {
1544                            content[content_start..content_end].to_string()
1545                        } else {
1546                            String::new()
1547                        };
1548
1549                        // Find which line this code span starts on
1550                        let mut line_num = 1;
1551                        let mut col_start = start_pos;
1552                        for (idx, line_info) in lines.iter().enumerate() {
1553                            if start_pos >= line_info.byte_offset {
1554                                line_num = idx + 1;
1555                                col_start = start_pos - line_info.byte_offset;
1556                            } else {
1557                                break;
1558                            }
1559                        }
1560
1561                        // Find end column
1562                        let mut col_end = end_pos;
1563                        for line_info in lines.iter() {
1564                            if end_pos > line_info.byte_offset {
1565                                col_end = end_pos - line_info.byte_offset;
1566                            } else {
1567                                break;
1568                            }
1569                        }
1570
1571                        spans.push(CodeSpan {
1572                            line: line_num,
1573                            start_col: col_start,
1574                            end_col: col_end,
1575                            byte_offset: start_pos,
1576                            byte_end: end_pos,
1577                            backtick_count,
1578                            content: span_content,
1579                        });
1580                    }
1581                }
1582                // Recursively process children
1583                Node::Root(root) => {
1584                    for child in &root.children {
1585                        extract_code_spans(child, content, lines, spans);
1586                    }
1587                }
1588                Node::Paragraph(para) => {
1589                    for child in &para.children {
1590                        extract_code_spans(child, content, lines, spans);
1591                    }
1592                }
1593                Node::Heading(heading) => {
1594                    for child in &heading.children {
1595                        extract_code_spans(child, content, lines, spans);
1596                    }
1597                }
1598                Node::List(list) => {
1599                    for child in &list.children {
1600                        extract_code_spans(child, content, lines, spans);
1601                    }
1602                }
1603                Node::ListItem(item) => {
1604                    for child in &item.children {
1605                        extract_code_spans(child, content, lines, spans);
1606                    }
1607                }
1608                Node::Blockquote(blockquote) => {
1609                    for child in &blockquote.children {
1610                        extract_code_spans(child, content, lines, spans);
1611                    }
1612                }
1613                Node::Table(table) => {
1614                    for child in &table.children {
1615                        extract_code_spans(child, content, lines, spans);
1616                    }
1617                }
1618                Node::TableRow(row) => {
1619                    for child in &row.children {
1620                        extract_code_spans(child, content, lines, spans);
1621                    }
1622                }
1623                Node::TableCell(cell) => {
1624                    for child in &cell.children {
1625                        extract_code_spans(child, content, lines, spans);
1626                    }
1627                }
1628                Node::Emphasis(emphasis) => {
1629                    for child in &emphasis.children {
1630                        extract_code_spans(child, content, lines, spans);
1631                    }
1632                }
1633                Node::Strong(strong) => {
1634                    for child in &strong.children {
1635                        extract_code_spans(child, content, lines, spans);
1636                    }
1637                }
1638                Node::Link(link) => {
1639                    for child in &link.children {
1640                        extract_code_spans(child, content, lines, spans);
1641                    }
1642                }
1643                Node::LinkReference(link_ref) => {
1644                    for child in &link_ref.children {
1645                        extract_code_spans(child, content, lines, spans);
1646                    }
1647                }
1648                Node::FootnoteDefinition(footnote) => {
1649                    for child in &footnote.children {
1650                        extract_code_spans(child, content, lines, spans);
1651                    }
1652                }
1653                Node::Delete(delete) => {
1654                    for child in &delete.children {
1655                        extract_code_spans(child, content, lines, spans);
1656                    }
1657                }
1658                // Terminal nodes or nodes without relevant children
1659                Node::Code(_)
1660                | Node::Text(_)
1661                | Node::Html(_)
1662                | Node::Image(_)
1663                | Node::ImageReference(_)
1664                | Node::FootnoteReference(_)
1665                | Node::Break(_)
1666                | Node::ThematicBreak(_)
1667                | Node::Definition(_)
1668                | Node::Yaml(_)
1669                | Node::Toml(_)
1670                | Node::Math(_)
1671                | Node::InlineMath(_)
1672                | Node::MdxJsxFlowElement(_)
1673                | Node::MdxFlowExpression(_)
1674                | Node::MdxJsxTextElement(_)
1675                | Node::MdxTextExpression(_)
1676                | Node::MdxjsEsm(_) => {
1677                    // No children to process or not relevant for code spans
1678                }
1679            }
1680        }
1681
1682        // Extract all code spans from the AST
1683        extract_code_spans(ast, content, lines, &mut code_spans);
1684
1685        // Sort by position to ensure consistent ordering
1686        code_spans.sort_by_key(|span| span.byte_offset);
1687
1688        code_spans
1689    }
1690
1691    /// Parse all list blocks in the content (legacy line-by-line approach)
1692    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1693        // Pre-size based on lines that could be list items
1694        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1695        let mut current_block: Option<ListBlock> = None;
1696        let mut last_list_item_line = 0;
1697        let mut current_indent_level = 0;
1698        let mut last_marker_width = 0;
1699
1700        for (line_idx, line_info) in lines.iter().enumerate() {
1701            let line_num = line_idx + 1;
1702
1703            // Enhanced code block handling using Design #3's context analysis
1704            if line_info.in_code_block {
1705                if let Some(ref mut block) = current_block {
1706                    // Calculate minimum indentation for list continuation
1707                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1708
1709                    // Analyze code block context using the three-tier classification
1710                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1711
1712                    match context {
1713                        CodeBlockContext::Indented => {
1714                            // Code block is properly indented - continues the list
1715                            block.end_line = line_num;
1716                            continue;
1717                        }
1718                        CodeBlockContext::Standalone => {
1719                            // Code block separates lists - end current block
1720                            let completed_block = current_block.take().unwrap();
1721                            list_blocks.push(completed_block);
1722                            continue;
1723                        }
1724                        CodeBlockContext::Adjacent => {
1725                            // Edge case - use conservative behavior (continue list)
1726                            block.end_line = line_num;
1727                            continue;
1728                        }
1729                    }
1730                } else {
1731                    // No current list block - skip code block lines
1732                    continue;
1733                }
1734            }
1735
1736            // Extract blockquote prefix if any
1737            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1738                caps.get(0).unwrap().as_str().to_string()
1739            } else {
1740                String::new()
1741            };
1742
1743            // Check if this line is a list item
1744            if let Some(list_item) = &line_info.list_item {
1745                // Calculate nesting level based on indentation
1746                let item_indent = list_item.marker_column;
1747                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1748
1749                if let Some(ref mut block) = current_block {
1750                    // Check if this continues the current block
1751                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1752                    // or a continuation at the same or lower level
1753                    let is_nested = nesting > block.nesting_level;
1754                    let same_type =
1755                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1756                    let same_context = block.blockquote_prefix == blockquote_prefix;
1757                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1758
1759                    // For unordered lists, also check marker consistency
1760                    let marker_compatible =
1761                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1762
1763                    // Check if there's non-list content between the last item and this one
1764                    let has_non_list_content = {
1765                        let mut found_non_list = false;
1766                        // Use the last item from the current block, not the global last_list_item_line
1767                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1768
1769                        // Debug: Special check for problematic line
1770                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1771                            let last_line = &lines[block_last_item_line - 1];
1772                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1773                                log::debug!(
1774                                    "After problematic line {}: checking lines {} to {} for non-list content",
1775                                    block_last_item_line,
1776                                    block_last_item_line + 1,
1777                                    line_num
1778                                );
1779                                // If they're consecutive list items, there's no content between
1780                                if line_num == block_last_item_line + 1 {
1781                                    log::debug!("Lines are consecutive, no content between");
1782                                }
1783                            }
1784                        }
1785
1786                        for check_line in (block_last_item_line + 1)..line_num {
1787                            let check_idx = check_line - 1;
1788                            if check_idx < lines.len() {
1789                                let check_info = &lines[check_idx];
1790                                // Check for content that breaks the list
1791                                let is_list_breaking_content = if check_info.in_code_block {
1792                                    // Use enhanced code block classification for list separation
1793                                    let last_item_marker_width =
1794                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1795                                            lines[block_last_item_line - 1]
1796                                                .list_item
1797                                                .as_ref()
1798                                                .map(|li| {
1799                                                    if li.is_ordered {
1800                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1801                                                    } else {
1802                                                        li.marker.len()
1803                                                    }
1804                                                })
1805                                                .unwrap_or(3) // fallback to 3 if no list item found
1806                                        } else {
1807                                            3 // fallback
1808                                        };
1809
1810                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1811
1812                                    // Analyze code block context using our enhanced classification
1813                                    let context = CodeBlockUtils::analyze_code_block_context(
1814                                        lines,
1815                                        check_line - 1,
1816                                        min_continuation,
1817                                    );
1818
1819                                    // Standalone code blocks break lists, indented ones continue them
1820                                    matches!(context, CodeBlockContext::Standalone)
1821                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1822                                    // Check for structural separators that should break lists (from issue #42)
1823                                    let line_content = check_info.content.trim();
1824
1825                                    // Any of these structural separators break lists
1826                                    if check_info.heading.is_some()
1827                                        || line_content.starts_with("---")
1828                                        || line_content.starts_with("***")
1829                                        || line_content.starts_with("___")
1830                                        || (line_content.contains('|')
1831                                            && !line_content.contains("](")
1832                                            && !line_content.contains("http")
1833                                            && (line_content.matches('|').count() > 1
1834                                                || line_content.starts_with('|')
1835                                                || line_content.ends_with('|')))
1836                                        || line_content.starts_with(">")
1837                                    {
1838                                        true
1839                                    }
1840                                    // Other non-list content - check if properly indented
1841                                    else {
1842                                        let last_item_marker_width =
1843                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1844                                                lines[block_last_item_line - 1]
1845                                                    .list_item
1846                                                    .as_ref()
1847                                                    .map(|li| {
1848                                                        if li.is_ordered {
1849                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1850                                                        } else {
1851                                                            li.marker.len()
1852                                                        }
1853                                                    })
1854                                                    .unwrap_or(3) // fallback to 3 if no list item found
1855                                            } else {
1856                                                3 // fallback
1857                                            };
1858
1859                                        let min_continuation =
1860                                            if block.is_ordered { last_item_marker_width } else { 2 };
1861                                        check_info.indent < min_continuation
1862                                    }
1863                                } else {
1864                                    false
1865                                };
1866
1867                                if is_list_breaking_content {
1868                                    // Not indented enough, so it breaks the list
1869                                    found_non_list = true;
1870                                    break;
1871                                }
1872                            }
1873                        }
1874                        found_non_list
1875                    };
1876
1877                    // A list continues if:
1878                    // 1. It's a nested item (indented more than the parent), OR
1879                    // 2. It's the same type at the same level with reasonable distance
1880                    let mut continues_list = if is_nested {
1881                        // Nested items always continue the list if they're in the same context
1882                        same_context && reasonable_distance && !has_non_list_content
1883                    } else {
1884                        // Same-level items need to match type and markers
1885                        let result = same_type
1886                            && same_context
1887                            && reasonable_distance
1888                            && marker_compatible
1889                            && !has_non_list_content;
1890
1891                        // Debug logging for lines after problematic content
1892                        if block.item_lines.last().is_some_and(|&last_line| {
1893                            last_line > 0
1894                                && last_line <= lines.len()
1895                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1896                                && lines[last_line - 1].content.contains(r"\`")
1897                        }) {
1898                            log::debug!(
1899                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1900                            );
1901                            if line_num > 0 && line_num <= lines.len() {
1902                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1903                            }
1904                        }
1905
1906                        result
1907                    };
1908
1909                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
1910                    // This handles edge cases where content patterns might otherwise split lists incorrectly
1911                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1912                        // Check if the previous line was a list item
1913                        if block.item_lines.contains(&(line_num - 1)) {
1914                            // They're consecutive list items - force them to be in the same list
1915                            continues_list = true;
1916                        }
1917                    }
1918
1919                    if continues_list {
1920                        // Extend current block
1921                        block.end_line = line_num;
1922                        block.item_lines.push(line_num);
1923
1924                        // Update max marker width
1925                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1926                            list_item.marker.len() + 1
1927                        } else {
1928                            list_item.marker.len()
1929                        });
1930
1931                        // Update marker consistency for unordered lists
1932                        if !block.is_ordered
1933                            && block.marker.is_some()
1934                            && block.marker.as_ref() != Some(&list_item.marker)
1935                        {
1936                            // Mixed markers, clear the marker field
1937                            block.marker = None;
1938                        }
1939                    } else {
1940                        // End current block and start a new one
1941
1942                        list_blocks.push(block.clone());
1943
1944                        *block = ListBlock {
1945                            start_line: line_num,
1946                            end_line: line_num,
1947                            is_ordered: list_item.is_ordered,
1948                            marker: if list_item.is_ordered {
1949                                None
1950                            } else {
1951                                Some(list_item.marker.clone())
1952                            },
1953                            blockquote_prefix: blockquote_prefix.clone(),
1954                            item_lines: vec![line_num],
1955                            nesting_level: nesting,
1956                            max_marker_width: if list_item.is_ordered {
1957                                list_item.marker.len() + 1
1958                            } else {
1959                                list_item.marker.len()
1960                            },
1961                        };
1962                    }
1963                } else {
1964                    // Start a new block
1965                    current_block = Some(ListBlock {
1966                        start_line: line_num,
1967                        end_line: line_num,
1968                        is_ordered: list_item.is_ordered,
1969                        marker: if list_item.is_ordered {
1970                            None
1971                        } else {
1972                            Some(list_item.marker.clone())
1973                        },
1974                        blockquote_prefix,
1975                        item_lines: vec![line_num],
1976                        nesting_level: nesting,
1977                        max_marker_width: list_item.marker.len(),
1978                    });
1979                }
1980
1981                last_list_item_line = line_num;
1982                current_indent_level = item_indent;
1983                last_marker_width = if list_item.is_ordered {
1984                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
1985                } else {
1986                    list_item.marker.len()
1987                };
1988            } else if let Some(ref mut block) = current_block {
1989                // Not a list item - check if it continues the current block
1990
1991                // For MD032 compatibility, we use a simple approach:
1992                // - Indented lines continue the list
1993                // - Blank lines followed by indented content continue the list
1994                // - Everything else ends the list
1995
1996                // Check if the last line in the list block ended with a backslash (hard line break)
1997                // This handles cases where list items use backslash for hard line breaks
1998                let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
1999                    lines[block.end_line - 1].content.trim_end().ends_with('\\')
2000                } else {
2001                    false
2002                };
2003
2004                // Calculate minimum indentation for list continuation
2005                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
2006                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
2007                let min_continuation_indent = if block.is_ordered {
2008                    current_indent_level + last_marker_width
2009                } else {
2010                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
2011                };
2012
2013                if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
2014                    // Indented line or backslash continuation continues the list
2015                    block.end_line = line_num;
2016                } else if line_info.is_blank {
2017                    // Blank line - check if it's internal to the list or ending it
2018                    // We only include blank lines that are followed by more list content
2019                    let mut check_idx = line_idx + 1;
2020                    let mut found_continuation = false;
2021
2022                    // Skip additional blank lines
2023                    while check_idx < lines.len() && lines[check_idx].is_blank {
2024                        check_idx += 1;
2025                    }
2026
2027                    if check_idx < lines.len() {
2028                        let next_line = &lines[check_idx];
2029                        // Check if followed by indented content (list continuation)
2030                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2031                            found_continuation = true;
2032                        }
2033                        // Check if followed by another list item at the same level
2034                        else if !next_line.in_code_block
2035                            && next_line.list_item.is_some()
2036                            && let Some(item) = &next_line.list_item
2037                        {
2038                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2039                                .find(&next_line.content)
2040                                .map_or(String::new(), |m| m.as_str().to_string());
2041                            if item.marker_column == current_indent_level
2042                                && item.is_ordered == block.is_ordered
2043                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2044                            {
2045                                // Check if there was meaningful content between the list items (unused now)
2046                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2047                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2048                                    if let Some(between_line) = lines.get(idx) {
2049                                        let trimmed = between_line.content.trim();
2050                                        // Skip empty lines
2051                                        if trimmed.is_empty() {
2052                                            return false;
2053                                        }
2054                                        // Check for meaningful content
2055                                        let line_indent =
2056                                            between_line.content.len() - between_line.content.trim_start().len();
2057
2058                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2059                                        if trimmed.starts_with("```")
2060                                            || trimmed.starts_with("~~~")
2061                                            || trimmed.starts_with("---")
2062                                            || trimmed.starts_with("***")
2063                                            || trimmed.starts_with("___")
2064                                            || trimmed.starts_with(">")
2065                                            || trimmed.contains('|') // Tables
2066                                            || between_line.heading.is_some()
2067                                        {
2068                                            return true; // These are structural separators - meaningful content that breaks lists
2069                                        }
2070
2071                                        // Only properly indented content continues the list
2072                                        line_indent >= min_continuation_indent
2073                                    } else {
2074                                        false
2075                                    }
2076                                });
2077
2078                                if block.is_ordered {
2079                                    // For ordered lists: don't continue if there are structural separators
2080                                    // Check if there are structural separators between the list items
2081                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2082                                        if let Some(between_line) = lines.get(idx) {
2083                                            let trimmed = between_line.content.trim();
2084                                            if trimmed.is_empty() {
2085                                                return false;
2086                                            }
2087                                            // Check for structural separators that break lists
2088                                            trimmed.starts_with("```")
2089                                                || trimmed.starts_with("~~~")
2090                                                || trimmed.starts_with("---")
2091                                                || trimmed.starts_with("***")
2092                                                || trimmed.starts_with("___")
2093                                                || trimmed.starts_with(">")
2094                                                || trimmed.contains('|') // Tables
2095                                                || between_line.heading.is_some()
2096                                        } else {
2097                                            false
2098                                        }
2099                                    });
2100                                    found_continuation = !has_structural_separators;
2101                                } else {
2102                                    // For unordered lists: also check for structural separators
2103                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2104                                        if let Some(between_line) = lines.get(idx) {
2105                                            let trimmed = between_line.content.trim();
2106                                            if trimmed.is_empty() {
2107                                                return false;
2108                                            }
2109                                            // Check for structural separators that break lists
2110                                            trimmed.starts_with("```")
2111                                                || trimmed.starts_with("~~~")
2112                                                || trimmed.starts_with("---")
2113                                                || trimmed.starts_with("***")
2114                                                || trimmed.starts_with("___")
2115                                                || trimmed.starts_with(">")
2116                                                || trimmed.contains('|') // Tables
2117                                                || between_line.heading.is_some()
2118                                        } else {
2119                                            false
2120                                        }
2121                                    });
2122                                    found_continuation = !has_structural_separators;
2123                                }
2124                            }
2125                        }
2126                    }
2127
2128                    if found_continuation {
2129                        // Include the blank line in the block
2130                        block.end_line = line_num;
2131                    } else {
2132                        // Blank line ends the list - don't include it
2133                        list_blocks.push(block.clone());
2134                        current_block = None;
2135                    }
2136                } else {
2137                    // Check for lazy continuation - non-indented line immediately after a list item
2138                    // But only if the line has sufficient indentation for the list type
2139                    let min_required_indent = if block.is_ordered {
2140                        current_indent_level + last_marker_width
2141                    } else {
2142                        current_indent_level + 2
2143                    };
2144
2145                    // For lazy continuation to apply, the line must either:
2146                    // 1. Have no indentation (true lazy continuation)
2147                    // 2. Have sufficient indentation for the list type
2148                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2149                    let line_content = line_info.content.trim();
2150                    let is_structural_separator = line_info.heading.is_some()
2151                        || line_content.starts_with("```")
2152                        || line_content.starts_with("~~~")
2153                        || line_content.starts_with("---")
2154                        || line_content.starts_with("***")
2155                        || line_content.starts_with("___")
2156                        || line_content.starts_with(">")
2157                        || (line_content.contains('|')
2158                            && !line_content.contains("](")
2159                            && !line_content.contains("http")
2160                            && (line_content.matches('|').count() > 1
2161                                || line_content.starts_with('|')
2162                                || line_content.ends_with('|'))); // Tables
2163
2164                    // Allow lazy continuation if we're still within the same list block
2165                    // (not just immediately after a list item)
2166                    let is_lazy_continuation = !is_structural_separator
2167                        && !line_info.is_blank
2168                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2169
2170                    if is_lazy_continuation {
2171                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2172                        // it's probably not a continuation
2173                        let content_to_check = if !blockquote_prefix.is_empty() {
2174                            // Strip blockquote prefix to check the actual content
2175                            line_info
2176                                .content
2177                                .strip_prefix(&blockquote_prefix)
2178                                .unwrap_or(&line_info.content)
2179                                .trim()
2180                        } else {
2181                            line_info.content.trim()
2182                        };
2183
2184                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2185
2186                        // If it starts with uppercase and the previous line ended with punctuation,
2187                        // it's likely a new paragraph, not a continuation
2188                        if starts_with_uppercase && last_list_item_line > 0 {
2189                            // This looks like a new paragraph
2190                            list_blocks.push(block.clone());
2191                            current_block = None;
2192                        } else {
2193                            // This is a lazy continuation line
2194                            block.end_line = line_num;
2195                        }
2196                    } else {
2197                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2198                        list_blocks.push(block.clone());
2199                        current_block = None;
2200                    }
2201                }
2202            }
2203        }
2204
2205        // Don't forget the last block
2206        if let Some(block) = current_block {
2207            list_blocks.push(block);
2208        }
2209
2210        // Merge adjacent blocks that should be one
2211        merge_adjacent_list_blocks(&mut list_blocks, lines);
2212
2213        list_blocks
2214    }
2215
2216    /// Compute character frequency for fast content analysis
2217    fn compute_char_frequency(content: &str) -> CharFrequency {
2218        let mut frequency = CharFrequency::default();
2219
2220        for ch in content.chars() {
2221            match ch {
2222                '#' => frequency.hash_count += 1,
2223                '*' => frequency.asterisk_count += 1,
2224                '_' => frequency.underscore_count += 1,
2225                '-' => frequency.hyphen_count += 1,
2226                '+' => frequency.plus_count += 1,
2227                '>' => frequency.gt_count += 1,
2228                '|' => frequency.pipe_count += 1,
2229                '[' => frequency.bracket_count += 1,
2230                '`' => frequency.backtick_count += 1,
2231                '<' => frequency.lt_count += 1,
2232                '!' => frequency.exclamation_count += 1,
2233                '\n' => frequency.newline_count += 1,
2234                _ => {}
2235            }
2236        }
2237
2238        frequency
2239    }
2240
2241    /// Parse HTML tags in the content
2242    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2243        lazy_static! {
2244            static ref HTML_TAG_REGEX: regex::Regex =
2245                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2246        }
2247
2248        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2249
2250        for cap in HTML_TAG_REGEX.captures_iter(content) {
2251            let full_match = cap.get(0).unwrap();
2252            let match_start = full_match.start();
2253            let match_end = full_match.end();
2254
2255            // Skip if in code block
2256            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2257                continue;
2258            }
2259
2260            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2261            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2262            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2263
2264            // Find which line this tag is on
2265            let mut line_num = 1;
2266            let mut col_start = match_start;
2267            let mut col_end = match_end;
2268            for (idx, line_info) in lines.iter().enumerate() {
2269                if match_start >= line_info.byte_offset {
2270                    line_num = idx + 1;
2271                    col_start = match_start - line_info.byte_offset;
2272                    col_end = match_end - line_info.byte_offset;
2273                } else {
2274                    break;
2275                }
2276            }
2277
2278            html_tags.push(HtmlTag {
2279                line: line_num,
2280                start_col: col_start,
2281                end_col: col_end,
2282                byte_offset: match_start,
2283                byte_end: match_end,
2284                tag_name,
2285                is_closing,
2286                is_self_closing,
2287                raw_content: full_match.as_str().to_string(),
2288            });
2289        }
2290
2291        html_tags
2292    }
2293
2294    /// Parse emphasis spans in the content
2295    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2296        lazy_static! {
2297            static ref EMPHASIS_REGEX: regex::Regex =
2298                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2299        }
2300
2301        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2302
2303        for cap in EMPHASIS_REGEX.captures_iter(content) {
2304            let full_match = cap.get(0).unwrap();
2305            let match_start = full_match.start();
2306            let match_end = full_match.end();
2307
2308            // Skip if in code block
2309            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2310                continue;
2311            }
2312
2313            let opening_markers = cap.get(1).unwrap().as_str();
2314            let content_part = cap.get(2).unwrap().as_str();
2315            let closing_markers = cap.get(3).unwrap().as_str();
2316
2317            // Validate matching markers
2318            if opening_markers.chars().next() != closing_markers.chars().next()
2319                || opening_markers.len() != closing_markers.len()
2320            {
2321                continue;
2322            }
2323
2324            let marker = opening_markers.chars().next().unwrap();
2325            let marker_count = opening_markers.len();
2326
2327            // Find which line this emphasis is on
2328            let mut line_num = 1;
2329            let mut col_start = match_start;
2330            let mut col_end = match_end;
2331            for (idx, line_info) in lines.iter().enumerate() {
2332                if match_start >= line_info.byte_offset {
2333                    line_num = idx + 1;
2334                    col_start = match_start - line_info.byte_offset;
2335                    col_end = match_end - line_info.byte_offset;
2336                } else {
2337                    break;
2338                }
2339            }
2340
2341            emphasis_spans.push(EmphasisSpan {
2342                line: line_num,
2343                start_col: col_start,
2344                end_col: col_end,
2345                byte_offset: match_start,
2346                byte_end: match_end,
2347                marker,
2348                marker_count,
2349                content: content_part.to_string(),
2350            });
2351        }
2352
2353        emphasis_spans
2354    }
2355
2356    /// Parse table rows in the content
2357    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2358        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2359
2360        for (line_idx, line_info) in lines.iter().enumerate() {
2361            // Skip lines in code blocks or blank lines
2362            if line_info.in_code_block || line_info.is_blank {
2363                continue;
2364            }
2365
2366            let line = &line_info.content;
2367            let line_num = line_idx + 1;
2368
2369            // Check if this line contains pipes (potential table row)
2370            if !line.contains('|') {
2371                continue;
2372            }
2373
2374            // Count columns by splitting on pipes
2375            let parts: Vec<&str> = line.split('|').collect();
2376            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2377
2378            // Check if this is a separator row
2379            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2380            let mut column_alignments = Vec::new();
2381
2382            if is_separator {
2383                for part in &parts[1..parts.len() - 1] {
2384                    // Skip first and last empty parts
2385                    let trimmed = part.trim();
2386                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2387                        "center".to_string()
2388                    } else if trimmed.ends_with(':') {
2389                        "right".to_string()
2390                    } else if trimmed.starts_with(':') {
2391                        "left".to_string()
2392                    } else {
2393                        "none".to_string()
2394                    };
2395                    column_alignments.push(alignment);
2396                }
2397            }
2398
2399            table_rows.push(TableRow {
2400                line: line_num,
2401                is_separator,
2402                column_count,
2403                column_alignments,
2404            });
2405        }
2406
2407        table_rows
2408    }
2409
2410    /// Parse bare URLs and emails in the content
2411    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2412        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2413
2414        // Check for bare URLs (not in angle brackets or markdown links)
2415        for cap in BARE_URL_PATTERN.captures_iter(content) {
2416            let full_match = cap.get(0).unwrap();
2417            let match_start = full_match.start();
2418            let match_end = full_match.end();
2419
2420            // Skip if in code block
2421            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2422                continue;
2423            }
2424
2425            // Skip if already in angle brackets or markdown links
2426            let preceding_char = if match_start > 0 {
2427                content.chars().nth(match_start - 1)
2428            } else {
2429                None
2430            };
2431            let following_char = content.chars().nth(match_end);
2432
2433            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2434                continue;
2435            }
2436            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2437                continue;
2438            }
2439
2440            let url = full_match.as_str();
2441            let url_type = if url.starts_with("https://") {
2442                "https"
2443            } else if url.starts_with("http://") {
2444                "http"
2445            } else if url.starts_with("ftp://") {
2446                "ftp"
2447            } else {
2448                "other"
2449            };
2450
2451            // Find which line this URL is on
2452            let mut line_num = 1;
2453            let mut col_start = match_start;
2454            let mut col_end = match_end;
2455            for (idx, line_info) in lines.iter().enumerate() {
2456                if match_start >= line_info.byte_offset {
2457                    line_num = idx + 1;
2458                    col_start = match_start - line_info.byte_offset;
2459                    col_end = match_end - line_info.byte_offset;
2460                } else {
2461                    break;
2462                }
2463            }
2464
2465            bare_urls.push(BareUrl {
2466                line: line_num,
2467                start_col: col_start,
2468                end_col: col_end,
2469                byte_offset: match_start,
2470                byte_end: match_end,
2471                url: url.to_string(),
2472                url_type: url_type.to_string(),
2473            });
2474        }
2475
2476        // Check for bare email addresses
2477        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2478            let full_match = cap.get(0).unwrap();
2479            let match_start = full_match.start();
2480            let match_end = full_match.end();
2481
2482            // Skip if in code block
2483            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2484                continue;
2485            }
2486
2487            // Skip if already in angle brackets or markdown links
2488            let preceding_char = if match_start > 0 {
2489                content.chars().nth(match_start - 1)
2490            } else {
2491                None
2492            };
2493            let following_char = content.chars().nth(match_end);
2494
2495            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2496                continue;
2497            }
2498            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2499                continue;
2500            }
2501
2502            let email = full_match.as_str();
2503
2504            // Find which line this email is on
2505            let mut line_num = 1;
2506            let mut col_start = match_start;
2507            let mut col_end = match_end;
2508            for (idx, line_info) in lines.iter().enumerate() {
2509                if match_start >= line_info.byte_offset {
2510                    line_num = idx + 1;
2511                    col_start = match_start - line_info.byte_offset;
2512                    col_end = match_end - line_info.byte_offset;
2513                } else {
2514                    break;
2515                }
2516            }
2517
2518            bare_urls.push(BareUrl {
2519                line: line_num,
2520                start_col: col_start,
2521                end_col: col_end,
2522                byte_offset: match_start,
2523                byte_end: match_end,
2524                url: email.to_string(),
2525                url_type: "email".to_string(),
2526            });
2527        }
2528
2529        bare_urls
2530    }
2531}
2532
2533/// Merge adjacent list blocks that should be treated as one
2534fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2535    if list_blocks.len() < 2 {
2536        return;
2537    }
2538
2539    let mut merger = ListBlockMerger::new(lines);
2540    *list_blocks = merger.merge(list_blocks);
2541}
2542
2543/// Helper struct to manage the complex logic of merging list blocks
2544struct ListBlockMerger<'a> {
2545    lines: &'a [LineInfo],
2546}
2547
2548impl<'a> ListBlockMerger<'a> {
2549    fn new(lines: &'a [LineInfo]) -> Self {
2550        Self { lines }
2551    }
2552
2553    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2554        let mut merged = Vec::with_capacity(list_blocks.len());
2555        let mut current = list_blocks[0].clone();
2556
2557        for next in list_blocks.iter().skip(1) {
2558            if self.should_merge_blocks(&current, next) {
2559                current = self.merge_two_blocks(current, next);
2560            } else {
2561                merged.push(current);
2562                current = next.clone();
2563            }
2564        }
2565
2566        merged.push(current);
2567        merged
2568    }
2569
2570    /// Determine if two adjacent list blocks should be merged
2571    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2572        // Basic compatibility checks
2573        if !self.blocks_are_compatible(current, next) {
2574            return false;
2575        }
2576
2577        // Check spacing and content between blocks
2578        let spacing = self.analyze_spacing_between(current, next);
2579        match spacing {
2580            BlockSpacing::Consecutive => true,
2581            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2582            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2583                self.can_merge_with_content_between(current, next)
2584            }
2585        }
2586    }
2587
2588    /// Check if blocks have compatible structure for merging
2589    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2590        current.is_ordered == next.is_ordered
2591            && current.blockquote_prefix == next.blockquote_prefix
2592            && current.nesting_level == next.nesting_level
2593    }
2594
2595    /// Analyze the spacing between two list blocks
2596    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2597        let gap = next.start_line - current.end_line;
2598
2599        match gap {
2600            1 => BlockSpacing::Consecutive,
2601            2 => BlockSpacing::SingleBlank,
2602            _ if gap > 2 => {
2603                if self.has_only_blank_lines_between(current, next) {
2604                    BlockSpacing::MultipleBlanks
2605                } else {
2606                    BlockSpacing::ContentBetween
2607                }
2608            }
2609            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2610        }
2611    }
2612
2613    /// Check if unordered lists can be merged with a single blank line between
2614    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2615        // Check if there are structural separators between the blocks
2616        // If has_meaningful_content_between returns true, it means there are structural separators
2617        if has_meaningful_content_between(current, next, self.lines) {
2618            return false; // Structural separators prevent merging
2619        }
2620
2621        // Only merge unordered lists with same marker across single blank
2622        !current.is_ordered && current.marker == next.marker
2623    }
2624
2625    /// Check if ordered lists can be merged when there's content between them
2626    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2627        // Do not merge lists if there are structural separators between them
2628        if has_meaningful_content_between(current, next, self.lines) {
2629            return false; // Structural separators prevent merging
2630        }
2631
2632        // Only consider merging ordered lists if there's no structural content between
2633        current.is_ordered && next.is_ordered
2634    }
2635
2636    /// Check if there are only blank lines between blocks
2637    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2638        for line_num in (current.end_line + 1)..next.start_line {
2639            if let Some(line_info) = self.lines.get(line_num - 1)
2640                && !line_info.content.trim().is_empty()
2641            {
2642                return false;
2643            }
2644        }
2645        true
2646    }
2647
2648    /// Merge two compatible list blocks into one
2649    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2650        current.end_line = next.end_line;
2651        current.item_lines.extend_from_slice(&next.item_lines);
2652
2653        // Update max marker width
2654        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2655
2656        // Handle marker consistency for unordered lists
2657        if !current.is_ordered && self.markers_differ(&current, next) {
2658            current.marker = None; // Mixed markers
2659        }
2660
2661        current
2662    }
2663
2664    /// Check if two blocks have different markers
2665    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2666        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2667    }
2668}
2669
2670/// Types of spacing between list blocks
2671#[derive(Debug, PartialEq)]
2672enum BlockSpacing {
2673    Consecutive,    // No gap between blocks
2674    SingleBlank,    // One blank line between blocks
2675    MultipleBlanks, // Multiple blank lines but no content
2676    ContentBetween, // Content exists between blocks
2677}
2678
2679/// Check if there's meaningful content (not just blank lines) between two list blocks
2680fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2681    // Check lines between current.end_line and next.start_line
2682    for line_num in (current.end_line + 1)..next.start_line {
2683        if let Some(line_info) = lines.get(line_num - 1) {
2684            // Convert to 0-indexed
2685            let trimmed = line_info.content.trim();
2686
2687            // Skip empty lines
2688            if trimmed.is_empty() {
2689                continue;
2690            }
2691
2692            // Check for structural separators that should separate lists (CommonMark compliant)
2693
2694            // Headings separate lists
2695            if line_info.heading.is_some() {
2696                return true; // Has meaningful content - headings separate lists
2697            }
2698
2699            // Horizontal rules separate lists (---, ***, ___)
2700            if is_horizontal_rule(trimmed) {
2701                return true; // Has meaningful content - horizontal rules separate lists
2702            }
2703
2704            // Tables separate lists (lines containing | but not in URLs or code)
2705            // Simple heuristic: tables typically have | at start/end or multiple |
2706            if trimmed.contains('|') && trimmed.len() > 1 {
2707                // Don't treat URLs with | as tables
2708                if !trimmed.contains("](") && !trimmed.contains("http") {
2709                    // More robust check: tables usually have multiple | or | at edges
2710                    let pipe_count = trimmed.matches('|').count();
2711                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2712                        return true; // Has meaningful content - tables separate lists
2713                    }
2714                }
2715            }
2716
2717            // Blockquotes separate lists
2718            if trimmed.starts_with('>') {
2719                return true; // Has meaningful content - blockquotes separate lists
2720            }
2721
2722            // Code block fences separate lists (unless properly indented as list content)
2723            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2724                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2725
2726                // Check if this code block is properly indented as list continuation
2727                let min_continuation_indent = if current.is_ordered {
2728                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2729                } else {
2730                    current.nesting_level + 2
2731                };
2732
2733                if line_indent < min_continuation_indent {
2734                    // This is a standalone code block that separates lists
2735                    return true; // Has meaningful content - standalone code blocks separate lists
2736                }
2737            }
2738
2739            // Check if this line has proper indentation for list continuation
2740            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2741
2742            // Calculate minimum indentation needed to be list continuation
2743            let min_indent = if current.is_ordered {
2744                current.nesting_level + current.max_marker_width
2745            } else {
2746                current.nesting_level + 2
2747            };
2748
2749            // If the line is not indented enough to be list continuation, it's meaningful content
2750            if line_indent < min_indent {
2751                return true; // Has meaningful content - content not indented as list continuation
2752            }
2753
2754            // If we reach here, the line is properly indented as list continuation
2755            // Continue checking other lines
2756        }
2757    }
2758
2759    // Only blank lines or properly indented list continuation content between blocks
2760    false
2761}
2762
2763/// Check if a line is a horizontal rule (---, ***, ___)
2764fn is_horizontal_rule(trimmed: &str) -> bool {
2765    if trimmed.len() < 3 {
2766        return false;
2767    }
2768
2769    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2770    let chars: Vec<char> = trimmed.chars().collect();
2771    if let Some(&first_char) = chars.first()
2772        && (first_char == '-' || first_char == '*' || first_char == '_')
2773    {
2774        let mut count = 0;
2775        for &ch in &chars {
2776            if ch == first_char {
2777                count += 1;
2778            } else if ch != ' ' && ch != '\t' {
2779                return false; // Non-matching, non-whitespace character
2780            }
2781        }
2782        return count >= 3;
2783    }
2784    false
2785}
2786
2787/// Check if content contains patterns that cause the markdown crate to panic
2788#[cfg(test)]
2789mod tests {
2790    use super::*;
2791
2792    #[test]
2793    fn test_empty_content() {
2794        let ctx = LintContext::new("", MarkdownFlavor::Standard);
2795        assert_eq!(ctx.content, "");
2796        assert_eq!(ctx.line_offsets, vec![0]);
2797        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2798        assert_eq!(ctx.lines.len(), 0);
2799    }
2800
2801    #[test]
2802    fn test_single_line() {
2803        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2804        assert_eq!(ctx.content, "# Hello");
2805        assert_eq!(ctx.line_offsets, vec![0]);
2806        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2807        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2808    }
2809
2810    #[test]
2811    fn test_multi_line() {
2812        let content = "# Title\n\nSecond line\nThird line";
2813        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2814        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2815        // Test offset to line/col
2816        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2817        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2818        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2819        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2820        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2821    }
2822
2823    #[test]
2824    fn test_line_info() {
2825        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2826        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2827
2828        // Test line info
2829        assert_eq!(ctx.lines.len(), 7);
2830
2831        // Line 1: "# Title"
2832        let line1 = &ctx.lines[0];
2833        assert_eq!(line1.content, "# Title");
2834        assert_eq!(line1.byte_offset, 0);
2835        assert_eq!(line1.indent, 0);
2836        assert!(!line1.is_blank);
2837        assert!(!line1.in_code_block);
2838        assert!(line1.list_item.is_none());
2839
2840        // Line 2: "    indented"
2841        let line2 = &ctx.lines[1];
2842        assert_eq!(line2.content, "    indented");
2843        assert_eq!(line2.byte_offset, 8);
2844        assert_eq!(line2.indent, 4);
2845        assert!(!line2.is_blank);
2846
2847        // Line 3: "" (blank)
2848        let line3 = &ctx.lines[2];
2849        assert_eq!(line3.content, "");
2850        assert!(line3.is_blank);
2851
2852        // Test helper methods
2853        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2854        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2855        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2856        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2857    }
2858
2859    #[test]
2860    fn test_list_item_detection() {
2861        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2862        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2863
2864        // Line 1: "- Unordered item"
2865        let line1 = &ctx.lines[0];
2866        assert!(line1.list_item.is_some());
2867        let list1 = line1.list_item.as_ref().unwrap();
2868        assert_eq!(list1.marker, "-");
2869        assert!(!list1.is_ordered);
2870        assert_eq!(list1.marker_column, 0);
2871        assert_eq!(list1.content_column, 2);
2872
2873        // Line 2: "  * Nested item"
2874        let line2 = &ctx.lines[1];
2875        assert!(line2.list_item.is_some());
2876        let list2 = line2.list_item.as_ref().unwrap();
2877        assert_eq!(list2.marker, "*");
2878        assert_eq!(list2.marker_column, 2);
2879
2880        // Line 3: "1. Ordered item"
2881        let line3 = &ctx.lines[2];
2882        assert!(line3.list_item.is_some());
2883        let list3 = line3.list_item.as_ref().unwrap();
2884        assert_eq!(list3.marker, "1.");
2885        assert!(list3.is_ordered);
2886        assert_eq!(list3.number, Some(1));
2887
2888        // Line 6: "Not a list"
2889        let line6 = &ctx.lines[5];
2890        assert!(line6.list_item.is_none());
2891    }
2892
2893    #[test]
2894    fn test_offset_to_line_col_edge_cases() {
2895        let content = "a\nb\nc";
2896        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2897        // line_offsets: [0, 2, 4]
2898        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2899        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2900        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2901        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2902        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2903        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2904    }
2905}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs