rumdl_lib/
lint_context.rs

1use crate::config::MarkdownFlavor;
2use crate::utils::ast_utils::get_cached_ast;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use lazy_static::lazy_static;
5use markdown::mdast::Node;
6use regex::Regex;
7
8lazy_static! {
9    // Comprehensive link pattern that captures both inline and reference links
10    // Use (?s) flag to make . match newlines
11    static ref LINK_PATTERN: Regex = Regex::new(
12        r"(?sx)
13        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
14        (?:
15            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
16            |
17            \[([^\]]*)\]      # Reference ID in group 3
18        )"
19    ).unwrap();
20
21    // Image pattern (similar to links but with ! prefix)
22    // Use (?s) flag to make . match newlines
23    static ref IMAGE_PATTERN: Regex = Regex::new(
24        r"(?sx)
25        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
26        (?:
27            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
28            |
29            \[([^\]]*)\]      # Reference ID in group 3
30        )"
31    ).unwrap();
32
33    // Reference definition pattern
34    static ref REF_DEF_PATTERN: Regex = Regex::new(
35        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
36    ).unwrap();
37
38    // Code span pattern - matches backticks and captures content
39    // This handles multi-backtick code spans correctly
40    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
41        r"`+"
42    ).unwrap();
43
44    // Pattern for bare URLs
45    static ref BARE_URL_PATTERN: Regex = Regex::new(
46        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
47    ).unwrap();
48
49    // Pattern for email addresses
50    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
51        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
52    ).unwrap();
53
54    // Pattern for angle bracket links (to exclude from bare URL detection)
55    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
56        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
57    ).unwrap();
58
59    // Pattern for blockquote prefix in parse_list_blocks
60    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
61}
62
63/// Pre-computed information about a line
64#[derive(Debug, Clone)]
65pub struct LineInfo {
66    /// The actual line content (without newline)
67    pub content: String,
68    /// Byte offset where this line starts in the document
69    pub byte_offset: usize,
70    /// Number of leading spaces/tabs
71    pub indent: usize,
72    /// Whether the line is blank (empty or only whitespace)
73    pub is_blank: bool,
74    /// Whether this line is inside a code block
75    pub in_code_block: bool,
76    /// Whether this line is inside front matter
77    pub in_front_matter: bool,
78    /// Whether this line is inside an HTML block
79    pub in_html_block: bool,
80    /// List item information if this line starts a list item
81    pub list_item: Option<ListItemInfo>,
82    /// Heading information if this line is a heading
83    pub heading: Option<HeadingInfo>,
84    /// Blockquote information if this line is a blockquote
85    pub blockquote: Option<BlockquoteInfo>,
86}
87
88/// Information about a list item
89#[derive(Debug, Clone)]
90pub struct ListItemInfo {
91    /// The marker used (*, -, +, or number with . or ))
92    pub marker: String,
93    /// Whether it's ordered (true) or unordered (false)
94    pub is_ordered: bool,
95    /// The number for ordered lists
96    pub number: Option<usize>,
97    /// Column where the marker starts (0-based)
98    pub marker_column: usize,
99    /// Column where content after marker starts
100    pub content_column: usize,
101}
102
103/// Heading style type
104#[derive(Debug, Clone, PartialEq)]
105pub enum HeadingStyle {
106    /// ATX style heading (# Heading)
107    ATX,
108    /// Setext style heading with = underline
109    Setext1,
110    /// Setext style heading with - underline
111    Setext2,
112}
113
114/// Parsed link information
115#[derive(Debug, Clone)]
116pub struct ParsedLink {
117    /// Line number (1-indexed)
118    pub line: usize,
119    /// Start column (0-indexed) in the line
120    pub start_col: usize,
121    /// End column (0-indexed) in the line
122    pub end_col: usize,
123    /// Byte offset in document
124    pub byte_offset: usize,
125    /// End byte offset in document
126    pub byte_end: usize,
127    /// Link text
128    pub text: String,
129    /// Link URL or reference
130    pub url: String,
131    /// Whether this is a reference link [text][ref] vs inline [text](url)
132    pub is_reference: bool,
133    /// Reference ID for reference links
134    pub reference_id: Option<String>,
135}
136
137/// Parsed image information
138#[derive(Debug, Clone)]
139pub struct ParsedImage {
140    /// Line number (1-indexed)
141    pub line: usize,
142    /// Start column (0-indexed) in the line
143    pub start_col: usize,
144    /// End column (0-indexed) in the line
145    pub end_col: usize,
146    /// Byte offset in document
147    pub byte_offset: usize,
148    /// End byte offset in document
149    pub byte_end: usize,
150    /// Alt text
151    pub alt_text: String,
152    /// Image URL or reference
153    pub url: String,
154    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
155    pub is_reference: bool,
156    /// Reference ID for reference images
157    pub reference_id: Option<String>,
158}
159
160/// Reference definition [ref]: url "title"
161#[derive(Debug, Clone)]
162pub struct ReferenceDef {
163    /// Line number (1-indexed)
164    pub line: usize,
165    /// Reference ID (normalized to lowercase)
166    pub id: String,
167    /// URL
168    pub url: String,
169    /// Optional title
170    pub title: Option<String>,
171}
172
173/// Parsed code span information
174#[derive(Debug, Clone)]
175pub struct CodeSpan {
176    /// Line number (1-indexed)
177    pub line: usize,
178    /// Start column (0-indexed) in the line
179    pub start_col: usize,
180    /// End column (0-indexed) in the line
181    pub end_col: usize,
182    /// Byte offset in document
183    pub byte_offset: usize,
184    /// End byte offset in document
185    pub byte_end: usize,
186    /// Number of backticks used (1, 2, 3, etc.)
187    pub backtick_count: usize,
188    /// Content inside the code span (without backticks)
189    pub content: String,
190}
191
192/// Information about a heading
193#[derive(Debug, Clone)]
194pub struct HeadingInfo {
195    /// Heading level (1-6 for ATX, 1-2 for Setext)
196    pub level: u8,
197    /// Style of heading
198    pub style: HeadingStyle,
199    /// The heading marker (# characters or underline)
200    pub marker: String,
201    /// Column where the marker starts (0-based)
202    pub marker_column: usize,
203    /// Column where heading text starts
204    pub content_column: usize,
205    /// The heading text (without markers and without custom ID syntax)
206    pub text: String,
207    /// Custom header ID if present (e.g., from {#custom-id} syntax)
208    pub custom_id: Option<String>,
209    /// Original heading text including custom ID syntax
210    pub raw_text: String,
211    /// Whether it has a closing sequence (for ATX)
212    pub has_closing_sequence: bool,
213    /// The closing sequence if present
214    pub closing_sequence: String,
215}
216
217/// Information about a blockquote line
218#[derive(Debug, Clone)]
219pub struct BlockquoteInfo {
220    /// Nesting level (1 for >, 2 for >>, etc.)
221    pub nesting_level: usize,
222    /// The indentation before the blockquote marker
223    pub indent: String,
224    /// Column where the first > starts (0-based)
225    pub marker_column: usize,
226    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
227    pub prefix: String,
228    /// Content after the blockquote marker(s)
229    pub content: String,
230    /// Whether the line has no space after the marker
231    pub has_no_space_after_marker: bool,
232    /// Whether the line has multiple spaces after the marker
233    pub has_multiple_spaces_after_marker: bool,
234    /// Whether this is an empty blockquote line needing MD028 fix
235    pub needs_md028_fix: bool,
236}
237
238/// Information about a list block
239#[derive(Debug, Clone)]
240pub struct ListBlock {
241    /// Line number where the list starts (1-indexed)
242    pub start_line: usize,
243    /// Line number where the list ends (1-indexed)
244    pub end_line: usize,
245    /// Whether it's ordered or unordered
246    pub is_ordered: bool,
247    /// The consistent marker for unordered lists (if any)
248    pub marker: Option<String>,
249    /// Blockquote prefix for this list (empty if not in blockquote)
250    pub blockquote_prefix: String,
251    /// Lines that are list items within this block
252    pub item_lines: Vec<usize>,
253    /// Nesting level (0 for top-level lists)
254    pub nesting_level: usize,
255    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
256    pub max_marker_width: usize,
257}
258
259use std::sync::{Arc, Mutex};
260
261/// Character frequency data for fast content analysis
262#[derive(Debug, Clone, Default)]
263pub struct CharFrequency {
264    /// Count of # characters (headings)
265    pub hash_count: usize,
266    /// Count of * characters (emphasis, lists, horizontal rules)
267    pub asterisk_count: usize,
268    /// Count of _ characters (emphasis, horizontal rules)
269    pub underscore_count: usize,
270    /// Count of - characters (lists, horizontal rules, setext headings)
271    pub hyphen_count: usize,
272    /// Count of + characters (lists)
273    pub plus_count: usize,
274    /// Count of > characters (blockquotes)
275    pub gt_count: usize,
276    /// Count of | characters (tables)
277    pub pipe_count: usize,
278    /// Count of [ characters (links, images)
279    pub bracket_count: usize,
280    /// Count of ` characters (code spans, code blocks)
281    pub backtick_count: usize,
282    /// Count of < characters (HTML tags, autolinks)
283    pub lt_count: usize,
284    /// Count of ! characters (images)
285    pub exclamation_count: usize,
286    /// Count of newline characters
287    pub newline_count: usize,
288}
289
290/// Pre-parsed HTML tag information
291#[derive(Debug, Clone)]
292pub struct HtmlTag {
293    /// Line number (1-indexed)
294    pub line: usize,
295    /// Start column (0-indexed) in the line
296    pub start_col: usize,
297    /// End column (0-indexed) in the line
298    pub end_col: usize,
299    /// Byte offset in document
300    pub byte_offset: usize,
301    /// End byte offset in document
302    pub byte_end: usize,
303    /// Tag name (e.g., "div", "img", "br")
304    pub tag_name: String,
305    /// Whether it's a closing tag (</tag>)
306    pub is_closing: bool,
307    /// Whether it's self-closing (<tag />)
308    pub is_self_closing: bool,
309    /// Raw tag content
310    pub raw_content: String,
311}
312
313/// Pre-parsed emphasis span information
314#[derive(Debug, Clone)]
315pub struct EmphasisSpan {
316    /// Line number (1-indexed)
317    pub line: usize,
318    /// Start column (0-indexed) in the line
319    pub start_col: usize,
320    /// End column (0-indexed) in the line
321    pub end_col: usize,
322    /// Byte offset in document
323    pub byte_offset: usize,
324    /// End byte offset in document
325    pub byte_end: usize,
326    /// Type of emphasis ('*' or '_')
327    pub marker: char,
328    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
329    pub marker_count: usize,
330    /// Content inside the emphasis
331    pub content: String,
332}
333
334/// Pre-parsed table row information
335#[derive(Debug, Clone)]
336pub struct TableRow {
337    /// Line number (1-indexed)
338    pub line: usize,
339    /// Whether this is a separator row (contains only |, -, :, and spaces)
340    pub is_separator: bool,
341    /// Number of columns (pipe-separated cells)
342    pub column_count: usize,
343    /// Alignment info from separator row
344    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
345}
346
347/// Pre-parsed bare URL information (not in links)
348#[derive(Debug, Clone)]
349pub struct BareUrl {
350    /// Line number (1-indexed)
351    pub line: usize,
352    /// Start column (0-indexed) in the line
353    pub start_col: usize,
354    /// End column (0-indexed) in the line
355    pub end_col: usize,
356    /// Byte offset in document
357    pub byte_offset: usize,
358    /// End byte offset in document
359    pub byte_end: usize,
360    /// The URL string
361    pub url: String,
362    /// Type of URL ("http", "https", "ftp", "email")
363    pub url_type: String,
364}
365
366pub struct LintContext<'a> {
367    pub content: &'a str,
368    pub line_offsets: Vec<usize>,
369    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
370    pub lines: Vec<LineInfo>,             // Pre-computed line information
371    pub links: Vec<ParsedLink>,           // Pre-parsed links
372    pub images: Vec<ParsedImage>,         // Pre-parsed images
373    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
374    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
375    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
376    pub char_frequency: CharFrequency,    // Character frequency analysis
377    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
378    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
379    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
380    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
381    ast_cache: Mutex<Option<Arc<Node>>>,  // Lazy-loaded AST
382    pub flavor: MarkdownFlavor,           // Markdown flavor being used
383}
384
385impl<'a> LintContext<'a> {
386    pub fn new(content: &'a str, flavor: MarkdownFlavor) -> Self {
387        let mut line_offsets = vec![0];
388        for (i, c) in content.char_indices() {
389            if c == '\n' {
390                line_offsets.push(i + 1);
391            }
392        }
393
394        // Detect code blocks once and cache them
395        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
396
397        // Pre-compute line information
398        let mut lines = Self::compute_line_info(content, &line_offsets, &code_blocks, flavor);
399
400        // Parse code spans early so we can exclude them from link/image parsing
401        let ast = get_cached_ast(content);
402        let code_spans = Self::parse_code_spans(content, &lines, &ast);
403
404        // Parse links, images, references, and list blocks
405        let links = Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor);
406        let images = Self::parse_images(content, &lines, &code_blocks, &code_spans);
407        let reference_defs = Self::parse_reference_defs(content, &lines);
408        let list_blocks = Self::parse_list_blocks(&lines);
409
410        // Detect HTML blocks
411        Self::detect_html_blocks(&mut lines);
412
413        // Compute character frequency for fast content analysis
414        let char_frequency = Self::compute_char_frequency(content);
415
416        Self {
417            content,
418            line_offsets,
419            code_blocks,
420            lines,
421            links,
422            images,
423            reference_defs,
424            code_spans_cache: Mutex::new(Some(Arc::new(code_spans))),
425            list_blocks,
426            char_frequency,
427            html_tags_cache: Mutex::new(None),
428            emphasis_spans_cache: Mutex::new(None),
429            table_rows_cache: Mutex::new(None),
430            bare_urls_cache: Mutex::new(None),
431            ast_cache: Mutex::new(None),
432            flavor,
433        }
434    }
435
436    /// Get AST - uses global cache for deduplication
437    pub fn get_ast(&self) -> Arc<Node> {
438        let mut cache = self.ast_cache.lock().unwrap();
439
440        if cache.is_none() {
441            // Use global AST cache to avoid duplicate parsing
442            // MarkdownAst is just a type alias for Node, so no conversion needed
443            *cache = Some(get_cached_ast(self.content));
444        }
445
446        cache.as_ref().unwrap().clone()
447    }
448
449    /// Get code spans - computed lazily on first access
450    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
451        let mut cache = self.code_spans_cache.lock().unwrap();
452
453        // Check if we need to compute code spans
454        if cache.is_none() {
455            let ast = self.get_ast();
456            let code_spans = Self::parse_code_spans(self.content, &self.lines, &ast);
457            *cache = Some(Arc::new(code_spans));
458        }
459
460        // Return a reference to the cached code spans
461        cache.as_ref().unwrap().clone()
462    }
463
464    /// Get HTML tags - computed lazily on first access
465    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
466        let mut cache = self.html_tags_cache.lock().unwrap();
467
468        if cache.is_none() {
469            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
470            *cache = Some(Arc::new(html_tags));
471        }
472
473        cache.as_ref().unwrap().clone()
474    }
475
476    /// Get emphasis spans - computed lazily on first access
477    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
478        let mut cache = self.emphasis_spans_cache.lock().unwrap();
479
480        if cache.is_none() {
481            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
482            *cache = Some(Arc::new(emphasis_spans));
483        }
484
485        cache.as_ref().unwrap().clone()
486    }
487
488    /// Get table rows - computed lazily on first access
489    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
490        let mut cache = self.table_rows_cache.lock().unwrap();
491
492        if cache.is_none() {
493            let table_rows = Self::parse_table_rows(&self.lines);
494            *cache = Some(Arc::new(table_rows));
495        }
496
497        cache.as_ref().unwrap().clone()
498    }
499
500    /// Get bare URLs - computed lazily on first access
501    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
502        let mut cache = self.bare_urls_cache.lock().unwrap();
503
504        if cache.is_none() {
505            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
506            *cache = Some(Arc::new(bare_urls));
507        }
508
509        cache.as_ref().unwrap().clone()
510    }
511
512    /// Map a byte offset to (line, column)
513    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
514        match self.line_offsets.binary_search(&offset) {
515            Ok(line) => (line + 1, 1),
516            Err(line) => {
517                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
518                (line, offset - line_start + 1)
519            }
520        }
521    }
522
523    /// Check if a position is within a code block or code span
524    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
525        // Check code blocks first
526        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
527            return true;
528        }
529
530        // Check inline code spans (lazy load if needed)
531        self.code_spans()
532            .iter()
533            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
534    }
535
536    /// Get line information by line number (1-indexed)
537    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
538        if line_num > 0 {
539            self.lines.get(line_num - 1)
540        } else {
541            None
542        }
543    }
544
545    /// Get byte offset for a line number (1-indexed)
546    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
547        self.line_info(line_num).map(|info| info.byte_offset)
548    }
549
550    /// Get URL for a reference link/image by its ID
551    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
552        let normalized_id = ref_id.to_lowercase();
553        self.reference_defs
554            .iter()
555            .find(|def| def.id == normalized_id)
556            .map(|def| def.url.as_str())
557    }
558
559    /// Get links on a specific line
560    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
561        self.links.iter().filter(|link| link.line == line_num).collect()
562    }
563
564    /// Get images on a specific line
565    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
566        self.images.iter().filter(|img| img.line == line_num).collect()
567    }
568
569    /// Check if a line is part of a list block
570    pub fn is_in_list_block(&self, line_num: usize) -> bool {
571        self.list_blocks
572            .iter()
573            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
574    }
575
576    /// Get the list block containing a specific line
577    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
578        self.list_blocks
579            .iter()
580            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
581    }
582
583    // Compatibility methods for DocumentStructure migration
584
585    /// Check if a line is within a code block
586    pub fn is_in_code_block(&self, line_num: usize) -> bool {
587        if line_num == 0 || line_num > self.lines.len() {
588            return false;
589        }
590        self.lines[line_num - 1].in_code_block
591    }
592
593    /// Check if a line is within front matter
594    pub fn is_in_front_matter(&self, line_num: usize) -> bool {
595        if line_num == 0 || line_num > self.lines.len() {
596            return false;
597        }
598        self.lines[line_num - 1].in_front_matter
599    }
600
601    /// Check if a line is within an HTML block
602    pub fn is_in_html_block(&self, line_num: usize) -> bool {
603        if line_num == 0 || line_num > self.lines.len() {
604            return false;
605        }
606        self.lines[line_num - 1].in_html_block
607    }
608
609    /// Check if a line and column is within a code span
610    pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
611        if line_num == 0 || line_num > self.lines.len() {
612            return false;
613        }
614
615        // Use the code spans cache to check
616        let code_spans = self.code_spans();
617        code_spans
618            .iter()
619            .any(|span| span.line == line_num && col >= span.start_col && col <= span.end_col)
620    }
621
622    /// Check if content has any instances of a specific character (fast)
623    pub fn has_char(&self, ch: char) -> bool {
624        match ch {
625            '#' => self.char_frequency.hash_count > 0,
626            '*' => self.char_frequency.asterisk_count > 0,
627            '_' => self.char_frequency.underscore_count > 0,
628            '-' => self.char_frequency.hyphen_count > 0,
629            '+' => self.char_frequency.plus_count > 0,
630            '>' => self.char_frequency.gt_count > 0,
631            '|' => self.char_frequency.pipe_count > 0,
632            '[' => self.char_frequency.bracket_count > 0,
633            '`' => self.char_frequency.backtick_count > 0,
634            '<' => self.char_frequency.lt_count > 0,
635            '!' => self.char_frequency.exclamation_count > 0,
636            '\n' => self.char_frequency.newline_count > 0,
637            _ => self.content.contains(ch), // Fallback for other characters
638        }
639    }
640
641    /// Get count of a specific character (fast)
642    pub fn char_count(&self, ch: char) -> usize {
643        match ch {
644            '#' => self.char_frequency.hash_count,
645            '*' => self.char_frequency.asterisk_count,
646            '_' => self.char_frequency.underscore_count,
647            '-' => self.char_frequency.hyphen_count,
648            '+' => self.char_frequency.plus_count,
649            '>' => self.char_frequency.gt_count,
650            '|' => self.char_frequency.pipe_count,
651            '[' => self.char_frequency.bracket_count,
652            '`' => self.char_frequency.backtick_count,
653            '<' => self.char_frequency.lt_count,
654            '!' => self.char_frequency.exclamation_count,
655            '\n' => self.char_frequency.newline_count,
656            _ => self.content.matches(ch).count(), // Fallback for other characters
657        }
658    }
659
660    /// Check if content likely contains headings (fast)
661    pub fn likely_has_headings(&self) -> bool {
662        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
663    }
664
665    /// Check if content likely contains lists (fast)
666    pub fn likely_has_lists(&self) -> bool {
667        self.char_frequency.asterisk_count > 0
668            || self.char_frequency.hyphen_count > 0
669            || self.char_frequency.plus_count > 0
670    }
671
672    /// Check if content likely contains emphasis (fast)
673    pub fn likely_has_emphasis(&self) -> bool {
674        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
675    }
676
677    /// Check if content likely contains tables (fast)
678    pub fn likely_has_tables(&self) -> bool {
679        self.char_frequency.pipe_count > 2
680    }
681
682    /// Check if content likely contains blockquotes (fast)
683    pub fn likely_has_blockquotes(&self) -> bool {
684        self.char_frequency.gt_count > 0
685    }
686
687    /// Check if content likely contains code (fast)
688    pub fn likely_has_code(&self) -> bool {
689        self.char_frequency.backtick_count > 0
690    }
691
692    /// Check if content likely contains links or images (fast)
693    pub fn likely_has_links_or_images(&self) -> bool {
694        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
695    }
696
697    /// Check if content likely contains HTML (fast)
698    pub fn likely_has_html(&self) -> bool {
699        self.char_frequency.lt_count > 0
700    }
701
702    /// Get HTML tags on a specific line
703    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
704        self.html_tags()
705            .iter()
706            .filter(|tag| tag.line == line_num)
707            .cloned()
708            .collect()
709    }
710
711    /// Get emphasis spans on a specific line
712    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
713        self.emphasis_spans()
714            .iter()
715            .filter(|span| span.line == line_num)
716            .cloned()
717            .collect()
718    }
719
720    /// Get table rows on a specific line
721    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
722        self.table_rows()
723            .iter()
724            .filter(|row| row.line == line_num)
725            .cloned()
726            .collect()
727    }
728
729    /// Get bare URLs on a specific line
730    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
731        self.bare_urls()
732            .iter()
733            .filter(|url| url.line == line_num)
734            .cloned()
735            .collect()
736    }
737
738    /// Parse all links in the content
739    fn parse_links(
740        content: &str,
741        lines: &[LineInfo],
742        code_blocks: &[(usize, usize)],
743        code_spans: &[CodeSpan],
744        flavor: MarkdownFlavor,
745    ) -> Vec<ParsedLink> {
746        use crate::utils::skip_context::is_mkdocs_snippet_line;
747
748        // Pre-size based on a heuristic: most markdown files have relatively few links
749        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
750
751        // Parse links across the entire content, not line by line
752        for cap in LINK_PATTERN.captures_iter(content) {
753            let full_match = cap.get(0).unwrap();
754            let match_start = full_match.start();
755            let match_end = full_match.end();
756
757            // Skip if the opening bracket is escaped (preceded by \)
758            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
759                continue;
760            }
761
762            // Skip if this is actually an image (preceded by !)
763            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
764                continue;
765            }
766
767            // Skip if in code block
768            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
769                continue;
770            }
771
772            // Skip if in code span
773            if code_spans
774                .iter()
775                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
776            {
777                continue;
778            }
779
780            // Skip if this link is on a MkDocs snippet line
781            // Find which line this link is on
782            let line_idx = lines
783                .iter()
784                .position(|line| {
785                    match_start >= line.byte_offset && (match_start < line.byte_offset + line.content.len() + 1)
786                })
787                .unwrap_or(0);
788
789            if is_mkdocs_snippet_line(&lines[line_idx].content, flavor) {
790                continue;
791            }
792
793            // Find which line this link starts on
794            let mut line_num = 1;
795            let mut col_start = match_start;
796            for (idx, line_info) in lines.iter().enumerate() {
797                if match_start >= line_info.byte_offset {
798                    line_num = idx + 1;
799                    col_start = match_start - line_info.byte_offset;
800                } else {
801                    break;
802                }
803            }
804
805            // Find which line this link ends on (and calculate column on that line)
806            let mut end_line_num = 1;
807            let mut col_end = match_end;
808            for (idx, line_info) in lines.iter().enumerate() {
809                if match_end > line_info.byte_offset {
810                    end_line_num = idx + 1;
811                    col_end = match_end - line_info.byte_offset;
812                } else {
813                    break;
814                }
815            }
816
817            // For single-line links, use the same approach as before
818            if line_num == end_line_num {
819                // col_end is already correct
820            } else {
821                // For multi-line links, col_end represents the column on the ending line
822                // which is what we want
823            }
824
825            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
826
827            if let Some(inline_url) = cap.get(2) {
828                // Inline link
829                links.push(ParsedLink {
830                    line: line_num,
831                    start_col: col_start,
832                    end_col: col_end,
833                    byte_offset: match_start,
834                    byte_end: match_end,
835                    text,
836                    url: inline_url.as_str().to_string(),
837                    is_reference: false,
838                    reference_id: None,
839                });
840            } else if let Some(ref_id) = cap.get(3) {
841                // Reference link
842                let ref_id_str = ref_id.as_str();
843                let normalized_ref = if ref_id_str.is_empty() {
844                    text.to_lowercase() // Implicit reference
845                } else {
846                    ref_id_str.to_lowercase()
847                };
848
849                links.push(ParsedLink {
850                    line: line_num,
851                    start_col: col_start,
852                    end_col: col_end,
853                    byte_offset: match_start,
854                    byte_end: match_end,
855                    text,
856                    url: String::new(), // Will be resolved with reference_defs
857                    is_reference: true,
858                    reference_id: Some(normalized_ref),
859                });
860            }
861        }
862
863        links
864    }
865
866    /// Parse all images in the content
867    fn parse_images(
868        content: &str,
869        lines: &[LineInfo],
870        code_blocks: &[(usize, usize)],
871        code_spans: &[CodeSpan],
872    ) -> Vec<ParsedImage> {
873        // Pre-size based on a heuristic: images are less common than links
874        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
875
876        // Parse images across the entire content, not line by line
877        for cap in IMAGE_PATTERN.captures_iter(content) {
878            let full_match = cap.get(0).unwrap();
879            let match_start = full_match.start();
880            let match_end = full_match.end();
881
882            // Skip if the ! is escaped (preceded by \)
883            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
884                continue;
885            }
886
887            // Skip if in code block
888            if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
889                continue;
890            }
891
892            // Skip if in code span
893            if code_spans
894                .iter()
895                .any(|span| match_start >= span.byte_offset && match_start < span.byte_end)
896            {
897                continue;
898            }
899
900            // Find which line this image starts on
901            let mut line_num = 1;
902            let mut col_start = match_start;
903            for (idx, line_info) in lines.iter().enumerate() {
904                if match_start >= line_info.byte_offset {
905                    line_num = idx + 1;
906                    col_start = match_start - line_info.byte_offset;
907                } else {
908                    break;
909                }
910            }
911
912            // Find which line this image ends on (and calculate column on that line)
913            let mut end_line_num = 1;
914            let mut col_end = match_end;
915            for (idx, line_info) in lines.iter().enumerate() {
916                if match_end > line_info.byte_offset {
917                    end_line_num = idx + 1;
918                    col_end = match_end - line_info.byte_offset;
919                } else {
920                    break;
921                }
922            }
923
924            // For single-line images, use the same approach as before
925            if line_num == end_line_num {
926                // col_end is already correct
927            } else {
928                // For multi-line images, col_end represents the column on the ending line
929                // which is what we want
930            }
931
932            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
933
934            if let Some(inline_url) = cap.get(2) {
935                // Inline image
936                images.push(ParsedImage {
937                    line: line_num,
938                    start_col: col_start,
939                    end_col: col_end,
940                    byte_offset: match_start,
941                    byte_end: match_end,
942                    alt_text,
943                    url: inline_url.as_str().to_string(),
944                    is_reference: false,
945                    reference_id: None,
946                });
947            } else if let Some(ref_id) = cap.get(3) {
948                // Reference image
949                let ref_id_str = ref_id.as_str();
950                let normalized_ref = if ref_id_str.is_empty() {
951                    alt_text.to_lowercase() // Implicit reference
952                } else {
953                    ref_id_str.to_lowercase()
954                };
955
956                images.push(ParsedImage {
957                    line: line_num,
958                    start_col: col_start,
959                    end_col: col_end,
960                    byte_offset: match_start,
961                    byte_end: match_end,
962                    alt_text,
963                    url: String::new(), // Will be resolved with reference_defs
964                    is_reference: true,
965                    reference_id: Some(normalized_ref),
966                });
967            }
968        }
969
970        images
971    }
972
973    /// Parse reference definitions
974    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
975        // Pre-size based on lines count as reference definitions are line-based
976        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
977
978        for (line_idx, line_info) in lines.iter().enumerate() {
979            // Skip lines in code blocks
980            if line_info.in_code_block {
981                continue;
982            }
983
984            let line = &line_info.content;
985            let line_num = line_idx + 1;
986
987            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
988                let id = cap.get(1).unwrap().as_str().to_lowercase();
989                let url = cap.get(2).unwrap().as_str().to_string();
990                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
991
992                refs.push(ReferenceDef {
993                    line: line_num,
994                    id,
995                    url,
996                    title,
997                });
998            }
999        }
1000
1001        refs
1002    }
1003
1004    /// Pre-compute line information
1005    fn compute_line_info(
1006        content: &str,
1007        line_offsets: &[usize],
1008        code_blocks: &[(usize, usize)],
1009        flavor: MarkdownFlavor,
1010    ) -> Vec<LineInfo> {
1011        lazy_static! {
1012            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
1013            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
1014            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
1015
1016            // Regex for blockquote prefix
1017            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
1018
1019            // Regex for heading detection
1020            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
1021            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
1022
1023            // Regex for blockquote detection
1024            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
1025        }
1026
1027        let content_lines: Vec<&str> = content.lines().collect();
1028        let mut lines = Vec::with_capacity(content_lines.len());
1029
1030        // Detect front matter boundaries FIRST, before any other parsing
1031        let mut in_front_matter = false;
1032        let mut front_matter_end = 0;
1033        if content_lines.first().map(|l| l.trim()) == Some("---") {
1034            in_front_matter = true;
1035            for (idx, line) in content_lines.iter().enumerate().skip(1) {
1036                if line.trim() == "---" {
1037                    front_matter_end = idx;
1038                    break;
1039                }
1040            }
1041        }
1042
1043        for (i, line) in content_lines.iter().enumerate() {
1044            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
1045            let indent = line.len() - line.trim_start().len();
1046            // For blank detection, consider blockquote context
1047            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1048                // In blockquote context, check if content after prefix is blank
1049                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
1050                after_prefix.trim().is_empty()
1051            } else {
1052                line.trim().is_empty()
1053            };
1054            // Check if this line is inside a code block (not inline code span)
1055            // We only want to check for fenced/indented code blocks, not inline code
1056            let in_code_block = code_blocks.iter().any(|&(start, end)| {
1057                // Only consider ranges that span multiple lines (code blocks)
1058                // Inline code spans are typically on a single line
1059
1060                // Ensure we're at valid UTF-8 boundaries
1061                let safe_start = if start > 0 && !content.is_char_boundary(start) {
1062                    // Find the nearest valid boundary before start
1063                    let mut boundary = start;
1064                    while boundary > 0 && !content.is_char_boundary(boundary) {
1065                        boundary -= 1;
1066                    }
1067                    boundary
1068                } else {
1069                    start
1070                };
1071
1072                let safe_end = if end < content.len() && !content.is_char_boundary(end) {
1073                    // Find the nearest valid boundary after end
1074                    let mut boundary = end;
1075                    while boundary < content.len() && !content.is_char_boundary(boundary) {
1076                        boundary += 1;
1077                    }
1078                    boundary
1079                } else {
1080                    end.min(content.len())
1081                };
1082
1083                let block_content = &content[safe_start..safe_end];
1084                let is_multiline = block_content.contains('\n');
1085                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
1086                let is_indented = !is_fenced
1087                    && block_content
1088                        .lines()
1089                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
1090
1091                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
1092            });
1093
1094            // Detect list items (skip if in frontmatter)
1095            let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
1096                // Strip blockquote prefix if present for list detection
1097                let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
1098                    let prefix = caps.get(1).unwrap().as_str();
1099                    let content = caps.get(2).unwrap().as_str();
1100                    (content, prefix.len())
1101                } else {
1102                    (&**line, 0)
1103                };
1104
1105                if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
1106                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1107                    let marker = caps.get(2).map_or("", |m| m.as_str());
1108                    let spacing = caps.get(3).map_or("", |m| m.as_str());
1109                    let _content = caps.get(4).map_or("", |m| m.as_str());
1110                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1111                    let content_column = marker_column + marker.len() + spacing.len();
1112
1113                    // According to CommonMark spec, unordered list items MUST have at least one space
1114                    // after the marker (-, *, or +). Without a space, it's not a list item.
1115                    // This also naturally handles cases like:
1116                    // - *emphasis* (not a list)
1117                    // - **bold** (not a list)
1118                    // - --- (horizontal rule, not a list)
1119                    if spacing.is_empty() {
1120                        None
1121                    } else {
1122                        Some(ListItemInfo {
1123                            marker: marker.to_string(),
1124                            is_ordered: false,
1125                            number: None,
1126                            marker_column,
1127                            content_column,
1128                        })
1129                    }
1130                } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
1131                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1132                    let number_str = caps.get(2).map_or("", |m| m.as_str());
1133                    let delimiter = caps.get(3).map_or("", |m| m.as_str());
1134                    let spacing = caps.get(4).map_or("", |m| m.as_str());
1135                    let _content = caps.get(5).map_or("", |m| m.as_str());
1136                    let marker = format!("{number_str}{delimiter}");
1137                    let marker_column = blockquote_prefix_len + leading_spaces.len();
1138                    let content_column = marker_column + marker.len() + spacing.len();
1139
1140                    // According to CommonMark spec, ordered list items MUST have at least one space
1141                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1142                    if spacing.is_empty() {
1143                        None
1144                    } else {
1145                        Some(ListItemInfo {
1146                            marker,
1147                            is_ordered: true,
1148                            number: number_str.parse().ok(),
1149                            marker_column,
1150                            content_column,
1151                        })
1152                    }
1153                } else {
1154                    None
1155                }
1156            } else {
1157                None
1158            };
1159
1160            lines.push(LineInfo {
1161                content: line.to_string(),
1162                byte_offset,
1163                indent,
1164                is_blank,
1165                in_code_block,
1166                in_front_matter: in_front_matter && i <= front_matter_end,
1167                in_html_block: false, // Will be populated after line creation
1168                list_item,
1169                heading: None,    // Will be populated in second pass for Setext headings
1170                blockquote: None, // Will be populated after line creation
1171            });
1172        }
1173
1174        // Second pass: detect headings (including Setext which needs look-ahead) and blockquotes
1175        for i in 0..content_lines.len() {
1176            if lines[i].in_code_block {
1177                continue;
1178            }
1179
1180            // Skip lines in front matter
1181            if in_front_matter && i <= front_matter_end {
1182                continue;
1183            }
1184
1185            let line = content_lines[i];
1186
1187            // Check for blockquotes (even on blank lines within blockquotes)
1188            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1189                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1190                let markers = caps.get(2).map_or("", |m| m.as_str());
1191                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1192                let content = caps.get(4).map_or("", |m| m.as_str());
1193
1194                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1195                let marker_column = indent_str.len();
1196
1197                // Build the prefix (indentation + markers + space)
1198                let prefix = format!("{indent_str}{markers}{spaces_after}");
1199
1200                // Check for various blockquote issues
1201                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1202                // Consider tabs as multiple spaces, or actual multiple spaces
1203                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1204
1205                // Check if needs MD028 fix (empty blockquote line without proper spacing)
1206                // MD028 flags empty blockquote lines that don't have a single space after the marker
1207                // Lines like "> " or ">> " are already correct and don't need fixing
1208                let needs_md028_fix = content.is_empty() && spaces_after.is_empty();
1209
1210                lines[i].blockquote = Some(BlockquoteInfo {
1211                    nesting_level,
1212                    indent: indent_str.to_string(),
1213                    marker_column,
1214                    prefix,
1215                    content: content.to_string(),
1216                    has_no_space_after_marker: has_no_space,
1217                    has_multiple_spaces_after_marker: has_multiple_spaces,
1218                    needs_md028_fix,
1219                });
1220            }
1221
1222            // Skip heading detection for blank lines
1223            if lines[i].is_blank {
1224                continue;
1225            }
1226
1227            // Check for ATX headings (but skip MkDocs snippet lines)
1228            // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
1229            let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
1230                crate::utils::mkdocs_snippets::is_snippet_section_start(line)
1231                    || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
1232            } else {
1233                false
1234            };
1235
1236            if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1237                // Skip headings inside HTML comments
1238                if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1239                    continue;
1240                }
1241                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1242                let hashes = caps.get(2).map_or("", |m| m.as_str());
1243                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1244                let rest = caps.get(4).map_or("", |m| m.as_str());
1245
1246                let level = hashes.len() as u8;
1247                let marker_column = leading_spaces.len();
1248
1249                // Check for closing sequence, but handle custom IDs that might come after
1250                let (text, has_closing, closing_seq) = {
1251                    // First check if there's a custom ID at the end
1252                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1253                        // Check if this looks like a valid custom ID (ends with })
1254                        if rest[id_start..].trim_end().ends_with('}') {
1255                            // Split off the custom ID
1256                            (&rest[..id_start], &rest[id_start..])
1257                        } else {
1258                            (rest, "")
1259                        }
1260                    } else {
1261                        (rest, "")
1262                    };
1263
1264                    // Now look for closing hashes in the part before the custom ID
1265                    let trimmed_rest = rest_without_id.trim_end();
1266                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1267                        // Look for the start of the hash sequence
1268                        let mut start_of_hashes = last_hash_pos;
1269                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1270                            start_of_hashes -= 1;
1271                        }
1272
1273                        // Check if there's at least one space before the closing hashes
1274                        let has_space_before = start_of_hashes == 0
1275                            || trimmed_rest
1276                                .chars()
1277                                .nth(start_of_hashes - 1)
1278                                .is_some_and(|c| c.is_whitespace());
1279
1280                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1281                        let potential_closing = &trimmed_rest[start_of_hashes..];
1282                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1283
1284                        if is_all_hashes && has_space_before {
1285                            // This is a closing sequence
1286                            let closing_hashes = potential_closing.to_string();
1287                            // The text is everything before the closing hashes
1288                            // Don't include the custom ID here - it will be extracted later
1289                            let text_part = if !custom_id_part.is_empty() {
1290                                // If we have a custom ID, append it back to get the full rest
1291                                // This allows the extract_header_id function to handle it properly
1292                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1293                            } else {
1294                                rest_without_id[..start_of_hashes].trim_end().to_string()
1295                            };
1296                            (text_part, true, closing_hashes)
1297                        } else {
1298                            // Not a valid closing sequence, return the full content
1299                            (rest.to_string(), false, String::new())
1300                        }
1301                    } else {
1302                        // No hashes found, return the full content
1303                        (rest.to_string(), false, String::new())
1304                    }
1305                };
1306
1307                let content_column = marker_column + hashes.len() + spaces_after.len();
1308
1309                // Extract custom header ID if present
1310                let raw_text = text.trim().to_string();
1311                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1312
1313                // If no custom ID was found on the header line, check the next line for standalone attr-list
1314                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1315                    let next_line = content_lines[i + 1];
1316                    if !lines[i + 1].in_code_block
1317                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1318                        && let Some(next_line_id) =
1319                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1320                    {
1321                        custom_id = Some(next_line_id);
1322                    }
1323                }
1324
1325                lines[i].heading = Some(HeadingInfo {
1326                    level,
1327                    style: HeadingStyle::ATX,
1328                    marker: hashes.to_string(),
1329                    marker_column,
1330                    content_column,
1331                    text: clean_text,
1332                    custom_id,
1333                    raw_text,
1334                    has_closing_sequence: has_closing,
1335                    closing_sequence: closing_seq,
1336                });
1337            }
1338            // Check for Setext headings (need to look at next line)
1339            else if i + 1 < content_lines.len() {
1340                let next_line = content_lines[i + 1];
1341                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1342                    // Skip if next line is front matter delimiter
1343                    if in_front_matter && i < front_matter_end {
1344                        continue;
1345                    }
1346
1347                    // Skip Setext headings inside HTML comments
1348                    if crate::utils::skip_context::is_in_html_comment(content, lines[i].byte_offset) {
1349                        continue;
1350                    }
1351
1352                    let underline = next_line.trim();
1353
1354                    // Skip if the underline looks like YAML delimiter (exactly 3 or more dashes)
1355                    // YAML uses exactly `---` while Setext headings typically use longer underlines
1356                    if underline == "---" {
1357                        continue;
1358                    }
1359
1360                    // Skip if the current line looks like YAML key-value syntax
1361                    let current_line_trimmed = line.trim();
1362                    if current_line_trimmed.contains(':')
1363                        && !current_line_trimmed.starts_with('#')
1364                        && !current_line_trimmed.contains('[')
1365                        && !current_line_trimmed.contains("](")
1366                    {
1367                        // This looks like "key: value" which suggests YAML, not a heading
1368                        continue;
1369                    }
1370
1371                    let level = if underline.starts_with('=') { 1 } else { 2 };
1372                    let style = if level == 1 {
1373                        HeadingStyle::Setext1
1374                    } else {
1375                        HeadingStyle::Setext2
1376                    };
1377
1378                    // Extract custom header ID if present
1379                    let raw_text = line.trim().to_string();
1380                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1381
1382                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1383                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1384                        let attr_line = content_lines[i + 2];
1385                        if !lines[i + 2].in_code_block
1386                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1387                            && let Some(attr_line_id) =
1388                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1389                        {
1390                            custom_id = Some(attr_line_id);
1391                        }
1392                    }
1393
1394                    lines[i].heading = Some(HeadingInfo {
1395                        level,
1396                        style,
1397                        marker: underline.to_string(),
1398                        marker_column: next_line.len() - next_line.trim_start().len(),
1399                        content_column: lines[i].indent,
1400                        text: clean_text,
1401                        custom_id,
1402                        raw_text,
1403                        has_closing_sequence: false,
1404                        closing_sequence: String::new(),
1405                    });
1406                }
1407            }
1408        }
1409
1410        lines
1411    }
1412
1413    /// Detect HTML blocks in the content
1414    fn detect_html_blocks(lines: &mut [LineInfo]) {
1415        // HTML block elements that trigger block context
1416        const BLOCK_ELEMENTS: &[&str] = &[
1417            "address",
1418            "article",
1419            "aside",
1420            "blockquote",
1421            "details",
1422            "dialog",
1423            "dd",
1424            "div",
1425            "dl",
1426            "dt",
1427            "fieldset",
1428            "figcaption",
1429            "figure",
1430            "footer",
1431            "form",
1432            "h1",
1433            "h2",
1434            "h3",
1435            "h4",
1436            "h5",
1437            "h6",
1438            "header",
1439            "hr",
1440            "li",
1441            "main",
1442            "nav",
1443            "ol",
1444            "p",
1445            "pre",
1446            "section",
1447            "table",
1448            "tbody",
1449            "td",
1450            "tfoot",
1451            "th",
1452            "thead",
1453            "tr",
1454            "ul",
1455        ];
1456
1457        let mut i = 0;
1458        while i < lines.len() {
1459            // Skip if already in code block or front matter
1460            if lines[i].in_code_block || lines[i].in_front_matter {
1461                i += 1;
1462                continue;
1463            }
1464
1465            let trimmed = lines[i].content.trim_start();
1466
1467            // Check if line starts with an HTML tag
1468            if trimmed.starts_with('<') && trimmed.len() > 1 {
1469                // Extract tag name safely
1470                let after_bracket = &trimmed[1..];
1471                let is_closing = after_bracket.starts_with('/');
1472                let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
1473
1474                // Extract tag name (stop at space, >, /, or end of string)
1475                let tag_name = tag_start
1476                    .chars()
1477                    .take_while(|c| c.is_ascii_alphabetic() || *c == '-')
1478                    .collect::<String>()
1479                    .to_lowercase();
1480
1481                // Check if it's a block element
1482                if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
1483                    // Mark this line as in HTML block
1484                    lines[i].in_html_block = true;
1485
1486                    // For simplicity, just mark lines until we find a closing tag or reach a blank line
1487                    // This avoids complex nesting logic that might cause infinite loops
1488                    if !is_closing {
1489                        let closing_tag = format!("</{tag_name}>");
1490                        let mut j = i + 1;
1491                        while j < lines.len() && j < i + 100 {
1492                            // Limit search to 100 lines
1493                            // Stop at blank lines
1494                            if lines[j].is_blank {
1495                                break;
1496                            }
1497
1498                            lines[j].in_html_block = true;
1499
1500                            // Check if this line contains the closing tag
1501                            if lines[j].content.contains(&closing_tag) {
1502                                break;
1503                            }
1504                            j += 1;
1505                        }
1506                    }
1507                }
1508            }
1509
1510            i += 1;
1511        }
1512    }
1513
1514    /// Parse all inline code spans in the content using AST
1515    fn parse_code_spans(content: &str, lines: &[LineInfo], ast: &Node) -> Vec<CodeSpan> {
1516        let mut code_spans = Vec::new();
1517
1518        // Quick check - if no backticks, no code spans
1519        if !content.contains('`') {
1520            return code_spans;
1521        }
1522
1523        // Helper function to recursively extract inline code spans from AST nodes
1524        fn extract_code_spans(node: &Node, content: &str, lines: &[LineInfo], spans: &mut Vec<CodeSpan>) {
1525            match node {
1526                Node::InlineCode(inline_code) => {
1527                    if let Some(pos) = &inline_code.position {
1528                        let start_pos = pos.start.offset;
1529                        let end_pos = pos.end.offset;
1530
1531                        // The position includes the backticks, extract the actual content
1532                        let full_span = &content[start_pos..end_pos];
1533                        let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
1534
1535                        // Extract content between backticks, preserving spaces
1536                        let content_start = start_pos + backtick_count;
1537                        let content_end = end_pos - backtick_count;
1538                        let span_content = if content_start < content_end {
1539                            content[content_start..content_end].to_string()
1540                        } else {
1541                            String::new()
1542                        };
1543
1544                        // Find which line this code span starts on
1545                        let mut line_num = 1;
1546                        let mut col_start = start_pos;
1547                        for (idx, line_info) in lines.iter().enumerate() {
1548                            if start_pos >= line_info.byte_offset {
1549                                line_num = idx + 1;
1550                                col_start = start_pos - line_info.byte_offset;
1551                            } else {
1552                                break;
1553                            }
1554                        }
1555
1556                        // Find end column
1557                        let mut col_end = end_pos;
1558                        for line_info in lines.iter() {
1559                            if end_pos > line_info.byte_offset {
1560                                col_end = end_pos - line_info.byte_offset;
1561                            } else {
1562                                break;
1563                            }
1564                        }
1565
1566                        spans.push(CodeSpan {
1567                            line: line_num,
1568                            start_col: col_start,
1569                            end_col: col_end,
1570                            byte_offset: start_pos,
1571                            byte_end: end_pos,
1572                            backtick_count,
1573                            content: span_content,
1574                        });
1575                    }
1576                }
1577                // Recursively process children
1578                Node::Root(root) => {
1579                    for child in &root.children {
1580                        extract_code_spans(child, content, lines, spans);
1581                    }
1582                }
1583                Node::Paragraph(para) => {
1584                    for child in &para.children {
1585                        extract_code_spans(child, content, lines, spans);
1586                    }
1587                }
1588                Node::Heading(heading) => {
1589                    for child in &heading.children {
1590                        extract_code_spans(child, content, lines, spans);
1591                    }
1592                }
1593                Node::List(list) => {
1594                    for child in &list.children {
1595                        extract_code_spans(child, content, lines, spans);
1596                    }
1597                }
1598                Node::ListItem(item) => {
1599                    for child in &item.children {
1600                        extract_code_spans(child, content, lines, spans);
1601                    }
1602                }
1603                Node::Blockquote(blockquote) => {
1604                    for child in &blockquote.children {
1605                        extract_code_spans(child, content, lines, spans);
1606                    }
1607                }
1608                Node::Table(table) => {
1609                    for child in &table.children {
1610                        extract_code_spans(child, content, lines, spans);
1611                    }
1612                }
1613                Node::TableRow(row) => {
1614                    for child in &row.children {
1615                        extract_code_spans(child, content, lines, spans);
1616                    }
1617                }
1618                Node::TableCell(cell) => {
1619                    for child in &cell.children {
1620                        extract_code_spans(child, content, lines, spans);
1621                    }
1622                }
1623                Node::Emphasis(emphasis) => {
1624                    for child in &emphasis.children {
1625                        extract_code_spans(child, content, lines, spans);
1626                    }
1627                }
1628                Node::Strong(strong) => {
1629                    for child in &strong.children {
1630                        extract_code_spans(child, content, lines, spans);
1631                    }
1632                }
1633                Node::Link(link) => {
1634                    for child in &link.children {
1635                        extract_code_spans(child, content, lines, spans);
1636                    }
1637                }
1638                Node::LinkReference(link_ref) => {
1639                    for child in &link_ref.children {
1640                        extract_code_spans(child, content, lines, spans);
1641                    }
1642                }
1643                Node::FootnoteDefinition(footnote) => {
1644                    for child in &footnote.children {
1645                        extract_code_spans(child, content, lines, spans);
1646                    }
1647                }
1648                Node::Delete(delete) => {
1649                    for child in &delete.children {
1650                        extract_code_spans(child, content, lines, spans);
1651                    }
1652                }
1653                // Terminal nodes or nodes without relevant children
1654                Node::Code(_)
1655                | Node::Text(_)
1656                | Node::Html(_)
1657                | Node::Image(_)
1658                | Node::ImageReference(_)
1659                | Node::FootnoteReference(_)
1660                | Node::Break(_)
1661                | Node::ThematicBreak(_)
1662                | Node::Definition(_)
1663                | Node::Yaml(_)
1664                | Node::Toml(_)
1665                | Node::Math(_)
1666                | Node::InlineMath(_)
1667                | Node::MdxJsxFlowElement(_)
1668                | Node::MdxFlowExpression(_)
1669                | Node::MdxJsxTextElement(_)
1670                | Node::MdxTextExpression(_)
1671                | Node::MdxjsEsm(_) => {
1672                    // No children to process or not relevant for code spans
1673                }
1674            }
1675        }
1676
1677        // Extract all code spans from the AST
1678        extract_code_spans(ast, content, lines, &mut code_spans);
1679
1680        // Sort by position to ensure consistent ordering
1681        code_spans.sort_by_key(|span| span.byte_offset);
1682
1683        code_spans
1684    }
1685
1686    /// Parse all list blocks in the content
1687    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1688        // Pre-size based on lines that could be list items
1689        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1690        let mut current_block: Option<ListBlock> = None;
1691        let mut last_list_item_line = 0;
1692        let mut current_indent_level = 0;
1693        let mut last_marker_width = 0;
1694
1695        for (line_idx, line_info) in lines.iter().enumerate() {
1696            let line_num = line_idx + 1;
1697
1698            // Enhanced code block handling using Design #3's context analysis
1699            if line_info.in_code_block {
1700                if let Some(ref mut block) = current_block {
1701                    // Calculate minimum indentation for list continuation
1702                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1703
1704                    // Analyze code block context using the three-tier classification
1705                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1706
1707                    match context {
1708                        CodeBlockContext::Indented => {
1709                            // Code block is properly indented - continues the list
1710                            block.end_line = line_num;
1711                            continue;
1712                        }
1713                        CodeBlockContext::Standalone => {
1714                            // Code block separates lists - end current block
1715                            let completed_block = current_block.take().unwrap();
1716                            list_blocks.push(completed_block);
1717                            continue;
1718                        }
1719                        CodeBlockContext::Adjacent => {
1720                            // Edge case - use conservative behavior (continue list)
1721                            block.end_line = line_num;
1722                            continue;
1723                        }
1724                    }
1725                } else {
1726                    // No current list block - skip code block lines
1727                    continue;
1728                }
1729            }
1730
1731            // Extract blockquote prefix if any
1732            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1733                caps.get(0).unwrap().as_str().to_string()
1734            } else {
1735                String::new()
1736            };
1737
1738            // Check if this line is a list item
1739            if let Some(list_item) = &line_info.list_item {
1740                // Calculate nesting level based on indentation
1741                let item_indent = list_item.marker_column;
1742                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1743
1744                if let Some(ref mut block) = current_block {
1745                    // Check if this continues the current block
1746                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1747                    // or a continuation at the same or lower level
1748                    let is_nested = nesting > block.nesting_level;
1749                    let same_type =
1750                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1751                    let same_context = block.blockquote_prefix == blockquote_prefix;
1752                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1753
1754                    // For unordered lists, also check marker consistency
1755                    let marker_compatible =
1756                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1757
1758                    // Check if there's non-list content between the last item and this one
1759                    let has_non_list_content = {
1760                        let mut found_non_list = false;
1761                        // Use the last item from the current block, not the global last_list_item_line
1762                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1763
1764                        // Debug: Special check for problematic line
1765                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1766                            let last_line = &lines[block_last_item_line - 1];
1767                            if last_line.content.contains(r"`sqlalchemy`") && last_line.content.contains(r"\`") {
1768                                log::debug!(
1769                                    "After problematic line {}: checking lines {} to {} for non-list content",
1770                                    block_last_item_line,
1771                                    block_last_item_line + 1,
1772                                    line_num
1773                                );
1774                                // If they're consecutive list items, there's no content between
1775                                if line_num == block_last_item_line + 1 {
1776                                    log::debug!("Lines are consecutive, no content between");
1777                                }
1778                            }
1779                        }
1780
1781                        for check_line in (block_last_item_line + 1)..line_num {
1782                            let check_idx = check_line - 1;
1783                            if check_idx < lines.len() {
1784                                let check_info = &lines[check_idx];
1785                                // Check for content that breaks the list
1786                                let is_list_breaking_content = if check_info.in_code_block {
1787                                    // Use enhanced code block classification for list separation
1788                                    let last_item_marker_width =
1789                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1790                                            lines[block_last_item_line - 1]
1791                                                .list_item
1792                                                .as_ref()
1793                                                .map(|li| {
1794                                                    if li.is_ordered {
1795                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1796                                                    } else {
1797                                                        li.marker.len()
1798                                                    }
1799                                                })
1800                                                .unwrap_or(3) // fallback to 3 if no list item found
1801                                        } else {
1802                                            3 // fallback
1803                                        };
1804
1805                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1806
1807                                    // Analyze code block context using our enhanced classification
1808                                    let context = CodeBlockUtils::analyze_code_block_context(
1809                                        lines,
1810                                        check_line - 1,
1811                                        min_continuation,
1812                                    );
1813
1814                                    // Standalone code blocks break lists, indented ones continue them
1815                                    matches!(context, CodeBlockContext::Standalone)
1816                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1817                                    // Check for structural separators that should break lists (from issue #42)
1818                                    let line_content = check_info.content.trim();
1819
1820                                    // Any of these structural separators break lists
1821                                    if check_info.heading.is_some()
1822                                        || line_content.starts_with("---")
1823                                        || line_content.starts_with("***")
1824                                        || line_content.starts_with("___")
1825                                        || (line_content.contains('|')
1826                                            && !line_content.contains("](")
1827                                            && !line_content.contains("http")
1828                                            && (line_content.matches('|').count() > 1
1829                                                || line_content.starts_with('|')
1830                                                || line_content.ends_with('|')))
1831                                        || line_content.starts_with(">")
1832                                    {
1833                                        true
1834                                    }
1835                                    // Other non-list content - check if properly indented
1836                                    else {
1837                                        let last_item_marker_width =
1838                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1839                                                lines[block_last_item_line - 1]
1840                                                    .list_item
1841                                                    .as_ref()
1842                                                    .map(|li| {
1843                                                        if li.is_ordered {
1844                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1845                                                        } else {
1846                                                            li.marker.len()
1847                                                        }
1848                                                    })
1849                                                    .unwrap_or(3) // fallback to 3 if no list item found
1850                                            } else {
1851                                                3 // fallback
1852                                            };
1853
1854                                        let min_continuation =
1855                                            if block.is_ordered { last_item_marker_width } else { 2 };
1856                                        check_info.indent < min_continuation
1857                                    }
1858                                } else {
1859                                    false
1860                                };
1861
1862                                if is_list_breaking_content {
1863                                    // Not indented enough, so it breaks the list
1864                                    found_non_list = true;
1865                                    break;
1866                                }
1867                            }
1868                        }
1869                        found_non_list
1870                    };
1871
1872                    // A list continues if:
1873                    // 1. It's a nested item (indented more than the parent), OR
1874                    // 2. It's the same type at the same level with reasonable distance
1875                    let mut continues_list = if is_nested {
1876                        // Nested items always continue the list if they're in the same context
1877                        same_context && reasonable_distance && !has_non_list_content
1878                    } else {
1879                        // Same-level items need to match type and markers
1880                        let result = same_type
1881                            && same_context
1882                            && reasonable_distance
1883                            && marker_compatible
1884                            && !has_non_list_content;
1885
1886                        // Debug logging for lines after problematic content
1887                        if block.item_lines.last().is_some_and(|&last_line| {
1888                            last_line > 0
1889                                && last_line <= lines.len()
1890                                && lines[last_line - 1].content.contains(r"`sqlalchemy`")
1891                                && lines[last_line - 1].content.contains(r"\`")
1892                        }) {
1893                            log::debug!(
1894                                "List continuation check after problematic line at line {line_num}: same_type={same_type}, same_context={same_context}, reasonable_distance={reasonable_distance}, marker_compatible={marker_compatible}, has_non_list_content={has_non_list_content}, continues={result}"
1895                            );
1896                            if line_num > 0 && line_num <= lines.len() {
1897                                log::debug!("Current line content: {:?}", lines[line_num - 1].content);
1898                            }
1899                        }
1900
1901                        result
1902                    };
1903
1904                    // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
1905                    // This handles edge cases where content patterns might otherwise split lists incorrectly
1906                    if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
1907                        // Check if the previous line was a list item
1908                        if block.item_lines.contains(&(line_num - 1)) {
1909                            // They're consecutive list items - force them to be in the same list
1910                            continues_list = true;
1911                        }
1912                    }
1913
1914                    if continues_list {
1915                        // Extend current block
1916                        block.end_line = line_num;
1917                        block.item_lines.push(line_num);
1918
1919                        // Update max marker width
1920                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1921                            list_item.marker.len() + 1
1922                        } else {
1923                            list_item.marker.len()
1924                        });
1925
1926                        // Update marker consistency for unordered lists
1927                        if !block.is_ordered
1928                            && block.marker.is_some()
1929                            && block.marker.as_ref() != Some(&list_item.marker)
1930                        {
1931                            // Mixed markers, clear the marker field
1932                            block.marker = None;
1933                        }
1934                    } else {
1935                        // End current block and start a new one
1936
1937                        list_blocks.push(block.clone());
1938
1939                        *block = ListBlock {
1940                            start_line: line_num,
1941                            end_line: line_num,
1942                            is_ordered: list_item.is_ordered,
1943                            marker: if list_item.is_ordered {
1944                                None
1945                            } else {
1946                                Some(list_item.marker.clone())
1947                            },
1948                            blockquote_prefix: blockquote_prefix.clone(),
1949                            item_lines: vec![line_num],
1950                            nesting_level: nesting,
1951                            max_marker_width: if list_item.is_ordered {
1952                                list_item.marker.len() + 1
1953                            } else {
1954                                list_item.marker.len()
1955                            },
1956                        };
1957                    }
1958                } else {
1959                    // Start a new block
1960                    current_block = Some(ListBlock {
1961                        start_line: line_num,
1962                        end_line: line_num,
1963                        is_ordered: list_item.is_ordered,
1964                        marker: if list_item.is_ordered {
1965                            None
1966                        } else {
1967                            Some(list_item.marker.clone())
1968                        },
1969                        blockquote_prefix,
1970                        item_lines: vec![line_num],
1971                        nesting_level: nesting,
1972                        max_marker_width: list_item.marker.len(),
1973                    });
1974                }
1975
1976                last_list_item_line = line_num;
1977                current_indent_level = item_indent;
1978                last_marker_width = if list_item.is_ordered {
1979                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
1980                } else {
1981                    list_item.marker.len()
1982                };
1983            } else if let Some(ref mut block) = current_block {
1984                // Not a list item - check if it continues the current block
1985
1986                // For MD032 compatibility, we use a simple approach:
1987                // - Indented lines continue the list
1988                // - Blank lines followed by indented content continue the list
1989                // - Everything else ends the list
1990
1991                // Calculate minimum indentation for list continuation
1992                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
1993                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
1994                let min_continuation_indent = if block.is_ordered {
1995                    current_indent_level + last_marker_width
1996                } else {
1997                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
1998                };
1999
2000                if line_info.indent >= min_continuation_indent {
2001                    // Indented line continues the list
2002                    block.end_line = line_num;
2003                } else if line_info.is_blank {
2004                    // Blank line - check if it's internal to the list or ending it
2005                    // We only include blank lines that are followed by more list content
2006                    let mut check_idx = line_idx + 1;
2007                    let mut found_continuation = false;
2008
2009                    // Skip additional blank lines
2010                    while check_idx < lines.len() && lines[check_idx].is_blank {
2011                        check_idx += 1;
2012                    }
2013
2014                    if check_idx < lines.len() {
2015                        let next_line = &lines[check_idx];
2016                        // Check if followed by indented content (list continuation)
2017                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
2018                            found_continuation = true;
2019                        }
2020                        // Check if followed by another list item at the same level
2021                        else if !next_line.in_code_block
2022                            && next_line.list_item.is_some()
2023                            && let Some(item) = &next_line.list_item
2024                        {
2025                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
2026                                .find(&next_line.content)
2027                                .map_or(String::new(), |m| m.as_str().to_string());
2028                            if item.marker_column == current_indent_level
2029                                && item.is_ordered == block.is_ordered
2030                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
2031                            {
2032                                // Check if there was meaningful content between the list items (unused now)
2033                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
2034                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
2035                                    if let Some(between_line) = lines.get(idx) {
2036                                        let trimmed = between_line.content.trim();
2037                                        // Skip empty lines
2038                                        if trimmed.is_empty() {
2039                                            return false;
2040                                        }
2041                                        // Check for meaningful content
2042                                        let line_indent =
2043                                            between_line.content.len() - between_line.content.trim_start().len();
2044
2045                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
2046                                        if trimmed.starts_with("```")
2047                                            || trimmed.starts_with("~~~")
2048                                            || trimmed.starts_with("---")
2049                                            || trimmed.starts_with("***")
2050                                            || trimmed.starts_with("___")
2051                                            || trimmed.starts_with(">")
2052                                            || trimmed.contains('|') // Tables
2053                                            || between_line.heading.is_some()
2054                                        {
2055                                            return true; // These are structural separators - meaningful content that breaks lists
2056                                        }
2057
2058                                        // Only properly indented content continues the list
2059                                        line_indent >= min_continuation_indent
2060                                    } else {
2061                                        false
2062                                    }
2063                                });
2064
2065                                if block.is_ordered {
2066                                    // For ordered lists: don't continue if there are structural separators
2067                                    // Check if there are structural separators between the list items
2068                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2069                                        if let Some(between_line) = lines.get(idx) {
2070                                            let trimmed = between_line.content.trim();
2071                                            if trimmed.is_empty() {
2072                                                return false;
2073                                            }
2074                                            // Check for structural separators that break lists
2075                                            trimmed.starts_with("```")
2076                                                || trimmed.starts_with("~~~")
2077                                                || trimmed.starts_with("---")
2078                                                || trimmed.starts_with("***")
2079                                                || trimmed.starts_with("___")
2080                                                || trimmed.starts_with(">")
2081                                                || trimmed.contains('|') // Tables
2082                                                || between_line.heading.is_some()
2083                                        } else {
2084                                            false
2085                                        }
2086                                    });
2087                                    found_continuation = !has_structural_separators;
2088                                } else {
2089                                    // For unordered lists: also check for structural separators
2090                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
2091                                        if let Some(between_line) = lines.get(idx) {
2092                                            let trimmed = between_line.content.trim();
2093                                            if trimmed.is_empty() {
2094                                                return false;
2095                                            }
2096                                            // Check for structural separators that break lists
2097                                            trimmed.starts_with("```")
2098                                                || trimmed.starts_with("~~~")
2099                                                || trimmed.starts_with("---")
2100                                                || trimmed.starts_with("***")
2101                                                || trimmed.starts_with("___")
2102                                                || trimmed.starts_with(">")
2103                                                || trimmed.contains('|') // Tables
2104                                                || between_line.heading.is_some()
2105                                        } else {
2106                                            false
2107                                        }
2108                                    });
2109                                    found_continuation = !has_structural_separators;
2110                                }
2111                            }
2112                        }
2113                    }
2114
2115                    if found_continuation {
2116                        // Include the blank line in the block
2117                        block.end_line = line_num;
2118                    } else {
2119                        // Blank line ends the list - don't include it
2120                        list_blocks.push(block.clone());
2121                        current_block = None;
2122                    }
2123                } else {
2124                    // Check for lazy continuation - non-indented line immediately after a list item
2125                    // But only if the line has sufficient indentation for the list type
2126                    let min_required_indent = if block.is_ordered {
2127                        current_indent_level + last_marker_width
2128                    } else {
2129                        current_indent_level + 2
2130                    };
2131
2132                    // For lazy continuation to apply, the line must either:
2133                    // 1. Have no indentation (true lazy continuation)
2134                    // 2. Have sufficient indentation for the list type
2135                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
2136                    let line_content = line_info.content.trim();
2137                    let is_structural_separator = line_info.heading.is_some()
2138                        || line_content.starts_with("```")
2139                        || line_content.starts_with("~~~")
2140                        || line_content.starts_with("---")
2141                        || line_content.starts_with("***")
2142                        || line_content.starts_with("___")
2143                        || line_content.starts_with(">")
2144                        || (line_content.contains('|')
2145                            && !line_content.contains("](")
2146                            && !line_content.contains("http")
2147                            && (line_content.matches('|').count() > 1
2148                                || line_content.starts_with('|')
2149                                || line_content.ends_with('|'))); // Tables
2150
2151                    // Allow lazy continuation if we're still within the same list block
2152                    // (not just immediately after a list item)
2153                    let is_lazy_continuation = !is_structural_separator
2154                        && !line_info.is_blank
2155                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
2156
2157                    if is_lazy_continuation {
2158                        // Additional check: if the line starts with uppercase and looks like a new sentence,
2159                        // it's probably not a continuation
2160                        let content_to_check = if !blockquote_prefix.is_empty() {
2161                            // Strip blockquote prefix to check the actual content
2162                            line_info
2163                                .content
2164                                .strip_prefix(&blockquote_prefix)
2165                                .unwrap_or(&line_info.content)
2166                                .trim()
2167                        } else {
2168                            line_info.content.trim()
2169                        };
2170
2171                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
2172
2173                        // If it starts with uppercase and the previous line ended with punctuation,
2174                        // it's likely a new paragraph, not a continuation
2175                        if starts_with_uppercase && last_list_item_line > 0 {
2176                            // This looks like a new paragraph
2177                            list_blocks.push(block.clone());
2178                            current_block = None;
2179                        } else {
2180                            // This is a lazy continuation line
2181                            block.end_line = line_num;
2182                        }
2183                    } else {
2184                        // Non-indented, non-blank line that's not a lazy continuation - end the block
2185                        list_blocks.push(block.clone());
2186                        current_block = None;
2187                    }
2188                }
2189            }
2190        }
2191
2192        // Don't forget the last block
2193        if let Some(block) = current_block {
2194            list_blocks.push(block);
2195        }
2196
2197        // Merge adjacent blocks that should be one
2198        merge_adjacent_list_blocks(&mut list_blocks, lines);
2199
2200        list_blocks
2201    }
2202
2203    /// Compute character frequency for fast content analysis
2204    fn compute_char_frequency(content: &str) -> CharFrequency {
2205        let mut frequency = CharFrequency::default();
2206
2207        for ch in content.chars() {
2208            match ch {
2209                '#' => frequency.hash_count += 1,
2210                '*' => frequency.asterisk_count += 1,
2211                '_' => frequency.underscore_count += 1,
2212                '-' => frequency.hyphen_count += 1,
2213                '+' => frequency.plus_count += 1,
2214                '>' => frequency.gt_count += 1,
2215                '|' => frequency.pipe_count += 1,
2216                '[' => frequency.bracket_count += 1,
2217                '`' => frequency.backtick_count += 1,
2218                '<' => frequency.lt_count += 1,
2219                '!' => frequency.exclamation_count += 1,
2220                '\n' => frequency.newline_count += 1,
2221                _ => {}
2222            }
2223        }
2224
2225        frequency
2226    }
2227
2228    /// Parse HTML tags in the content
2229    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
2230        lazy_static! {
2231            static ref HTML_TAG_REGEX: regex::Regex =
2232                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap();
2233        }
2234
2235        let mut html_tags = Vec::with_capacity(content.matches('<').count());
2236
2237        for cap in HTML_TAG_REGEX.captures_iter(content) {
2238            let full_match = cap.get(0).unwrap();
2239            let match_start = full_match.start();
2240            let match_end = full_match.end();
2241
2242            // Skip if in code block
2243            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2244                continue;
2245            }
2246
2247            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
2248            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
2249            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
2250
2251            // Find which line this tag is on
2252            let mut line_num = 1;
2253            let mut col_start = match_start;
2254            let mut col_end = match_end;
2255            for (idx, line_info) in lines.iter().enumerate() {
2256                if match_start >= line_info.byte_offset {
2257                    line_num = idx + 1;
2258                    col_start = match_start - line_info.byte_offset;
2259                    col_end = match_end - line_info.byte_offset;
2260                } else {
2261                    break;
2262                }
2263            }
2264
2265            html_tags.push(HtmlTag {
2266                line: line_num,
2267                start_col: col_start,
2268                end_col: col_end,
2269                byte_offset: match_start,
2270                byte_end: match_end,
2271                tag_name,
2272                is_closing,
2273                is_self_closing,
2274                raw_content: full_match.as_str().to_string(),
2275            });
2276        }
2277
2278        html_tags
2279    }
2280
2281    /// Parse emphasis spans in the content
2282    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
2283        lazy_static! {
2284            static ref EMPHASIS_REGEX: regex::Regex =
2285                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
2286        }
2287
2288        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
2289
2290        for cap in EMPHASIS_REGEX.captures_iter(content) {
2291            let full_match = cap.get(0).unwrap();
2292            let match_start = full_match.start();
2293            let match_end = full_match.end();
2294
2295            // Skip if in code block
2296            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2297                continue;
2298            }
2299
2300            let opening_markers = cap.get(1).unwrap().as_str();
2301            let content_part = cap.get(2).unwrap().as_str();
2302            let closing_markers = cap.get(3).unwrap().as_str();
2303
2304            // Validate matching markers
2305            if opening_markers.chars().next() != closing_markers.chars().next()
2306                || opening_markers.len() != closing_markers.len()
2307            {
2308                continue;
2309            }
2310
2311            let marker = opening_markers.chars().next().unwrap();
2312            let marker_count = opening_markers.len();
2313
2314            // Find which line this emphasis is on
2315            let mut line_num = 1;
2316            let mut col_start = match_start;
2317            let mut col_end = match_end;
2318            for (idx, line_info) in lines.iter().enumerate() {
2319                if match_start >= line_info.byte_offset {
2320                    line_num = idx + 1;
2321                    col_start = match_start - line_info.byte_offset;
2322                    col_end = match_end - line_info.byte_offset;
2323                } else {
2324                    break;
2325                }
2326            }
2327
2328            emphasis_spans.push(EmphasisSpan {
2329                line: line_num,
2330                start_col: col_start,
2331                end_col: col_end,
2332                byte_offset: match_start,
2333                byte_end: match_end,
2334                marker,
2335                marker_count,
2336                content: content_part.to_string(),
2337            });
2338        }
2339
2340        emphasis_spans
2341    }
2342
2343    /// Parse table rows in the content
2344    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
2345        let mut table_rows = Vec::with_capacity(lines.len() / 20);
2346
2347        for (line_idx, line_info) in lines.iter().enumerate() {
2348            // Skip lines in code blocks or blank lines
2349            if line_info.in_code_block || line_info.is_blank {
2350                continue;
2351            }
2352
2353            let line = &line_info.content;
2354            let line_num = line_idx + 1;
2355
2356            // Check if this line contains pipes (potential table row)
2357            if !line.contains('|') {
2358                continue;
2359            }
2360
2361            // Count columns by splitting on pipes
2362            let parts: Vec<&str> = line.split('|').collect();
2363            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
2364
2365            // Check if this is a separator row
2366            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
2367            let mut column_alignments = Vec::new();
2368
2369            if is_separator {
2370                for part in &parts[1..parts.len() - 1] {
2371                    // Skip first and last empty parts
2372                    let trimmed = part.trim();
2373                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
2374                        "center".to_string()
2375                    } else if trimmed.ends_with(':') {
2376                        "right".to_string()
2377                    } else if trimmed.starts_with(':') {
2378                        "left".to_string()
2379                    } else {
2380                        "none".to_string()
2381                    };
2382                    column_alignments.push(alignment);
2383                }
2384            }
2385
2386            table_rows.push(TableRow {
2387                line: line_num,
2388                is_separator,
2389                column_count,
2390                column_alignments,
2391            });
2392        }
2393
2394        table_rows
2395    }
2396
2397    /// Parse bare URLs and emails in the content
2398    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
2399        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
2400
2401        // Check for bare URLs (not in angle brackets or markdown links)
2402        for cap in BARE_URL_PATTERN.captures_iter(content) {
2403            let full_match = cap.get(0).unwrap();
2404            let match_start = full_match.start();
2405            let match_end = full_match.end();
2406
2407            // Skip if in code block
2408            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2409                continue;
2410            }
2411
2412            // Skip if already in angle brackets or markdown links
2413            let preceding_char = if match_start > 0 {
2414                content.chars().nth(match_start - 1)
2415            } else {
2416                None
2417            };
2418            let following_char = content.chars().nth(match_end);
2419
2420            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2421                continue;
2422            }
2423            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2424                continue;
2425            }
2426
2427            let url = full_match.as_str();
2428            let url_type = if url.starts_with("https://") {
2429                "https"
2430            } else if url.starts_with("http://") {
2431                "http"
2432            } else if url.starts_with("ftp://") {
2433                "ftp"
2434            } else {
2435                "other"
2436            };
2437
2438            // Find which line this URL is on
2439            let mut line_num = 1;
2440            let mut col_start = match_start;
2441            let mut col_end = match_end;
2442            for (idx, line_info) in lines.iter().enumerate() {
2443                if match_start >= line_info.byte_offset {
2444                    line_num = idx + 1;
2445                    col_start = match_start - line_info.byte_offset;
2446                    col_end = match_end - line_info.byte_offset;
2447                } else {
2448                    break;
2449                }
2450            }
2451
2452            bare_urls.push(BareUrl {
2453                line: line_num,
2454                start_col: col_start,
2455                end_col: col_end,
2456                byte_offset: match_start,
2457                byte_end: match_end,
2458                url: url.to_string(),
2459                url_type: url_type.to_string(),
2460            });
2461        }
2462
2463        // Check for bare email addresses
2464        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2465            let full_match = cap.get(0).unwrap();
2466            let match_start = full_match.start();
2467            let match_end = full_match.end();
2468
2469            // Skip if in code block
2470            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2471                continue;
2472            }
2473
2474            // Skip if already in angle brackets or markdown links
2475            let preceding_char = if match_start > 0 {
2476                content.chars().nth(match_start - 1)
2477            } else {
2478                None
2479            };
2480            let following_char = content.chars().nth(match_end);
2481
2482            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2483                continue;
2484            }
2485            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2486                continue;
2487            }
2488
2489            let email = full_match.as_str();
2490
2491            // Find which line this email is on
2492            let mut line_num = 1;
2493            let mut col_start = match_start;
2494            let mut col_end = match_end;
2495            for (idx, line_info) in lines.iter().enumerate() {
2496                if match_start >= line_info.byte_offset {
2497                    line_num = idx + 1;
2498                    col_start = match_start - line_info.byte_offset;
2499                    col_end = match_end - line_info.byte_offset;
2500                } else {
2501                    break;
2502                }
2503            }
2504
2505            bare_urls.push(BareUrl {
2506                line: line_num,
2507                start_col: col_start,
2508                end_col: col_end,
2509                byte_offset: match_start,
2510                byte_end: match_end,
2511                url: email.to_string(),
2512                url_type: "email".to_string(),
2513            });
2514        }
2515
2516        bare_urls
2517    }
2518}
2519
2520/// Merge adjacent list blocks that should be treated as one
2521fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2522    if list_blocks.len() < 2 {
2523        return;
2524    }
2525
2526    let mut merger = ListBlockMerger::new(lines);
2527    *list_blocks = merger.merge(list_blocks);
2528}
2529
2530/// Helper struct to manage the complex logic of merging list blocks
2531struct ListBlockMerger<'a> {
2532    lines: &'a [LineInfo],
2533}
2534
2535impl<'a> ListBlockMerger<'a> {
2536    fn new(lines: &'a [LineInfo]) -> Self {
2537        Self { lines }
2538    }
2539
2540    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2541        let mut merged = Vec::with_capacity(list_blocks.len());
2542        let mut current = list_blocks[0].clone();
2543
2544        for next in list_blocks.iter().skip(1) {
2545            if self.should_merge_blocks(&current, next) {
2546                current = self.merge_two_blocks(current, next);
2547            } else {
2548                merged.push(current);
2549                current = next.clone();
2550            }
2551        }
2552
2553        merged.push(current);
2554        merged
2555    }
2556
2557    /// Determine if two adjacent list blocks should be merged
2558    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2559        // Basic compatibility checks
2560        if !self.blocks_are_compatible(current, next) {
2561            return false;
2562        }
2563
2564        // Check spacing and content between blocks
2565        let spacing = self.analyze_spacing_between(current, next);
2566        match spacing {
2567            BlockSpacing::Consecutive => true,
2568            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2569            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2570                self.can_merge_with_content_between(current, next)
2571            }
2572        }
2573    }
2574
2575    /// Check if blocks have compatible structure for merging
2576    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2577        current.is_ordered == next.is_ordered
2578            && current.blockquote_prefix == next.blockquote_prefix
2579            && current.nesting_level == next.nesting_level
2580    }
2581
2582    /// Analyze the spacing between two list blocks
2583    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2584        let gap = next.start_line - current.end_line;
2585
2586        match gap {
2587            1 => BlockSpacing::Consecutive,
2588            2 => BlockSpacing::SingleBlank,
2589            _ if gap > 2 => {
2590                if self.has_only_blank_lines_between(current, next) {
2591                    BlockSpacing::MultipleBlanks
2592                } else {
2593                    BlockSpacing::ContentBetween
2594                }
2595            }
2596            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2597        }
2598    }
2599
2600    /// Check if unordered lists can be merged with a single blank line between
2601    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2602        // Check if there are structural separators between the blocks
2603        // If has_meaningful_content_between returns true, it means there are structural separators
2604        if has_meaningful_content_between(current, next, self.lines) {
2605            return false; // Structural separators prevent merging
2606        }
2607
2608        // Only merge unordered lists with same marker across single blank
2609        !current.is_ordered && current.marker == next.marker
2610    }
2611
2612    /// Check if ordered lists can be merged when there's content between them
2613    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2614        // Do not merge lists if there are structural separators between them
2615        if has_meaningful_content_between(current, next, self.lines) {
2616            return false; // Structural separators prevent merging
2617        }
2618
2619        // Only consider merging ordered lists if there's no structural content between
2620        current.is_ordered && next.is_ordered
2621    }
2622
2623    /// Check if there are only blank lines between blocks
2624    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2625        for line_num in (current.end_line + 1)..next.start_line {
2626            if let Some(line_info) = self.lines.get(line_num - 1)
2627                && !line_info.content.trim().is_empty()
2628            {
2629                return false;
2630            }
2631        }
2632        true
2633    }
2634
2635    /// Merge two compatible list blocks into one
2636    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2637        current.end_line = next.end_line;
2638        current.item_lines.extend_from_slice(&next.item_lines);
2639
2640        // Update max marker width
2641        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2642
2643        // Handle marker consistency for unordered lists
2644        if !current.is_ordered && self.markers_differ(&current, next) {
2645            current.marker = None; // Mixed markers
2646        }
2647
2648        current
2649    }
2650
2651    /// Check if two blocks have different markers
2652    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2653        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2654    }
2655}
2656
2657/// Types of spacing between list blocks
2658#[derive(Debug, PartialEq)]
2659enum BlockSpacing {
2660    Consecutive,    // No gap between blocks
2661    SingleBlank,    // One blank line between blocks
2662    MultipleBlanks, // Multiple blank lines but no content
2663    ContentBetween, // Content exists between blocks
2664}
2665
2666/// Check if there's meaningful content (not just blank lines) between two list blocks
2667fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2668    // Check lines between current.end_line and next.start_line
2669    for line_num in (current.end_line + 1)..next.start_line {
2670        if let Some(line_info) = lines.get(line_num - 1) {
2671            // Convert to 0-indexed
2672            let trimmed = line_info.content.trim();
2673
2674            // Skip empty lines
2675            if trimmed.is_empty() {
2676                continue;
2677            }
2678
2679            // Check for structural separators that should separate lists (CommonMark compliant)
2680
2681            // Headings separate lists
2682            if line_info.heading.is_some() {
2683                return true; // Has meaningful content - headings separate lists
2684            }
2685
2686            // Horizontal rules separate lists (---, ***, ___)
2687            if is_horizontal_rule(trimmed) {
2688                return true; // Has meaningful content - horizontal rules separate lists
2689            }
2690
2691            // Tables separate lists (lines containing | but not in URLs or code)
2692            // Simple heuristic: tables typically have | at start/end or multiple |
2693            if trimmed.contains('|') && trimmed.len() > 1 {
2694                // Don't treat URLs with | as tables
2695                if !trimmed.contains("](") && !trimmed.contains("http") {
2696                    // More robust check: tables usually have multiple | or | at edges
2697                    let pipe_count = trimmed.matches('|').count();
2698                    if pipe_count > 1 || trimmed.starts_with('|') || trimmed.ends_with('|') {
2699                        return true; // Has meaningful content - tables separate lists
2700                    }
2701                }
2702            }
2703
2704            // Blockquotes separate lists
2705            if trimmed.starts_with('>') {
2706                return true; // Has meaningful content - blockquotes separate lists
2707            }
2708
2709            // Code block fences separate lists (unless properly indented as list content)
2710            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2711                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2712
2713                // Check if this code block is properly indented as list continuation
2714                let min_continuation_indent = if current.is_ordered {
2715                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2716                } else {
2717                    current.nesting_level + 2
2718                };
2719
2720                if line_indent < min_continuation_indent {
2721                    // This is a standalone code block that separates lists
2722                    return true; // Has meaningful content - standalone code blocks separate lists
2723                }
2724            }
2725
2726            // Check if this line has proper indentation for list continuation
2727            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2728
2729            // Calculate minimum indentation needed to be list continuation
2730            let min_indent = if current.is_ordered {
2731                current.nesting_level + current.max_marker_width
2732            } else {
2733                current.nesting_level + 2
2734            };
2735
2736            // If the line is not indented enough to be list continuation, it's meaningful content
2737            if line_indent < min_indent {
2738                return true; // Has meaningful content - content not indented as list continuation
2739            }
2740
2741            // If we reach here, the line is properly indented as list continuation
2742            // Continue checking other lines
2743        }
2744    }
2745
2746    // Only blank lines or properly indented list continuation content between blocks
2747    false
2748}
2749
2750/// Check if a line is a horizontal rule (---, ***, ___)
2751fn is_horizontal_rule(trimmed: &str) -> bool {
2752    if trimmed.len() < 3 {
2753        return false;
2754    }
2755
2756    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2757    let chars: Vec<char> = trimmed.chars().collect();
2758    if let Some(&first_char) = chars.first()
2759        && (first_char == '-' || first_char == '*' || first_char == '_')
2760    {
2761        let mut count = 0;
2762        for &ch in &chars {
2763            if ch == first_char {
2764                count += 1;
2765            } else if ch != ' ' && ch != '\t' {
2766                return false; // Non-matching, non-whitespace character
2767            }
2768        }
2769        return count >= 3;
2770    }
2771    false
2772}
2773
2774/// Check if content contains patterns that cause the markdown crate to panic
2775#[cfg(test)]
2776mod tests {
2777    use super::*;
2778
2779    #[test]
2780    fn test_empty_content() {
2781        let ctx = LintContext::new("", MarkdownFlavor::Standard);
2782        assert_eq!(ctx.content, "");
2783        assert_eq!(ctx.line_offsets, vec![0]);
2784        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2785        assert_eq!(ctx.lines.len(), 0);
2786    }
2787
2788    #[test]
2789    fn test_single_line() {
2790        let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard);
2791        assert_eq!(ctx.content, "# Hello");
2792        assert_eq!(ctx.line_offsets, vec![0]);
2793        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2794        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2795    }
2796
2797    #[test]
2798    fn test_multi_line() {
2799        let content = "# Title\n\nSecond line\nThird line";
2800        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2801        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2802        // Test offset to line/col
2803        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2804        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2805        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2806        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2807        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2808    }
2809
2810    #[test]
2811    fn test_line_info() {
2812        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2813        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2814
2815        // Test line info
2816        assert_eq!(ctx.lines.len(), 7);
2817
2818        // Line 1: "# Title"
2819        let line1 = &ctx.lines[0];
2820        assert_eq!(line1.content, "# Title");
2821        assert_eq!(line1.byte_offset, 0);
2822        assert_eq!(line1.indent, 0);
2823        assert!(!line1.is_blank);
2824        assert!(!line1.in_code_block);
2825        assert!(line1.list_item.is_none());
2826
2827        // Line 2: "    indented"
2828        let line2 = &ctx.lines[1];
2829        assert_eq!(line2.content, "    indented");
2830        assert_eq!(line2.byte_offset, 8);
2831        assert_eq!(line2.indent, 4);
2832        assert!(!line2.is_blank);
2833
2834        // Line 3: "" (blank)
2835        let line3 = &ctx.lines[2];
2836        assert_eq!(line3.content, "");
2837        assert!(line3.is_blank);
2838
2839        // Test helper methods
2840        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2841        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2842        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2843        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2844    }
2845
2846    #[test]
2847    fn test_list_item_detection() {
2848        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2849        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2850
2851        // Line 1: "- Unordered item"
2852        let line1 = &ctx.lines[0];
2853        assert!(line1.list_item.is_some());
2854        let list1 = line1.list_item.as_ref().unwrap();
2855        assert_eq!(list1.marker, "-");
2856        assert!(!list1.is_ordered);
2857        assert_eq!(list1.marker_column, 0);
2858        assert_eq!(list1.content_column, 2);
2859
2860        // Line 2: "  * Nested item"
2861        let line2 = &ctx.lines[1];
2862        assert!(line2.list_item.is_some());
2863        let list2 = line2.list_item.as_ref().unwrap();
2864        assert_eq!(list2.marker, "*");
2865        assert_eq!(list2.marker_column, 2);
2866
2867        // Line 3: "1. Ordered item"
2868        let line3 = &ctx.lines[2];
2869        assert!(line3.list_item.is_some());
2870        let list3 = line3.list_item.as_ref().unwrap();
2871        assert_eq!(list3.marker, "1.");
2872        assert!(list3.is_ordered);
2873        assert_eq!(list3.number, Some(1));
2874
2875        // Line 6: "Not a list"
2876        let line6 = &ctx.lines[5];
2877        assert!(line6.list_item.is_none());
2878    }
2879
2880    #[test]
2881    fn test_offset_to_line_col_edge_cases() {
2882        let content = "a\nb\nc";
2883        let ctx = LintContext::new(content, MarkdownFlavor::Standard);
2884        // line_offsets: [0, 2, 4]
2885        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2886        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2887        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2888        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2889        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2890        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2891    }
2892}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs