rumdl_lib/
lint_context.rs

1use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
2use lazy_static::lazy_static;
3use regex::Regex;
4
5lazy_static! {
6    // Comprehensive link pattern that captures both inline and reference links
7    // Use (?s) flag to make . match newlines
8    static ref LINK_PATTERN: Regex = Regex::new(
9        r"(?sx)
10        \[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]          # Link text in group 1 (handles nested brackets)
11        (?:
12            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
13            |
14            \[([^\]]*)\]      # Reference ID in group 3
15        )"
16    ).unwrap();
17
18    // Image pattern (similar to links but with ! prefix)
19    // Use (?s) flag to make . match newlines
20    static ref IMAGE_PATTERN: Regex = Regex::new(
21        r"(?sx)
22        !\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]         # Alt text in group 1 (handles nested brackets)
23        (?:
24            \(([^)]*)\)       # Inline URL in group 2 (can be empty)
25            |
26            \[([^\]]*)\]      # Reference ID in group 3
27        )"
28    ).unwrap();
29
30    // Reference definition pattern
31    static ref REF_DEF_PATTERN: Regex = Regex::new(
32        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
33    ).unwrap();
34
35    // Code span pattern - matches backticks and captures content
36    // This handles multi-backtick code spans correctly
37    static ref CODE_SPAN_PATTERN: Regex = Regex::new(
38        r"`+"
39    ).unwrap();
40
41    // Pattern for bare URLs
42    static ref BARE_URL_PATTERN: Regex = Regex::new(
43        r#"(https?|ftp)://[^\s<>\[\]()\\'"`]+(?:\.[^\s<>\[\]()\\'"`]+)*(?::\d+)?(?:/[^\s<>\[\]()\\'"`]*)?(?:\?[^\s<>\[\]()\\'"`]*)?(?:#[^\s<>\[\]()\\'"`]*)?"#
44    ).unwrap();
45
46    // Pattern for email addresses
47    static ref BARE_EMAIL_PATTERN: Regex = Regex::new(
48        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
49    ).unwrap();
50
51    // Pattern for angle bracket links (to exclude from bare URL detection)
52    static ref ANGLE_BRACKET_PATTERN: Regex = Regex::new(
53        r"<((?:https?|ftp)://[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"
54    ).unwrap();
55
56    // Pattern for blockquote prefix in parse_list_blocks
57    static ref BLOCKQUOTE_PREFIX_REGEX: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
58}
59
60/// Pre-computed information about a line
61#[derive(Debug, Clone)]
62pub struct LineInfo {
63    /// The actual line content (without newline)
64    pub content: String,
65    /// Byte offset where this line starts in the document
66    pub byte_offset: usize,
67    /// Number of leading spaces/tabs
68    pub indent: usize,
69    /// Whether the line is blank (empty or only whitespace)
70    pub is_blank: bool,
71    /// Whether this line is inside a code block
72    pub in_code_block: bool,
73    /// Whether this line is inside front matter
74    pub in_front_matter: bool,
75    /// List item information if this line starts a list item
76    pub list_item: Option<ListItemInfo>,
77    /// Heading information if this line is a heading
78    pub heading: Option<HeadingInfo>,
79    /// Blockquote information if this line is a blockquote
80    pub blockquote: Option<BlockquoteInfo>,
81}
82
83/// Information about a list item
84#[derive(Debug, Clone)]
85pub struct ListItemInfo {
86    /// The marker used (*, -, +, or number with . or ))
87    pub marker: String,
88    /// Whether it's ordered (true) or unordered (false)
89    pub is_ordered: bool,
90    /// The number for ordered lists
91    pub number: Option<usize>,
92    /// Column where the marker starts (0-based)
93    pub marker_column: usize,
94    /// Column where content after marker starts
95    pub content_column: usize,
96}
97
98/// Heading style type
99#[derive(Debug, Clone, PartialEq)]
100pub enum HeadingStyle {
101    /// ATX style heading (# Heading)
102    ATX,
103    /// Setext style heading with = underline
104    Setext1,
105    /// Setext style heading with - underline
106    Setext2,
107}
108
109/// Parsed link information
110#[derive(Debug, Clone)]
111pub struct ParsedLink {
112    /// Line number (1-indexed)
113    pub line: usize,
114    /// Start column (0-indexed) in the line
115    pub start_col: usize,
116    /// End column (0-indexed) in the line
117    pub end_col: usize,
118    /// Byte offset in document
119    pub byte_offset: usize,
120    /// End byte offset in document
121    pub byte_end: usize,
122    /// Link text
123    pub text: String,
124    /// Link URL or reference
125    pub url: String,
126    /// Whether this is a reference link [text][ref] vs inline [text](url)
127    pub is_reference: bool,
128    /// Reference ID for reference links
129    pub reference_id: Option<String>,
130}
131
132/// Parsed image information
133#[derive(Debug, Clone)]
134pub struct ParsedImage {
135    /// Line number (1-indexed)
136    pub line: usize,
137    /// Start column (0-indexed) in the line
138    pub start_col: usize,
139    /// End column (0-indexed) in the line
140    pub end_col: usize,
141    /// Byte offset in document
142    pub byte_offset: usize,
143    /// End byte offset in document
144    pub byte_end: usize,
145    /// Alt text
146    pub alt_text: String,
147    /// Image URL or reference
148    pub url: String,
149    /// Whether this is a reference image ![alt][ref] vs inline ![alt](url)
150    pub is_reference: bool,
151    /// Reference ID for reference images
152    pub reference_id: Option<String>,
153}
154
155/// Reference definition [ref]: url "title"
156#[derive(Debug, Clone)]
157pub struct ReferenceDef {
158    /// Line number (1-indexed)
159    pub line: usize,
160    /// Reference ID (normalized to lowercase)
161    pub id: String,
162    /// URL
163    pub url: String,
164    /// Optional title
165    pub title: Option<String>,
166}
167
168/// Parsed code span information
169#[derive(Debug, Clone)]
170pub struct CodeSpan {
171    /// Line number (1-indexed)
172    pub line: usize,
173    /// Start column (0-indexed) in the line
174    pub start_col: usize,
175    /// End column (0-indexed) in the line
176    pub end_col: usize,
177    /// Byte offset in document
178    pub byte_offset: usize,
179    /// End byte offset in document
180    pub byte_end: usize,
181    /// Number of backticks used (1, 2, 3, etc.)
182    pub backtick_count: usize,
183    /// Content inside the code span (without backticks)
184    pub content: String,
185}
186
187/// Information about a heading
188#[derive(Debug, Clone)]
189pub struct HeadingInfo {
190    /// Heading level (1-6 for ATX, 1-2 for Setext)
191    pub level: u8,
192    /// Style of heading
193    pub style: HeadingStyle,
194    /// The heading marker (# characters or underline)
195    pub marker: String,
196    /// Column where the marker starts (0-based)
197    pub marker_column: usize,
198    /// Column where heading text starts
199    pub content_column: usize,
200    /// The heading text (without markers and without custom ID syntax)
201    pub text: String,
202    /// Custom header ID if present (e.g., from {#custom-id} syntax)
203    pub custom_id: Option<String>,
204    /// Original heading text including custom ID syntax
205    pub raw_text: String,
206    /// Whether it has a closing sequence (for ATX)
207    pub has_closing_sequence: bool,
208    /// The closing sequence if present
209    pub closing_sequence: String,
210}
211
212/// Information about a blockquote line
213#[derive(Debug, Clone)]
214pub struct BlockquoteInfo {
215    /// Nesting level (1 for >, 2 for >>, etc.)
216    pub nesting_level: usize,
217    /// The indentation before the blockquote marker
218    pub indent: String,
219    /// Column where the first > starts (0-based)
220    pub marker_column: usize,
221    /// The blockquote prefix (e.g., "> ", ">> ", etc.)
222    pub prefix: String,
223    /// Content after the blockquote marker(s)
224    pub content: String,
225    /// Whether the line has no space after the marker
226    pub has_no_space_after_marker: bool,
227    /// Whether the line has multiple spaces after the marker
228    pub has_multiple_spaces_after_marker: bool,
229    /// Whether this is an empty blockquote line needing MD028 fix
230    pub needs_md028_fix: bool,
231}
232
233/// Information about a list block
234#[derive(Debug, Clone)]
235pub struct ListBlock {
236    /// Line number where the list starts (1-indexed)
237    pub start_line: usize,
238    /// Line number where the list ends (1-indexed)
239    pub end_line: usize,
240    /// Whether it's ordered or unordered
241    pub is_ordered: bool,
242    /// The consistent marker for unordered lists (if any)
243    pub marker: Option<String>,
244    /// Blockquote prefix for this list (empty if not in blockquote)
245    pub blockquote_prefix: String,
246    /// Lines that are list items within this block
247    pub item_lines: Vec<usize>,
248    /// Nesting level (0 for top-level lists)
249    pub nesting_level: usize,
250    /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
251    pub max_marker_width: usize,
252}
253
254use std::sync::{Arc, Mutex};
255
256/// Character frequency data for fast content analysis
257#[derive(Debug, Clone, Default)]
258pub struct CharFrequency {
259    /// Count of # characters (headings)
260    pub hash_count: usize,
261    /// Count of * characters (emphasis, lists, horizontal rules)
262    pub asterisk_count: usize,
263    /// Count of _ characters (emphasis, horizontal rules)
264    pub underscore_count: usize,
265    /// Count of - characters (lists, horizontal rules, setext headings)
266    pub hyphen_count: usize,
267    /// Count of + characters (lists)
268    pub plus_count: usize,
269    /// Count of > characters (blockquotes)
270    pub gt_count: usize,
271    /// Count of | characters (tables)
272    pub pipe_count: usize,
273    /// Count of [ characters (links, images)
274    pub bracket_count: usize,
275    /// Count of ` characters (code spans, code blocks)
276    pub backtick_count: usize,
277    /// Count of < characters (HTML tags, autolinks)
278    pub lt_count: usize,
279    /// Count of ! characters (images)
280    pub exclamation_count: usize,
281    /// Count of newline characters
282    pub newline_count: usize,
283}
284
285/// Pre-parsed HTML tag information
286#[derive(Debug, Clone)]
287pub struct HtmlTag {
288    /// Line number (1-indexed)
289    pub line: usize,
290    /// Start column (0-indexed) in the line
291    pub start_col: usize,
292    /// End column (0-indexed) in the line
293    pub end_col: usize,
294    /// Byte offset in document
295    pub byte_offset: usize,
296    /// End byte offset in document
297    pub byte_end: usize,
298    /// Tag name (e.g., "div", "img", "br")
299    pub tag_name: String,
300    /// Whether it's a closing tag (</tag>)
301    pub is_closing: bool,
302    /// Whether it's self-closing (<tag />)
303    pub is_self_closing: bool,
304    /// Raw tag content
305    pub raw_content: String,
306}
307
308/// Pre-parsed emphasis span information
309#[derive(Debug, Clone)]
310pub struct EmphasisSpan {
311    /// Line number (1-indexed)
312    pub line: usize,
313    /// Start column (0-indexed) in the line
314    pub start_col: usize,
315    /// End column (0-indexed) in the line
316    pub end_col: usize,
317    /// Byte offset in document
318    pub byte_offset: usize,
319    /// End byte offset in document
320    pub byte_end: usize,
321    /// Type of emphasis ('*' or '_')
322    pub marker: char,
323    /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
324    pub marker_count: usize,
325    /// Content inside the emphasis
326    pub content: String,
327}
328
329/// Pre-parsed table row information
330#[derive(Debug, Clone)]
331pub struct TableRow {
332    /// Line number (1-indexed)
333    pub line: usize,
334    /// Whether this is a separator row (contains only |, -, :, and spaces)
335    pub is_separator: bool,
336    /// Number of columns (pipe-separated cells)
337    pub column_count: usize,
338    /// Alignment info from separator row
339    pub column_alignments: Vec<String>, // "left", "center", "right", "none"
340}
341
342/// Pre-parsed bare URL information (not in links)
343#[derive(Debug, Clone)]
344pub struct BareUrl {
345    /// Line number (1-indexed)
346    pub line: usize,
347    /// Start column (0-indexed) in the line
348    pub start_col: usize,
349    /// End column (0-indexed) in the line
350    pub end_col: usize,
351    /// Byte offset in document
352    pub byte_offset: usize,
353    /// End byte offset in document
354    pub byte_end: usize,
355    /// The URL string
356    pub url: String,
357    /// Type of URL ("http", "https", "ftp", "email")
358    pub url_type: String,
359}
360
361pub struct LintContext<'a> {
362    pub content: &'a str,
363    pub line_offsets: Vec<usize>,
364    pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
365    pub lines: Vec<LineInfo>,             // Pre-computed line information
366    pub links: Vec<ParsedLink>,           // Pre-parsed links
367    pub images: Vec<ParsedImage>,         // Pre-parsed images
368    pub reference_defs: Vec<ReferenceDef>, // Reference definitions
369    code_spans_cache: Mutex<Option<Arc<Vec<CodeSpan>>>>, // Lazy-loaded inline code spans
370    pub list_blocks: Vec<ListBlock>,      // Pre-parsed list blocks
371    pub char_frequency: CharFrequency,    // Character frequency analysis
372    html_tags_cache: Mutex<Option<Arc<Vec<HtmlTag>>>>, // Lazy-loaded HTML tags
373    emphasis_spans_cache: Mutex<Option<Arc<Vec<EmphasisSpan>>>>, // Lazy-loaded emphasis spans
374    table_rows_cache: Mutex<Option<Arc<Vec<TableRow>>>>, // Lazy-loaded table rows
375    bare_urls_cache: Mutex<Option<Arc<Vec<BareUrl>>>>, // Lazy-loaded bare URLs
376}
377
378impl<'a> LintContext<'a> {
379    pub fn new(content: &'a str) -> Self {
380        let mut line_offsets = vec![0];
381        for (i, c) in content.char_indices() {
382            if c == '\n' {
383                line_offsets.push(i + 1);
384            }
385        }
386
387        // Detect code blocks once and cache them
388        let code_blocks = CodeBlockUtils::detect_code_blocks(content);
389
390        // Pre-compute line information
391        let lines = Self::compute_line_info(content, &line_offsets, &code_blocks);
392
393        // Parse links, images, references, and list blocks
394        // Skip code spans - they'll be computed lazily
395        let links = Self::parse_links(content, &lines, &code_blocks);
396        let images = Self::parse_images(content, &lines, &code_blocks);
397        let reference_defs = Self::parse_reference_defs(content, &lines);
398        let list_blocks = Self::parse_list_blocks(&lines);
399
400        // Compute character frequency for fast content analysis
401        let char_frequency = Self::compute_char_frequency(content);
402
403        Self {
404            content,
405            line_offsets,
406            code_blocks,
407            lines,
408            links,
409            images,
410            reference_defs,
411            code_spans_cache: Mutex::new(None),
412            list_blocks,
413            char_frequency,
414            html_tags_cache: Mutex::new(None),
415            emphasis_spans_cache: Mutex::new(None),
416            table_rows_cache: Mutex::new(None),
417            bare_urls_cache: Mutex::new(None),
418        }
419    }
420
421    /// Get code spans - computed lazily on first access
422    pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
423        let mut cache = self.code_spans_cache.lock().unwrap();
424
425        // Check if we need to compute code spans
426        if cache.is_none() {
427            let code_spans = Self::parse_code_spans(self.content, &self.lines);
428            *cache = Some(Arc::new(code_spans));
429        }
430
431        // Return a reference to the cached code spans
432        cache.as_ref().unwrap().clone()
433    }
434
435    /// Get HTML tags - computed lazily on first access
436    pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
437        let mut cache = self.html_tags_cache.lock().unwrap();
438
439        if cache.is_none() {
440            let html_tags = Self::parse_html_tags(self.content, &self.lines, &self.code_blocks);
441            *cache = Some(Arc::new(html_tags));
442        }
443
444        cache.as_ref().unwrap().clone()
445    }
446
447    /// Get emphasis spans - computed lazily on first access
448    pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
449        let mut cache = self.emphasis_spans_cache.lock().unwrap();
450
451        if cache.is_none() {
452            let emphasis_spans = Self::parse_emphasis_spans(self.content, &self.lines, &self.code_blocks);
453            *cache = Some(Arc::new(emphasis_spans));
454        }
455
456        cache.as_ref().unwrap().clone()
457    }
458
459    /// Get table rows - computed lazily on first access
460    pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
461        let mut cache = self.table_rows_cache.lock().unwrap();
462
463        if cache.is_none() {
464            let table_rows = Self::parse_table_rows(&self.lines);
465            *cache = Some(Arc::new(table_rows));
466        }
467
468        cache.as_ref().unwrap().clone()
469    }
470
471    /// Get bare URLs - computed lazily on first access
472    pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
473        let mut cache = self.bare_urls_cache.lock().unwrap();
474
475        if cache.is_none() {
476            let bare_urls = Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks);
477            *cache = Some(Arc::new(bare_urls));
478        }
479
480        cache.as_ref().unwrap().clone()
481    }
482
483    /// Map a byte offset to (line, column)
484    pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
485        match self.line_offsets.binary_search(&offset) {
486            Ok(line) => (line + 1, 1),
487            Err(line) => {
488                let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
489                (line, offset - line_start + 1)
490            }
491        }
492    }
493
494    /// Check if a position is within a code block or code span
495    pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
496        // Check code blocks first
497        if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
498            return true;
499        }
500
501        // Check inline code spans (lazy load if needed)
502        self.code_spans()
503            .iter()
504            .any(|span| pos >= span.byte_offset && pos < span.byte_end)
505    }
506
507    /// Get line information by line number (1-indexed)
508    pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
509        if line_num > 0 {
510            self.lines.get(line_num - 1)
511        } else {
512            None
513        }
514    }
515
516    /// Get byte offset for a line number (1-indexed)
517    pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
518        self.line_info(line_num).map(|info| info.byte_offset)
519    }
520
521    /// Get URL for a reference link/image by its ID
522    pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
523        let normalized_id = ref_id.to_lowercase();
524        self.reference_defs
525            .iter()
526            .find(|def| def.id == normalized_id)
527            .map(|def| def.url.as_str())
528    }
529
530    /// Get links on a specific line
531    pub fn links_on_line(&self, line_num: usize) -> Vec<&ParsedLink> {
532        self.links.iter().filter(|link| link.line == line_num).collect()
533    }
534
535    /// Get images on a specific line
536    pub fn images_on_line(&self, line_num: usize) -> Vec<&ParsedImage> {
537        self.images.iter().filter(|img| img.line == line_num).collect()
538    }
539
540    /// Check if a line is part of a list block
541    pub fn is_in_list_block(&self, line_num: usize) -> bool {
542        self.list_blocks
543            .iter()
544            .any(|block| line_num >= block.start_line && line_num <= block.end_line)
545    }
546
547    /// Get the list block containing a specific line
548    pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
549        self.list_blocks
550            .iter()
551            .find(|block| line_num >= block.start_line && line_num <= block.end_line)
552    }
553
554    /// Check if content has any instances of a specific character (fast)
555    pub fn has_char(&self, ch: char) -> bool {
556        match ch {
557            '#' => self.char_frequency.hash_count > 0,
558            '*' => self.char_frequency.asterisk_count > 0,
559            '_' => self.char_frequency.underscore_count > 0,
560            '-' => self.char_frequency.hyphen_count > 0,
561            '+' => self.char_frequency.plus_count > 0,
562            '>' => self.char_frequency.gt_count > 0,
563            '|' => self.char_frequency.pipe_count > 0,
564            '[' => self.char_frequency.bracket_count > 0,
565            '`' => self.char_frequency.backtick_count > 0,
566            '<' => self.char_frequency.lt_count > 0,
567            '!' => self.char_frequency.exclamation_count > 0,
568            '\n' => self.char_frequency.newline_count > 0,
569            _ => self.content.contains(ch), // Fallback for other characters
570        }
571    }
572
573    /// Get count of a specific character (fast)
574    pub fn char_count(&self, ch: char) -> usize {
575        match ch {
576            '#' => self.char_frequency.hash_count,
577            '*' => self.char_frequency.asterisk_count,
578            '_' => self.char_frequency.underscore_count,
579            '-' => self.char_frequency.hyphen_count,
580            '+' => self.char_frequency.plus_count,
581            '>' => self.char_frequency.gt_count,
582            '|' => self.char_frequency.pipe_count,
583            '[' => self.char_frequency.bracket_count,
584            '`' => self.char_frequency.backtick_count,
585            '<' => self.char_frequency.lt_count,
586            '!' => self.char_frequency.exclamation_count,
587            '\n' => self.char_frequency.newline_count,
588            _ => self.content.matches(ch).count(), // Fallback for other characters
589        }
590    }
591
592    /// Check if content likely contains headings (fast)
593    pub fn likely_has_headings(&self) -> bool {
594        self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
595    }
596
597    /// Check if content likely contains lists (fast)
598    pub fn likely_has_lists(&self) -> bool {
599        self.char_frequency.asterisk_count > 0
600            || self.char_frequency.hyphen_count > 0
601            || self.char_frequency.plus_count > 0
602    }
603
604    /// Check if content likely contains emphasis (fast)
605    pub fn likely_has_emphasis(&self) -> bool {
606        self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
607    }
608
609    /// Check if content likely contains tables (fast)
610    pub fn likely_has_tables(&self) -> bool {
611        self.char_frequency.pipe_count > 2
612    }
613
614    /// Check if content likely contains blockquotes (fast)
615    pub fn likely_has_blockquotes(&self) -> bool {
616        self.char_frequency.gt_count > 0
617    }
618
619    /// Check if content likely contains code (fast)
620    pub fn likely_has_code(&self) -> bool {
621        self.char_frequency.backtick_count > 0
622    }
623
624    /// Check if content likely contains links or images (fast)
625    pub fn likely_has_links_or_images(&self) -> bool {
626        self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
627    }
628
629    /// Check if content likely contains HTML (fast)
630    pub fn likely_has_html(&self) -> bool {
631        self.char_frequency.lt_count > 0
632    }
633
634    /// Get HTML tags on a specific line
635    pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
636        self.html_tags()
637            .iter()
638            .filter(|tag| tag.line == line_num)
639            .cloned()
640            .collect()
641    }
642
643    /// Get emphasis spans on a specific line
644    pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
645        self.emphasis_spans()
646            .iter()
647            .filter(|span| span.line == line_num)
648            .cloned()
649            .collect()
650    }
651
652    /// Get table rows on a specific line
653    pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
654        self.table_rows()
655            .iter()
656            .filter(|row| row.line == line_num)
657            .cloned()
658            .collect()
659    }
660
661    /// Get bare URLs on a specific line
662    pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
663        self.bare_urls()
664            .iter()
665            .filter(|url| url.line == line_num)
666            .cloned()
667            .collect()
668    }
669
670    /// Parse all links in the content
671    fn parse_links(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedLink> {
672        // Pre-size based on a heuristic: most markdown files have relatively few links
673        let mut links = Vec::with_capacity(content.len() / 500); // ~1 link per 500 chars
674
675        // Parse links across the entire content, not line by line
676        for cap in LINK_PATTERN.captures_iter(content) {
677            let full_match = cap.get(0).unwrap();
678            let match_start = full_match.start();
679            let match_end = full_match.end();
680
681            // Skip if the opening bracket is escaped (preceded by \)
682            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
683                continue;
684            }
685
686            // Skip if this is actually an image (preceded by !)
687            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
688                continue;
689            }
690
691            // Skip if in code block or span
692            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
693                continue;
694            }
695
696            // Find which line this link starts on
697            let mut line_num = 1;
698            let mut col_start = match_start;
699            for (idx, line_info) in lines.iter().enumerate() {
700                if match_start >= line_info.byte_offset {
701                    line_num = idx + 1;
702                    col_start = match_start - line_info.byte_offset;
703                } else {
704                    break;
705                }
706            }
707
708            // Find which line this link ends on (and calculate column on that line)
709            let mut end_line_num = 1;
710            let mut col_end = match_end;
711            for (idx, line_info) in lines.iter().enumerate() {
712                if match_end > line_info.byte_offset {
713                    end_line_num = idx + 1;
714                    col_end = match_end - line_info.byte_offset;
715                } else {
716                    break;
717                }
718            }
719
720            // For single-line links, use the same approach as before
721            if line_num == end_line_num {
722                // col_end is already correct
723            } else {
724                // For multi-line links, col_end represents the column on the ending line
725                // which is what we want
726            }
727
728            let text = cap.get(1).map_or("", |m| m.as_str()).to_string();
729
730            if let Some(inline_url) = cap.get(2) {
731                // Inline link
732                links.push(ParsedLink {
733                    line: line_num,
734                    start_col: col_start,
735                    end_col: col_end,
736                    byte_offset: match_start,
737                    byte_end: match_end,
738                    text,
739                    url: inline_url.as_str().to_string(),
740                    is_reference: false,
741                    reference_id: None,
742                });
743            } else if let Some(ref_id) = cap.get(3) {
744                // Reference link
745                let ref_id_str = ref_id.as_str();
746                let normalized_ref = if ref_id_str.is_empty() {
747                    text.to_lowercase() // Implicit reference
748                } else {
749                    ref_id_str.to_lowercase()
750                };
751
752                links.push(ParsedLink {
753                    line: line_num,
754                    start_col: col_start,
755                    end_col: col_end,
756                    byte_offset: match_start,
757                    byte_end: match_end,
758                    text,
759                    url: String::new(), // Will be resolved with reference_defs
760                    is_reference: true,
761                    reference_id: Some(normalized_ref),
762                });
763            }
764        }
765
766        links
767    }
768
769    /// Parse all images in the content
770    fn parse_images(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<ParsedImage> {
771        // Pre-size based on a heuristic: images are less common than links
772        let mut images = Vec::with_capacity(content.len() / 1000); // ~1 image per 1000 chars
773
774        // Parse images across the entire content, not line by line
775        for cap in IMAGE_PATTERN.captures_iter(content) {
776            let full_match = cap.get(0).unwrap();
777            let match_start = full_match.start();
778            let match_end = full_match.end();
779
780            // Skip if the ! is escaped (preceded by \)
781            if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
782                continue;
783            }
784
785            // Skip if in code block or span
786            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
787                continue;
788            }
789
790            // Find which line this image starts on
791            let mut line_num = 1;
792            let mut col_start = match_start;
793            for (idx, line_info) in lines.iter().enumerate() {
794                if match_start >= line_info.byte_offset {
795                    line_num = idx + 1;
796                    col_start = match_start - line_info.byte_offset;
797                } else {
798                    break;
799                }
800            }
801
802            // Find which line this image ends on (and calculate column on that line)
803            let mut end_line_num = 1;
804            let mut col_end = match_end;
805            for (idx, line_info) in lines.iter().enumerate() {
806                if match_end > line_info.byte_offset {
807                    end_line_num = idx + 1;
808                    col_end = match_end - line_info.byte_offset;
809                } else {
810                    break;
811                }
812            }
813
814            // For single-line images, use the same approach as before
815            if line_num == end_line_num {
816                // col_end is already correct
817            } else {
818                // For multi-line images, col_end represents the column on the ending line
819                // which is what we want
820            }
821
822            let alt_text = cap.get(1).map_or("", |m| m.as_str()).to_string();
823
824            if let Some(inline_url) = cap.get(2) {
825                // Inline image
826                images.push(ParsedImage {
827                    line: line_num,
828                    start_col: col_start,
829                    end_col: col_end,
830                    byte_offset: match_start,
831                    byte_end: match_end,
832                    alt_text,
833                    url: inline_url.as_str().to_string(),
834                    is_reference: false,
835                    reference_id: None,
836                });
837            } else if let Some(ref_id) = cap.get(3) {
838                // Reference image
839                let ref_id_str = ref_id.as_str();
840                let normalized_ref = if ref_id_str.is_empty() {
841                    alt_text.to_lowercase() // Implicit reference
842                } else {
843                    ref_id_str.to_lowercase()
844                };
845
846                images.push(ParsedImage {
847                    line: line_num,
848                    start_col: col_start,
849                    end_col: col_end,
850                    byte_offset: match_start,
851                    byte_end: match_end,
852                    alt_text,
853                    url: String::new(), // Will be resolved with reference_defs
854                    is_reference: true,
855                    reference_id: Some(normalized_ref),
856                });
857            }
858        }
859
860        images
861    }
862
863    /// Parse reference definitions
864    fn parse_reference_defs(_content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
865        // Pre-size based on lines count as reference definitions are line-based
866        let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
867
868        for (line_idx, line_info) in lines.iter().enumerate() {
869            // Skip lines in code blocks
870            if line_info.in_code_block {
871                continue;
872            }
873
874            let line = &line_info.content;
875            let line_num = line_idx + 1;
876
877            if let Some(cap) = REF_DEF_PATTERN.captures(line) {
878                let id = cap.get(1).unwrap().as_str().to_lowercase();
879                let url = cap.get(2).unwrap().as_str().to_string();
880                let title = cap.get(3).or_else(|| cap.get(4)).map(|m| m.as_str().to_string());
881
882                refs.push(ReferenceDef {
883                    line: line_num,
884                    id,
885                    url,
886                    title,
887                });
888            }
889        }
890
891        refs
892    }
893
894    /// Pre-compute line information
895    fn compute_line_info(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<LineInfo> {
896        lazy_static! {
897            // Regex for list detection - allow any whitespace including no space (to catch malformed lists)
898            static ref UNORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)([-*+])([ \t]*)(.*)").unwrap();
899            static ref ORDERED_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(\d+)([.)])([ \t]*)(.*)").unwrap();
900
901            // Regex for blockquote prefix
902            static ref BLOCKQUOTE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*>\s*)(.*)").unwrap();
903
904            // Regex for heading detection
905            static ref ATX_HEADING_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap();
906            static ref SETEXT_UNDERLINE_REGEX: regex::Regex = regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap();
907
908            // Regex for blockquote detection
909            static ref BLOCKQUOTE_REGEX_FULL: regex::Regex = regex::Regex::new(r"^(\s*)(>+)(\s*)(.*)$").unwrap();
910        }
911
912        let content_lines: Vec<&str> = content.lines().collect();
913        let mut lines = Vec::with_capacity(content_lines.len());
914
915        // Detect front matter boundaries FIRST, before any other parsing
916        let mut in_front_matter = false;
917        let mut front_matter_end = 0;
918        if content_lines.first().map(|l| l.trim()) == Some("---") {
919            in_front_matter = true;
920            for (idx, line) in content_lines.iter().enumerate().skip(1) {
921                if line.trim() == "---" {
922                    front_matter_end = idx;
923                    break;
924                }
925            }
926        }
927
928        for (i, line) in content_lines.iter().enumerate() {
929            let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
930            let indent = line.len() - line.trim_start().len();
931            // For blank detection, consider blockquote context
932            let is_blank = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
933                // In blockquote context, check if content after prefix is blank
934                let after_prefix = caps.get(2).map_or("", |m| m.as_str());
935                after_prefix.trim().is_empty()
936            } else {
937                line.trim().is_empty()
938            };
939            // Check if this line is inside a code block (not inline code span)
940            // We only want to check for fenced/indented code blocks, not inline code
941            let in_code_block = code_blocks.iter().any(|&(start, end)| {
942                // Only consider ranges that span multiple lines (code blocks)
943                // Inline code spans are typically on a single line
944                let block_content = &content[start..end];
945                let is_multiline = block_content.contains('\n');
946                let is_fenced = block_content.starts_with("```") || block_content.starts_with("~~~");
947                let is_indented = !is_fenced
948                    && block_content
949                        .lines()
950                        .all(|l| l.starts_with("    ") || l.starts_with("\t") || l.trim().is_empty());
951
952                byte_offset >= start && byte_offset < end && (is_multiline || is_fenced || is_indented)
953            });
954
955            // Detect list items (skip if in frontmatter)
956            let list_item = if !(in_code_block || is_blank || in_front_matter && i <= front_matter_end) {
957                // Strip blockquote prefix if present for list detection
958                let (line_for_list_check, blockquote_prefix_len) = if let Some(caps) = BLOCKQUOTE_REGEX.captures(line) {
959                    let prefix = caps.get(1).unwrap().as_str();
960                    let content = caps.get(2).unwrap().as_str();
961                    (content, prefix.len())
962                } else {
963                    (&**line, 0)
964                };
965
966                if let Some(caps) = UNORDERED_REGEX.captures(line_for_list_check) {
967                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
968                    let marker = caps.get(2).map_or("", |m| m.as_str());
969                    let spacing = caps.get(3).map_or("", |m| m.as_str());
970                    let _content = caps.get(4).map_or("", |m| m.as_str());
971                    let marker_column = blockquote_prefix_len + leading_spaces.len();
972                    let content_column = marker_column + marker.len() + spacing.len();
973
974                    // According to CommonMark spec, unordered list items MUST have at least one space
975                    // after the marker (-, *, or +). Without a space, it's not a list item.
976                    // This also naturally handles cases like:
977                    // - *emphasis* (not a list)
978                    // - **bold** (not a list)
979                    // - --- (horizontal rule, not a list)
980                    if spacing.is_empty() {
981                        None
982                    } else {
983                        Some(ListItemInfo {
984                            marker: marker.to_string(),
985                            is_ordered: false,
986                            number: None,
987                            marker_column,
988                            content_column,
989                        })
990                    }
991                } else if let Some(caps) = ORDERED_REGEX.captures(line_for_list_check) {
992                    let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
993                    let number_str = caps.get(2).map_or("", |m| m.as_str());
994                    let delimiter = caps.get(3).map_or("", |m| m.as_str());
995                    let spacing = caps.get(4).map_or("", |m| m.as_str());
996                    let _content = caps.get(5).map_or("", |m| m.as_str());
997                    let marker = format!("{number_str}{delimiter}");
998                    let marker_column = blockquote_prefix_len + leading_spaces.len();
999                    let content_column = marker_column + marker.len() + spacing.len();
1000
1001                    // According to CommonMark spec, ordered list items MUST have at least one space
1002                    // after the marker (period or parenthesis). Without a space, it's not a list item.
1003                    if spacing.is_empty() {
1004                        None
1005                    } else {
1006                        Some(ListItemInfo {
1007                            marker,
1008                            is_ordered: true,
1009                            number: number_str.parse().ok(),
1010                            marker_column,
1011                            content_column,
1012                        })
1013                    }
1014                } else {
1015                    None
1016                }
1017            } else {
1018                None
1019            };
1020
1021            lines.push(LineInfo {
1022                content: line.to_string(),
1023                byte_offset,
1024                indent,
1025                is_blank,
1026                in_code_block,
1027                in_front_matter: in_front_matter && i <= front_matter_end,
1028                list_item,
1029                heading: None,    // Will be populated in second pass for Setext headings
1030                blockquote: None, // Will be populated after line creation
1031            });
1032        }
1033
1034        // Second pass: detect headings (including Setext which needs look-ahead) and blockquotes
1035        for i in 0..content_lines.len() {
1036            if lines[i].in_code_block {
1037                continue;
1038            }
1039
1040            // Skip lines in front matter
1041            if in_front_matter && i <= front_matter_end {
1042                continue;
1043            }
1044
1045            let line = content_lines[i];
1046
1047            // Check for blockquotes (even on blank lines within blockquotes)
1048            if let Some(caps) = BLOCKQUOTE_REGEX_FULL.captures(line) {
1049                let indent_str = caps.get(1).map_or("", |m| m.as_str());
1050                let markers = caps.get(2).map_or("", |m| m.as_str());
1051                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1052                let content = caps.get(4).map_or("", |m| m.as_str());
1053
1054                let nesting_level = markers.chars().filter(|&c| c == '>').count();
1055                let marker_column = indent_str.len();
1056
1057                // Build the prefix (indentation + markers + space)
1058                let prefix = format!("{indent_str}{markers}{spaces_after}");
1059
1060                // Check for various blockquote issues
1061                let has_no_space = spaces_after.is_empty() && !content.is_empty();
1062                // Consider tabs as multiple spaces, or actual multiple spaces
1063                let has_multiple_spaces = spaces_after.len() > 1 || spaces_after.contains('\t');
1064
1065                // Check if needs MD028 fix (empty blockquote without proper spacing)
1066                let needs_md028_fix = content.trim().is_empty() && spaces_after.is_empty();
1067
1068                lines[i].blockquote = Some(BlockquoteInfo {
1069                    nesting_level,
1070                    indent: indent_str.to_string(),
1071                    marker_column,
1072                    prefix,
1073                    content: content.to_string(),
1074                    has_no_space_after_marker: has_no_space,
1075                    has_multiple_spaces_after_marker: has_multiple_spaces,
1076                    needs_md028_fix,
1077                });
1078            }
1079
1080            // Skip heading detection for blank lines
1081            if lines[i].is_blank {
1082                continue;
1083            }
1084
1085            // Check for ATX headings
1086            if let Some(caps) = ATX_HEADING_REGEX.captures(line) {
1087                let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
1088                let hashes = caps.get(2).map_or("", |m| m.as_str());
1089                let spaces_after = caps.get(3).map_or("", |m| m.as_str());
1090                let rest = caps.get(4).map_or("", |m| m.as_str());
1091
1092                let level = hashes.len() as u8;
1093                let marker_column = leading_spaces.len();
1094
1095                // Check for closing sequence, but handle custom IDs that might come after
1096                let (text, has_closing, closing_seq) = {
1097                    // First check if there's a custom ID at the end
1098                    let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
1099                        // Check if this looks like a valid custom ID (ends with })
1100                        if rest[id_start..].trim_end().ends_with('}') {
1101                            // Split off the custom ID
1102                            (&rest[..id_start], &rest[id_start..])
1103                        } else {
1104                            (rest, "")
1105                        }
1106                    } else {
1107                        (rest, "")
1108                    };
1109
1110                    // Now look for closing hashes in the part before the custom ID
1111                    let trimmed_rest = rest_without_id.trim_end();
1112                    if let Some(last_hash_pos) = trimmed_rest.rfind('#') {
1113                        // Look for the start of the hash sequence
1114                        let mut start_of_hashes = last_hash_pos;
1115                        while start_of_hashes > 0 && trimmed_rest.chars().nth(start_of_hashes - 1) == Some('#') {
1116                            start_of_hashes -= 1;
1117                        }
1118
1119                        // Check if there's at least one space before the closing hashes
1120                        let has_space_before = start_of_hashes == 0
1121                            || trimmed_rest
1122                                .chars()
1123                                .nth(start_of_hashes - 1)
1124                                .is_some_and(|c| c.is_whitespace());
1125
1126                        // Check if this is a valid closing sequence (all hashes to end of trimmed part)
1127                        let potential_closing = &trimmed_rest[start_of_hashes..];
1128                        let is_all_hashes = potential_closing.chars().all(|c| c == '#');
1129
1130                        if is_all_hashes && has_space_before {
1131                            // This is a closing sequence
1132                            let closing_hashes = potential_closing.to_string();
1133                            // The text is everything before the closing hashes
1134                            // Don't include the custom ID here - it will be extracted later
1135                            let text_part = if !custom_id_part.is_empty() {
1136                                // If we have a custom ID, append it back to get the full rest
1137                                // This allows the extract_header_id function to handle it properly
1138                                format!("{}{}", rest_without_id[..start_of_hashes].trim_end(), custom_id_part)
1139                            } else {
1140                                rest_without_id[..start_of_hashes].trim_end().to_string()
1141                            };
1142                            (text_part, true, closing_hashes)
1143                        } else {
1144                            // Not a valid closing sequence, return the full content
1145                            (rest.to_string(), false, String::new())
1146                        }
1147                    } else {
1148                        // No hashes found, return the full content
1149                        (rest.to_string(), false, String::new())
1150                    }
1151                };
1152
1153                let content_column = marker_column + hashes.len() + spaces_after.len();
1154
1155                // Extract custom header ID if present
1156                let raw_text = text.trim().to_string();
1157                let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1158
1159                // If no custom ID was found on the header line, check the next line for standalone attr-list
1160                if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
1161                    let next_line = content_lines[i + 1];
1162                    if !lines[i + 1].in_code_block
1163                        && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
1164                        && let Some(next_line_id) =
1165                            crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
1166                    {
1167                        custom_id = Some(next_line_id);
1168                    }
1169                }
1170
1171                lines[i].heading = Some(HeadingInfo {
1172                    level,
1173                    style: HeadingStyle::ATX,
1174                    marker: hashes.to_string(),
1175                    marker_column,
1176                    content_column,
1177                    text: clean_text,
1178                    custom_id,
1179                    raw_text,
1180                    has_closing_sequence: has_closing,
1181                    closing_sequence: closing_seq,
1182                });
1183            }
1184            // Check for Setext headings (need to look at next line)
1185            else if i + 1 < content_lines.len() {
1186                let next_line = content_lines[i + 1];
1187                if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
1188                    // Skip if next line is front matter delimiter
1189                    if in_front_matter && i < front_matter_end {
1190                        continue;
1191                    }
1192
1193                    let underline = next_line.trim();
1194                    let level = if underline.starts_with('=') { 1 } else { 2 };
1195                    let style = if level == 1 {
1196                        HeadingStyle::Setext1
1197                    } else {
1198                        HeadingStyle::Setext2
1199                    };
1200
1201                    // Extract custom header ID if present
1202                    let raw_text = line.trim().to_string();
1203                    let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
1204
1205                    // If no custom ID was found on the header line, check the line after underline for standalone attr-list
1206                    if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
1207                        let attr_line = content_lines[i + 2];
1208                        if !lines[i + 2].in_code_block
1209                            && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
1210                            && let Some(attr_line_id) =
1211                                crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
1212                        {
1213                            custom_id = Some(attr_line_id);
1214                        }
1215                    }
1216
1217                    lines[i].heading = Some(HeadingInfo {
1218                        level,
1219                        style,
1220                        marker: underline.to_string(),
1221                        marker_column: next_line.len() - next_line.trim_start().len(),
1222                        content_column: lines[i].indent,
1223                        text: clean_text,
1224                        custom_id,
1225                        raw_text,
1226                        has_closing_sequence: false,
1227                        closing_sequence: String::new(),
1228                    });
1229                }
1230            }
1231        }
1232
1233        lines
1234    }
1235
1236    /// Parse all inline code spans in the content
1237    fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1238        // Pre-size based on content - code spans are fairly common
1239        let mut code_spans = Vec::with_capacity(content.matches('`').count() / 2); // Each code span has 2 backticks
1240
1241        // Quick check - if no backticks, no code spans
1242        if !content.contains('`') {
1243            return code_spans;
1244        }
1245
1246        let mut pos = 0;
1247        let bytes = content.as_bytes();
1248
1249        while pos < bytes.len() {
1250            // Find the next backtick
1251            if let Some(backtick_start) = content[pos..].find('`') {
1252                let start_pos = pos + backtick_start;
1253
1254                // Skip if this backtick is inside a code block
1255                let mut in_code_block = false;
1256                for (line_idx, line_info) in lines.iter().enumerate() {
1257                    if start_pos >= line_info.byte_offset
1258                        && (line_idx + 1 >= lines.len() || start_pos < lines[line_idx + 1].byte_offset)
1259                    {
1260                        in_code_block = line_info.in_code_block;
1261                        break;
1262                    }
1263                }
1264
1265                if in_code_block {
1266                    pos = start_pos + 1;
1267                    continue;
1268                }
1269
1270                // Count consecutive backticks
1271                let mut backtick_count = 0;
1272                let mut i = start_pos;
1273                while i < bytes.len() && bytes[i] == b'`' {
1274                    backtick_count += 1;
1275                    i += 1;
1276                }
1277
1278                // Look for matching closing backticks
1279                let search_start = start_pos + backtick_count;
1280                let closing_pattern = &content[start_pos..start_pos + backtick_count];
1281
1282                if let Some(rel_end) = content[search_start..].find(closing_pattern) {
1283                    // Check that the closing backticks are not followed by more backticks
1284                    let end_pos = search_start + rel_end;
1285                    let check_pos = end_pos + backtick_count;
1286
1287                    // Make sure we have exactly the right number of backticks (not more)
1288                    if check_pos >= bytes.len() || bytes[check_pos] != b'`' {
1289                        // We found a valid code span
1290                        let content_start = start_pos + backtick_count;
1291                        let content_end = end_pos;
1292                        let span_content = content[content_start..content_end].to_string();
1293
1294                        // Find which line this code span starts on
1295                        let mut line_num = 1;
1296                        let mut col_start = start_pos;
1297                        for (idx, line_info) in lines.iter().enumerate() {
1298                            if start_pos >= line_info.byte_offset {
1299                                line_num = idx + 1;
1300                                col_start = start_pos - line_info.byte_offset;
1301                            } else {
1302                                break;
1303                            }
1304                        }
1305
1306                        // Find end column
1307                        let mut col_end = end_pos + backtick_count;
1308                        for line_info in lines.iter() {
1309                            if end_pos + backtick_count > line_info.byte_offset {
1310                                col_end = end_pos + backtick_count - line_info.byte_offset;
1311                            } else {
1312                                break;
1313                            }
1314                        }
1315
1316                        code_spans.push(CodeSpan {
1317                            line: line_num,
1318                            start_col: col_start,
1319                            end_col: col_end,
1320                            byte_offset: start_pos,
1321                            byte_end: end_pos + backtick_count,
1322                            backtick_count,
1323                            content: span_content,
1324                        });
1325
1326                        // Continue searching after this code span
1327                        pos = end_pos + backtick_count;
1328                        continue;
1329                    }
1330                }
1331
1332                // No matching closing backticks found, move past these opening backticks
1333                pos = start_pos + backtick_count;
1334            } else {
1335                // No more backticks found
1336                break;
1337            }
1338        }
1339
1340        code_spans
1341    }
1342
1343    /// Parse all list blocks in the content
1344    fn parse_list_blocks(lines: &[LineInfo]) -> Vec<ListBlock> {
1345        // Pre-size based on lines that could be list items
1346        let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
1347        let mut current_block: Option<ListBlock> = None;
1348        let mut last_list_item_line = 0;
1349        let mut current_indent_level = 0;
1350        let mut last_marker_width = 0;
1351
1352        for (line_idx, line_info) in lines.iter().enumerate() {
1353            let line_num = line_idx + 1;
1354
1355            // Enhanced code block handling using Design #3's context analysis
1356            if line_info.in_code_block {
1357                if let Some(ref mut block) = current_block {
1358                    // Calculate minimum indentation for list continuation
1359                    let min_continuation_indent = CodeBlockUtils::calculate_min_continuation_indent(lines, line_idx);
1360
1361                    // Analyze code block context using the three-tier classification
1362                    let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
1363
1364                    match context {
1365                        CodeBlockContext::Indented => {
1366                            // Code block is properly indented - continues the list
1367                            block.end_line = line_num;
1368                            continue;
1369                        }
1370                        CodeBlockContext::Standalone => {
1371                            // Code block separates lists - end current block
1372                            let completed_block = current_block.take().unwrap();
1373                            list_blocks.push(completed_block);
1374                            continue;
1375                        }
1376                        CodeBlockContext::Adjacent => {
1377                            // Edge case - use conservative behavior (continue list)
1378                            block.end_line = line_num;
1379                            continue;
1380                        }
1381                    }
1382                } else {
1383                    // No current list block - skip code block lines
1384                    continue;
1385                }
1386            }
1387
1388            // Extract blockquote prefix if any
1389            let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(&line_info.content) {
1390                caps.get(0).unwrap().as_str().to_string()
1391            } else {
1392                String::new()
1393            };
1394
1395            // Check if this line is a list item
1396            if let Some(list_item) = &line_info.list_item {
1397                // Calculate nesting level based on indentation
1398                let item_indent = list_item.marker_column;
1399                let nesting = item_indent / 2; // Assume 2-space indentation for nesting
1400
1401                if let Some(ref mut block) = current_block {
1402                    // Check if this continues the current block
1403                    // For nested lists, we need to check if this is a nested item (higher nesting level)
1404                    // or a continuation at the same or lower level
1405                    let is_nested = nesting > block.nesting_level;
1406                    let same_type =
1407                        (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
1408                    let same_context = block.blockquote_prefix == blockquote_prefix;
1409                    let reasonable_distance = line_num <= last_list_item_line + 2; // Allow one blank line
1410
1411                    // For unordered lists, also check marker consistency
1412                    let marker_compatible =
1413                        block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
1414
1415                    // Check if there's non-list content between the last item and this one
1416                    let has_non_list_content = {
1417                        let mut found_non_list = false;
1418                        // Use the last item from the current block, not the global last_list_item_line
1419                        let block_last_item_line = block.item_lines.last().copied().unwrap_or(block.end_line);
1420                        for check_line in (block_last_item_line + 1)..line_num {
1421                            let check_idx = check_line - 1;
1422                            if check_idx < lines.len() {
1423                                let check_info = &lines[check_idx];
1424                                // Check for content that breaks the list
1425                                let is_list_breaking_content = if check_info.in_code_block {
1426                                    // Use enhanced code block classification for list separation
1427                                    let last_item_marker_width =
1428                                        if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1429                                            lines[block_last_item_line - 1]
1430                                                .list_item
1431                                                .as_ref()
1432                                                .map(|li| {
1433                                                    if li.is_ordered {
1434                                                        li.marker.len() + 1 // Add 1 for the space after ordered list markers
1435                                                    } else {
1436                                                        li.marker.len()
1437                                                    }
1438                                                })
1439                                                .unwrap_or(3) // fallback to 3 if no list item found
1440                                        } else {
1441                                            3 // fallback
1442                                        };
1443
1444                                    let min_continuation = if block.is_ordered { last_item_marker_width } else { 2 };
1445
1446                                    // Analyze code block context using our enhanced classification
1447                                    let context = CodeBlockUtils::analyze_code_block_context(
1448                                        lines,
1449                                        check_line - 1,
1450                                        min_continuation,
1451                                    );
1452
1453                                    // Standalone code blocks break lists, indented ones continue them
1454                                    matches!(context, CodeBlockContext::Standalone)
1455                                } else if !check_info.is_blank && check_info.list_item.is_none() {
1456                                    // Check for structural separators that should break lists (from issue #42)
1457                                    let line_content = check_info.content.trim();
1458
1459                                    // Any of these structural separators break lists
1460                                    if check_info.heading.is_some()
1461                                        || line_content.starts_with("---")
1462                                        || line_content.starts_with("***")
1463                                        || line_content.starts_with("___")
1464                                        || line_content.contains('|')
1465                                        || line_content.starts_with(">")
1466                                    {
1467                                        true
1468                                    }
1469                                    // Other non-list content - check if properly indented
1470                                    else {
1471                                        let last_item_marker_width =
1472                                            if block_last_item_line > 0 && block_last_item_line <= lines.len() {
1473                                                lines[block_last_item_line - 1]
1474                                                    .list_item
1475                                                    .as_ref()
1476                                                    .map(|li| {
1477                                                        if li.is_ordered {
1478                                                            li.marker.len() + 1 // Add 1 for the space after ordered list markers
1479                                                        } else {
1480                                                            li.marker.len()
1481                                                        }
1482                                                    })
1483                                                    .unwrap_or(3) // fallback to 3 if no list item found
1484                                            } else {
1485                                                3 // fallback
1486                                            };
1487
1488                                        let min_continuation =
1489                                            if block.is_ordered { last_item_marker_width } else { 2 };
1490                                        check_info.indent < min_continuation
1491                                    }
1492                                } else {
1493                                    false
1494                                };
1495
1496                                if is_list_breaking_content {
1497                                    // Not indented enough, so it breaks the list
1498                                    found_non_list = true;
1499                                    break;
1500                                }
1501                            }
1502                        }
1503                        found_non_list
1504                    };
1505
1506                    // A list continues if:
1507                    // 1. It's a nested item (indented more than the parent), OR
1508                    // 2. It's the same type at the same level with reasonable distance
1509                    let continues_list = if is_nested {
1510                        // Nested items always continue the list if they're in the same context
1511                        same_context && reasonable_distance && !has_non_list_content
1512                    } else {
1513                        // Same-level items need to match type and markers
1514                        same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
1515                    };
1516
1517                    if continues_list {
1518                        // Extend current block
1519                        block.end_line = line_num;
1520                        block.item_lines.push(line_num);
1521
1522                        // Update max marker width
1523                        block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
1524                            list_item.marker.len() + 1
1525                        } else {
1526                            list_item.marker.len()
1527                        });
1528
1529                        // Update marker consistency for unordered lists
1530                        if !block.is_ordered
1531                            && block.marker.is_some()
1532                            && block.marker.as_ref() != Some(&list_item.marker)
1533                        {
1534                            // Mixed markers, clear the marker field
1535                            block.marker = None;
1536                        }
1537                    } else {
1538                        // End current block and start a new one
1539                        list_blocks.push(block.clone());
1540
1541                        *block = ListBlock {
1542                            start_line: line_num,
1543                            end_line: line_num,
1544                            is_ordered: list_item.is_ordered,
1545                            marker: if list_item.is_ordered {
1546                                None
1547                            } else {
1548                                Some(list_item.marker.clone())
1549                            },
1550                            blockquote_prefix: blockquote_prefix.clone(),
1551                            item_lines: vec![line_num],
1552                            nesting_level: nesting,
1553                            max_marker_width: if list_item.is_ordered {
1554                                list_item.marker.len() + 1
1555                            } else {
1556                                list_item.marker.len()
1557                            },
1558                        };
1559                    }
1560                } else {
1561                    // Start a new block
1562                    current_block = Some(ListBlock {
1563                        start_line: line_num,
1564                        end_line: line_num,
1565                        is_ordered: list_item.is_ordered,
1566                        marker: if list_item.is_ordered {
1567                            None
1568                        } else {
1569                            Some(list_item.marker.clone())
1570                        },
1571                        blockquote_prefix,
1572                        item_lines: vec![line_num],
1573                        nesting_level: nesting,
1574                        max_marker_width: list_item.marker.len(),
1575                    });
1576                }
1577
1578                last_list_item_line = line_num;
1579                current_indent_level = item_indent;
1580                last_marker_width = if list_item.is_ordered {
1581                    list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
1582                } else {
1583                    list_item.marker.len()
1584                };
1585            } else if let Some(ref mut block) = current_block {
1586                // Not a list item - check if it continues the current block
1587
1588                // For MD032 compatibility, we use a simple approach:
1589                // - Indented lines continue the list
1590                // - Blank lines followed by indented content continue the list
1591                // - Everything else ends the list
1592
1593                // Calculate minimum indentation for list continuation
1594                // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
1595                // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
1596                let min_continuation_indent = if block.is_ordered {
1597                    current_indent_level + last_marker_width
1598                } else {
1599                    current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
1600                };
1601
1602                if line_info.indent >= min_continuation_indent {
1603                    // Indented line continues the list
1604                    block.end_line = line_num;
1605                } else if line_info.is_blank {
1606                    // Blank line - check if it's internal to the list or ending it
1607                    // We only include blank lines that are followed by more list content
1608                    let mut check_idx = line_idx + 1;
1609                    let mut found_continuation = false;
1610
1611                    // Skip additional blank lines
1612                    while check_idx < lines.len() && lines[check_idx].is_blank {
1613                        check_idx += 1;
1614                    }
1615
1616                    if check_idx < lines.len() {
1617                        let next_line = &lines[check_idx];
1618                        // Check if followed by indented content (list continuation)
1619                        if !next_line.in_code_block && next_line.indent >= min_continuation_indent {
1620                            found_continuation = true;
1621                        }
1622                        // Check if followed by another list item at the same level
1623                        else if !next_line.in_code_block
1624                            && next_line.list_item.is_some()
1625                            && let Some(item) = &next_line.list_item
1626                        {
1627                            let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
1628                                .find(&next_line.content)
1629                                .map_or(String::new(), |m| m.as_str().to_string());
1630                            if item.marker_column == current_indent_level
1631                                && item.is_ordered == block.is_ordered
1632                                && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
1633                            {
1634                                // Check if there was meaningful content between the list items (unused now)
1635                                // This variable is kept for potential future use but is currently replaced by has_structural_separators
1636                                let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
1637                                    if let Some(between_line) = lines.get(idx) {
1638                                        let trimmed = between_line.content.trim();
1639                                        // Skip empty lines
1640                                        if trimmed.is_empty() {
1641                                            return false;
1642                                        }
1643                                        // Check for meaningful content
1644                                        let line_indent =
1645                                            between_line.content.len() - between_line.content.trim_start().len();
1646
1647                                        // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
1648                                        if trimmed.starts_with("```")
1649                                            || trimmed.starts_with("~~~")
1650                                            || trimmed.starts_with("---")
1651                                            || trimmed.starts_with("***")
1652                                            || trimmed.starts_with("___")
1653                                            || trimmed.starts_with(">")
1654                                            || trimmed.contains('|') // Tables
1655                                            || between_line.heading.is_some()
1656                                        {
1657                                            return true; // These are structural separators - meaningful content that breaks lists
1658                                        }
1659
1660                                        // Only properly indented content continues the list
1661                                        line_indent >= min_continuation_indent
1662                                    } else {
1663                                        false
1664                                    }
1665                                });
1666
1667                                if block.is_ordered {
1668                                    // For ordered lists: don't continue if there are structural separators
1669                                    // Check if there are structural separators between the list items
1670                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1671                                        if let Some(between_line) = lines.get(idx) {
1672                                            let trimmed = between_line.content.trim();
1673                                            if trimmed.is_empty() {
1674                                                return false;
1675                                            }
1676                                            // Check for structural separators that break lists
1677                                            trimmed.starts_with("```")
1678                                                || trimmed.starts_with("~~~")
1679                                                || trimmed.starts_with("---")
1680                                                || trimmed.starts_with("***")
1681                                                || trimmed.starts_with("___")
1682                                                || trimmed.starts_with(">")
1683                                                || trimmed.contains('|') // Tables
1684                                                || between_line.heading.is_some()
1685                                        } else {
1686                                            false
1687                                        }
1688                                    });
1689                                    found_continuation = !has_structural_separators;
1690                                } else {
1691                                    // For unordered lists: also check for structural separators
1692                                    let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
1693                                        if let Some(between_line) = lines.get(idx) {
1694                                            let trimmed = between_line.content.trim();
1695                                            if trimmed.is_empty() {
1696                                                return false;
1697                                            }
1698                                            // Check for structural separators that break lists
1699                                            trimmed.starts_with("```")
1700                                                || trimmed.starts_with("~~~")
1701                                                || trimmed.starts_with("---")
1702                                                || trimmed.starts_with("***")
1703                                                || trimmed.starts_with("___")
1704                                                || trimmed.starts_with(">")
1705                                                || trimmed.contains('|') // Tables
1706                                                || between_line.heading.is_some()
1707                                        } else {
1708                                            false
1709                                        }
1710                                    });
1711                                    found_continuation = !has_structural_separators;
1712                                }
1713                            }
1714                        }
1715                    }
1716
1717                    if found_continuation {
1718                        // Include the blank line in the block
1719                        block.end_line = line_num;
1720                    } else {
1721                        // Blank line ends the list - don't include it
1722                        list_blocks.push(block.clone());
1723                        current_block = None;
1724                    }
1725                } else {
1726                    // Check for lazy continuation - non-indented line immediately after a list item
1727                    // But only if the line has sufficient indentation for the list type
1728                    let min_required_indent = if block.is_ordered {
1729                        current_indent_level + last_marker_width
1730                    } else {
1731                        current_indent_level + 2
1732                    };
1733
1734                    // For lazy continuation to apply, the line must either:
1735                    // 1. Have no indentation (true lazy continuation)
1736                    // 2. Have sufficient indentation for the list type
1737                    // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
1738                    let line_content = line_info.content.trim();
1739                    let is_structural_separator = line_info.heading.is_some()
1740                        || line_content.starts_with("```")
1741                        || line_content.starts_with("~~~")
1742                        || line_content.starts_with("---")
1743                        || line_content.starts_with("***")
1744                        || line_content.starts_with("___")
1745                        || line_content.starts_with(">")
1746                        || line_content.contains('|'); // Tables
1747
1748                    let is_lazy_continuation = last_list_item_line == line_num - 1
1749                        && !is_structural_separator
1750                        && !line_info.is_blank
1751                        && (line_info.indent == 0 || line_info.indent >= min_required_indent);
1752
1753                    if is_lazy_continuation {
1754                        // Additional check: if the line starts with uppercase and looks like a new sentence,
1755                        // it's probably not a continuation
1756                        let content_to_check = if !blockquote_prefix.is_empty() {
1757                            // Strip blockquote prefix to check the actual content
1758                            line_info
1759                                .content
1760                                .strip_prefix(&blockquote_prefix)
1761                                .unwrap_or(&line_info.content)
1762                                .trim()
1763                        } else {
1764                            line_info.content.trim()
1765                        };
1766
1767                        let starts_with_uppercase = content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
1768
1769                        // If it starts with uppercase and the previous line ended with punctuation,
1770                        // it's likely a new paragraph, not a continuation
1771                        if starts_with_uppercase && last_list_item_line > 0 {
1772                            // This looks like a new paragraph
1773                            list_blocks.push(block.clone());
1774                            current_block = None;
1775                        } else {
1776                            // This is a lazy continuation line
1777                            block.end_line = line_num;
1778                        }
1779                    } else {
1780                        // Non-indented, non-blank line that's not a lazy continuation - end the block
1781                        list_blocks.push(block.clone());
1782                        current_block = None;
1783                    }
1784                }
1785            }
1786        }
1787
1788        // Don't forget the last block
1789        if let Some(block) = current_block {
1790            list_blocks.push(block);
1791        }
1792
1793        // Merge adjacent blocks that should be one
1794        merge_adjacent_list_blocks(&mut list_blocks, lines);
1795
1796        list_blocks
1797    }
1798
1799    /// Compute character frequency for fast content analysis
1800    fn compute_char_frequency(content: &str) -> CharFrequency {
1801        let mut frequency = CharFrequency::default();
1802
1803        for ch in content.chars() {
1804            match ch {
1805                '#' => frequency.hash_count += 1,
1806                '*' => frequency.asterisk_count += 1,
1807                '_' => frequency.underscore_count += 1,
1808                '-' => frequency.hyphen_count += 1,
1809                '+' => frequency.plus_count += 1,
1810                '>' => frequency.gt_count += 1,
1811                '|' => frequency.pipe_count += 1,
1812                '[' => frequency.bracket_count += 1,
1813                '`' => frequency.backtick_count += 1,
1814                '<' => frequency.lt_count += 1,
1815                '!' => frequency.exclamation_count += 1,
1816                '\n' => frequency.newline_count += 1,
1817                _ => {}
1818            }
1819        }
1820
1821        frequency
1822    }
1823
1824    /// Parse HTML tags in the content
1825    fn parse_html_tags(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<HtmlTag> {
1826        lazy_static! {
1827            static ref HTML_TAG_REGEX: regex::Regex =
1828                regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9]*)\b[^>]*(/?)>").unwrap();
1829        }
1830
1831        let mut html_tags = Vec::with_capacity(content.matches('<').count());
1832
1833        for cap in HTML_TAG_REGEX.captures_iter(content) {
1834            let full_match = cap.get(0).unwrap();
1835            let match_start = full_match.start();
1836            let match_end = full_match.end();
1837
1838            // Skip if in code block
1839            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1840                continue;
1841            }
1842
1843            let is_closing = !cap.get(1).unwrap().as_str().is_empty();
1844            let tag_name = cap.get(2).unwrap().as_str().to_lowercase();
1845            let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
1846
1847            // Find which line this tag is on
1848            let mut line_num = 1;
1849            let mut col_start = match_start;
1850            let mut col_end = match_end;
1851            for (idx, line_info) in lines.iter().enumerate() {
1852                if match_start >= line_info.byte_offset {
1853                    line_num = idx + 1;
1854                    col_start = match_start - line_info.byte_offset;
1855                    col_end = match_end - line_info.byte_offset;
1856                } else {
1857                    break;
1858                }
1859            }
1860
1861            html_tags.push(HtmlTag {
1862                line: line_num,
1863                start_col: col_start,
1864                end_col: col_end,
1865                byte_offset: match_start,
1866                byte_end: match_end,
1867                tag_name,
1868                is_closing,
1869                is_self_closing,
1870                raw_content: full_match.as_str().to_string(),
1871            });
1872        }
1873
1874        html_tags
1875    }
1876
1877    /// Parse emphasis spans in the content
1878    fn parse_emphasis_spans(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<EmphasisSpan> {
1879        lazy_static! {
1880            static ref EMPHASIS_REGEX: regex::Regex =
1881                regex::Regex::new(r"(\*{1,3}|_{1,3})([^*_\s][^*_]*?)(\*{1,3}|_{1,3})").unwrap();
1882        }
1883
1884        let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1885
1886        for cap in EMPHASIS_REGEX.captures_iter(content) {
1887            let full_match = cap.get(0).unwrap();
1888            let match_start = full_match.start();
1889            let match_end = full_match.end();
1890
1891            // Skip if in code block
1892            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1893                continue;
1894            }
1895
1896            let opening_markers = cap.get(1).unwrap().as_str();
1897            let content_part = cap.get(2).unwrap().as_str();
1898            let closing_markers = cap.get(3).unwrap().as_str();
1899
1900            // Validate matching markers
1901            if opening_markers.chars().next() != closing_markers.chars().next()
1902                || opening_markers.len() != closing_markers.len()
1903            {
1904                continue;
1905            }
1906
1907            let marker = opening_markers.chars().next().unwrap();
1908            let marker_count = opening_markers.len();
1909
1910            // Find which line this emphasis is on
1911            let mut line_num = 1;
1912            let mut col_start = match_start;
1913            let mut col_end = match_end;
1914            for (idx, line_info) in lines.iter().enumerate() {
1915                if match_start >= line_info.byte_offset {
1916                    line_num = idx + 1;
1917                    col_start = match_start - line_info.byte_offset;
1918                    col_end = match_end - line_info.byte_offset;
1919                } else {
1920                    break;
1921                }
1922            }
1923
1924            emphasis_spans.push(EmphasisSpan {
1925                line: line_num,
1926                start_col: col_start,
1927                end_col: col_end,
1928                byte_offset: match_start,
1929                byte_end: match_end,
1930                marker,
1931                marker_count,
1932                content: content_part.to_string(),
1933            });
1934        }
1935
1936        emphasis_spans
1937    }
1938
1939    /// Parse table rows in the content
1940    fn parse_table_rows(lines: &[LineInfo]) -> Vec<TableRow> {
1941        let mut table_rows = Vec::with_capacity(lines.len() / 20);
1942
1943        for (line_idx, line_info) in lines.iter().enumerate() {
1944            // Skip lines in code blocks or blank lines
1945            if line_info.in_code_block || line_info.is_blank {
1946                continue;
1947            }
1948
1949            let line = &line_info.content;
1950            let line_num = line_idx + 1;
1951
1952            // Check if this line contains pipes (potential table row)
1953            if !line.contains('|') {
1954                continue;
1955            }
1956
1957            // Count columns by splitting on pipes
1958            let parts: Vec<&str> = line.split('|').collect();
1959            let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
1960
1961            // Check if this is a separator row
1962            let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
1963            let mut column_alignments = Vec::new();
1964
1965            if is_separator {
1966                for part in &parts[1..parts.len() - 1] {
1967                    // Skip first and last empty parts
1968                    let trimmed = part.trim();
1969                    let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
1970                        "center".to_string()
1971                    } else if trimmed.ends_with(':') {
1972                        "right".to_string()
1973                    } else if trimmed.starts_with(':') {
1974                        "left".to_string()
1975                    } else {
1976                        "none".to_string()
1977                    };
1978                    column_alignments.push(alignment);
1979                }
1980            }
1981
1982            table_rows.push(TableRow {
1983                line: line_num,
1984                is_separator,
1985                column_count,
1986                column_alignments,
1987            });
1988        }
1989
1990        table_rows
1991    }
1992
1993    /// Parse bare URLs and emails in the content
1994    fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
1995        let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
1996
1997        // Check for bare URLs (not in angle brackets or markdown links)
1998        for cap in BARE_URL_PATTERN.captures_iter(content) {
1999            let full_match = cap.get(0).unwrap();
2000            let match_start = full_match.start();
2001            let match_end = full_match.end();
2002
2003            // Skip if in code block
2004            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2005                continue;
2006            }
2007
2008            // Skip if already in angle brackets or markdown links
2009            let preceding_char = if match_start > 0 {
2010                content.chars().nth(match_start - 1)
2011            } else {
2012                None
2013            };
2014            let following_char = content.chars().nth(match_end);
2015
2016            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2017                continue;
2018            }
2019            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2020                continue;
2021            }
2022
2023            let url = full_match.as_str();
2024            let url_type = if url.starts_with("https://") {
2025                "https"
2026            } else if url.starts_with("http://") {
2027                "http"
2028            } else if url.starts_with("ftp://") {
2029                "ftp"
2030            } else {
2031                "other"
2032            };
2033
2034            // Find which line this URL is on
2035            let mut line_num = 1;
2036            let mut col_start = match_start;
2037            let mut col_end = match_end;
2038            for (idx, line_info) in lines.iter().enumerate() {
2039                if match_start >= line_info.byte_offset {
2040                    line_num = idx + 1;
2041                    col_start = match_start - line_info.byte_offset;
2042                    col_end = match_end - line_info.byte_offset;
2043                } else {
2044                    break;
2045                }
2046            }
2047
2048            bare_urls.push(BareUrl {
2049                line: line_num,
2050                start_col: col_start,
2051                end_col: col_end,
2052                byte_offset: match_start,
2053                byte_end: match_end,
2054                url: url.to_string(),
2055                url_type: url_type.to_string(),
2056            });
2057        }
2058
2059        // Check for bare email addresses
2060        for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
2061            let full_match = cap.get(0).unwrap();
2062            let match_start = full_match.start();
2063            let match_end = full_match.end();
2064
2065            // Skip if in code block
2066            if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
2067                continue;
2068            }
2069
2070            // Skip if already in angle brackets or markdown links
2071            let preceding_char = if match_start > 0 {
2072                content.chars().nth(match_start - 1)
2073            } else {
2074                None
2075            };
2076            let following_char = content.chars().nth(match_end);
2077
2078            if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
2079                continue;
2080            }
2081            if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
2082                continue;
2083            }
2084
2085            let email = full_match.as_str();
2086
2087            // Find which line this email is on
2088            let mut line_num = 1;
2089            let mut col_start = match_start;
2090            let mut col_end = match_end;
2091            for (idx, line_info) in lines.iter().enumerate() {
2092                if match_start >= line_info.byte_offset {
2093                    line_num = idx + 1;
2094                    col_start = match_start - line_info.byte_offset;
2095                    col_end = match_end - line_info.byte_offset;
2096                } else {
2097                    break;
2098                }
2099            }
2100
2101            bare_urls.push(BareUrl {
2102                line: line_num,
2103                start_col: col_start,
2104                end_col: col_end,
2105                byte_offset: match_start,
2106                byte_end: match_end,
2107                url: email.to_string(),
2108                url_type: "email".to_string(),
2109            });
2110        }
2111
2112        bare_urls
2113    }
2114}
2115
2116/// Merge adjacent list blocks that should be treated as one
2117fn merge_adjacent_list_blocks(list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
2118    if list_blocks.len() < 2 {
2119        return;
2120    }
2121
2122    let mut merger = ListBlockMerger::new(lines);
2123    *list_blocks = merger.merge(list_blocks);
2124}
2125
2126/// Helper struct to manage the complex logic of merging list blocks
2127struct ListBlockMerger<'a> {
2128    lines: &'a [LineInfo],
2129}
2130
2131impl<'a> ListBlockMerger<'a> {
2132    fn new(lines: &'a [LineInfo]) -> Self {
2133        Self { lines }
2134    }
2135
2136    fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
2137        let mut merged = Vec::with_capacity(list_blocks.len());
2138        let mut current = list_blocks[0].clone();
2139
2140        for next in list_blocks.iter().skip(1) {
2141            if self.should_merge_blocks(&current, next) {
2142                current = self.merge_two_blocks(current, next);
2143            } else {
2144                merged.push(current);
2145                current = next.clone();
2146            }
2147        }
2148
2149        merged.push(current);
2150        merged
2151    }
2152
2153    /// Determine if two adjacent list blocks should be merged
2154    fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
2155        // Basic compatibility checks
2156        if !self.blocks_are_compatible(current, next) {
2157            return false;
2158        }
2159
2160        // Check spacing and content between blocks
2161        let spacing = self.analyze_spacing_between(current, next);
2162        match spacing {
2163            BlockSpacing::Consecutive => true,
2164            BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
2165            BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
2166                self.can_merge_with_content_between(current, next)
2167            }
2168        }
2169    }
2170
2171    /// Check if blocks have compatible structure for merging
2172    fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
2173        current.is_ordered == next.is_ordered
2174            && current.blockquote_prefix == next.blockquote_prefix
2175            && current.nesting_level == next.nesting_level
2176    }
2177
2178    /// Analyze the spacing between two list blocks
2179    fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
2180        let gap = next.start_line - current.end_line;
2181
2182        match gap {
2183            1 => BlockSpacing::Consecutive,
2184            2 => BlockSpacing::SingleBlank,
2185            _ if gap > 2 => {
2186                if self.has_only_blank_lines_between(current, next) {
2187                    BlockSpacing::MultipleBlanks
2188                } else {
2189                    BlockSpacing::ContentBetween
2190                }
2191            }
2192            _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
2193        }
2194    }
2195
2196    /// Check if unordered lists can be merged with a single blank line between
2197    fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2198        // Check if there are structural separators between the blocks
2199        // If has_meaningful_content_between returns true, it means there are structural separators
2200        if has_meaningful_content_between(current, next, self.lines) {
2201            return false; // Structural separators prevent merging
2202        }
2203
2204        // Only merge unordered lists with same marker across single blank
2205        !current.is_ordered && current.marker == next.marker
2206    }
2207
2208    /// Check if ordered lists can be merged when there's content between them
2209    fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2210        // Do not merge lists if there are structural separators between them
2211        if has_meaningful_content_between(current, next, self.lines) {
2212            return false; // Structural separators prevent merging
2213        }
2214
2215        // Only consider merging ordered lists if there's no structural content between
2216        current.is_ordered && next.is_ordered
2217    }
2218
2219    /// Check if there are only blank lines between blocks
2220    fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
2221        for line_num in (current.end_line + 1)..next.start_line {
2222            if let Some(line_info) = self.lines.get(line_num - 1)
2223                && !line_info.content.trim().is_empty()
2224            {
2225                return false;
2226            }
2227        }
2228        true
2229    }
2230
2231    /// Merge two compatible list blocks into one
2232    fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
2233        current.end_line = next.end_line;
2234        current.item_lines.extend_from_slice(&next.item_lines);
2235
2236        // Update max marker width
2237        current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
2238
2239        // Handle marker consistency for unordered lists
2240        if !current.is_ordered && self.markers_differ(&current, next) {
2241            current.marker = None; // Mixed markers
2242        }
2243
2244        current
2245    }
2246
2247    /// Check if two blocks have different markers
2248    fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
2249        current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
2250    }
2251}
2252
2253/// Types of spacing between list blocks
2254#[derive(Debug, PartialEq)]
2255enum BlockSpacing {
2256    Consecutive,    // No gap between blocks
2257    SingleBlank,    // One blank line between blocks
2258    MultipleBlanks, // Multiple blank lines but no content
2259    ContentBetween, // Content exists between blocks
2260}
2261
2262/// Check if there's meaningful content (not just blank lines) between two list blocks
2263fn has_meaningful_content_between(current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
2264    // Check lines between current.end_line and next.start_line
2265    for line_num in (current.end_line + 1)..next.start_line {
2266        if let Some(line_info) = lines.get(line_num - 1) {
2267            // Convert to 0-indexed
2268            let trimmed = line_info.content.trim();
2269
2270            // Skip empty lines
2271            if trimmed.is_empty() {
2272                continue;
2273            }
2274
2275            // Check for structural separators that should separate lists (CommonMark compliant)
2276
2277            // Headings separate lists
2278            if line_info.heading.is_some() {
2279                return true; // Has meaningful content - headings separate lists
2280            }
2281
2282            // Horizontal rules separate lists (---, ***, ___)
2283            if is_horizontal_rule(trimmed) {
2284                return true; // Has meaningful content - horizontal rules separate lists
2285            }
2286
2287            // Tables separate lists (lines containing |)
2288            if trimmed.contains('|') && trimmed.len() > 1 {
2289                return true; // Has meaningful content - tables separate lists
2290            }
2291
2292            // Blockquotes separate lists
2293            if trimmed.starts_with('>') {
2294                return true; // Has meaningful content - blockquotes separate lists
2295            }
2296
2297            // Code block fences separate lists (unless properly indented as list content)
2298            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2299                let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2300
2301                // Check if this code block is properly indented as list continuation
2302                let min_continuation_indent = if current.is_ordered {
2303                    current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
2304                } else {
2305                    current.nesting_level + 2
2306                };
2307
2308                if line_indent < min_continuation_indent {
2309                    // This is a standalone code block that separates lists
2310                    return true; // Has meaningful content - standalone code blocks separate lists
2311                }
2312            }
2313
2314            // Check if this line has proper indentation for list continuation
2315            let line_indent = line_info.content.len() - line_info.content.trim_start().len();
2316
2317            // Calculate minimum indentation needed to be list continuation
2318            let min_indent = if current.is_ordered {
2319                current.nesting_level + current.max_marker_width
2320            } else {
2321                current.nesting_level + 2
2322            };
2323
2324            // If the line is not indented enough to be list continuation, it's meaningful content
2325            if line_indent < min_indent {
2326                return true; // Has meaningful content - content not indented as list continuation
2327            }
2328
2329            // If we reach here, the line is properly indented as list continuation
2330            // Continue checking other lines
2331        }
2332    }
2333
2334    // Only blank lines or properly indented list continuation content between blocks
2335    false
2336}
2337
2338/// Check if a line is a horizontal rule (---, ***, ___)
2339fn is_horizontal_rule(trimmed: &str) -> bool {
2340    if trimmed.len() < 3 {
2341        return false;
2342    }
2343
2344    // Check for three or more consecutive -, *, or _ characters (with optional spaces)
2345    let chars: Vec<char> = trimmed.chars().collect();
2346    if let Some(&first_char) = chars.first()
2347        && (first_char == '-' || first_char == '*' || first_char == '_')
2348    {
2349        let mut count = 0;
2350        for &ch in &chars {
2351            if ch == first_char {
2352                count += 1;
2353            } else if ch != ' ' && ch != '\t' {
2354                return false; // Non-matching, non-whitespace character
2355            }
2356        }
2357        return count >= 3;
2358    }
2359    false
2360}
2361
2362/// Check if content contains patterns that cause the markdown crate to panic
2363#[cfg(test)]
2364mod tests {
2365    use super::*;
2366
2367    #[test]
2368    fn test_empty_content() {
2369        let ctx = LintContext::new("");
2370        assert_eq!(ctx.content, "");
2371        assert_eq!(ctx.line_offsets, vec![0]);
2372        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2373        assert_eq!(ctx.lines.len(), 0);
2374    }
2375
2376    #[test]
2377    fn test_single_line() {
2378        let ctx = LintContext::new("# Hello");
2379        assert_eq!(ctx.content, "# Hello");
2380        assert_eq!(ctx.line_offsets, vec![0]);
2381        assert_eq!(ctx.offset_to_line_col(0), (1, 1));
2382        assert_eq!(ctx.offset_to_line_col(3), (1, 4));
2383    }
2384
2385    #[test]
2386    fn test_multi_line() {
2387        let content = "# Title\n\nSecond line\nThird line";
2388        let ctx = LintContext::new(content);
2389        assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
2390        // Test offset to line/col
2391        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
2392        assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
2393        assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
2394        assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
2395        assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
2396    }
2397
2398    #[test]
2399    fn test_line_info() {
2400        let content = "# Title\n    indented\n\ncode:\n```rust\nfn main() {}\n```";
2401        let ctx = LintContext::new(content);
2402
2403        // Test line info
2404        assert_eq!(ctx.lines.len(), 7);
2405
2406        // Line 1: "# Title"
2407        let line1 = &ctx.lines[0];
2408        assert_eq!(line1.content, "# Title");
2409        assert_eq!(line1.byte_offset, 0);
2410        assert_eq!(line1.indent, 0);
2411        assert!(!line1.is_blank);
2412        assert!(!line1.in_code_block);
2413        assert!(line1.list_item.is_none());
2414
2415        // Line 2: "    indented"
2416        let line2 = &ctx.lines[1];
2417        assert_eq!(line2.content, "    indented");
2418        assert_eq!(line2.byte_offset, 8);
2419        assert_eq!(line2.indent, 4);
2420        assert!(!line2.is_blank);
2421
2422        // Line 3: "" (blank)
2423        let line3 = &ctx.lines[2];
2424        assert_eq!(line3.content, "");
2425        assert!(line3.is_blank);
2426
2427        // Test helper methods
2428        assert_eq!(ctx.line_to_byte_offset(1), Some(0));
2429        assert_eq!(ctx.line_to_byte_offset(2), Some(8));
2430        assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
2431        assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
2432    }
2433
2434    #[test]
2435    fn test_list_item_detection() {
2436        let content = "- Unordered item\n  * Nested item\n1. Ordered item\n   2) Nested ordered\n\nNot a list";
2437        let ctx = LintContext::new(content);
2438
2439        // Line 1: "- Unordered item"
2440        let line1 = &ctx.lines[0];
2441        assert!(line1.list_item.is_some());
2442        let list1 = line1.list_item.as_ref().unwrap();
2443        assert_eq!(list1.marker, "-");
2444        assert!(!list1.is_ordered);
2445        assert_eq!(list1.marker_column, 0);
2446        assert_eq!(list1.content_column, 2);
2447
2448        // Line 2: "  * Nested item"
2449        let line2 = &ctx.lines[1];
2450        assert!(line2.list_item.is_some());
2451        let list2 = line2.list_item.as_ref().unwrap();
2452        assert_eq!(list2.marker, "*");
2453        assert_eq!(list2.marker_column, 2);
2454
2455        // Line 3: "1. Ordered item"
2456        let line3 = &ctx.lines[2];
2457        assert!(line3.list_item.is_some());
2458        let list3 = line3.list_item.as_ref().unwrap();
2459        assert_eq!(list3.marker, "1.");
2460        assert!(list3.is_ordered);
2461        assert_eq!(list3.number, Some(1));
2462
2463        // Line 6: "Not a list"
2464        let line6 = &ctx.lines[5];
2465        assert!(line6.list_item.is_none());
2466    }
2467
2468    #[test]
2469    fn test_offset_to_line_col_edge_cases() {
2470        let content = "a\nb\nc";
2471        let ctx = LintContext::new(content);
2472        // line_offsets: [0, 2, 4]
2473        assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
2474        assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
2475        assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
2476        assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
2477        assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
2478        assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
2479    }
2480}
rumdl_lib/lint_context.rs

rumdl_lib/
lint_context.rs