rumdl_lib/lint_context.rs
1use crate::config::MarkdownFlavor;
2use crate::rules::front_matter_utils::FrontMatterUtils;
3use crate::utils::code_block_utils::{CodeBlockContext, CodeBlockUtils};
4use crate::utils::element_cache::ElementCache;
5use crate::utils::regex_cache::URL_SIMPLE_REGEX;
6use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
7use regex::Regex;
8use std::borrow::Cow;
9use std::path::PathBuf;
10use std::sync::LazyLock;
11
12/// Macro for profiling sections - only active in non-WASM builds
13#[cfg(not(target_arch = "wasm32"))]
14macro_rules! profile_section {
15 ($name:expr, $profile:expr, $code:expr) => {{
16 let start = std::time::Instant::now();
17 let result = $code;
18 if $profile {
19 eprintln!("[PROFILE] {}: {:?}", $name, start.elapsed());
20 }
21 result
22 }};
23}
24
25#[cfg(target_arch = "wasm32")]
26macro_rules! profile_section {
27 ($name:expr, $profile:expr, $code:expr) => {{ $code }};
28}
29
30// Comprehensive link pattern that captures both inline and reference links
31// Use (?s) flag to make . match newlines
32static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
33 Regex::new(
34 r#"(?sx)
35 \[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
36 (?:
37 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
38 |
39 \[([^\]]*)\] # Reference ID in group 6
40 )"#
41 ).unwrap()
42});
43
44// Image pattern (similar to links but with ! prefix)
45// Use (?s) flag to make . match newlines
46static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
47 Regex::new(
48 r#"(?sx)
49 !\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
50 (?:
51 \((?:<([^<>\n]*)>|([^)"']*))(?:\s+(?:"([^"]*)"|'([^']*)'))?\) # URL in group 2 (angle) or 3 (bare), title in 4/5
52 |
53 \[([^\]]*)\] # Reference ID in group 6
54 )"#
55 ).unwrap()
56});
57
58// Reference definition pattern
59static REF_DEF_PATTERN: LazyLock<Regex> =
60 LazyLock::new(|| Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap());
61
62// Pattern for bare URLs - uses centralized URL pattern from regex_cache
63
64// Pattern for email addresses
65static BARE_EMAIL_PATTERN: LazyLock<Regex> =
66 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
67
68// Pattern for blockquote prefix in parse_list_blocks
69static BLOCKQUOTE_PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
70
71/// Pre-computed information about a line
72#[derive(Debug, Clone)]
73pub struct LineInfo {
74 /// Byte offset where this line starts in the document
75 pub byte_offset: usize,
76 /// Length of the line in bytes (without newline)
77 pub byte_len: usize,
78 /// Number of bytes of leading whitespace (for substring extraction)
79 pub indent: usize,
80 /// Visual column width of leading whitespace (with proper tab expansion)
81 /// Per CommonMark, tabs expand to the next column that is a multiple of 4.
82 /// Use this for numeric comparisons like checking for indented code blocks (>= 4).
83 pub visual_indent: usize,
84 /// Whether the line is blank (empty or only whitespace)
85 pub is_blank: bool,
86 /// Whether this line is inside a code block
87 pub in_code_block: bool,
88 /// Whether this line is inside front matter
89 pub in_front_matter: bool,
90 /// Whether this line is inside an HTML block
91 pub in_html_block: bool,
92 /// Whether this line is inside an HTML comment
93 pub in_html_comment: bool,
94 /// List item information if this line starts a list item
95 pub list_item: Option<ListItemInfo>,
96 /// Heading information if this line is a heading
97 pub heading: Option<HeadingInfo>,
98 /// Blockquote information if this line is a blockquote
99 pub blockquote: Option<BlockquoteInfo>,
100 /// Whether this line is inside a mkdocstrings autodoc block
101 pub in_mkdocstrings: bool,
102 /// Whether this line is part of an ESM import/export block (MDX only)
103 pub in_esm_block: bool,
104 /// Whether this line is a continuation of a multi-line code span from a previous line
105 pub in_code_span_continuation: bool,
106 /// Whether this line is a horizontal rule (---, ***, ___, etc.)
107 /// Pre-computed for consistent detection across all rules
108 pub is_horizontal_rule: bool,
109 /// Whether this line is inside a math block ($$ ... $$)
110 pub in_math_block: bool,
111}
112
113impl LineInfo {
114 /// Get the line content as a string slice from the source document
115 pub fn content<'a>(&self, source: &'a str) -> &'a str {
116 &source[self.byte_offset..self.byte_offset + self.byte_len]
117 }
118}
119
120/// Information about a list item
121#[derive(Debug, Clone)]
122pub struct ListItemInfo {
123 /// The marker used (*, -, +, or number with . or ))
124 pub marker: String,
125 /// Whether it's ordered (true) or unordered (false)
126 pub is_ordered: bool,
127 /// The number for ordered lists
128 pub number: Option<usize>,
129 /// Column where the marker starts (0-based)
130 pub marker_column: usize,
131 /// Column where content after marker starts
132 pub content_column: usize,
133}
134
135/// Heading style type
136#[derive(Debug, Clone, PartialEq)]
137pub enum HeadingStyle {
138 /// ATX style heading (# Heading)
139 ATX,
140 /// Setext style heading with = underline
141 Setext1,
142 /// Setext style heading with - underline
143 Setext2,
144}
145
146/// Parsed link information
147#[derive(Debug, Clone)]
148pub struct ParsedLink<'a> {
149 /// Line number (1-indexed)
150 pub line: usize,
151 /// Start column (0-indexed) in the line
152 pub start_col: usize,
153 /// End column (0-indexed) in the line
154 pub end_col: usize,
155 /// Byte offset in document
156 pub byte_offset: usize,
157 /// End byte offset in document
158 pub byte_end: usize,
159 /// Link text
160 pub text: Cow<'a, str>,
161 /// Link URL or reference
162 pub url: Cow<'a, str>,
163 /// Whether this is a reference link [text][ref] vs inline [text](url)
164 pub is_reference: bool,
165 /// Reference ID for reference links
166 pub reference_id: Option<Cow<'a, str>>,
167 /// Link type from pulldown-cmark
168 pub link_type: LinkType,
169}
170
171/// Information about a broken link reported by pulldown-cmark
172#[derive(Debug, Clone)]
173pub struct BrokenLinkInfo {
174 /// The reference text that couldn't be resolved
175 pub reference: String,
176 /// Byte span in the source document
177 pub span: std::ops::Range<usize>,
178}
179
180/// Parsed footnote reference (e.g., `[^1]`, `[^note]`)
181#[derive(Debug, Clone)]
182pub struct FootnoteRef {
183 /// The footnote ID (without the ^ prefix)
184 pub id: String,
185 /// Line number (1-indexed)
186 pub line: usize,
187 /// Start byte offset in document
188 pub byte_offset: usize,
189 /// End byte offset in document
190 pub byte_end: usize,
191}
192
193/// Parsed image information
194#[derive(Debug, Clone)]
195pub struct ParsedImage<'a> {
196 /// Line number (1-indexed)
197 pub line: usize,
198 /// Start column (0-indexed) in the line
199 pub start_col: usize,
200 /// End column (0-indexed) in the line
201 pub end_col: usize,
202 /// Byte offset in document
203 pub byte_offset: usize,
204 /// End byte offset in document
205 pub byte_end: usize,
206 /// Alt text
207 pub alt_text: Cow<'a, str>,
208 /// Image URL or reference
209 pub url: Cow<'a, str>,
210 /// Whether this is a reference image ![alt][ref] vs inline 
211 pub is_reference: bool,
212 /// Reference ID for reference images
213 pub reference_id: Option<Cow<'a, str>>,
214 /// Link type from pulldown-cmark
215 pub link_type: LinkType,
216}
217
218/// Reference definition [ref]: url "title"
219#[derive(Debug, Clone)]
220pub struct ReferenceDef {
221 /// Line number (1-indexed)
222 pub line: usize,
223 /// Reference ID (normalized to lowercase)
224 pub id: String,
225 /// URL
226 pub url: String,
227 /// Optional title
228 pub title: Option<String>,
229 /// Byte offset where the reference definition starts
230 pub byte_offset: usize,
231 /// Byte offset where the reference definition ends
232 pub byte_end: usize,
233 /// Byte offset where the title starts (if present, includes quote)
234 pub title_byte_start: Option<usize>,
235 /// Byte offset where the title ends (if present, includes quote)
236 pub title_byte_end: Option<usize>,
237}
238
239/// Parsed code span information
240#[derive(Debug, Clone)]
241pub struct CodeSpan {
242 /// Line number where the code span starts (1-indexed)
243 pub line: usize,
244 /// Line number where the code span ends (1-indexed)
245 pub end_line: usize,
246 /// Start column (0-indexed) in the line
247 pub start_col: usize,
248 /// End column (0-indexed) in the line
249 pub end_col: usize,
250 /// Byte offset in document
251 pub byte_offset: usize,
252 /// End byte offset in document
253 pub byte_end: usize,
254 /// Number of backticks used (1, 2, 3, etc.)
255 pub backtick_count: usize,
256 /// Content inside the code span (without backticks)
257 pub content: String,
258}
259
260/// Information about a heading
261#[derive(Debug, Clone)]
262pub struct HeadingInfo {
263 /// Heading level (1-6 for ATX, 1-2 for Setext)
264 pub level: u8,
265 /// Style of heading
266 pub style: HeadingStyle,
267 /// The heading marker (# characters or underline)
268 pub marker: String,
269 /// Column where the marker starts (0-based)
270 pub marker_column: usize,
271 /// Column where heading text starts
272 pub content_column: usize,
273 /// The heading text (without markers and without custom ID syntax)
274 pub text: String,
275 /// Custom header ID if present (e.g., from {#custom-id} syntax)
276 pub custom_id: Option<String>,
277 /// Original heading text including custom ID syntax
278 pub raw_text: String,
279 /// Whether it has a closing sequence (for ATX)
280 pub has_closing_sequence: bool,
281 /// The closing sequence if present
282 pub closing_sequence: String,
283 /// Whether this is a valid CommonMark heading (ATX headings require space after #)
284 /// False for malformed headings like `#NoSpace` that MD018 should flag
285 pub is_valid: bool,
286}
287
288/// A valid heading from a filtered iteration
289///
290/// Only includes headings that are CommonMark-compliant (have space after #).
291/// Hashtag-like patterns (`#tag`, `#123`) are excluded.
292#[derive(Debug, Clone)]
293pub struct ValidHeading<'a> {
294 /// The 1-indexed line number in the document
295 pub line_num: usize,
296 /// Reference to the heading information
297 pub heading: &'a HeadingInfo,
298 /// Reference to the full line info (for rules that need additional context)
299 pub line_info: &'a LineInfo,
300}
301
302/// Iterator over valid CommonMark headings in a document
303///
304/// Filters out malformed headings like `#NoSpace` that should be flagged by MD018
305/// but should not be processed by other heading rules.
306pub struct ValidHeadingsIter<'a> {
307 lines: &'a [LineInfo],
308 current_index: usize,
309}
310
311impl<'a> ValidHeadingsIter<'a> {
312 fn new(lines: &'a [LineInfo]) -> Self {
313 Self {
314 lines,
315 current_index: 0,
316 }
317 }
318}
319
320impl<'a> Iterator for ValidHeadingsIter<'a> {
321 type Item = ValidHeading<'a>;
322
323 fn next(&mut self) -> Option<Self::Item> {
324 while self.current_index < self.lines.len() {
325 let idx = self.current_index;
326 self.current_index += 1;
327
328 let line_info = &self.lines[idx];
329 if let Some(heading) = &line_info.heading
330 && heading.is_valid
331 {
332 return Some(ValidHeading {
333 line_num: idx + 1, // Convert 0-indexed to 1-indexed
334 heading,
335 line_info,
336 });
337 }
338 }
339 None
340 }
341}
342
343/// Information about a blockquote line
344#[derive(Debug, Clone)]
345pub struct BlockquoteInfo {
346 /// Nesting level (1 for >, 2 for >>, etc.)
347 pub nesting_level: usize,
348 /// The indentation before the blockquote marker
349 pub indent: String,
350 /// Column where the first > starts (0-based)
351 pub marker_column: usize,
352 /// The blockquote prefix (e.g., "> ", ">> ", etc.)
353 pub prefix: String,
354 /// Content after the blockquote marker(s)
355 pub content: String,
356 /// Whether the line has no space after the marker
357 pub has_no_space_after_marker: bool,
358 /// Whether the line has multiple spaces after the marker
359 pub has_multiple_spaces_after_marker: bool,
360 /// Whether this is an empty blockquote line needing MD028 fix
361 pub needs_md028_fix: bool,
362}
363
364/// Information about a list block
365#[derive(Debug, Clone)]
366pub struct ListBlock {
367 /// Line number where the list starts (1-indexed)
368 pub start_line: usize,
369 /// Line number where the list ends (1-indexed)
370 pub end_line: usize,
371 /// Whether it's ordered or unordered
372 pub is_ordered: bool,
373 /// The consistent marker for unordered lists (if any)
374 pub marker: Option<String>,
375 /// Blockquote prefix for this list (empty if not in blockquote)
376 pub blockquote_prefix: String,
377 /// Lines that are list items within this block
378 pub item_lines: Vec<usize>,
379 /// Nesting level (0 for top-level lists)
380 pub nesting_level: usize,
381 /// Maximum marker width seen in this block (e.g., 3 for "1. ", 4 for "10. ")
382 pub max_marker_width: usize,
383}
384
385use std::sync::{Arc, OnceLock};
386
387/// Map from line byte offset to list item data: (is_ordered, marker, marker_column, content_column, number)
388type ListItemMap = std::collections::HashMap<usize, (bool, String, usize, usize, Option<usize>)>;
389
390/// Character frequency data for fast content analysis
391#[derive(Debug, Clone, Default)]
392pub struct CharFrequency {
393 /// Count of # characters (headings)
394 pub hash_count: usize,
395 /// Count of * characters (emphasis, lists, horizontal rules)
396 pub asterisk_count: usize,
397 /// Count of _ characters (emphasis, horizontal rules)
398 pub underscore_count: usize,
399 /// Count of - characters (lists, horizontal rules, setext headings)
400 pub hyphen_count: usize,
401 /// Count of + characters (lists)
402 pub plus_count: usize,
403 /// Count of > characters (blockquotes)
404 pub gt_count: usize,
405 /// Count of | characters (tables)
406 pub pipe_count: usize,
407 /// Count of [ characters (links, images)
408 pub bracket_count: usize,
409 /// Count of ` characters (code spans, code blocks)
410 pub backtick_count: usize,
411 /// Count of < characters (HTML tags, autolinks)
412 pub lt_count: usize,
413 /// Count of ! characters (images)
414 pub exclamation_count: usize,
415 /// Count of newline characters
416 pub newline_count: usize,
417}
418
419/// Pre-parsed HTML tag information
420#[derive(Debug, Clone)]
421pub struct HtmlTag {
422 /// Line number (1-indexed)
423 pub line: usize,
424 /// Start column (0-indexed) in the line
425 pub start_col: usize,
426 /// End column (0-indexed) in the line
427 pub end_col: usize,
428 /// Byte offset in document
429 pub byte_offset: usize,
430 /// End byte offset in document
431 pub byte_end: usize,
432 /// Tag name (e.g., "div", "img", "br")
433 pub tag_name: String,
434 /// Whether it's a closing tag (`</tag>`)
435 pub is_closing: bool,
436 /// Whether it's self-closing (`<tag />`)
437 pub is_self_closing: bool,
438 /// Raw tag content
439 pub raw_content: String,
440}
441
442/// Pre-parsed emphasis span information
443#[derive(Debug, Clone)]
444pub struct EmphasisSpan {
445 /// Line number (1-indexed)
446 pub line: usize,
447 /// Start column (0-indexed) in the line
448 pub start_col: usize,
449 /// End column (0-indexed) in the line
450 pub end_col: usize,
451 /// Byte offset in document
452 pub byte_offset: usize,
453 /// End byte offset in document
454 pub byte_end: usize,
455 /// Type of emphasis ('*' or '_')
456 pub marker: char,
457 /// Number of markers (1 for italic, 2 for bold, 3+ for bold+italic)
458 pub marker_count: usize,
459 /// Content inside the emphasis
460 pub content: String,
461}
462
463/// Pre-parsed table row information
464#[derive(Debug, Clone)]
465pub struct TableRow {
466 /// Line number (1-indexed)
467 pub line: usize,
468 /// Whether this is a separator row (contains only |, -, :, and spaces)
469 pub is_separator: bool,
470 /// Number of columns (pipe-separated cells)
471 pub column_count: usize,
472 /// Alignment info from separator row
473 pub column_alignments: Vec<String>, // "left", "center", "right", "none"
474}
475
476/// Pre-parsed bare URL information (not in links)
477#[derive(Debug, Clone)]
478pub struct BareUrl {
479 /// Line number (1-indexed)
480 pub line: usize,
481 /// Start column (0-indexed) in the line
482 pub start_col: usize,
483 /// End column (0-indexed) in the line
484 pub end_col: usize,
485 /// Byte offset in document
486 pub byte_offset: usize,
487 /// End byte offset in document
488 pub byte_end: usize,
489 /// The URL string
490 pub url: String,
491 /// Type of URL ("http", "https", "ftp", "email")
492 pub url_type: String,
493}
494
495pub struct LintContext<'a> {
496 pub content: &'a str,
497 pub line_offsets: Vec<usize>,
498 pub code_blocks: Vec<(usize, usize)>, // Cached code block ranges (not including inline code spans)
499 pub lines: Vec<LineInfo>, // Pre-computed line information
500 pub links: Vec<ParsedLink<'a>>, // Pre-parsed links
501 pub images: Vec<ParsedImage<'a>>, // Pre-parsed images
502 pub broken_links: Vec<BrokenLinkInfo>, // Broken/undefined references
503 pub footnote_refs: Vec<FootnoteRef>, // Pre-parsed footnote references
504 pub reference_defs: Vec<ReferenceDef>, // Reference definitions
505 code_spans_cache: OnceLock<Arc<Vec<CodeSpan>>>, // Lazy-loaded inline code spans
506 pub list_blocks: Vec<ListBlock>, // Pre-parsed list blocks
507 pub char_frequency: CharFrequency, // Character frequency analysis
508 html_tags_cache: OnceLock<Arc<Vec<HtmlTag>>>, // Lazy-loaded HTML tags
509 emphasis_spans_cache: OnceLock<Arc<Vec<EmphasisSpan>>>, // Lazy-loaded emphasis spans
510 table_rows_cache: OnceLock<Arc<Vec<TableRow>>>, // Lazy-loaded table rows
511 bare_urls_cache: OnceLock<Arc<Vec<BareUrl>>>, // Lazy-loaded bare URLs
512 has_mixed_list_nesting_cache: OnceLock<bool>, // Cached result for mixed ordered/unordered list nesting detection
513 html_comment_ranges: Vec<crate::utils::skip_context::ByteRange>, // Pre-computed HTML comment ranges
514 pub table_blocks: Vec<crate::utils::table_utils::TableBlock>, // Pre-computed table blocks
515 pub line_index: crate::utils::range_utils::LineIndex<'a>, // Pre-computed line index for byte position calculations
516 jinja_ranges: Vec<(usize, usize)>, // Pre-computed Jinja template ranges ({{ }}, {% %})
517 pub flavor: MarkdownFlavor, // Markdown flavor being used
518 pub source_file: Option<PathBuf>, // Source file path (for rules that need file context)
519}
520
521/// Detailed blockquote parse result with all components
522struct BlockquoteComponents<'a> {
523 indent: &'a str,
524 markers: &'a str,
525 spaces_after: &'a str,
526 content: &'a str,
527}
528
529/// Parse blockquote prefix with detailed components using manual parsing
530#[inline]
531fn parse_blockquote_detailed(line: &str) -> Option<BlockquoteComponents<'_>> {
532 let bytes = line.as_bytes();
533 let mut pos = 0;
534
535 // Parse leading whitespace (indent)
536 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
537 pos += 1;
538 }
539 let indent_end = pos;
540
541 // Must have at least one '>' marker
542 if pos >= bytes.len() || bytes[pos] != b'>' {
543 return None;
544 }
545
546 // Parse '>' markers
547 while pos < bytes.len() && bytes[pos] == b'>' {
548 pos += 1;
549 }
550 let markers_end = pos;
551
552 // Parse spaces after markers
553 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
554 pos += 1;
555 }
556 let spaces_end = pos;
557
558 Some(BlockquoteComponents {
559 indent: &line[0..indent_end],
560 markers: &line[indent_end..markers_end],
561 spaces_after: &line[markers_end..spaces_end],
562 content: &line[spaces_end..],
563 })
564}
565
566impl<'a> LintContext<'a> {
567 pub fn new(content: &'a str, flavor: MarkdownFlavor, source_file: Option<PathBuf>) -> Self {
568 #[cfg(not(target_arch = "wasm32"))]
569 let profile = std::env::var("RUMDL_PROFILE_QUADRATIC").is_ok();
570 #[cfg(target_arch = "wasm32")]
571 let profile = false;
572
573 let line_offsets = profile_section!("Line offsets", profile, {
574 let mut offsets = vec![0];
575 for (i, c) in content.char_indices() {
576 if c == '\n' {
577 offsets.push(i + 1);
578 }
579 }
580 offsets
581 });
582
583 // Detect code blocks once and cache them
584 let code_blocks = profile_section!("Code blocks", profile, CodeBlockUtils::detect_code_blocks(content));
585
586 // Pre-compute HTML comment ranges ONCE for all operations
587 let html_comment_ranges = profile_section!(
588 "HTML comment ranges",
589 profile,
590 crate::utils::skip_context::compute_html_comment_ranges(content)
591 );
592
593 // Pre-compute autodoc block ranges for MkDocs flavor (avoids O(n²) scaling)
594 let autodoc_ranges = profile_section!("Autodoc block ranges", profile, {
595 if flavor == MarkdownFlavor::MkDocs {
596 crate::utils::mkdocstrings_refs::detect_autodoc_block_ranges(content)
597 } else {
598 Vec::new()
599 }
600 });
601
602 // Pre-compute line information AND emphasis spans (without headings/blockquotes yet)
603 // Emphasis spans are captured during the same pulldown-cmark parse as list detection
604 let (mut lines, emphasis_spans) = profile_section!(
605 "Basic line info",
606 profile,
607 Self::compute_basic_line_info(
608 content,
609 &line_offsets,
610 &code_blocks,
611 flavor,
612 &html_comment_ranges,
613 &autodoc_ranges,
614 )
615 );
616
617 // Detect HTML blocks BEFORE heading detection
618 profile_section!("HTML blocks", profile, Self::detect_html_blocks(content, &mut lines));
619
620 // Detect ESM import/export blocks in MDX files BEFORE heading detection
621 profile_section!(
622 "ESM blocks",
623 profile,
624 Self::detect_esm_blocks(content, &mut lines, flavor)
625 );
626
627 // Collect link byte ranges early for heading detection (to skip lines inside link syntax)
628 let link_byte_ranges = profile_section!("Link byte ranges", profile, Self::collect_link_byte_ranges(content));
629
630 // Now detect headings and blockquotes
631 profile_section!(
632 "Headings & blockquotes",
633 profile,
634 Self::detect_headings_and_blockquotes(content, &mut lines, flavor, &html_comment_ranges, &link_byte_ranges)
635 );
636
637 // Parse code spans early so we can exclude them from link/image parsing
638 let code_spans = profile_section!("Code spans", profile, Self::parse_code_spans(content, &lines));
639
640 // Mark lines that are continuations of multi-line code spans
641 // This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
642 for span in &code_spans {
643 if span.end_line > span.line {
644 // Mark lines after the first line as continuations
645 for line_num in (span.line + 1)..=span.end_line {
646 if let Some(line_info) = lines.get_mut(line_num - 1) {
647 line_info.in_code_span_continuation = true;
648 }
649 }
650 }
651 }
652
653 // Parse links, images, references, and list blocks
654 let (links, broken_links, footnote_refs) = profile_section!(
655 "Links",
656 profile,
657 Self::parse_links(content, &lines, &code_blocks, &code_spans, flavor, &html_comment_ranges)
658 );
659
660 let images = profile_section!(
661 "Images",
662 profile,
663 Self::parse_images(content, &lines, &code_blocks, &code_spans, &html_comment_ranges)
664 );
665
666 let reference_defs = profile_section!("Reference defs", profile, Self::parse_reference_defs(content, &lines));
667
668 let list_blocks = profile_section!("List blocks", profile, Self::parse_list_blocks(content, &lines));
669
670 // Compute character frequency for fast content analysis
671 let char_frequency = profile_section!("Char frequency", profile, Self::compute_char_frequency(content));
672
673 // Pre-compute table blocks for rules that need them (MD013, MD055, MD056, MD058, MD060)
674 let table_blocks = profile_section!(
675 "Table blocks",
676 profile,
677 crate::utils::table_utils::TableUtils::find_table_blocks_with_code_info(
678 content,
679 &code_blocks,
680 &code_spans,
681 &html_comment_ranges,
682 )
683 );
684
685 // Pre-compute LineIndex once for all rules (eliminates 46x content cloning)
686 let line_index = profile_section!(
687 "Line index",
688 profile,
689 crate::utils::range_utils::LineIndex::new(content)
690 );
691
692 // Pre-compute Jinja template ranges once for all rules (eliminates O(n×m) in MD011)
693 let jinja_ranges = profile_section!(
694 "Jinja ranges",
695 profile,
696 crate::utils::jinja_utils::find_jinja_ranges(content)
697 );
698
699 Self {
700 content,
701 line_offsets,
702 code_blocks,
703 lines,
704 links,
705 images,
706 broken_links,
707 footnote_refs,
708 reference_defs,
709 code_spans_cache: OnceLock::from(Arc::new(code_spans)),
710 list_blocks,
711 char_frequency,
712 html_tags_cache: OnceLock::new(),
713 emphasis_spans_cache: OnceLock::from(Arc::new(emphasis_spans)),
714 table_rows_cache: OnceLock::new(),
715 bare_urls_cache: OnceLock::new(),
716 has_mixed_list_nesting_cache: OnceLock::new(),
717 html_comment_ranges,
718 table_blocks,
719 line_index,
720 jinja_ranges,
721 flavor,
722 source_file,
723 }
724 }
725
726 /// Get code spans - computed lazily on first access
727 pub fn code_spans(&self) -> Arc<Vec<CodeSpan>> {
728 Arc::clone(
729 self.code_spans_cache
730 .get_or_init(|| Arc::new(Self::parse_code_spans(self.content, &self.lines))),
731 )
732 }
733
734 /// Get HTML comment ranges - pre-computed during LintContext construction
735 pub fn html_comment_ranges(&self) -> &[crate::utils::skip_context::ByteRange] {
736 &self.html_comment_ranges
737 }
738
739 /// Get HTML tags - computed lazily on first access
740 pub fn html_tags(&self) -> Arc<Vec<HtmlTag>> {
741 Arc::clone(self.html_tags_cache.get_or_init(|| {
742 Arc::new(Self::parse_html_tags(
743 self.content,
744 &self.lines,
745 &self.code_blocks,
746 self.flavor,
747 ))
748 }))
749 }
750
751 /// Get emphasis spans - pre-computed during construction
752 pub fn emphasis_spans(&self) -> Arc<Vec<EmphasisSpan>> {
753 Arc::clone(
754 self.emphasis_spans_cache
755 .get()
756 .expect("emphasis_spans_cache initialized during construction"),
757 )
758 }
759
760 /// Get table rows - computed lazily on first access
761 pub fn table_rows(&self) -> Arc<Vec<TableRow>> {
762 Arc::clone(
763 self.table_rows_cache
764 .get_or_init(|| Arc::new(Self::parse_table_rows(self.content, &self.lines))),
765 )
766 }
767
768 /// Get bare URLs - computed lazily on first access
769 pub fn bare_urls(&self) -> Arc<Vec<BareUrl>> {
770 Arc::clone(
771 self.bare_urls_cache
772 .get_or_init(|| Arc::new(Self::parse_bare_urls(self.content, &self.lines, &self.code_blocks))),
773 )
774 }
775
776 /// Check if document has mixed ordered/unordered list nesting.
777 /// Result is cached after first computation (document-level invariant).
778 /// This is used by MD007 for smart style auto-detection.
779 pub fn has_mixed_list_nesting(&self) -> bool {
780 *self
781 .has_mixed_list_nesting_cache
782 .get_or_init(|| self.compute_mixed_list_nesting())
783 }
784
785 /// Internal computation for mixed list nesting (only called once per LintContext).
786 fn compute_mixed_list_nesting(&self) -> bool {
787 // Track parent list items by their marker position and type
788 // Using marker_column instead of indent because it works correctly
789 // for blockquoted content where indent doesn't account for the prefix
790 // Stack stores: (marker_column, is_ordered)
791 let mut stack: Vec<(usize, bool)> = Vec::new();
792 let mut last_was_blank = false;
793
794 for line_info in &self.lines {
795 // Skip non-content lines (code blocks, frontmatter, HTML comments, etc.)
796 if line_info.in_code_block
797 || line_info.in_front_matter
798 || line_info.in_mkdocstrings
799 || line_info.in_html_comment
800 || line_info.in_esm_block
801 {
802 continue;
803 }
804
805 // OPTIMIZATION: Use pre-computed is_blank instead of content().trim()
806 if line_info.is_blank {
807 last_was_blank = true;
808 continue;
809 }
810
811 if let Some(list_item) = &line_info.list_item {
812 // Normalize column 1 to column 0 (consistent with MD007 check function)
813 let current_pos = if list_item.marker_column == 1 {
814 0
815 } else {
816 list_item.marker_column
817 };
818
819 // If there was a blank line and this item is at root level, reset stack
820 if last_was_blank && current_pos == 0 {
821 stack.clear();
822 }
823 last_was_blank = false;
824
825 // Pop items at same or greater position (they're siblings or deeper, not parents)
826 while let Some(&(pos, _)) = stack.last() {
827 if pos >= current_pos {
828 stack.pop();
829 } else {
830 break;
831 }
832 }
833
834 // Check if immediate parent has different type - this is mixed nesting
835 if let Some(&(_, parent_is_ordered)) = stack.last()
836 && parent_is_ordered != list_item.is_ordered
837 {
838 return true; // Found mixed nesting - early exit
839 }
840
841 stack.push((current_pos, list_item.is_ordered));
842 } else {
843 // Non-list line (but not blank) - could be paragraph or other content
844 last_was_blank = false;
845 }
846 }
847
848 false
849 }
850
851 /// Map a byte offset to (line, column)
852 pub fn offset_to_line_col(&self, offset: usize) -> (usize, usize) {
853 match self.line_offsets.binary_search(&offset) {
854 Ok(line) => (line + 1, 1),
855 Err(line) => {
856 let line_start = self.line_offsets.get(line.wrapping_sub(1)).copied().unwrap_or(0);
857 (line, offset - line_start + 1)
858 }
859 }
860 }
861
862 /// Check if a position is within a code block or code span
863 pub fn is_in_code_block_or_span(&self, pos: usize) -> bool {
864 // Check code blocks first
865 if CodeBlockUtils::is_in_code_block_or_span(&self.code_blocks, pos) {
866 return true;
867 }
868
869 // Check inline code spans (lazy load if needed)
870 self.code_spans()
871 .iter()
872 .any(|span| pos >= span.byte_offset && pos < span.byte_end)
873 }
874
875 /// Get line information by line number (1-indexed)
876 pub fn line_info(&self, line_num: usize) -> Option<&LineInfo> {
877 if line_num > 0 {
878 self.lines.get(line_num - 1)
879 } else {
880 None
881 }
882 }
883
884 /// Get byte offset for a line number (1-indexed)
885 pub fn line_to_byte_offset(&self, line_num: usize) -> Option<usize> {
886 self.line_info(line_num).map(|info| info.byte_offset)
887 }
888
889 /// Get URL for a reference link/image by its ID
890 pub fn get_reference_url(&self, ref_id: &str) -> Option<&str> {
891 let normalized_id = ref_id.to_lowercase();
892 self.reference_defs
893 .iter()
894 .find(|def| def.id == normalized_id)
895 .map(|def| def.url.as_str())
896 }
897
898 /// Check if a line is part of a list block
899 pub fn is_in_list_block(&self, line_num: usize) -> bool {
900 self.list_blocks
901 .iter()
902 .any(|block| line_num >= block.start_line && line_num <= block.end_line)
903 }
904
905 /// Get the list block containing a specific line
906 pub fn list_block_for_line(&self, line_num: usize) -> Option<&ListBlock> {
907 self.list_blocks
908 .iter()
909 .find(|block| line_num >= block.start_line && line_num <= block.end_line)
910 }
911
912 // Compatibility methods for DocumentStructure migration
913
914 /// Check if a line is within a code block
915 pub fn is_in_code_block(&self, line_num: usize) -> bool {
916 if line_num == 0 || line_num > self.lines.len() {
917 return false;
918 }
919 self.lines[line_num - 1].in_code_block
920 }
921
922 /// Check if a line is within front matter
923 pub fn is_in_front_matter(&self, line_num: usize) -> bool {
924 if line_num == 0 || line_num > self.lines.len() {
925 return false;
926 }
927 self.lines[line_num - 1].in_front_matter
928 }
929
930 /// Check if a line is within an HTML block
931 pub fn is_in_html_block(&self, line_num: usize) -> bool {
932 if line_num == 0 || line_num > self.lines.len() {
933 return false;
934 }
935 self.lines[line_num - 1].in_html_block
936 }
937
938 /// Check if a line and column is within a code span
939 pub fn is_in_code_span(&self, line_num: usize, col: usize) -> bool {
940 if line_num == 0 || line_num > self.lines.len() {
941 return false;
942 }
943
944 // Use the code spans cache to check
945 // Note: col is 1-indexed from caller, but span.start_col and span.end_col are 0-indexed
946 // Convert col to 0-indexed for comparison
947 let col_0indexed = if col > 0 { col - 1 } else { 0 };
948 let code_spans = self.code_spans();
949 code_spans.iter().any(|span| {
950 // Check if line is within the span's line range
951 if line_num < span.line || line_num > span.end_line {
952 return false;
953 }
954
955 if span.line == span.end_line {
956 // Single-line span: check column bounds
957 col_0indexed >= span.start_col && col_0indexed < span.end_col
958 } else if line_num == span.line {
959 // First line of multi-line span: anything after start_col is in span
960 col_0indexed >= span.start_col
961 } else if line_num == span.end_line {
962 // Last line of multi-line span: anything before end_col is in span
963 col_0indexed < span.end_col
964 } else {
965 // Middle line of multi-line span: entire line is in span
966 true
967 }
968 })
969 }
970
971 /// Check if a byte offset is within a code span
972 #[inline]
973 pub fn is_byte_offset_in_code_span(&self, byte_offset: usize) -> bool {
974 let code_spans = self.code_spans();
975 code_spans
976 .iter()
977 .any(|span| byte_offset >= span.byte_offset && byte_offset < span.byte_end)
978 }
979
980 /// Check if a byte position is within a reference definition
981 /// This is much faster than scanning the content with regex for each check (O(1) vs O(n))
982 #[inline]
983 pub fn is_in_reference_def(&self, byte_pos: usize) -> bool {
984 self.reference_defs
985 .iter()
986 .any(|ref_def| byte_pos >= ref_def.byte_offset && byte_pos < ref_def.byte_end)
987 }
988
989 /// Check if a byte position is within an HTML comment
990 /// This is much faster than scanning the content with regex for each check (O(k) vs O(n))
991 /// where k is the number of HTML comments (typically very small)
992 #[inline]
993 pub fn is_in_html_comment(&self, byte_pos: usize) -> bool {
994 self.html_comment_ranges
995 .iter()
996 .any(|range| byte_pos >= range.start && byte_pos < range.end)
997 }
998
999 /// Check if a byte position is within an HTML tag (including multiline tags)
1000 /// Uses the pre-parsed html_tags which correctly handles tags spanning multiple lines
1001 #[inline]
1002 pub fn is_in_html_tag(&self, byte_pos: usize) -> bool {
1003 self.html_tags()
1004 .iter()
1005 .any(|tag| byte_pos >= tag.byte_offset && byte_pos < tag.byte_end)
1006 }
1007
1008 /// Check if a byte position is within a Jinja template ({{ }} or {% %})
1009 pub fn is_in_jinja_range(&self, byte_pos: usize) -> bool {
1010 self.jinja_ranges
1011 .iter()
1012 .any(|(start, end)| byte_pos >= *start && byte_pos < *end)
1013 }
1014
1015 /// Check if a byte position is within a link reference definition title
1016 pub fn is_in_link_title(&self, byte_pos: usize) -> bool {
1017 self.reference_defs.iter().any(|def| {
1018 if let (Some(start), Some(end)) = (def.title_byte_start, def.title_byte_end) {
1019 byte_pos >= start && byte_pos < end
1020 } else {
1021 false
1022 }
1023 })
1024 }
1025
1026 /// Check if content has any instances of a specific character (fast)
1027 pub fn has_char(&self, ch: char) -> bool {
1028 match ch {
1029 '#' => self.char_frequency.hash_count > 0,
1030 '*' => self.char_frequency.asterisk_count > 0,
1031 '_' => self.char_frequency.underscore_count > 0,
1032 '-' => self.char_frequency.hyphen_count > 0,
1033 '+' => self.char_frequency.plus_count > 0,
1034 '>' => self.char_frequency.gt_count > 0,
1035 '|' => self.char_frequency.pipe_count > 0,
1036 '[' => self.char_frequency.bracket_count > 0,
1037 '`' => self.char_frequency.backtick_count > 0,
1038 '<' => self.char_frequency.lt_count > 0,
1039 '!' => self.char_frequency.exclamation_count > 0,
1040 '\n' => self.char_frequency.newline_count > 0,
1041 _ => self.content.contains(ch), // Fallback for other characters
1042 }
1043 }
1044
1045 /// Get count of a specific character (fast)
1046 pub fn char_count(&self, ch: char) -> usize {
1047 match ch {
1048 '#' => self.char_frequency.hash_count,
1049 '*' => self.char_frequency.asterisk_count,
1050 '_' => self.char_frequency.underscore_count,
1051 '-' => self.char_frequency.hyphen_count,
1052 '+' => self.char_frequency.plus_count,
1053 '>' => self.char_frequency.gt_count,
1054 '|' => self.char_frequency.pipe_count,
1055 '[' => self.char_frequency.bracket_count,
1056 '`' => self.char_frequency.backtick_count,
1057 '<' => self.char_frequency.lt_count,
1058 '!' => self.char_frequency.exclamation_count,
1059 '\n' => self.char_frequency.newline_count,
1060 _ => self.content.matches(ch).count(), // Fallback for other characters
1061 }
1062 }
1063
1064 /// Check if content likely contains headings (fast)
1065 pub fn likely_has_headings(&self) -> bool {
1066 self.char_frequency.hash_count > 0 || self.char_frequency.hyphen_count > 2 // Potential setext underlines
1067 }
1068
1069 /// Check if content likely contains lists (fast)
1070 pub fn likely_has_lists(&self) -> bool {
1071 self.char_frequency.asterisk_count > 0
1072 || self.char_frequency.hyphen_count > 0
1073 || self.char_frequency.plus_count > 0
1074 }
1075
1076 /// Check if content likely contains emphasis (fast)
1077 pub fn likely_has_emphasis(&self) -> bool {
1078 self.char_frequency.asterisk_count > 1 || self.char_frequency.underscore_count > 1
1079 }
1080
1081 /// Check if content likely contains tables (fast)
1082 pub fn likely_has_tables(&self) -> bool {
1083 self.char_frequency.pipe_count > 2
1084 }
1085
1086 /// Check if content likely contains blockquotes (fast)
1087 pub fn likely_has_blockquotes(&self) -> bool {
1088 self.char_frequency.gt_count > 0
1089 }
1090
1091 /// Check if content likely contains code (fast)
1092 pub fn likely_has_code(&self) -> bool {
1093 self.char_frequency.backtick_count > 0
1094 }
1095
1096 /// Check if content likely contains links or images (fast)
1097 pub fn likely_has_links_or_images(&self) -> bool {
1098 self.char_frequency.bracket_count > 0 || self.char_frequency.exclamation_count > 0
1099 }
1100
1101 /// Check if content likely contains HTML (fast)
1102 pub fn likely_has_html(&self) -> bool {
1103 self.char_frequency.lt_count > 0
1104 }
1105
1106 /// Get HTML tags on a specific line
1107 pub fn html_tags_on_line(&self, line_num: usize) -> Vec<HtmlTag> {
1108 self.html_tags()
1109 .iter()
1110 .filter(|tag| tag.line == line_num)
1111 .cloned()
1112 .collect()
1113 }
1114
1115 /// Get emphasis spans on a specific line
1116 pub fn emphasis_spans_on_line(&self, line_num: usize) -> Vec<EmphasisSpan> {
1117 self.emphasis_spans()
1118 .iter()
1119 .filter(|span| span.line == line_num)
1120 .cloned()
1121 .collect()
1122 }
1123
1124 /// Get table rows on a specific line
1125 pub fn table_rows_on_line(&self, line_num: usize) -> Vec<TableRow> {
1126 self.table_rows()
1127 .iter()
1128 .filter(|row| row.line == line_num)
1129 .cloned()
1130 .collect()
1131 }
1132
1133 /// Get bare URLs on a specific line
1134 pub fn bare_urls_on_line(&self, line_num: usize) -> Vec<BareUrl> {
1135 self.bare_urls()
1136 .iter()
1137 .filter(|url| url.line == line_num)
1138 .cloned()
1139 .collect()
1140 }
1141
1142 /// Find the line index for a given byte offset using binary search.
1143 /// Returns (line_index, line_number, column) where:
1144 /// - line_index is the 0-based index in the lines array
1145 /// - line_number is the 1-based line number
1146 /// - column is the byte offset within that line
1147 #[inline]
1148 fn find_line_for_offset(lines: &[LineInfo], byte_offset: usize) -> (usize, usize, usize) {
1149 // Binary search to find the line containing this byte offset
1150 let idx = match lines.binary_search_by(|line| {
1151 if byte_offset < line.byte_offset {
1152 std::cmp::Ordering::Greater
1153 } else if byte_offset > line.byte_offset + line.byte_len {
1154 std::cmp::Ordering::Less
1155 } else {
1156 std::cmp::Ordering::Equal
1157 }
1158 }) {
1159 Ok(idx) => idx,
1160 Err(idx) => idx.saturating_sub(1),
1161 };
1162
1163 let line = &lines[idx];
1164 let line_num = idx + 1;
1165 let col = byte_offset.saturating_sub(line.byte_offset);
1166
1167 (idx, line_num, col)
1168 }
1169
1170 /// Check if a byte offset is within a code span using binary search
1171 #[inline]
1172 fn is_offset_in_code_span(code_spans: &[CodeSpan], offset: usize) -> bool {
1173 // Since spans are sorted by byte_offset, use partition_point for binary search
1174 let idx = code_spans.partition_point(|span| span.byte_offset <= offset);
1175
1176 // Check the span that starts at or before our offset
1177 if idx > 0 {
1178 let span = &code_spans[idx - 1];
1179 if offset >= span.byte_offset && offset < span.byte_end {
1180 return true;
1181 }
1182 }
1183
1184 false
1185 }
1186
1187 /// Collect byte ranges of all links using pulldown-cmark
1188 /// This is used to skip heading detection for lines that fall within link syntax
1189 /// (e.g., multiline links like `[text](url\n#fragment)`)
1190 fn collect_link_byte_ranges(content: &str) -> Vec<(usize, usize)> {
1191 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
1192
1193 let mut link_ranges = Vec::new();
1194 let mut options = Options::empty();
1195 options.insert(Options::ENABLE_WIKILINKS);
1196 options.insert(Options::ENABLE_FOOTNOTES);
1197
1198 let parser = Parser::new_ext(content, options).into_offset_iter();
1199 let mut link_stack: Vec<usize> = Vec::new();
1200
1201 for (event, range) in parser {
1202 match event {
1203 Event::Start(Tag::Link { .. }) => {
1204 link_stack.push(range.start);
1205 }
1206 Event::End(TagEnd::Link) => {
1207 if let Some(start_pos) = link_stack.pop() {
1208 link_ranges.push((start_pos, range.end));
1209 }
1210 }
1211 _ => {}
1212 }
1213 }
1214
1215 link_ranges
1216 }
1217
1218 /// Parse all links in the content
1219 fn parse_links(
1220 content: &'a str,
1221 lines: &[LineInfo],
1222 code_blocks: &[(usize, usize)],
1223 code_spans: &[CodeSpan],
1224 flavor: MarkdownFlavor,
1225 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1226 ) -> (Vec<ParsedLink<'a>>, Vec<BrokenLinkInfo>, Vec<FootnoteRef>) {
1227 use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
1228 use std::collections::HashSet;
1229
1230 let mut links = Vec::with_capacity(content.len() / 500);
1231 let mut broken_links = Vec::new();
1232 let mut footnote_refs = Vec::new();
1233
1234 // Track byte positions of links found by pulldown-cmark
1235 let mut found_positions = HashSet::new();
1236
1237 // Use pulldown-cmark's streaming parser with BrokenLink callback
1238 // The callback captures undefined references: [text][undefined], [shortcut], [text][]
1239 // This automatically handles:
1240 // - Escaped links (won't generate events)
1241 // - Links in code blocks/spans (won't generate Link events)
1242 // - Images (generates Tag::Image instead)
1243 // - Reference resolution (dest_url is already resolved!)
1244 // - Broken references (callback is invoked)
1245 // - Wiki-links (enabled via ENABLE_WIKILINKS)
1246 let mut options = Options::empty();
1247 options.insert(Options::ENABLE_WIKILINKS);
1248 options.insert(Options::ENABLE_FOOTNOTES);
1249
1250 let parser = Parser::new_with_broken_link_callback(
1251 content,
1252 options,
1253 Some(|link: BrokenLink<'_>| {
1254 broken_links.push(BrokenLinkInfo {
1255 reference: link.reference.to_string(),
1256 span: link.span.clone(),
1257 });
1258 None
1259 }),
1260 )
1261 .into_offset_iter();
1262
1263 let mut link_stack: Vec<(
1264 usize,
1265 usize,
1266 pulldown_cmark::CowStr<'a>,
1267 LinkType,
1268 pulldown_cmark::CowStr<'a>,
1269 )> = Vec::new();
1270 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1271
1272 for (event, range) in parser {
1273 match event {
1274 Event::Start(Tag::Link {
1275 link_type,
1276 dest_url,
1277 id,
1278 ..
1279 }) => {
1280 // Link start - record position, URL, and reference ID
1281 link_stack.push((range.start, range.end, dest_url, link_type, id));
1282 text_chunks.clear();
1283 }
1284 Event::Text(text) if !link_stack.is_empty() => {
1285 // Track text content with its byte range
1286 text_chunks.push((text.to_string(), range.start, range.end));
1287 }
1288 Event::Code(code) if !link_stack.is_empty() => {
1289 // Include inline code in link text (with backticks)
1290 let code_text = format!("`{code}`");
1291 text_chunks.push((code_text, range.start, range.end));
1292 }
1293 Event::End(TagEnd::Link) => {
1294 if let Some((start_pos, _link_start_end, url, link_type, ref_id)) = link_stack.pop() {
1295 // Skip if in HTML comment
1296 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1297 text_chunks.clear();
1298 continue;
1299 }
1300
1301 // Find line and column information
1302 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1303
1304 // Skip if this link is on a MkDocs snippet line
1305 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1306 text_chunks.clear();
1307 continue;
1308 }
1309
1310 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1311
1312 let is_reference = matches!(
1313 link_type,
1314 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1315 );
1316
1317 // Extract link text directly from source bytes to preserve escaping
1318 // Text events from pulldown-cmark unescape \] → ], which breaks MD039
1319 let link_text = if start_pos < content.len() {
1320 let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1321
1322 // Find MATCHING ] by tracking bracket depth for nested brackets
1323 // An unescaped bracket is one NOT preceded by an odd number of backslashes
1324 // Brackets inside code spans (between backticks) should be ignored
1325 let mut close_pos = None;
1326 let mut depth = 0;
1327 let mut in_code_span = false;
1328
1329 for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
1330 // Count preceding backslashes
1331 let mut backslash_count = 0;
1332 let mut j = i;
1333 while j > 0 && link_bytes[j - 1] == b'\\' {
1334 backslash_count += 1;
1335 j -= 1;
1336 }
1337 let is_escaped = backslash_count % 2 != 0;
1338
1339 // Track code spans - backticks toggle in/out of code
1340 if byte == b'`' && !is_escaped {
1341 in_code_span = !in_code_span;
1342 }
1343
1344 // Only count brackets when NOT in a code span
1345 if !is_escaped && !in_code_span {
1346 if byte == b'[' {
1347 depth += 1;
1348 } else if byte == b']' {
1349 if depth == 0 {
1350 // Found the matching closing bracket
1351 close_pos = Some(i);
1352 break;
1353 } else {
1354 depth -= 1;
1355 }
1356 }
1357 }
1358 }
1359
1360 if let Some(pos) = close_pos {
1361 Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
1362 } else {
1363 Cow::Borrowed("")
1364 }
1365 } else {
1366 Cow::Borrowed("")
1367 };
1368
1369 // For reference links, use the actual reference ID from pulldown-cmark
1370 let reference_id = if is_reference && !ref_id.is_empty() {
1371 Some(Cow::Owned(ref_id.to_lowercase()))
1372 } else if is_reference {
1373 // For collapsed/shortcut references without explicit ID, use the link text
1374 Some(Cow::Owned(link_text.to_lowercase()))
1375 } else {
1376 None
1377 };
1378
1379 // Track this position as found
1380 found_positions.insert(start_pos);
1381
1382 links.push(ParsedLink {
1383 line: line_num,
1384 start_col: col_start,
1385 end_col: col_end,
1386 byte_offset: start_pos,
1387 byte_end: range.end,
1388 text: link_text,
1389 url: Cow::Owned(url.to_string()),
1390 is_reference,
1391 reference_id,
1392 link_type,
1393 });
1394
1395 text_chunks.clear();
1396 }
1397 }
1398 Event::FootnoteReference(footnote_id) => {
1399 // Capture footnote references like [^1], [^note]
1400 // Skip if in HTML comment
1401 if is_in_html_comment_ranges(html_comment_ranges, range.start) {
1402 continue;
1403 }
1404
1405 let (_, line_num, _) = Self::find_line_for_offset(lines, range.start);
1406 footnote_refs.push(FootnoteRef {
1407 id: footnote_id.to_string(),
1408 line: line_num,
1409 byte_offset: range.start,
1410 byte_end: range.end,
1411 });
1412 }
1413 _ => {}
1414 }
1415 }
1416
1417 // Also find undefined references using regex
1418 // These are patterns like [text][ref] that pulldown-cmark didn't parse as links
1419 // because the reference is undefined
1420 for cap in LINK_PATTERN.captures_iter(content) {
1421 let full_match = cap.get(0).unwrap();
1422 let match_start = full_match.start();
1423 let match_end = full_match.end();
1424
1425 // Skip if this was already found by pulldown-cmark (it's a valid link)
1426 if found_positions.contains(&match_start) {
1427 continue;
1428 }
1429
1430 // Skip if escaped
1431 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1432 continue;
1433 }
1434
1435 // Skip if it's an image
1436 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
1437 continue;
1438 }
1439
1440 // Skip if in code block
1441 if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
1442 continue;
1443 }
1444
1445 // Skip if in code span
1446 if Self::is_offset_in_code_span(code_spans, match_start) {
1447 continue;
1448 }
1449
1450 // Skip if in HTML comment
1451 if is_in_html_comment_ranges(html_comment_ranges, match_start) {
1452 continue;
1453 }
1454
1455 // Find line and column information
1456 let (line_idx, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1457
1458 // Skip if this link is on a MkDocs snippet line
1459 if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
1460 continue;
1461 }
1462
1463 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1464
1465 let text = cap.get(1).map_or("", |m| m.as_str());
1466
1467 // Only process reference links (group 6)
1468 if let Some(ref_id) = cap.get(6) {
1469 let ref_id_str = ref_id.as_str();
1470 let normalized_ref = if ref_id_str.is_empty() {
1471 Cow::Owned(text.to_lowercase()) // Implicit reference
1472 } else {
1473 Cow::Owned(ref_id_str.to_lowercase())
1474 };
1475
1476 // This is an undefined reference (pulldown-cmark didn't parse it)
1477 links.push(ParsedLink {
1478 line: line_num,
1479 start_col: col_start,
1480 end_col: col_end,
1481 byte_offset: match_start,
1482 byte_end: match_end,
1483 text: Cow::Borrowed(text),
1484 url: Cow::Borrowed(""), // Empty URL indicates undefined reference
1485 is_reference: true,
1486 reference_id: Some(normalized_ref),
1487 link_type: LinkType::Reference, // Undefined references are reference-style
1488 });
1489 }
1490 }
1491
1492 (links, broken_links, footnote_refs)
1493 }
1494
1495 /// Parse all images in the content
1496 fn parse_images(
1497 content: &'a str,
1498 lines: &[LineInfo],
1499 code_blocks: &[(usize, usize)],
1500 code_spans: &[CodeSpan],
1501 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
1502 ) -> Vec<ParsedImage<'a>> {
1503 use crate::utils::skip_context::is_in_html_comment_ranges;
1504 use std::collections::HashSet;
1505
1506 // Pre-size based on a heuristic: images are less common than links
1507 let mut images = Vec::with_capacity(content.len() / 1000);
1508 let mut found_positions = HashSet::new();
1509
1510 // Use pulldown-cmark for parsing - more accurate and faster
1511 let parser = Parser::new(content).into_offset_iter();
1512 let mut image_stack: Vec<(usize, pulldown_cmark::CowStr<'a>, LinkType, pulldown_cmark::CowStr<'a>)> =
1513 Vec::new();
1514 let mut text_chunks: Vec<(String, usize, usize)> = Vec::new(); // (text, start, end)
1515
1516 for (event, range) in parser {
1517 match event {
1518 Event::Start(Tag::Image {
1519 link_type,
1520 dest_url,
1521 id,
1522 ..
1523 }) => {
1524 image_stack.push((range.start, dest_url, link_type, id));
1525 text_chunks.clear();
1526 }
1527 Event::Text(text) if !image_stack.is_empty() => {
1528 text_chunks.push((text.to_string(), range.start, range.end));
1529 }
1530 Event::Code(code) if !image_stack.is_empty() => {
1531 let code_text = format!("`{code}`");
1532 text_chunks.push((code_text, range.start, range.end));
1533 }
1534 Event::End(TagEnd::Image) => {
1535 if let Some((start_pos, url, link_type, ref_id)) = image_stack.pop() {
1536 // Skip if in code block
1537 if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
1538 continue;
1539 }
1540
1541 // Skip if in code span
1542 if Self::is_offset_in_code_span(code_spans, start_pos) {
1543 continue;
1544 }
1545
1546 // Skip if in HTML comment
1547 if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
1548 continue;
1549 }
1550
1551 // Find line and column using binary search
1552 let (_, line_num, col_start) = Self::find_line_for_offset(lines, start_pos);
1553 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, range.end);
1554
1555 let is_reference = matches!(
1556 link_type,
1557 LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
1558 );
1559
1560 // Extract alt text directly from source bytes to preserve escaping
1561 // Text events from pulldown-cmark unescape \] → ], which breaks rules that need escaping
1562 let alt_text = if start_pos < content.len() {
1563 let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
1564
1565 // Find MATCHING ] by tracking bracket depth for nested brackets
1566 // An unescaped bracket is one NOT preceded by an odd number of backslashes
1567 let mut close_pos = None;
1568 let mut depth = 0;
1569
1570 if image_bytes.len() > 2 {
1571 for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
1572 // Count preceding backslashes
1573 let mut backslash_count = 0;
1574 let mut j = i;
1575 while j > 0 && image_bytes[j - 1] == b'\\' {
1576 backslash_count += 1;
1577 j -= 1;
1578 }
1579 let is_escaped = backslash_count % 2 != 0;
1580
1581 if !is_escaped {
1582 if byte == b'[' {
1583 depth += 1;
1584 } else if byte == b']' {
1585 if depth == 0 {
1586 // Found the matching closing bracket
1587 close_pos = Some(i);
1588 break;
1589 } else {
1590 depth -= 1;
1591 }
1592 }
1593 }
1594 }
1595 }
1596
1597 if let Some(pos) = close_pos {
1598 Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
1599 } else {
1600 Cow::Borrowed("")
1601 }
1602 } else {
1603 Cow::Borrowed("")
1604 };
1605
1606 let reference_id = if is_reference && !ref_id.is_empty() {
1607 Some(Cow::Owned(ref_id.to_lowercase()))
1608 } else if is_reference {
1609 Some(Cow::Owned(alt_text.to_lowercase())) // Collapsed/shortcut references
1610 } else {
1611 None
1612 };
1613
1614 found_positions.insert(start_pos);
1615 images.push(ParsedImage {
1616 line: line_num,
1617 start_col: col_start,
1618 end_col: col_end,
1619 byte_offset: start_pos,
1620 byte_end: range.end,
1621 alt_text,
1622 url: Cow::Owned(url.to_string()),
1623 is_reference,
1624 reference_id,
1625 link_type,
1626 });
1627 }
1628 }
1629 _ => {}
1630 }
1631 }
1632
1633 // Regex fallback for undefined references that pulldown-cmark treats as plain text
1634 for cap in IMAGE_PATTERN.captures_iter(content) {
1635 let full_match = cap.get(0).unwrap();
1636 let match_start = full_match.start();
1637 let match_end = full_match.end();
1638
1639 // Skip if already found by pulldown-cmark
1640 if found_positions.contains(&match_start) {
1641 continue;
1642 }
1643
1644 // Skip if the ! is escaped
1645 if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
1646 continue;
1647 }
1648
1649 // Skip if in code block, code span, or HTML comment
1650 if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
1651 || Self::is_offset_in_code_span(code_spans, match_start)
1652 || is_in_html_comment_ranges(html_comment_ranges, match_start)
1653 {
1654 continue;
1655 }
1656
1657 // Only process reference images (undefined references not found by pulldown-cmark)
1658 if let Some(ref_id) = cap.get(6) {
1659 let (_, line_num, col_start) = Self::find_line_for_offset(lines, match_start);
1660 let (_, _end_line_num, col_end) = Self::find_line_for_offset(lines, match_end);
1661 let alt_text = cap.get(1).map_or("", |m| m.as_str());
1662 let ref_id_str = ref_id.as_str();
1663 let normalized_ref = if ref_id_str.is_empty() {
1664 Cow::Owned(alt_text.to_lowercase())
1665 } else {
1666 Cow::Owned(ref_id_str.to_lowercase())
1667 };
1668
1669 images.push(ParsedImage {
1670 line: line_num,
1671 start_col: col_start,
1672 end_col: col_end,
1673 byte_offset: match_start,
1674 byte_end: match_end,
1675 alt_text: Cow::Borrowed(alt_text),
1676 url: Cow::Borrowed(""),
1677 is_reference: true,
1678 reference_id: Some(normalized_ref),
1679 link_type: LinkType::Reference, // Undefined references are reference-style
1680 });
1681 }
1682 }
1683
1684 images
1685 }
1686
1687 /// Parse reference definitions
1688 fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
1689 // Pre-size based on lines count as reference definitions are line-based
1690 let mut refs = Vec::with_capacity(lines.len() / 20); // ~1 ref per 20 lines
1691
1692 for (line_idx, line_info) in lines.iter().enumerate() {
1693 // Skip lines in code blocks
1694 if line_info.in_code_block {
1695 continue;
1696 }
1697
1698 let line = line_info.content(content);
1699 let line_num = line_idx + 1;
1700
1701 if let Some(cap) = REF_DEF_PATTERN.captures(line) {
1702 let id = cap.get(1).unwrap().as_str().to_lowercase();
1703 let url = cap.get(2).unwrap().as_str().to_string();
1704 let title_match = cap.get(3).or_else(|| cap.get(4));
1705 let title = title_match.map(|m| m.as_str().to_string());
1706
1707 // Calculate byte positions
1708 // The match starts at the beginning of the line (0) and extends to the end
1709 let match_obj = cap.get(0).unwrap();
1710 let byte_offset = line_info.byte_offset + match_obj.start();
1711 let byte_end = line_info.byte_offset + match_obj.end();
1712
1713 // Calculate title byte positions (includes the quote character before content)
1714 let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
1715 // The match is the content inside quotes, so we include the quote before
1716 let start = line_info.byte_offset + m.start().saturating_sub(1);
1717 let end = line_info.byte_offset + m.end() + 1; // Include closing quote
1718 (Some(start), Some(end))
1719 } else {
1720 (None, None)
1721 };
1722
1723 refs.push(ReferenceDef {
1724 line: line_num,
1725 id,
1726 url,
1727 title,
1728 byte_offset,
1729 byte_end,
1730 title_byte_start,
1731 title_byte_end,
1732 });
1733 }
1734 }
1735
1736 refs
1737 }
1738
1739 /// Fast blockquote prefix parser - replaces regex for 5-10x speedup
1740 /// Handles nested blockquotes like `> > > content`
1741 /// Returns: Some((prefix_with_ws, content_after_prefix)) or None
1742 #[inline]
1743 fn parse_blockquote_prefix(line: &str) -> Option<(&str, &str)> {
1744 let trimmed_start = line.trim_start();
1745 if !trimmed_start.starts_with('>') {
1746 return None;
1747 }
1748
1749 // Track total prefix length to handle nested blockquotes
1750 let mut remaining = line;
1751 let mut total_prefix_len = 0;
1752
1753 loop {
1754 let trimmed = remaining.trim_start();
1755 if !trimmed.starts_with('>') {
1756 break;
1757 }
1758
1759 // Add leading whitespace + '>' to prefix
1760 let leading_ws_len = remaining.len() - trimmed.len();
1761 total_prefix_len += leading_ws_len + 1;
1762
1763 let after_gt = &trimmed[1..];
1764
1765 // Handle optional whitespace after '>' (space or tab)
1766 if let Some(stripped) = after_gt.strip_prefix(' ') {
1767 total_prefix_len += 1;
1768 remaining = stripped;
1769 } else if let Some(stripped) = after_gt.strip_prefix('\t') {
1770 total_prefix_len += 1;
1771 remaining = stripped;
1772 } else {
1773 remaining = after_gt;
1774 }
1775 }
1776
1777 Some((&line[..total_prefix_len], remaining))
1778 }
1779
1780 /// Detect list items using pulldown-cmark for CommonMark-compliant parsing.
1781 ///
1782 /// Returns a HashMap keyed by line byte offset, containing:
1783 /// `(is_ordered, marker, marker_column, content_column, number)`
1784 ///
1785 /// ## Why pulldown-cmark?
1786 /// Using pulldown-cmark instead of regex ensures we only detect actual list items,
1787 /// not lines that merely look like lists (e.g., continuation paragraphs, code blocks).
1788 /// This fixes issue #253 where continuation lines were falsely detected.
1789 ///
1790 /// ## Tab indentation quirk
1791 /// Pulldown-cmark reports nested list items at the newline character position
1792 /// when tab indentation is used. For example, in `"* Item\n\t- Nested"`,
1793 /// the nested item is reported at byte 7 (the `\n`), not byte 8 (the `\t`).
1794 /// We detect this and advance to the correct line.
1795 ///
1796 /// ## HashMap key strategy
1797 /// We use `entry().or_insert()` because pulldown-cmark may emit multiple events
1798 /// that resolve to the same line (after newline adjustment). The first event
1799 /// for each line is authoritative.
1800 /// Detect list items and emphasis spans in a single pulldown-cmark pass.
1801 /// Returns both list items (for LineInfo) and emphasis spans (for MD030).
1802 /// This avoids a separate parse for emphasis detection.
1803 fn detect_list_items_and_emphasis_with_pulldown(
1804 content: &str,
1805 line_offsets: &[usize],
1806 flavor: MarkdownFlavor,
1807 front_matter_end: usize,
1808 code_blocks: &[(usize, usize)],
1809 ) -> (ListItemMap, Vec<EmphasisSpan>) {
1810 use std::collections::HashMap;
1811
1812 let mut list_items = HashMap::new();
1813 let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
1814
1815 let mut options = Options::empty();
1816 options.insert(Options::ENABLE_TABLES);
1817 options.insert(Options::ENABLE_FOOTNOTES);
1818 options.insert(Options::ENABLE_STRIKETHROUGH);
1819 options.insert(Options::ENABLE_TASKLISTS);
1820 // Always enable GFM features for consistency with existing behavior
1821 options.insert(Options::ENABLE_GFM);
1822
1823 // Suppress unused variable warning
1824 let _ = flavor;
1825
1826 let parser = Parser::new_ext(content, options).into_offset_iter();
1827 let mut list_depth: usize = 0;
1828 let mut list_stack: Vec<bool> = Vec::new();
1829
1830 for (event, range) in parser {
1831 match event {
1832 // Capture emphasis spans (for MD030's emphasis detection)
1833 Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
1834 let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
1835 2
1836 } else {
1837 1
1838 };
1839 let match_start = range.start;
1840 let match_end = range.end;
1841
1842 // Skip if in code block
1843 if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
1844 // Determine marker character by looking at the content at the start
1845 let marker = content[match_start..].chars().next().unwrap_or('*');
1846 if marker == '*' || marker == '_' {
1847 // Extract content between markers
1848 let content_start = match_start + marker_count;
1849 let content_end = if match_end >= marker_count {
1850 match_end - marker_count
1851 } else {
1852 match_end
1853 };
1854 let content_part = if content_start < content_end && content_end <= content.len() {
1855 &content[content_start..content_end]
1856 } else {
1857 ""
1858 };
1859
1860 // Find which line this emphasis is on using line_offsets
1861 let line_idx = match line_offsets.binary_search(&match_start) {
1862 Ok(idx) => idx,
1863 Err(idx) => idx.saturating_sub(1),
1864 };
1865 let line_num = line_idx + 1;
1866 let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
1867 let col_start = match_start - line_start;
1868 let col_end = match_end - line_start;
1869
1870 emphasis_spans.push(EmphasisSpan {
1871 line: line_num,
1872 start_col: col_start,
1873 end_col: col_end,
1874 byte_offset: match_start,
1875 byte_end: match_end,
1876 marker,
1877 marker_count,
1878 content: content_part.to_string(),
1879 });
1880 }
1881 }
1882 }
1883 Event::Start(Tag::List(start_number)) => {
1884 list_depth += 1;
1885 list_stack.push(start_number.is_some());
1886 }
1887 Event::End(TagEnd::List(_)) => {
1888 list_depth = list_depth.saturating_sub(1);
1889 list_stack.pop();
1890 }
1891 Event::Start(Tag::Item) if list_depth > 0 => {
1892 // Get the ordered state for the CURRENT (innermost) list
1893 let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
1894 // Find which line this byte offset corresponds to
1895 let item_start = range.start;
1896
1897 // Binary search to find the line number
1898 let mut line_idx = match line_offsets.binary_search(&item_start) {
1899 Ok(idx) => idx,
1900 Err(idx) => idx.saturating_sub(1),
1901 };
1902
1903 // Pulldown-cmark reports nested list items at the newline before the item
1904 // when using tab indentation (e.g., "* Item\n\t- Nested").
1905 // Advance to the actual content line in this case.
1906 if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
1907 line_idx += 1;
1908 }
1909
1910 // Skip list items in frontmatter (they are YAML/TOML syntax, not Markdown)
1911 if front_matter_end > 0 && line_idx < front_matter_end {
1912 continue;
1913 }
1914
1915 if line_idx < line_offsets.len() {
1916 let line_start_byte = line_offsets[line_idx];
1917 let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
1918 let line = &content[line_start_byte..line_end.min(content.len())];
1919
1920 // Strip trailing newline
1921 let line = line
1922 .strip_suffix('\n')
1923 .or_else(|| line.strip_suffix("\r\n"))
1924 .unwrap_or(line);
1925
1926 // Strip blockquote prefix if present
1927 let blockquote_parse = Self::parse_blockquote_prefix(line);
1928 let (blockquote_prefix_len, line_to_parse) = if let Some((prefix, content)) = blockquote_parse {
1929 (prefix.len(), content)
1930 } else {
1931 (0, line)
1932 };
1933
1934 // Parse the list marker from the actual line
1935 if current_list_is_ordered {
1936 if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
1937 Self::parse_ordered_list(line_to_parse)
1938 {
1939 let marker = format!("{number_str}{delimiter}");
1940 let marker_column = blockquote_prefix_len + leading_spaces.len();
1941 let content_column = marker_column + marker.len() + spacing.len();
1942 let number = number_str.parse().ok();
1943
1944 list_items.entry(line_start_byte).or_insert((
1945 true,
1946 marker,
1947 marker_column,
1948 content_column,
1949 number,
1950 ));
1951 }
1952 } else if let Some((leading_spaces, marker, spacing, _content)) =
1953 Self::parse_unordered_list(line_to_parse)
1954 {
1955 let marker_column = blockquote_prefix_len + leading_spaces.len();
1956 let content_column = marker_column + 1 + spacing.len();
1957
1958 list_items.entry(line_start_byte).or_insert((
1959 false,
1960 marker.to_string(),
1961 marker_column,
1962 content_column,
1963 None,
1964 ));
1965 }
1966 }
1967 }
1968 _ => {}
1969 }
1970 }
1971
1972 (list_items, emphasis_spans)
1973 }
1974
1975 /// Fast unordered list parser - replaces regex for 5-10x speedup
1976 /// Matches: ^(\s*)([-*+])([ \t]*)(.*)
1977 /// Returns: Some((leading_ws, marker, spacing, content)) or None
1978 #[inline]
1979 fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
1980 let bytes = line.as_bytes();
1981 let mut i = 0;
1982
1983 // Skip leading whitespace
1984 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
1985 i += 1;
1986 }
1987
1988 // Check for marker
1989 if i >= bytes.len() {
1990 return None;
1991 }
1992 let marker = bytes[i] as char;
1993 if marker != '-' && marker != '*' && marker != '+' {
1994 return None;
1995 }
1996 let marker_pos = i;
1997 i += 1;
1998
1999 // Collect spacing after marker (space or tab only)
2000 let spacing_start = i;
2001 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2002 i += 1;
2003 }
2004
2005 Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
2006 }
2007
2008 /// Fast ordered list parser - replaces regex for 5-10x speedup
2009 /// Matches: ^(\s*)(\d+)([.)])([ \t]*)(.*)
2010 /// Returns: Some((leading_ws, number_str, delimiter, spacing, content)) or None
2011 #[inline]
2012 fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
2013 let bytes = line.as_bytes();
2014 let mut i = 0;
2015
2016 // Skip leading whitespace
2017 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2018 i += 1;
2019 }
2020
2021 // Collect digits
2022 let number_start = i;
2023 while i < bytes.len() && bytes[i].is_ascii_digit() {
2024 i += 1;
2025 }
2026 if i == number_start {
2027 return None; // No digits found
2028 }
2029
2030 // Check for delimiter
2031 if i >= bytes.len() {
2032 return None;
2033 }
2034 let delimiter = bytes[i] as char;
2035 if delimiter != '.' && delimiter != ')' {
2036 return None;
2037 }
2038 let delimiter_pos = i;
2039 i += 1;
2040
2041 // Collect spacing after delimiter (space or tab only)
2042 let spacing_start = i;
2043 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
2044 i += 1;
2045 }
2046
2047 Some((
2048 &line[..number_start],
2049 &line[number_start..delimiter_pos],
2050 delimiter,
2051 &line[spacing_start..i],
2052 &line[i..],
2053 ))
2054 }
2055
2056 /// Pre-compute which lines are in code blocks - O(m*n) where m=code_blocks, n=lines
2057 /// Returns a Vec<bool> where index i indicates if line i is in a code block
2058 fn compute_code_block_line_map(content: &str, line_offsets: &[usize], code_blocks: &[(usize, usize)]) -> Vec<bool> {
2059 let num_lines = line_offsets.len();
2060 let mut in_code_block = vec![false; num_lines];
2061
2062 // For each code block, mark all lines within it
2063 for &(start, end) in code_blocks {
2064 // Ensure we're at valid UTF-8 boundaries
2065 let safe_start = if start > 0 && !content.is_char_boundary(start) {
2066 let mut boundary = start;
2067 while boundary > 0 && !content.is_char_boundary(boundary) {
2068 boundary -= 1;
2069 }
2070 boundary
2071 } else {
2072 start
2073 };
2074
2075 let safe_end = if end < content.len() && !content.is_char_boundary(end) {
2076 let mut boundary = end;
2077 while boundary < content.len() && !content.is_char_boundary(boundary) {
2078 boundary += 1;
2079 }
2080 boundary
2081 } else {
2082 end.min(content.len())
2083 };
2084
2085 // Trust the code blocks detected by CodeBlockUtils::detect_code_blocks()
2086 // That function now has proper list context awareness (see code_block_utils.rs)
2087 // and correctly distinguishes between:
2088 // - Fenced code blocks (``` or ~~~)
2089 // - Indented code blocks at document level (4 spaces + blank line before)
2090 // - List continuation paragraphs (NOT code blocks, even with 4 spaces)
2091 //
2092 // We no longer need to re-validate here. The original validation logic
2093 // was causing false positives by marking list continuation paragraphs as
2094 // code blocks when they have 4 spaces of indentation.
2095
2096 // Use binary search to find the first and last line indices
2097 // line_offsets is sorted, so we can use partition_point for O(log n) lookup
2098 // Use safe_start/safe_end (UTF-8 boundaries) for consistent line mapping
2099 //
2100 // Find the line that CONTAINS safe_start: the line with the largest
2101 // start offset that is <= safe_start. partition_point gives us the
2102 // first line that starts AFTER safe_start, so we subtract 1.
2103 let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
2104 let first_line = first_line_after.saturating_sub(1);
2105 let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
2106
2107 // Mark all lines in the range at once
2108 for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
2109 *flag = true;
2110 }
2111 }
2112
2113 in_code_block
2114 }
2115
2116 /// Pre-compute which lines are inside math blocks ($$ ... $$) - O(n) single pass
2117 /// Returns a Vec<bool> where index i indicates if line i is in a math block
2118 fn compute_math_block_line_map(content: &str, code_block_map: &[bool]) -> Vec<bool> {
2119 let content_lines: Vec<&str> = content.lines().collect();
2120 let num_lines = content_lines.len();
2121 let mut in_math_block = vec![false; num_lines];
2122
2123 let mut inside_math = false;
2124
2125 for (i, line) in content_lines.iter().enumerate() {
2126 // Skip lines that are in code blocks - math delimiters inside code are literal
2127 if code_block_map.get(i).copied().unwrap_or(false) {
2128 continue;
2129 }
2130
2131 let trimmed = line.trim();
2132
2133 // Check for math block delimiter ($$)
2134 // A line with just $$ toggles the math block state
2135 if trimmed == "$$" {
2136 if inside_math {
2137 // Closing delimiter - this line is still part of the math block
2138 in_math_block[i] = true;
2139 inside_math = false;
2140 } else {
2141 // Opening delimiter - this line starts the math block
2142 in_math_block[i] = true;
2143 inside_math = true;
2144 }
2145 } else if inside_math {
2146 // Content inside math block
2147 in_math_block[i] = true;
2148 }
2149 }
2150
2151 in_math_block
2152 }
2153
2154 /// Pre-compute basic line information (without headings/blockquotes)
2155 /// Also returns emphasis spans detected during the pulldown-cmark parse
2156 fn compute_basic_line_info(
2157 content: &str,
2158 line_offsets: &[usize],
2159 code_blocks: &[(usize, usize)],
2160 flavor: MarkdownFlavor,
2161 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2162 autodoc_ranges: &[crate::utils::skip_context::ByteRange],
2163 ) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
2164 let content_lines: Vec<&str> = content.lines().collect();
2165 let mut lines = Vec::with_capacity(content_lines.len());
2166
2167 // Pre-compute which lines are in code blocks
2168 let code_block_map = Self::compute_code_block_line_map(content, line_offsets, code_blocks);
2169
2170 // Pre-compute which lines are in math blocks ($$ ... $$)
2171 let math_block_map = Self::compute_math_block_line_map(content, &code_block_map);
2172
2173 // Detect front matter boundaries FIRST, before any other parsing
2174 // Use FrontMatterUtils to detect all types of front matter (YAML, TOML, JSON, malformed)
2175 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2176
2177 // Use pulldown-cmark to detect list items AND emphasis spans in a single pass
2178 // (context-aware, eliminates false positives)
2179 let (list_item_map, emphasis_spans) = Self::detect_list_items_and_emphasis_with_pulldown(
2180 content,
2181 line_offsets,
2182 flavor,
2183 front_matter_end,
2184 code_blocks,
2185 );
2186
2187 for (i, line) in content_lines.iter().enumerate() {
2188 let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
2189 let indent = line.len() - line.trim_start().len();
2190 // Compute visual indent with proper CommonMark tab expansion
2191 let visual_indent = ElementCache::calculate_indentation_width_default(line);
2192
2193 // Parse blockquote prefix once and reuse it (avoid redundant parsing)
2194 let blockquote_parse = Self::parse_blockquote_prefix(line);
2195
2196 // For blank detection, consider blockquote context
2197 let is_blank = if let Some((_, content)) = blockquote_parse {
2198 // In blockquote context, check if content after prefix is blank
2199 content.trim().is_empty()
2200 } else {
2201 line.trim().is_empty()
2202 };
2203
2204 // Use pre-computed map for O(1) lookup instead of O(m) iteration
2205 let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
2206
2207 // Detect list items (skip if in frontmatter, in mkdocstrings block, or in HTML comment)
2208 let in_mkdocstrings = flavor == MarkdownFlavor::MkDocs
2209 && crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(autodoc_ranges, byte_offset);
2210 // Check if the ENTIRE line is within an HTML comment (not just the line start)
2211 // This ensures content after `-->` on the same line is not incorrectly skipped
2212 let line_end_offset = byte_offset + line.len();
2213 let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
2214 html_comment_ranges,
2215 byte_offset,
2216 line_end_offset,
2217 );
2218 // Use pulldown-cmark's list detection for context-aware parsing
2219 // This eliminates false positives on continuation lines (issue #253)
2220 let list_item =
2221 list_item_map
2222 .get(&byte_offset)
2223 .map(
2224 |(is_ordered, marker, marker_column, content_column, number)| ListItemInfo {
2225 marker: marker.clone(),
2226 is_ordered: *is_ordered,
2227 number: *number,
2228 marker_column: *marker_column,
2229 content_column: *content_column,
2230 },
2231 );
2232
2233 // Detect horizontal rules (only outside code blocks and frontmatter)
2234 // Uses CommonMark-compliant check including leading indentation validation
2235 let in_front_matter = front_matter_end > 0 && i < front_matter_end;
2236 let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
2237
2238 // Get math block status for this line
2239 let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
2240
2241 lines.push(LineInfo {
2242 byte_offset,
2243 byte_len: line.len(),
2244 indent,
2245 visual_indent,
2246 is_blank,
2247 in_code_block,
2248 in_front_matter,
2249 in_html_block: false, // Will be populated after line creation
2250 in_html_comment,
2251 list_item,
2252 heading: None, // Will be populated in second pass for Setext headings
2253 blockquote: None, // Will be populated after line creation
2254 in_mkdocstrings,
2255 in_esm_block: false, // Will be populated after line creation for MDX files
2256 in_code_span_continuation: false, // Will be populated after code spans are parsed
2257 is_horizontal_rule: is_hr,
2258 in_math_block,
2259 });
2260 }
2261
2262 (lines, emphasis_spans)
2263 }
2264
2265 /// Detect headings and blockquotes (called after HTML block detection)
2266 fn detect_headings_and_blockquotes(
2267 content: &str,
2268 lines: &mut [LineInfo],
2269 flavor: MarkdownFlavor,
2270 html_comment_ranges: &[crate::utils::skip_context::ByteRange],
2271 link_byte_ranges: &[(usize, usize)],
2272 ) {
2273 // Regex for heading detection
2274 static ATX_HEADING_REGEX: LazyLock<regex::Regex> =
2275 LazyLock::new(|| regex::Regex::new(r"^(\s*)(#{1,6})(\s*)(.*)$").unwrap());
2276 static SETEXT_UNDERLINE_REGEX: LazyLock<regex::Regex> =
2277 LazyLock::new(|| regex::Regex::new(r"^(\s*)(=+|-+)\s*$").unwrap());
2278
2279 let content_lines: Vec<&str> = content.lines().collect();
2280
2281 // Detect front matter boundaries to skip those lines
2282 let front_matter_end = FrontMatterUtils::get_front_matter_end_line(content);
2283
2284 // Detect headings (including Setext which needs look-ahead) and blockquotes
2285 for i in 0..lines.len() {
2286 if lines[i].in_code_block {
2287 continue;
2288 }
2289
2290 // Skip lines in front matter
2291 if front_matter_end > 0 && i < front_matter_end {
2292 continue;
2293 }
2294
2295 // Skip lines in HTML blocks - HTML content should not be parsed as markdown
2296 if lines[i].in_html_block {
2297 continue;
2298 }
2299
2300 let line = content_lines[i];
2301
2302 // Check for blockquotes (even on blank lines within blockquotes)
2303 if let Some(bq) = parse_blockquote_detailed(line) {
2304 let nesting_level = bq.markers.len(); // Each '>' is one level
2305 let marker_column = bq.indent.len();
2306
2307 // Build the prefix (indentation + markers + space)
2308 let prefix = format!("{}{}{}", bq.indent, bq.markers, bq.spaces_after);
2309
2310 // Check for various blockquote issues
2311 let has_no_space = bq.spaces_after.is_empty() && !bq.content.is_empty();
2312 // Only flag multiple literal spaces, not tabs
2313 // Tabs are handled by MD010 (no-hard-tabs), matching markdownlint behavior
2314 let has_multiple_spaces = bq.spaces_after.chars().filter(|&c| c == ' ').count() > 1;
2315
2316 // Check if needs MD028 fix (empty blockquote line without proper spacing)
2317 // MD028 flags empty blockquote lines that don't have a single space after the marker
2318 // Lines like "> " or ">> " are already correct and don't need fixing
2319 let needs_md028_fix = bq.content.is_empty() && bq.spaces_after.is_empty();
2320
2321 lines[i].blockquote = Some(BlockquoteInfo {
2322 nesting_level,
2323 indent: bq.indent.to_string(),
2324 marker_column,
2325 prefix,
2326 content: bq.content.to_string(),
2327 has_no_space_after_marker: has_no_space,
2328 has_multiple_spaces_after_marker: has_multiple_spaces,
2329 needs_md028_fix,
2330 });
2331 }
2332
2333 // Skip heading detection for blank lines
2334 if lines[i].is_blank {
2335 continue;
2336 }
2337
2338 // Check for ATX headings (but skip MkDocs snippet lines)
2339 // In MkDocs flavor, lines like "# -8<- [start:name]" are snippet markers, not headings
2340 let is_snippet_line = if flavor == MarkdownFlavor::MkDocs {
2341 crate::utils::mkdocs_snippets::is_snippet_section_start(line)
2342 || crate::utils::mkdocs_snippets::is_snippet_section_end(line)
2343 } else {
2344 false
2345 };
2346
2347 if !is_snippet_line && let Some(caps) = ATX_HEADING_REGEX.captures(line) {
2348 // Skip headings inside HTML comments (using pre-computed ranges for efficiency)
2349 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset) {
2350 continue;
2351 }
2352 // Skip lines that fall within link syntax (e.g., multiline links like `[text](url\n#fragment)`)
2353 // This prevents false positives where `#fragment` is detected as a heading
2354 let line_offset = lines[i].byte_offset;
2355 if link_byte_ranges
2356 .iter()
2357 .any(|&(start, end)| line_offset > start && line_offset < end)
2358 {
2359 continue;
2360 }
2361 let leading_spaces = caps.get(1).map_or("", |m| m.as_str());
2362 let hashes = caps.get(2).map_or("", |m| m.as_str());
2363 let spaces_after = caps.get(3).map_or("", |m| m.as_str());
2364 let rest = caps.get(4).map_or("", |m| m.as_str());
2365
2366 let level = hashes.len() as u8;
2367 let marker_column = leading_spaces.len();
2368
2369 // Check for closing sequence, but handle custom IDs that might come after
2370 let (text, has_closing, closing_seq) = {
2371 // First check if there's a custom ID at the end
2372 let (rest_without_id, custom_id_part) = if let Some(id_start) = rest.rfind(" {#") {
2373 // Check if this looks like a valid custom ID (ends with })
2374 if rest[id_start..].trim_end().ends_with('}') {
2375 // Split off the custom ID
2376 (&rest[..id_start], &rest[id_start..])
2377 } else {
2378 (rest, "")
2379 }
2380 } else {
2381 (rest, "")
2382 };
2383
2384 // Now look for closing hashes in the part before the custom ID
2385 let trimmed_rest = rest_without_id.trim_end();
2386 if let Some(last_hash_byte_pos) = trimmed_rest.rfind('#') {
2387 // Find the start of the hash sequence by walking backwards
2388 // Use char_indices to get byte positions at char boundaries
2389 let char_positions: Vec<(usize, char)> = trimmed_rest.char_indices().collect();
2390
2391 // Find which char index corresponds to last_hash_byte_pos
2392 let last_hash_char_idx = char_positions
2393 .iter()
2394 .position(|(byte_pos, _)| *byte_pos == last_hash_byte_pos);
2395
2396 if let Some(mut char_idx) = last_hash_char_idx {
2397 // Walk backwards to find start of hash sequence
2398 while char_idx > 0 && char_positions[char_idx - 1].1 == '#' {
2399 char_idx -= 1;
2400 }
2401
2402 // Get the byte position of the start of hashes
2403 let start_of_hashes = char_positions[char_idx].0;
2404
2405 // Check if there's at least one space before the closing hashes
2406 let has_space_before = char_idx == 0 || char_positions[char_idx - 1].1.is_whitespace();
2407
2408 // Check if this is a valid closing sequence (all hashes to end of trimmed part)
2409 let potential_closing = &trimmed_rest[start_of_hashes..];
2410 let is_all_hashes = potential_closing.chars().all(|c| c == '#');
2411
2412 if is_all_hashes && has_space_before {
2413 // This is a closing sequence
2414 let closing_hashes = potential_closing.to_string();
2415 // The text is everything before the closing hashes
2416 // Don't include the custom ID here - it will be extracted later
2417 let text_part = if !custom_id_part.is_empty() {
2418 // If we have a custom ID, append it back to get the full rest
2419 // This allows the extract_header_id function to handle it properly
2420 format!("{}{}", trimmed_rest[..start_of_hashes].trim_end(), custom_id_part)
2421 } else {
2422 trimmed_rest[..start_of_hashes].trim_end().to_string()
2423 };
2424 (text_part, true, closing_hashes)
2425 } else {
2426 // Not a valid closing sequence, return the full content
2427 (rest.to_string(), false, String::new())
2428 }
2429 } else {
2430 // Couldn't find char boundary, return the full content
2431 (rest.to_string(), false, String::new())
2432 }
2433 } else {
2434 // No hashes found, return the full content
2435 (rest.to_string(), false, String::new())
2436 }
2437 };
2438
2439 let content_column = marker_column + hashes.len() + spaces_after.len();
2440
2441 // Extract custom header ID if present
2442 let raw_text = text.trim().to_string();
2443 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2444
2445 // If no custom ID was found on the header line, check the next line for standalone attr-list
2446 if custom_id.is_none() && i + 1 < content_lines.len() && i + 1 < lines.len() {
2447 let next_line = content_lines[i + 1];
2448 if !lines[i + 1].in_code_block
2449 && crate::utils::header_id_utils::is_standalone_attr_list(next_line)
2450 && let Some(next_line_id) =
2451 crate::utils::header_id_utils::extract_standalone_attr_list_id(next_line)
2452 {
2453 custom_id = Some(next_line_id);
2454 }
2455 }
2456
2457 // ATX heading is "valid" for processing by heading rules if:
2458 // 1. Has space after # (CommonMark compliant): `# Heading`
2459 // 2. Is empty (just hashes): `#`
2460 // 3. Has multiple hashes (##intro is likely intended heading, not hashtag)
2461 // 4. Content starts with uppercase (likely intended heading, not social hashtag)
2462 //
2463 // Invalid patterns (hashtag-like) are skipped by most heading rules:
2464 // - `#tag` - single # with lowercase (social hashtag)
2465 // - `#123` - single # with number (GitHub issue ref)
2466 let is_valid = !spaces_after.is_empty()
2467 || rest.is_empty()
2468 || level > 1
2469 || rest.trim().chars().next().is_some_and(|c| c.is_uppercase());
2470
2471 lines[i].heading = Some(HeadingInfo {
2472 level,
2473 style: HeadingStyle::ATX,
2474 marker: hashes.to_string(),
2475 marker_column,
2476 content_column,
2477 text: clean_text,
2478 custom_id,
2479 raw_text,
2480 has_closing_sequence: has_closing,
2481 closing_sequence: closing_seq,
2482 is_valid,
2483 });
2484 }
2485 // Check for Setext headings (need to look at next line)
2486 else if i + 1 < content_lines.len() && i + 1 < lines.len() {
2487 let next_line = content_lines[i + 1];
2488 if !lines[i + 1].in_code_block && SETEXT_UNDERLINE_REGEX.is_match(next_line) {
2489 // Skip if next line is front matter delimiter
2490 if front_matter_end > 0 && i < front_matter_end {
2491 continue;
2492 }
2493
2494 // Skip Setext headings inside HTML comments (using pre-computed ranges for efficiency)
2495 if crate::utils::skip_context::is_in_html_comment_ranges(html_comment_ranges, lines[i].byte_offset)
2496 {
2497 continue;
2498 }
2499
2500 // Per CommonMark spec 4.3, setext heading content cannot be interpretable as:
2501 // list item, ATX heading, block quote, thematic break, code fence, or HTML block
2502 let content_line = line.trim();
2503
2504 // Skip list items (-, *, +) and thematic breaks (---, ***, etc.)
2505 if content_line.starts_with('-') || content_line.starts_with('*') || content_line.starts_with('+') {
2506 continue;
2507 }
2508
2509 // Skip underscore thematic breaks (___)
2510 if content_line.starts_with('_') {
2511 let non_ws: String = content_line.chars().filter(|c| !c.is_whitespace()).collect();
2512 if non_ws.len() >= 3 && non_ws.chars().all(|c| c == '_') {
2513 continue;
2514 }
2515 }
2516
2517 // Skip numbered lists (1. Item, 2. Item, etc.)
2518 if let Some(first_char) = content_line.chars().next()
2519 && first_char.is_ascii_digit()
2520 {
2521 let num_end = content_line.chars().take_while(|c| c.is_ascii_digit()).count();
2522 if num_end < content_line.len() {
2523 let next = content_line.chars().nth(num_end);
2524 if next == Some('.') || next == Some(')') {
2525 continue;
2526 }
2527 }
2528 }
2529
2530 // Skip ATX headings
2531 if ATX_HEADING_REGEX.is_match(line) {
2532 continue;
2533 }
2534
2535 // Skip blockquotes
2536 if content_line.starts_with('>') {
2537 continue;
2538 }
2539
2540 // Skip code fences
2541 let trimmed_start = line.trim_start();
2542 if trimmed_start.len() >= 3 {
2543 let first_three: String = trimmed_start.chars().take(3).collect();
2544 if first_three == "```" || first_three == "~~~" {
2545 continue;
2546 }
2547 }
2548
2549 // Skip HTML blocks
2550 if content_line.starts_with('<') {
2551 continue;
2552 }
2553
2554 let underline = next_line.trim();
2555
2556 let level = if underline.starts_with('=') { 1 } else { 2 };
2557 let style = if level == 1 {
2558 HeadingStyle::Setext1
2559 } else {
2560 HeadingStyle::Setext2
2561 };
2562
2563 // Extract custom header ID if present
2564 let raw_text = line.trim().to_string();
2565 let (clean_text, mut custom_id) = crate::utils::header_id_utils::extract_header_id(&raw_text);
2566
2567 // If no custom ID was found on the header line, check the line after underline for standalone attr-list
2568 if custom_id.is_none() && i + 2 < content_lines.len() && i + 2 < lines.len() {
2569 let attr_line = content_lines[i + 2];
2570 if !lines[i + 2].in_code_block
2571 && crate::utils::header_id_utils::is_standalone_attr_list(attr_line)
2572 && let Some(attr_line_id) =
2573 crate::utils::header_id_utils::extract_standalone_attr_list_id(attr_line)
2574 {
2575 custom_id = Some(attr_line_id);
2576 }
2577 }
2578
2579 lines[i].heading = Some(HeadingInfo {
2580 level,
2581 style,
2582 marker: underline.to_string(),
2583 marker_column: next_line.len() - next_line.trim_start().len(),
2584 content_column: lines[i].indent,
2585 text: clean_text,
2586 custom_id,
2587 raw_text,
2588 has_closing_sequence: false,
2589 closing_sequence: String::new(),
2590 is_valid: true, // Setext headings are always valid
2591 });
2592 }
2593 }
2594 }
2595 }
2596
2597 /// Detect HTML blocks in the content
2598 fn detect_html_blocks(content: &str, lines: &mut [LineInfo]) {
2599 // HTML block elements that trigger block context
2600 // Includes HTML5 media, embedded content, and interactive elements
2601 const BLOCK_ELEMENTS: &[&str] = &[
2602 "address",
2603 "article",
2604 "aside",
2605 "audio",
2606 "blockquote",
2607 "canvas",
2608 "details",
2609 "dialog",
2610 "dd",
2611 "div",
2612 "dl",
2613 "dt",
2614 "embed",
2615 "fieldset",
2616 "figcaption",
2617 "figure",
2618 "footer",
2619 "form",
2620 "h1",
2621 "h2",
2622 "h3",
2623 "h4",
2624 "h5",
2625 "h6",
2626 "header",
2627 "hr",
2628 "iframe",
2629 "li",
2630 "main",
2631 "menu",
2632 "nav",
2633 "noscript",
2634 "object",
2635 "ol",
2636 "p",
2637 "picture",
2638 "pre",
2639 "script",
2640 "search",
2641 "section",
2642 "source",
2643 "style",
2644 "summary",
2645 "svg",
2646 "table",
2647 "tbody",
2648 "td",
2649 "template",
2650 "textarea",
2651 "tfoot",
2652 "th",
2653 "thead",
2654 "tr",
2655 "track",
2656 "ul",
2657 "video",
2658 ];
2659
2660 let mut i = 0;
2661 while i < lines.len() {
2662 // Skip if already in code block or front matter
2663 if lines[i].in_code_block || lines[i].in_front_matter {
2664 i += 1;
2665 continue;
2666 }
2667
2668 let trimmed = lines[i].content(content).trim_start();
2669
2670 // Check if line starts with an HTML tag
2671 if trimmed.starts_with('<') && trimmed.len() > 1 {
2672 // Extract tag name safely
2673 let after_bracket = &trimmed[1..];
2674 let is_closing = after_bracket.starts_with('/');
2675 let tag_start = if is_closing { &after_bracket[1..] } else { after_bracket };
2676
2677 // Extract tag name (stop at space, >, /, or end of string)
2678 let tag_name = tag_start
2679 .chars()
2680 .take_while(|c| c.is_ascii_alphabetic() || *c == '-' || c.is_ascii_digit())
2681 .collect::<String>()
2682 .to_lowercase();
2683
2684 // Check if it's a block element
2685 if !tag_name.is_empty() && BLOCK_ELEMENTS.contains(&tag_name.as_str()) {
2686 // Mark this line as in HTML block
2687 lines[i].in_html_block = true;
2688
2689 // For simplicity, just mark lines until we find a closing tag or reach a blank line
2690 // This avoids complex nesting logic that might cause infinite loops
2691 if !is_closing {
2692 let closing_tag = format!("</{tag_name}>");
2693 // style and script tags can contain blank lines (CSS/JS formatting)
2694 let allow_blank_lines = tag_name == "style" || tag_name == "script";
2695 let mut j = i + 1;
2696 let mut found_closing_tag = false;
2697 while j < lines.len() && j < i + 100 {
2698 // Limit search to 100 lines
2699 // Stop at blank lines (except for style/script tags)
2700 if !allow_blank_lines && lines[j].is_blank {
2701 break;
2702 }
2703
2704 lines[j].in_html_block = true;
2705
2706 // Check if this line contains the closing tag
2707 if lines[j].content(content).contains(&closing_tag) {
2708 found_closing_tag = true;
2709 }
2710
2711 // After finding closing tag, continue marking lines as
2712 // in_html_block until blank line (per CommonMark spec)
2713 if found_closing_tag {
2714 j += 1;
2715 // Continue marking subsequent lines until blank
2716 while j < lines.len() && j < i + 100 {
2717 if lines[j].is_blank {
2718 break;
2719 }
2720 lines[j].in_html_block = true;
2721 j += 1;
2722 }
2723 break;
2724 }
2725 j += 1;
2726 }
2727 }
2728 }
2729 }
2730
2731 i += 1;
2732 }
2733 }
2734
2735 /// Detect ESM import/export blocks in MDX files
2736 /// ESM blocks consist of contiguous import/export statements at the top of the file
2737 fn detect_esm_blocks(content: &str, lines: &mut [LineInfo], flavor: MarkdownFlavor) {
2738 // Only process MDX files
2739 if !flavor.supports_esm_blocks() {
2740 return;
2741 }
2742
2743 let mut in_multiline_comment = false;
2744
2745 for line in lines.iter_mut() {
2746 // Skip blank lines and HTML comments
2747 if line.is_blank || line.in_html_comment {
2748 continue;
2749 }
2750
2751 let trimmed = line.content(content).trim_start();
2752
2753 // Handle continuation of multi-line JS comments
2754 if in_multiline_comment {
2755 if trimmed.contains("*/") {
2756 in_multiline_comment = false;
2757 }
2758 continue;
2759 }
2760
2761 // Skip single-line JS comments (// and ///)
2762 if trimmed.starts_with("//") {
2763 continue;
2764 }
2765
2766 // Handle start of multi-line JS comment
2767 if trimmed.starts_with("/*") {
2768 if !trimmed.contains("*/") {
2769 in_multiline_comment = true;
2770 }
2771 continue;
2772 }
2773
2774 // Check if line starts with import or export
2775 if trimmed.starts_with("import ") || trimmed.starts_with("export ") {
2776 line.in_esm_block = true;
2777 } else {
2778 // Once we hit a non-ESM, non-comment line, we're done with the ESM block
2779 break;
2780 }
2781 }
2782 }
2783
2784 /// Parse all inline code spans in the content using pulldown-cmark streaming parser
2785 fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
2786 let mut code_spans = Vec::new();
2787
2788 // Quick check - if no backticks, no code spans
2789 if !content.contains('`') {
2790 return code_spans;
2791 }
2792
2793 // Use pulldown-cmark's streaming parser with byte offsets
2794 let parser = Parser::new(content).into_offset_iter();
2795
2796 for (event, range) in parser {
2797 if let Event::Code(_) = event {
2798 let start_pos = range.start;
2799 let end_pos = range.end;
2800
2801 // The range includes the backticks, extract the actual content
2802 let full_span = &content[start_pos..end_pos];
2803 let backtick_count = full_span.chars().take_while(|&c| c == '`').count();
2804
2805 // Extract content between backticks, preserving spaces
2806 let content_start = start_pos + backtick_count;
2807 let content_end = end_pos - backtick_count;
2808 let span_content = if content_start < content_end {
2809 content[content_start..content_end].to_string()
2810 } else {
2811 String::new()
2812 };
2813
2814 // Use binary search to find line number - O(log n) instead of O(n)
2815 // Find the rightmost line whose byte_offset <= start_pos
2816 let line_idx = lines
2817 .partition_point(|line| line.byte_offset <= start_pos)
2818 .saturating_sub(1);
2819 let line_num = line_idx + 1;
2820 let byte_col_start = start_pos - lines[line_idx].byte_offset;
2821
2822 // Find end column using binary search
2823 let end_line_idx = lines
2824 .partition_point(|line| line.byte_offset <= end_pos)
2825 .saturating_sub(1);
2826 let byte_col_end = end_pos - lines[end_line_idx].byte_offset;
2827
2828 // Convert byte offsets to character positions for correct Unicode handling
2829 // This ensures consistency with warning.column which uses character positions
2830 let line_content = lines[line_idx].content(content);
2831 let col_start = if byte_col_start <= line_content.len() {
2832 line_content[..byte_col_start].chars().count()
2833 } else {
2834 line_content.chars().count()
2835 };
2836
2837 let end_line_content = lines[end_line_idx].content(content);
2838 let col_end = if byte_col_end <= end_line_content.len() {
2839 end_line_content[..byte_col_end].chars().count()
2840 } else {
2841 end_line_content.chars().count()
2842 };
2843
2844 code_spans.push(CodeSpan {
2845 line: line_num,
2846 end_line: end_line_idx + 1,
2847 start_col: col_start,
2848 end_col: col_end,
2849 byte_offset: start_pos,
2850 byte_end: end_pos,
2851 backtick_count,
2852 content: span_content,
2853 });
2854 }
2855 }
2856
2857 // Sort by position to ensure consistent ordering
2858 code_spans.sort_by_key(|span| span.byte_offset);
2859
2860 code_spans
2861 }
2862
2863 /// Parse all list blocks in the content (legacy line-by-line approach)
2864 ///
2865 /// Uses a forward-scanning O(n) algorithm that tracks two variables during iteration:
2866 /// - `has_list_breaking_content_since_last_item`: Set when encountering content that
2867 /// terminates a list (headings, horizontal rules, tables, insufficiently indented content)
2868 /// - `min_continuation_for_tracking`: Minimum indentation required for content to be
2869 /// treated as list continuation (based on the list marker width)
2870 ///
2871 /// When a new list item is encountered, we check if list-breaking content was seen
2872 /// since the last item. If so, we start a new list block.
2873 fn parse_list_blocks(content: &str, lines: &[LineInfo]) -> Vec<ListBlock> {
2874 // Minimum indentation for unordered list continuation per CommonMark spec
2875 const UNORDERED_LIST_MIN_CONTINUATION_INDENT: usize = 2;
2876
2877 /// Initialize or reset the forward-scanning tracking state.
2878 /// This helper eliminates code duplication across three initialization sites.
2879 #[inline]
2880 fn reset_tracking_state(
2881 list_item: &ListItemInfo,
2882 has_list_breaking_content: &mut bool,
2883 min_continuation: &mut usize,
2884 ) {
2885 *has_list_breaking_content = false;
2886 let marker_width = if list_item.is_ordered {
2887 list_item.marker.len() + 1 // Ordered markers need space after period/paren
2888 } else {
2889 list_item.marker.len()
2890 };
2891 *min_continuation = if list_item.is_ordered {
2892 marker_width
2893 } else {
2894 UNORDERED_LIST_MIN_CONTINUATION_INDENT
2895 };
2896 }
2897
2898 // Pre-size based on lines that could be list items
2899 let mut list_blocks = Vec::with_capacity(lines.len() / 10); // Estimate ~10% of lines might start list blocks
2900 let mut current_block: Option<ListBlock> = None;
2901 let mut last_list_item_line = 0;
2902 let mut current_indent_level = 0;
2903 let mut last_marker_width = 0;
2904
2905 // Track list-breaking content since last item (fixes O(n²) bottleneck from issue #148)
2906 let mut has_list_breaking_content_since_last_item = false;
2907 let mut min_continuation_for_tracking = 0;
2908
2909 for (line_idx, line_info) in lines.iter().enumerate() {
2910 let line_num = line_idx + 1;
2911
2912 // Enhanced code block handling using Design #3's context analysis
2913 if line_info.in_code_block {
2914 if let Some(ref mut block) = current_block {
2915 // Calculate minimum indentation for list continuation
2916 let min_continuation_indent =
2917 CodeBlockUtils::calculate_min_continuation_indent(content, lines, line_idx);
2918
2919 // Analyze code block context using the three-tier classification
2920 let context = CodeBlockUtils::analyze_code_block_context(lines, line_idx, min_continuation_indent);
2921
2922 match context {
2923 CodeBlockContext::Indented => {
2924 // Code block is properly indented - continues the list
2925 block.end_line = line_num;
2926 continue;
2927 }
2928 CodeBlockContext::Standalone => {
2929 // Code block separates lists - end current block
2930 let completed_block = current_block.take().unwrap();
2931 list_blocks.push(completed_block);
2932 continue;
2933 }
2934 CodeBlockContext::Adjacent => {
2935 // Edge case - use conservative behavior (continue list)
2936 block.end_line = line_num;
2937 continue;
2938 }
2939 }
2940 } else {
2941 // No current list block - skip code block lines
2942 continue;
2943 }
2944 }
2945
2946 // Extract blockquote prefix if any
2947 let blockquote_prefix = if let Some(caps) = BLOCKQUOTE_PREFIX_REGEX.captures(line_info.content(content)) {
2948 caps.get(0).unwrap().as_str().to_string()
2949 } else {
2950 String::new()
2951 };
2952
2953 // Track list-breaking content for non-list, non-blank lines (O(n) replacement for nested loop)
2954 // Skip lines that are continuations of multi-line code spans - they're part of the previous list item
2955 if let Some(ref block) = current_block
2956 && line_info.list_item.is_none()
2957 && !line_info.is_blank
2958 && !line_info.in_code_span_continuation
2959 {
2960 let line_content = line_info.content(content).trim();
2961
2962 // Check for structural separators that break lists
2963 // Note: Lazy continuation (indent=0) is valid in CommonMark and should NOT break lists.
2964 // Only lines with indent between 1 and min_continuation_for_tracking-1 break lists,
2965 // as they indicate improper indentation rather than lazy continuation.
2966 let is_lazy_continuation = line_info.indent == 0 && !line_info.is_blank;
2967
2968 // Check if blockquote context changes (different prefix than current block)
2969 // Lines within the SAME blockquote context don't break lists
2970 let blockquote_prefix_changes = blockquote_prefix.trim() != block.blockquote_prefix.trim();
2971
2972 let breaks_list = line_info.heading.is_some()
2973 || line_content.starts_with("---")
2974 || line_content.starts_with("***")
2975 || line_content.starts_with("___")
2976 || crate::utils::skip_context::is_table_line(line_content)
2977 || blockquote_prefix_changes
2978 || (line_info.indent > 0
2979 && line_info.indent < min_continuation_for_tracking
2980 && !is_lazy_continuation);
2981
2982 if breaks_list {
2983 has_list_breaking_content_since_last_item = true;
2984 }
2985 }
2986
2987 // If this line is a code span continuation within an active list block,
2988 // extend the block's end_line to include this line (maintains list continuity)
2989 if line_info.in_code_span_continuation
2990 && line_info.list_item.is_none()
2991 && let Some(ref mut block) = current_block
2992 {
2993 block.end_line = line_num;
2994 }
2995
2996 // Extend block.end_line for regular continuation lines (non-list-item, non-blank,
2997 // properly indented lines within the list). This ensures the workaround at line 2448
2998 // works correctly when there are multiple continuation lines before a nested list item.
2999 // Also include lazy continuation lines (indent=0) per CommonMark spec.
3000 // For blockquote lines, compute effective indent after stripping the prefix
3001 let effective_continuation_indent = if let Some(ref block) = current_block {
3002 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3003 let line_content = line_info.content(content);
3004 let line_bq_level = line_content
3005 .chars()
3006 .take_while(|c| *c == '>' || c.is_whitespace())
3007 .filter(|&c| c == '>')
3008 .count();
3009 if line_bq_level > 0 && line_bq_level == block_bq_level {
3010 // Compute indent after blockquote markers
3011 let mut pos = 0;
3012 let mut found_markers = 0;
3013 for c in line_content.chars() {
3014 pos += c.len_utf8();
3015 if c == '>' {
3016 found_markers += 1;
3017 if found_markers == line_bq_level {
3018 if line_content.get(pos..pos + 1) == Some(" ") {
3019 pos += 1;
3020 }
3021 break;
3022 }
3023 }
3024 }
3025 let after_bq = &line_content[pos..];
3026 after_bq.len() - after_bq.trim_start().len()
3027 } else {
3028 line_info.indent
3029 }
3030 } else {
3031 line_info.indent
3032 };
3033 let adjusted_min_continuation_for_tracking = if let Some(ref block) = current_block {
3034 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3035 if block_bq_level > 0 {
3036 if block.is_ordered { last_marker_width } else { 2 }
3037 } else {
3038 min_continuation_for_tracking
3039 }
3040 } else {
3041 min_continuation_for_tracking
3042 };
3043 let is_valid_continuation = effective_continuation_indent >= adjusted_min_continuation_for_tracking
3044 || (line_info.indent == 0 && !line_info.is_blank); // Lazy continuation
3045
3046 if std::env::var("RUMDL_DEBUG_LIST").is_ok() && line_info.list_item.is_none() && !line_info.is_blank {
3047 eprintln!(
3048 "[DEBUG] Line {}: checking continuation - indent={}, min_cont={}, is_valid={}, in_code_span={}, in_code_block={}, has_block={}",
3049 line_num,
3050 effective_continuation_indent,
3051 adjusted_min_continuation_for_tracking,
3052 is_valid_continuation,
3053 line_info.in_code_span_continuation,
3054 line_info.in_code_block,
3055 current_block.is_some()
3056 );
3057 }
3058
3059 if !line_info.in_code_span_continuation
3060 && line_info.list_item.is_none()
3061 && !line_info.is_blank
3062 && !line_info.in_code_block
3063 && is_valid_continuation
3064 && let Some(ref mut block) = current_block
3065 {
3066 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3067 eprintln!(
3068 "[DEBUG] Line {}: extending block.end_line from {} to {}",
3069 line_num, block.end_line, line_num
3070 );
3071 }
3072 block.end_line = line_num;
3073 }
3074
3075 // Check if this line is a list item
3076 if let Some(list_item) = &line_info.list_item {
3077 // Calculate nesting level based on indentation
3078 let item_indent = list_item.marker_column;
3079 let nesting = item_indent / 2; // Assume 2-space indentation for nesting
3080
3081 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3082 eprintln!(
3083 "[DEBUG] Line {}: list item found, marker={:?}, indent={}",
3084 line_num, list_item.marker, item_indent
3085 );
3086 }
3087
3088 if let Some(ref mut block) = current_block {
3089 // Check if this continues the current block
3090 // For nested lists, we need to check if this is a nested item (higher nesting level)
3091 // or a continuation at the same or lower level
3092 let is_nested = nesting > block.nesting_level;
3093 let same_type =
3094 (block.is_ordered && list_item.is_ordered) || (!block.is_ordered && !list_item.is_ordered);
3095 let same_context = block.blockquote_prefix == blockquote_prefix;
3096 // Allow one blank line after last item, or lines immediately after block content
3097 let reasonable_distance = line_num <= last_list_item_line + 2 || line_num == block.end_line + 1;
3098
3099 // For unordered lists, also check marker consistency
3100 let marker_compatible =
3101 block.is_ordered || block.marker.is_none() || block.marker.as_ref() == Some(&list_item.marker);
3102
3103 // O(1) check: Use the tracked variable instead of O(n) nested loop
3104 // This eliminates the quadratic bottleneck from issue #148
3105 let has_non_list_content = has_list_breaking_content_since_last_item;
3106
3107 // A list continues if:
3108 // 1. It's a nested item (indented more than the parent), OR
3109 // 2. It's the same type at the same level with reasonable distance
3110 let mut continues_list = if is_nested {
3111 // Nested items always continue the list if they're in the same context
3112 same_context && reasonable_distance && !has_non_list_content
3113 } else {
3114 // Same-level items need to match type and markers
3115 same_type && same_context && reasonable_distance && marker_compatible && !has_non_list_content
3116 };
3117
3118 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3119 eprintln!(
3120 "[DEBUG] Line {}: continues_list={}, is_nested={}, same_type={}, same_context={}, reasonable_distance={}, marker_compatible={}, has_non_list_content={}, last_item={}, block.end_line={}",
3121 line_num,
3122 continues_list,
3123 is_nested,
3124 same_type,
3125 same_context,
3126 reasonable_distance,
3127 marker_compatible,
3128 has_non_list_content,
3129 last_list_item_line,
3130 block.end_line
3131 );
3132 }
3133
3134 // WORKAROUND: If items are truly consecutive (no blank lines), they MUST be in the same list
3135 // This handles edge cases where content patterns might otherwise split lists incorrectly
3136 if !continues_list && reasonable_distance && line_num > 0 && block.end_line == line_num - 1 {
3137 // Check if the previous line was a list item or a continuation of a list item
3138 // (including lazy continuation lines)
3139 if block.item_lines.contains(&(line_num - 1)) {
3140 // They're consecutive list items - force them to be in the same list
3141 continues_list = true;
3142 } else {
3143 // Previous line is a continuation line within this block
3144 // (e.g., lazy continuation with indent=0)
3145 // Since block.end_line == line_num - 1, we know line_num - 1 is part of this block
3146 continues_list = true;
3147 }
3148 }
3149
3150 if continues_list {
3151 // Extend current block
3152 block.end_line = line_num;
3153 block.item_lines.push(line_num);
3154
3155 // Update max marker width
3156 block.max_marker_width = block.max_marker_width.max(if list_item.is_ordered {
3157 list_item.marker.len() + 1
3158 } else {
3159 list_item.marker.len()
3160 });
3161
3162 // Update marker consistency for unordered lists
3163 if !block.is_ordered
3164 && block.marker.is_some()
3165 && block.marker.as_ref() != Some(&list_item.marker)
3166 {
3167 // Mixed markers, clear the marker field
3168 block.marker = None;
3169 }
3170
3171 // Reset tracked state for issue #148 optimization
3172 reset_tracking_state(
3173 list_item,
3174 &mut has_list_breaking_content_since_last_item,
3175 &mut min_continuation_for_tracking,
3176 );
3177 } else {
3178 // End current block and start a new one
3179
3180 list_blocks.push(block.clone());
3181
3182 *block = ListBlock {
3183 start_line: line_num,
3184 end_line: line_num,
3185 is_ordered: list_item.is_ordered,
3186 marker: if list_item.is_ordered {
3187 None
3188 } else {
3189 Some(list_item.marker.clone())
3190 },
3191 blockquote_prefix: blockquote_prefix.clone(),
3192 item_lines: vec![line_num],
3193 nesting_level: nesting,
3194 max_marker_width: if list_item.is_ordered {
3195 list_item.marker.len() + 1
3196 } else {
3197 list_item.marker.len()
3198 },
3199 };
3200
3201 // Initialize tracked state for new block (issue #148 optimization)
3202 reset_tracking_state(
3203 list_item,
3204 &mut has_list_breaking_content_since_last_item,
3205 &mut min_continuation_for_tracking,
3206 );
3207 }
3208 } else {
3209 // Start a new block
3210 current_block = Some(ListBlock {
3211 start_line: line_num,
3212 end_line: line_num,
3213 is_ordered: list_item.is_ordered,
3214 marker: if list_item.is_ordered {
3215 None
3216 } else {
3217 Some(list_item.marker.clone())
3218 },
3219 blockquote_prefix,
3220 item_lines: vec![line_num],
3221 nesting_level: nesting,
3222 max_marker_width: list_item.marker.len(),
3223 });
3224
3225 // Initialize tracked state for new block (issue #148 optimization)
3226 reset_tracking_state(
3227 list_item,
3228 &mut has_list_breaking_content_since_last_item,
3229 &mut min_continuation_for_tracking,
3230 );
3231 }
3232
3233 last_list_item_line = line_num;
3234 current_indent_level = item_indent;
3235 last_marker_width = if list_item.is_ordered {
3236 list_item.marker.len() + 1 // Add 1 for the space after ordered list markers
3237 } else {
3238 list_item.marker.len()
3239 };
3240 } else if let Some(ref mut block) = current_block {
3241 // Not a list item - check if it continues the current block
3242 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3243 eprintln!(
3244 "[DEBUG] Line {}: non-list-item, is_blank={}, block exists",
3245 line_num, line_info.is_blank
3246 );
3247 }
3248
3249 // For MD032 compatibility, we use a simple approach:
3250 // - Indented lines continue the list
3251 // - Blank lines followed by indented content continue the list
3252 // - Everything else ends the list
3253
3254 // Check if the last line in the list block ended with a backslash (hard line break)
3255 // This handles cases where list items use backslash for hard line breaks
3256 let prev_line_ends_with_backslash = if block.end_line > 0 && block.end_line - 1 < lines.len() {
3257 lines[block.end_line - 1].content(content).trim_end().ends_with('\\')
3258 } else {
3259 false
3260 };
3261
3262 // Calculate minimum indentation for list continuation
3263 // For ordered lists, use the last marker width (e.g., 3 for "1. ", 4 for "10. ")
3264 // For unordered lists like "- ", content starts at column 2, so continuations need at least 2 spaces
3265 let min_continuation_indent = if block.is_ordered {
3266 current_indent_level + last_marker_width
3267 } else {
3268 current_indent_level + 2 // Unordered lists need at least 2 spaces (e.g., "- " = 2 chars)
3269 };
3270
3271 if prev_line_ends_with_backslash || line_info.indent >= min_continuation_indent {
3272 // Indented line or backslash continuation continues the list
3273 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3274 eprintln!(
3275 "[DEBUG] Line {}: indented continuation (indent={}, min={})",
3276 line_num, line_info.indent, min_continuation_indent
3277 );
3278 }
3279 block.end_line = line_num;
3280 } else if line_info.is_blank {
3281 // Blank line - check if it's internal to the list or ending it
3282 // We only include blank lines that are followed by more list content
3283 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3284 eprintln!("[DEBUG] Line {line_num}: entering blank line handling");
3285 }
3286 let mut check_idx = line_idx + 1;
3287 let mut found_continuation = false;
3288
3289 // Skip additional blank lines
3290 while check_idx < lines.len() && lines[check_idx].is_blank {
3291 check_idx += 1;
3292 }
3293
3294 if check_idx < lines.len() {
3295 let next_line = &lines[check_idx];
3296 // For blockquote lines, compute indent AFTER stripping the blockquote prefix
3297 let next_content = next_line.content(content);
3298 // Use blockquote level (count of >) to compare, not the full prefix
3299 // This avoids issues where the regex captures extra whitespace
3300 let block_bq_level_for_indent = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3301 let next_bq_level_for_indent = next_content
3302 .chars()
3303 .take_while(|c| *c == '>' || c.is_whitespace())
3304 .filter(|&c| c == '>')
3305 .count();
3306 let effective_indent =
3307 if next_bq_level_for_indent > 0 && next_bq_level_for_indent == block_bq_level_for_indent {
3308 // For lines in the same blockquote context, compute indent after the blockquote marker(s)
3309 // Find position after ">" and one space
3310 let mut pos = 0;
3311 let mut found_markers = 0;
3312 for c in next_content.chars() {
3313 pos += c.len_utf8();
3314 if c == '>' {
3315 found_markers += 1;
3316 if found_markers == next_bq_level_for_indent {
3317 // Skip optional space after last >
3318 if next_content.get(pos..pos + 1) == Some(" ") {
3319 pos += 1;
3320 }
3321 break;
3322 }
3323 }
3324 }
3325 let after_blockquote_marker = &next_content[pos..];
3326 after_blockquote_marker.len() - after_blockquote_marker.trim_start().len()
3327 } else {
3328 next_line.indent
3329 };
3330 // Also adjust min_continuation_indent for blockquote lists
3331 // The marker_column includes blockquote prefix, so subtract it
3332 let adjusted_min_continuation = if block_bq_level_for_indent > 0 {
3333 // For blockquote lists, the continuation is relative to blockquote content
3334 // current_indent_level includes blockquote prefix (2 for "> "), so use just 2 for unordered
3335 if block.is_ordered { last_marker_width } else { 2 }
3336 } else {
3337 min_continuation_indent
3338 };
3339 // Check if followed by indented content (list continuation)
3340 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3341 eprintln!(
3342 "[DEBUG] Blank line {} checking next line {}: effective_indent={}, adjusted_min={}, next_is_list={}, in_code_block={}",
3343 line_num,
3344 check_idx + 1,
3345 effective_indent,
3346 adjusted_min_continuation,
3347 next_line.list_item.is_some(),
3348 next_line.in_code_block
3349 );
3350 }
3351 if !next_line.in_code_block && effective_indent >= adjusted_min_continuation {
3352 found_continuation = true;
3353 }
3354 // Check if followed by another list item at the same level
3355 else if !next_line.in_code_block
3356 && next_line.list_item.is_some()
3357 && let Some(item) = &next_line.list_item
3358 {
3359 let next_blockquote_prefix = BLOCKQUOTE_PREFIX_REGEX
3360 .find(next_line.content(content))
3361 .map_or(String::new(), |m| m.as_str().to_string());
3362 if item.marker_column == current_indent_level
3363 && item.is_ordered == block.is_ordered
3364 && block.blockquote_prefix.trim() == next_blockquote_prefix.trim()
3365 {
3366 // Check if there was meaningful content between the list items (unused now)
3367 // This variable is kept for potential future use but is currently replaced by has_structural_separators
3368 // Pre-compute block's blockquote level for use in closures
3369 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3370 let _has_meaningful_content = (line_idx + 1..check_idx).any(|idx| {
3371 if let Some(between_line) = lines.get(idx) {
3372 let between_content = between_line.content(content);
3373 let trimmed = between_content.trim();
3374 // Skip empty lines
3375 if trimmed.is_empty() {
3376 return false;
3377 }
3378 // Check for meaningful content
3379 let line_indent = between_content.len() - between_content.trim_start().len();
3380
3381 // Check if blockquote level changed (not just if line starts with ">")
3382 let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3383 .find(between_content)
3384 .map_or(String::new(), |m| m.as_str().to_string());
3385 let between_bq_level = between_bq_prefix.chars().filter(|&c| c == '>').count();
3386 let blockquote_level_changed =
3387 trimmed.starts_with(">") && between_bq_level != block_bq_level;
3388
3389 // Structural separators (code fences, headings, etc.) are meaningful and should BREAK lists
3390 if trimmed.starts_with("```")
3391 || trimmed.starts_with("~~~")
3392 || trimmed.starts_with("---")
3393 || trimmed.starts_with("***")
3394 || trimmed.starts_with("___")
3395 || blockquote_level_changed
3396 || crate::utils::skip_context::is_table_line(trimmed)
3397 || between_line.heading.is_some()
3398 {
3399 return true; // These are structural separators - meaningful content that breaks lists
3400 }
3401
3402 // Only properly indented content continues the list
3403 line_indent >= min_continuation_indent
3404 } else {
3405 false
3406 }
3407 });
3408
3409 if block.is_ordered {
3410 // For ordered lists: don't continue if there are structural separators
3411 // Check if there are structural separators between the list items
3412 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3413 if let Some(between_line) = lines.get(idx) {
3414 let between_content = between_line.content(content);
3415 let trimmed = between_content.trim();
3416 if trimmed.is_empty() {
3417 return false;
3418 }
3419 // Check if blockquote level changed (not just if line starts with ">")
3420 let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3421 .find(between_content)
3422 .map_or(String::new(), |m| m.as_str().to_string());
3423 let between_bq_level =
3424 between_bq_prefix.chars().filter(|&c| c == '>').count();
3425 let blockquote_level_changed =
3426 trimmed.starts_with(">") && between_bq_level != block_bq_level;
3427 // Check for structural separators that break lists
3428 trimmed.starts_with("```")
3429 || trimmed.starts_with("~~~")
3430 || trimmed.starts_with("---")
3431 || trimmed.starts_with("***")
3432 || trimmed.starts_with("___")
3433 || blockquote_level_changed
3434 || crate::utils::skip_context::is_table_line(trimmed)
3435 || between_line.heading.is_some()
3436 } else {
3437 false
3438 }
3439 });
3440 found_continuation = !has_structural_separators;
3441 } else {
3442 // For unordered lists: also check for structural separators
3443 let has_structural_separators = (line_idx + 1..check_idx).any(|idx| {
3444 if let Some(between_line) = lines.get(idx) {
3445 let between_content = between_line.content(content);
3446 let trimmed = between_content.trim();
3447 if trimmed.is_empty() {
3448 return false;
3449 }
3450 // Check if blockquote level changed (not just if line starts with ">")
3451 let between_bq_prefix = BLOCKQUOTE_PREFIX_REGEX
3452 .find(between_content)
3453 .map_or(String::new(), |m| m.as_str().to_string());
3454 let between_bq_level =
3455 between_bq_prefix.chars().filter(|&c| c == '>').count();
3456 let blockquote_level_changed =
3457 trimmed.starts_with(">") && between_bq_level != block_bq_level;
3458 // Check for structural separators that break lists
3459 trimmed.starts_with("```")
3460 || trimmed.starts_with("~~~")
3461 || trimmed.starts_with("---")
3462 || trimmed.starts_with("***")
3463 || trimmed.starts_with("___")
3464 || blockquote_level_changed
3465 || crate::utils::skip_context::is_table_line(trimmed)
3466 || between_line.heading.is_some()
3467 } else {
3468 false
3469 }
3470 });
3471 found_continuation = !has_structural_separators;
3472 }
3473 }
3474 }
3475 }
3476
3477 if std::env::var("RUMDL_DEBUG_LIST").is_ok() {
3478 eprintln!("[DEBUG] Blank line {line_num} final: found_continuation={found_continuation}");
3479 }
3480 if found_continuation {
3481 // Include the blank line in the block
3482 block.end_line = line_num;
3483 } else {
3484 // Blank line ends the list - don't include it
3485 list_blocks.push(block.clone());
3486 current_block = None;
3487 }
3488 } else {
3489 // Check for lazy continuation - non-indented line immediately after a list item
3490 // But only if the line has sufficient indentation for the list type
3491 let min_required_indent = if block.is_ordered {
3492 current_indent_level + last_marker_width
3493 } else {
3494 current_indent_level + 2
3495 };
3496
3497 // For lazy continuation to apply, the line must either:
3498 // 1. Have no indentation (true lazy continuation)
3499 // 2. Have sufficient indentation for the list type
3500 // BUT structural separators (headings, code blocks, etc.) should never be lazy continuations
3501 let line_content = line_info.content(content).trim();
3502
3503 // Check for table-like patterns
3504 let looks_like_table = crate::utils::skip_context::is_table_line(line_content);
3505
3506 // Check if blockquote level changed (not just if line starts with ">")
3507 // Lines within the same blockquote level are NOT structural separators
3508 let block_bq_level = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3509 let current_bq_level = blockquote_prefix.chars().filter(|&c| c == '>').count();
3510 let blockquote_level_changed = line_content.starts_with(">") && current_bq_level != block_bq_level;
3511
3512 let is_structural_separator = line_info.heading.is_some()
3513 || line_content.starts_with("```")
3514 || line_content.starts_with("~~~")
3515 || line_content.starts_with("---")
3516 || line_content.starts_with("***")
3517 || line_content.starts_with("___")
3518 || blockquote_level_changed
3519 || looks_like_table;
3520
3521 // Allow lazy continuation if we're still within the same list block
3522 // (not just immediately after a list item)
3523 let is_lazy_continuation = !is_structural_separator
3524 && !line_info.is_blank
3525 && (line_info.indent == 0 || line_info.indent >= min_required_indent);
3526
3527 if is_lazy_continuation {
3528 // Additional check: if the line starts with uppercase and looks like a new sentence,
3529 // it's probably not a continuation
3530 // BUT: for blockquote lines with sufficient effective indent, always treat as continuation
3531 let line_content_raw = line_info.content(content);
3532 let block_bq_level_lazy = block.blockquote_prefix.chars().filter(|&c| c == '>').count();
3533 let line_bq_level_lazy = line_content_raw
3534 .chars()
3535 .take_while(|c| *c == '>' || c.is_whitespace())
3536 .filter(|&c| c == '>')
3537 .count();
3538 let has_proper_blockquote_indent =
3539 if line_bq_level_lazy > 0 && line_bq_level_lazy == block_bq_level_lazy {
3540 // Compute effective indent after blockquote markers
3541 let mut pos = 0;
3542 let mut found_markers = 0;
3543 for c in line_content_raw.chars() {
3544 pos += c.len_utf8();
3545 if c == '>' {
3546 found_markers += 1;
3547 if found_markers == line_bq_level_lazy {
3548 if line_content_raw.get(pos..pos + 1) == Some(" ") {
3549 pos += 1;
3550 }
3551 break;
3552 }
3553 }
3554 }
3555 let after_bq = &line_content_raw[pos..];
3556 let effective_indent_lazy = after_bq.len() - after_bq.trim_start().len();
3557 let min_required_for_bq = if block.is_ordered { last_marker_width } else { 2 };
3558 effective_indent_lazy >= min_required_for_bq
3559 } else {
3560 false
3561 };
3562
3563 // If it has proper blockquote indent, it's a continuation regardless of uppercase
3564 if has_proper_blockquote_indent {
3565 block.end_line = line_num;
3566 } else {
3567 let content_to_check = if !blockquote_prefix.is_empty() {
3568 // Strip blockquote prefix to check the actual content
3569 line_info
3570 .content(content)
3571 .strip_prefix(&blockquote_prefix)
3572 .unwrap_or(line_info.content(content))
3573 .trim()
3574 } else {
3575 line_info.content(content).trim()
3576 };
3577
3578 let starts_with_uppercase =
3579 content_to_check.chars().next().is_some_and(|c| c.is_uppercase());
3580
3581 // If it starts with uppercase and the previous line ended with punctuation,
3582 // it's likely a new paragraph, not a continuation
3583 if starts_with_uppercase && last_list_item_line > 0 {
3584 // This looks like a new paragraph
3585 list_blocks.push(block.clone());
3586 current_block = None;
3587 } else {
3588 // This is a lazy continuation line
3589 block.end_line = line_num;
3590 }
3591 }
3592 } else {
3593 // Non-indented, non-blank line that's not a lazy continuation - end the block
3594 list_blocks.push(block.clone());
3595 current_block = None;
3596 }
3597 }
3598 }
3599 }
3600
3601 // Don't forget the last block
3602 if let Some(block) = current_block {
3603 list_blocks.push(block);
3604 }
3605
3606 // Merge adjacent blocks that should be one
3607 merge_adjacent_list_blocks(content, &mut list_blocks, lines);
3608
3609 list_blocks
3610 }
3611
3612 /// Compute character frequency for fast content analysis
3613 fn compute_char_frequency(content: &str) -> CharFrequency {
3614 let mut frequency = CharFrequency::default();
3615
3616 for ch in content.chars() {
3617 match ch {
3618 '#' => frequency.hash_count += 1,
3619 '*' => frequency.asterisk_count += 1,
3620 '_' => frequency.underscore_count += 1,
3621 '-' => frequency.hyphen_count += 1,
3622 '+' => frequency.plus_count += 1,
3623 '>' => frequency.gt_count += 1,
3624 '|' => frequency.pipe_count += 1,
3625 '[' => frequency.bracket_count += 1,
3626 '`' => frequency.backtick_count += 1,
3627 '<' => frequency.lt_count += 1,
3628 '!' => frequency.exclamation_count += 1,
3629 '\n' => frequency.newline_count += 1,
3630 _ => {}
3631 }
3632 }
3633
3634 frequency
3635 }
3636
3637 /// Parse HTML tags in the content
3638 fn parse_html_tags(
3639 content: &str,
3640 lines: &[LineInfo],
3641 code_blocks: &[(usize, usize)],
3642 flavor: MarkdownFlavor,
3643 ) -> Vec<HtmlTag> {
3644 static HTML_TAG_REGEX: LazyLock<regex::Regex> =
3645 LazyLock::new(|| regex::Regex::new(r"(?i)<(/?)([a-zA-Z][a-zA-Z0-9-]*)(?:\s+[^>]*?)?\s*(/?)>").unwrap());
3646
3647 let mut html_tags = Vec::with_capacity(content.matches('<').count());
3648
3649 for cap in HTML_TAG_REGEX.captures_iter(content) {
3650 let full_match = cap.get(0).unwrap();
3651 let match_start = full_match.start();
3652 let match_end = full_match.end();
3653
3654 // Skip if in code block
3655 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3656 continue;
3657 }
3658
3659 let is_closing = !cap.get(1).unwrap().as_str().is_empty();
3660 let tag_name_original = cap.get(2).unwrap().as_str();
3661 let tag_name = tag_name_original.to_lowercase();
3662 let is_self_closing = !cap.get(3).unwrap().as_str().is_empty();
3663
3664 // Skip JSX components in MDX files (tags starting with uppercase letter)
3665 // JSX components like <Chart />, <MyComponent> should not be treated as HTML
3666 if flavor.supports_jsx() && tag_name_original.chars().next().is_some_and(|c| c.is_uppercase()) {
3667 continue;
3668 }
3669
3670 // Find which line this tag is on
3671 let mut line_num = 1;
3672 let mut col_start = match_start;
3673 let mut col_end = match_end;
3674 for (idx, line_info) in lines.iter().enumerate() {
3675 if match_start >= line_info.byte_offset {
3676 line_num = idx + 1;
3677 col_start = match_start - line_info.byte_offset;
3678 col_end = match_end - line_info.byte_offset;
3679 } else {
3680 break;
3681 }
3682 }
3683
3684 html_tags.push(HtmlTag {
3685 line: line_num,
3686 start_col: col_start,
3687 end_col: col_end,
3688 byte_offset: match_start,
3689 byte_end: match_end,
3690 tag_name,
3691 is_closing,
3692 is_self_closing,
3693 raw_content: full_match.as_str().to_string(),
3694 });
3695 }
3696
3697 html_tags
3698 }
3699
3700 /// Parse table rows in the content
3701 fn parse_table_rows(content: &str, lines: &[LineInfo]) -> Vec<TableRow> {
3702 let mut table_rows = Vec::with_capacity(lines.len() / 20);
3703
3704 for (line_idx, line_info) in lines.iter().enumerate() {
3705 // Skip lines in code blocks or blank lines
3706 if line_info.in_code_block || line_info.is_blank {
3707 continue;
3708 }
3709
3710 let line = line_info.content(content);
3711 let line_num = line_idx + 1;
3712
3713 // Check if this line contains pipes (potential table row)
3714 if !line.contains('|') {
3715 continue;
3716 }
3717
3718 // Count columns by splitting on pipes
3719 let parts: Vec<&str> = line.split('|').collect();
3720 let column_count = if parts.len() > 2 { parts.len() - 2 } else { parts.len() };
3721
3722 // Check if this is a separator row
3723 let is_separator = line.chars().all(|c| "|:-+ \t".contains(c));
3724 let mut column_alignments = Vec::new();
3725
3726 if is_separator {
3727 for part in &parts[1..parts.len() - 1] {
3728 // Skip first and last empty parts
3729 let trimmed = part.trim();
3730 let alignment = if trimmed.starts_with(':') && trimmed.ends_with(':') {
3731 "center".to_string()
3732 } else if trimmed.ends_with(':') {
3733 "right".to_string()
3734 } else if trimmed.starts_with(':') {
3735 "left".to_string()
3736 } else {
3737 "none".to_string()
3738 };
3739 column_alignments.push(alignment);
3740 }
3741 }
3742
3743 table_rows.push(TableRow {
3744 line: line_num,
3745 is_separator,
3746 column_count,
3747 column_alignments,
3748 });
3749 }
3750
3751 table_rows
3752 }
3753
3754 /// Parse bare URLs and emails in the content
3755 fn parse_bare_urls(content: &str, lines: &[LineInfo], code_blocks: &[(usize, usize)]) -> Vec<BareUrl> {
3756 let mut bare_urls = Vec::with_capacity(content.matches("http").count() + content.matches('@').count());
3757
3758 // Check for bare URLs (not in angle brackets or markdown links)
3759 for cap in URL_SIMPLE_REGEX.captures_iter(content) {
3760 let full_match = cap.get(0).unwrap();
3761 let match_start = full_match.start();
3762 let match_end = full_match.end();
3763
3764 // Skip if in code block
3765 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3766 continue;
3767 }
3768
3769 // Skip if already in angle brackets or markdown links
3770 let preceding_char = if match_start > 0 {
3771 content.chars().nth(match_start - 1)
3772 } else {
3773 None
3774 };
3775 let following_char = content.chars().nth(match_end);
3776
3777 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3778 continue;
3779 }
3780 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3781 continue;
3782 }
3783
3784 let url = full_match.as_str();
3785 let url_type = if url.starts_with("https://") {
3786 "https"
3787 } else if url.starts_with("http://") {
3788 "http"
3789 } else if url.starts_with("ftp://") {
3790 "ftp"
3791 } else {
3792 "other"
3793 };
3794
3795 // Find which line this URL is on
3796 let mut line_num = 1;
3797 let mut col_start = match_start;
3798 let mut col_end = match_end;
3799 for (idx, line_info) in lines.iter().enumerate() {
3800 if match_start >= line_info.byte_offset {
3801 line_num = idx + 1;
3802 col_start = match_start - line_info.byte_offset;
3803 col_end = match_end - line_info.byte_offset;
3804 } else {
3805 break;
3806 }
3807 }
3808
3809 bare_urls.push(BareUrl {
3810 line: line_num,
3811 start_col: col_start,
3812 end_col: col_end,
3813 byte_offset: match_start,
3814 byte_end: match_end,
3815 url: url.to_string(),
3816 url_type: url_type.to_string(),
3817 });
3818 }
3819
3820 // Check for bare email addresses
3821 for cap in BARE_EMAIL_PATTERN.captures_iter(content) {
3822 let full_match = cap.get(0).unwrap();
3823 let match_start = full_match.start();
3824 let match_end = full_match.end();
3825
3826 // Skip if in code block
3827 if CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
3828 continue;
3829 }
3830
3831 // Skip if already in angle brackets or markdown links
3832 let preceding_char = if match_start > 0 {
3833 content.chars().nth(match_start - 1)
3834 } else {
3835 None
3836 };
3837 let following_char = content.chars().nth(match_end);
3838
3839 if preceding_char == Some('<') || preceding_char == Some('(') || preceding_char == Some('[') {
3840 continue;
3841 }
3842 if following_char == Some('>') || following_char == Some(')') || following_char == Some(']') {
3843 continue;
3844 }
3845
3846 let email = full_match.as_str();
3847
3848 // Find which line this email is on
3849 let mut line_num = 1;
3850 let mut col_start = match_start;
3851 let mut col_end = match_end;
3852 for (idx, line_info) in lines.iter().enumerate() {
3853 if match_start >= line_info.byte_offset {
3854 line_num = idx + 1;
3855 col_start = match_start - line_info.byte_offset;
3856 col_end = match_end - line_info.byte_offset;
3857 } else {
3858 break;
3859 }
3860 }
3861
3862 bare_urls.push(BareUrl {
3863 line: line_num,
3864 start_col: col_start,
3865 end_col: col_end,
3866 byte_offset: match_start,
3867 byte_end: match_end,
3868 url: email.to_string(),
3869 url_type: "email".to_string(),
3870 });
3871 }
3872
3873 bare_urls
3874 }
3875
3876 /// Get an iterator over valid CommonMark headings
3877 ///
3878 /// This iterator filters out malformed headings like `#NoSpace` (hashtag-like patterns)
3879 /// that should be flagged by MD018 but should not be processed by other heading rules.
3880 ///
3881 /// # Examples
3882 ///
3883 /// ```rust
3884 /// use rumdl_lib::lint_context::LintContext;
3885 /// use rumdl_lib::config::MarkdownFlavor;
3886 ///
3887 /// let content = "# Valid Heading\n#NoSpace\n## Another Valid";
3888 /// let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
3889 ///
3890 /// for heading in ctx.valid_headings() {
3891 /// println!("Line {}: {} (level {})", heading.line_num, heading.heading.text, heading.heading.level);
3892 /// }
3893 /// // Only prints valid headings, skips `#NoSpace`
3894 /// ```
3895 #[must_use]
3896 pub fn valid_headings(&self) -> ValidHeadingsIter<'_> {
3897 ValidHeadingsIter::new(&self.lines)
3898 }
3899
3900 /// Check if the document contains any valid CommonMark headings
3901 ///
3902 /// Returns `true` if there is at least one heading with proper space after `#`.
3903 #[must_use]
3904 pub fn has_valid_headings(&self) -> bool {
3905 self.lines
3906 .iter()
3907 .any(|line| line.heading.as_ref().is_some_and(|h| h.is_valid))
3908 }
3909}
3910
3911/// Merge adjacent list blocks that should be treated as one
3912fn merge_adjacent_list_blocks(content: &str, list_blocks: &mut Vec<ListBlock>, lines: &[LineInfo]) {
3913 if list_blocks.len() < 2 {
3914 return;
3915 }
3916
3917 let mut merger = ListBlockMerger::new(content, lines);
3918 *list_blocks = merger.merge(list_blocks);
3919}
3920
3921/// Helper struct to manage the complex logic of merging list blocks
3922struct ListBlockMerger<'a> {
3923 content: &'a str,
3924 lines: &'a [LineInfo],
3925}
3926
3927impl<'a> ListBlockMerger<'a> {
3928 fn new(content: &'a str, lines: &'a [LineInfo]) -> Self {
3929 Self { content, lines }
3930 }
3931
3932 fn merge(&mut self, list_blocks: &[ListBlock]) -> Vec<ListBlock> {
3933 let mut merged = Vec::with_capacity(list_blocks.len());
3934 let mut current = list_blocks[0].clone();
3935
3936 for next in list_blocks.iter().skip(1) {
3937 if self.should_merge_blocks(¤t, next) {
3938 current = self.merge_two_blocks(current, next);
3939 } else {
3940 merged.push(current);
3941 current = next.clone();
3942 }
3943 }
3944
3945 merged.push(current);
3946 merged
3947 }
3948
3949 /// Determine if two adjacent list blocks should be merged
3950 fn should_merge_blocks(&self, current: &ListBlock, next: &ListBlock) -> bool {
3951 // Basic compatibility checks
3952 if !self.blocks_are_compatible(current, next) {
3953 return false;
3954 }
3955
3956 // Check spacing and content between blocks
3957 let spacing = self.analyze_spacing_between(current, next);
3958 match spacing {
3959 BlockSpacing::Consecutive => true,
3960 BlockSpacing::SingleBlank => self.can_merge_with_blank_between(current, next),
3961 BlockSpacing::MultipleBlanks | BlockSpacing::ContentBetween => {
3962 self.can_merge_with_content_between(current, next)
3963 }
3964 }
3965 }
3966
3967 /// Check if blocks have compatible structure for merging
3968 fn blocks_are_compatible(&self, current: &ListBlock, next: &ListBlock) -> bool {
3969 current.is_ordered == next.is_ordered
3970 && current.blockquote_prefix == next.blockquote_prefix
3971 && current.nesting_level == next.nesting_level
3972 }
3973
3974 /// Analyze the spacing between two list blocks
3975 fn analyze_spacing_between(&self, current: &ListBlock, next: &ListBlock) -> BlockSpacing {
3976 let gap = next.start_line - current.end_line;
3977
3978 match gap {
3979 1 => BlockSpacing::Consecutive,
3980 2 => BlockSpacing::SingleBlank,
3981 _ if gap > 2 => {
3982 if self.has_only_blank_lines_between(current, next) {
3983 BlockSpacing::MultipleBlanks
3984 } else {
3985 BlockSpacing::ContentBetween
3986 }
3987 }
3988 _ => BlockSpacing::Consecutive, // gap == 0, overlapping (shouldn't happen)
3989 }
3990 }
3991
3992 /// Check if unordered lists can be merged with a single blank line between
3993 fn can_merge_with_blank_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
3994 // Check if there are structural separators between the blocks
3995 // If has_meaningful_content_between returns true, it means there are structural separators
3996 if has_meaningful_content_between(self.content, current, next, self.lines) {
3997 return false; // Structural separators prevent merging
3998 }
3999
4000 // Only merge unordered lists with same marker across single blank
4001 !current.is_ordered && current.marker == next.marker
4002 }
4003
4004 /// Check if ordered lists can be merged when there's content between them
4005 fn can_merge_with_content_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4006 // Do not merge lists if there are structural separators between them
4007 if has_meaningful_content_between(self.content, current, next, self.lines) {
4008 return false; // Structural separators prevent merging
4009 }
4010
4011 // Only consider merging ordered lists if there's no structural content between
4012 current.is_ordered && next.is_ordered
4013 }
4014
4015 /// Check if there are only blank lines between blocks
4016 fn has_only_blank_lines_between(&self, current: &ListBlock, next: &ListBlock) -> bool {
4017 for line_num in (current.end_line + 1)..next.start_line {
4018 if let Some(line_info) = self.lines.get(line_num - 1)
4019 && !line_info.content(self.content).trim().is_empty()
4020 {
4021 return false;
4022 }
4023 }
4024 true
4025 }
4026
4027 /// Merge two compatible list blocks into one
4028 fn merge_two_blocks(&self, mut current: ListBlock, next: &ListBlock) -> ListBlock {
4029 current.end_line = next.end_line;
4030 current.item_lines.extend_from_slice(&next.item_lines);
4031
4032 // Update max marker width
4033 current.max_marker_width = current.max_marker_width.max(next.max_marker_width);
4034
4035 // Handle marker consistency for unordered lists
4036 if !current.is_ordered && self.markers_differ(¤t, next) {
4037 current.marker = None; // Mixed markers
4038 }
4039
4040 current
4041 }
4042
4043 /// Check if two blocks have different markers
4044 fn markers_differ(&self, current: &ListBlock, next: &ListBlock) -> bool {
4045 current.marker.is_some() && next.marker.is_some() && current.marker != next.marker
4046 }
4047}
4048
4049/// Types of spacing between list blocks
4050#[derive(Debug, PartialEq)]
4051enum BlockSpacing {
4052 Consecutive, // No gap between blocks
4053 SingleBlank, // One blank line between blocks
4054 MultipleBlanks, // Multiple blank lines but no content
4055 ContentBetween, // Content exists between blocks
4056}
4057
4058/// Check if there's meaningful content (not just blank lines) between two list blocks
4059fn has_meaningful_content_between(content: &str, current: &ListBlock, next: &ListBlock, lines: &[LineInfo]) -> bool {
4060 // Check lines between current.end_line and next.start_line
4061 for line_num in (current.end_line + 1)..next.start_line {
4062 if let Some(line_info) = lines.get(line_num - 1) {
4063 // Convert to 0-indexed
4064 let trimmed = line_info.content(content).trim();
4065
4066 // Skip empty lines
4067 if trimmed.is_empty() {
4068 continue;
4069 }
4070
4071 // Check for structural separators that should separate lists (CommonMark compliant)
4072
4073 // Headings separate lists
4074 if line_info.heading.is_some() {
4075 return true; // Has meaningful content - headings separate lists
4076 }
4077
4078 // Horizontal rules separate lists (---, ***, ___)
4079 if is_horizontal_rule(trimmed) {
4080 return true; // Has meaningful content - horizontal rules separate lists
4081 }
4082
4083 // Tables separate lists
4084 if crate::utils::skip_context::is_table_line(trimmed) {
4085 return true; // Has meaningful content - tables separate lists
4086 }
4087
4088 // Blockquotes separate lists
4089 if trimmed.starts_with('>') {
4090 return true; // Has meaningful content - blockquotes separate lists
4091 }
4092
4093 // Code block fences separate lists (unless properly indented as list content)
4094 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
4095 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4096
4097 // Check if this code block is properly indented as list continuation
4098 let min_continuation_indent = if current.is_ordered {
4099 current.nesting_level + current.max_marker_width + 1 // +1 for space after marker
4100 } else {
4101 current.nesting_level + 2
4102 };
4103
4104 if line_indent < min_continuation_indent {
4105 // This is a standalone code block that separates lists
4106 return true; // Has meaningful content - standalone code blocks separate lists
4107 }
4108 }
4109
4110 // Check if this line has proper indentation for list continuation
4111 let line_indent = line_info.byte_len - line_info.content(content).trim_start().len();
4112
4113 // Calculate minimum indentation needed to be list continuation
4114 let min_indent = if current.is_ordered {
4115 current.nesting_level + current.max_marker_width
4116 } else {
4117 current.nesting_level + 2
4118 };
4119
4120 // If the line is not indented enough to be list continuation, it's meaningful content
4121 if line_indent < min_indent {
4122 return true; // Has meaningful content - content not indented as list continuation
4123 }
4124
4125 // If we reach here, the line is properly indented as list continuation
4126 // Continue checking other lines
4127 }
4128 }
4129
4130 // Only blank lines or properly indented list continuation content between blocks
4131 false
4132}
4133
4134/// Check if a line is a horizontal rule (---, ***, ___) per CommonMark spec.
4135/// CommonMark rules for thematic breaks (horizontal rules):
4136/// - May have 0-3 spaces of leading indentation (but NOT tabs)
4137/// - Must have 3+ of the same character (-, *, or _)
4138/// - May have spaces between characters
4139/// - No other characters allowed
4140pub fn is_horizontal_rule_line(line: &str) -> bool {
4141 // CommonMark: HRs can have 0-3 spaces of leading indentation, not tabs
4142 let leading_spaces = line.len() - line.trim_start_matches(' ').len();
4143 if leading_spaces > 3 || line.starts_with('\t') {
4144 return false;
4145 }
4146
4147 is_horizontal_rule_content(line.trim())
4148}
4149
4150/// Check if trimmed content matches horizontal rule pattern.
4151/// Use `is_horizontal_rule_line` for full CommonMark compliance including indentation check.
4152pub fn is_horizontal_rule_content(trimmed: &str) -> bool {
4153 if trimmed.len() < 3 {
4154 return false;
4155 }
4156
4157 // Check for three or more consecutive -, *, or _ characters (with optional spaces)
4158 let chars: Vec<char> = trimmed.chars().collect();
4159 if let Some(&first_char) = chars.first()
4160 && (first_char == '-' || first_char == '*' || first_char == '_')
4161 {
4162 let mut count = 0;
4163 for &ch in &chars {
4164 if ch == first_char {
4165 count += 1;
4166 } else if ch != ' ' && ch != '\t' {
4167 return false; // Non-matching, non-whitespace character
4168 }
4169 }
4170 return count >= 3;
4171 }
4172 false
4173}
4174
4175/// Backwards-compatible alias for `is_horizontal_rule_content`
4176pub fn is_horizontal_rule(trimmed: &str) -> bool {
4177 is_horizontal_rule_content(trimmed)
4178}
4179
4180/// Check if content contains patterns that cause the markdown crate to panic
4181#[cfg(test)]
4182mod tests {
4183 use super::*;
4184
4185 #[test]
4186 fn test_empty_content() {
4187 let ctx = LintContext::new("", MarkdownFlavor::Standard, None);
4188 assert_eq!(ctx.content, "");
4189 assert_eq!(ctx.line_offsets, vec![0]);
4190 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4191 assert_eq!(ctx.lines.len(), 0);
4192 }
4193
4194 #[test]
4195 fn test_single_line() {
4196 let ctx = LintContext::new("# Hello", MarkdownFlavor::Standard, None);
4197 assert_eq!(ctx.content, "# Hello");
4198 assert_eq!(ctx.line_offsets, vec![0]);
4199 assert_eq!(ctx.offset_to_line_col(0), (1, 1));
4200 assert_eq!(ctx.offset_to_line_col(3), (1, 4));
4201 }
4202
4203 #[test]
4204 fn test_multi_line() {
4205 let content = "# Title\n\nSecond line\nThird line";
4206 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4207 assert_eq!(ctx.line_offsets, vec![0, 8, 9, 21]);
4208 // Test offset to line/col
4209 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // start
4210 assert_eq!(ctx.offset_to_line_col(8), (2, 1)); // start of blank line
4211 assert_eq!(ctx.offset_to_line_col(9), (3, 1)); // start of 'Second line'
4212 assert_eq!(ctx.offset_to_line_col(15), (3, 7)); // middle of 'Second line'
4213 assert_eq!(ctx.offset_to_line_col(21), (4, 1)); // start of 'Third line'
4214 }
4215
4216 #[test]
4217 fn test_line_info() {
4218 let content = "# Title\n indented\n\ncode:\n```rust\nfn main() {}\n```";
4219 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4220
4221 // Test line info
4222 assert_eq!(ctx.lines.len(), 7);
4223
4224 // Line 1: "# Title"
4225 let line1 = &ctx.lines[0];
4226 assert_eq!(line1.content(ctx.content), "# Title");
4227 assert_eq!(line1.byte_offset, 0);
4228 assert_eq!(line1.indent, 0);
4229 assert!(!line1.is_blank);
4230 assert!(!line1.in_code_block);
4231 assert!(line1.list_item.is_none());
4232
4233 // Line 2: " indented"
4234 let line2 = &ctx.lines[1];
4235 assert_eq!(line2.content(ctx.content), " indented");
4236 assert_eq!(line2.byte_offset, 8);
4237 assert_eq!(line2.indent, 4);
4238 assert!(!line2.is_blank);
4239
4240 // Line 3: "" (blank)
4241 let line3 = &ctx.lines[2];
4242 assert_eq!(line3.content(ctx.content), "");
4243 assert!(line3.is_blank);
4244
4245 // Test helper methods
4246 assert_eq!(ctx.line_to_byte_offset(1), Some(0));
4247 assert_eq!(ctx.line_to_byte_offset(2), Some(8));
4248 assert_eq!(ctx.line_info(1).map(|l| l.indent), Some(0));
4249 assert_eq!(ctx.line_info(2).map(|l| l.indent), Some(4));
4250 }
4251
4252 #[test]
4253 fn test_list_item_detection() {
4254 let content = "- Unordered item\n * Nested item\n1. Ordered item\n 2) Nested ordered\n\nNot a list";
4255 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4256
4257 // Line 1: "- Unordered item"
4258 let line1 = &ctx.lines[0];
4259 assert!(line1.list_item.is_some());
4260 let list1 = line1.list_item.as_ref().unwrap();
4261 assert_eq!(list1.marker, "-");
4262 assert!(!list1.is_ordered);
4263 assert_eq!(list1.marker_column, 0);
4264 assert_eq!(list1.content_column, 2);
4265
4266 // Line 2: " * Nested item"
4267 let line2 = &ctx.lines[1];
4268 assert!(line2.list_item.is_some());
4269 let list2 = line2.list_item.as_ref().unwrap();
4270 assert_eq!(list2.marker, "*");
4271 assert_eq!(list2.marker_column, 2);
4272
4273 // Line 3: "1. Ordered item"
4274 let line3 = &ctx.lines[2];
4275 assert!(line3.list_item.is_some());
4276 let list3 = line3.list_item.as_ref().unwrap();
4277 assert_eq!(list3.marker, "1.");
4278 assert!(list3.is_ordered);
4279 assert_eq!(list3.number, Some(1));
4280
4281 // Line 6: "Not a list"
4282 let line6 = &ctx.lines[5];
4283 assert!(line6.list_item.is_none());
4284 }
4285
4286 #[test]
4287 fn test_offset_to_line_col_edge_cases() {
4288 let content = "a\nb\nc";
4289 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4290 // line_offsets: [0, 2, 4]
4291 assert_eq!(ctx.offset_to_line_col(0), (1, 1)); // 'a'
4292 assert_eq!(ctx.offset_to_line_col(1), (1, 2)); // after 'a'
4293 assert_eq!(ctx.offset_to_line_col(2), (2, 1)); // 'b'
4294 assert_eq!(ctx.offset_to_line_col(3), (2, 2)); // after 'b'
4295 assert_eq!(ctx.offset_to_line_col(4), (3, 1)); // 'c'
4296 assert_eq!(ctx.offset_to_line_col(5), (3, 2)); // after 'c'
4297 }
4298
4299 #[test]
4300 fn test_mdx_esm_blocks() {
4301 let content = r##"import {Chart} from './snowfall.js'
4302export const year = 2023
4303
4304# Last year's snowfall
4305
4306In {year}, the snowfall was above average.
4307It was followed by a warm spring which caused
4308flood conditions in many of the nearby rivers.
4309
4310<Chart color="#fcb32c" year={year} />
4311"##;
4312
4313 let ctx = LintContext::new(content, MarkdownFlavor::MDX, None);
4314
4315 // Check that lines 1 and 2 are marked as ESM blocks
4316 assert_eq!(ctx.lines.len(), 10);
4317 assert!(ctx.lines[0].in_esm_block, "Line 1 (import) should be in_esm_block");
4318 assert!(ctx.lines[1].in_esm_block, "Line 2 (export) should be in_esm_block");
4319 assert!(!ctx.lines[2].in_esm_block, "Line 3 (blank) should NOT be in_esm_block");
4320 assert!(
4321 !ctx.lines[3].in_esm_block,
4322 "Line 4 (heading) should NOT be in_esm_block"
4323 );
4324 assert!(!ctx.lines[4].in_esm_block, "Line 5 (blank) should NOT be in_esm_block");
4325 assert!(!ctx.lines[5].in_esm_block, "Line 6 (text) should NOT be in_esm_block");
4326 }
4327
4328 #[test]
4329 fn test_mdx_esm_blocks_not_detected_in_standard_flavor() {
4330 let content = r#"import {Chart} from './snowfall.js'
4331export const year = 2023
4332
4333# Last year's snowfall
4334"#;
4335
4336 let ctx = LintContext::new(content, MarkdownFlavor::Standard, None);
4337
4338 // ESM blocks should NOT be detected in Standard flavor
4339 assert!(
4340 !ctx.lines[0].in_esm_block,
4341 "Line 1 should NOT be in_esm_block in Standard flavor"
4342 );
4343 assert!(
4344 !ctx.lines[1].in_esm_block,
4345 "Line 2 should NOT be in_esm_block in Standard flavor"
4346 );
4347 }
4348}