rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3//!
4//! Code block detection is delegated to pulldown-cmark, which correctly implements the
5//! CommonMark specification. This handles edge cases like:
6//! - Backtick fences with backticks in the info string (invalid per spec)
7//! - Nested fences (longer fence contains shorter fence as content)
8//! - Mixed fence types (tilde fence contains backticks as content)
9//! - Indented code blocks with proper list context handling
10
11use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
12
13/// Classification of code blocks relative to list contexts
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum CodeBlockContext {
16    /// Code block that separates lists (root-level, with blank lines)
17    Standalone,
18    /// Code block that continues a list (properly indented)
19    Indented,
20    /// Code block adjacent to list content (edge case, defaults to non-breaking)
21    Adjacent,
22}
23
24/// Utility functions for detecting and handling code blocks in Markdown
25pub struct CodeBlockUtils;
26
27impl CodeBlockUtils {
28    /// Detect all code blocks in the content (NOT including inline code spans)
29    ///
30    /// Uses pulldown-cmark for spec-compliant CommonMark parsing. This correctly handles:
31    /// - Fenced code blocks (``` and ~~~)
32    /// - Indented code blocks (4 spaces or tab)
33    /// - Code blocks inside lists, blockquotes, and other containers
34    /// - Edge cases like backticks in info strings (which invalidate the fence)
35    ///
36    /// Returns a sorted vector of (start, end) byte offset tuples.
37    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
38        let mut blocks = Vec::new();
39        let mut code_block_start: Option<usize> = None;
40
41        // Use pulldown-cmark with all extensions for maximum compatibility
42        let options = Options::all();
43        let parser = Parser::new_ext(content, options).into_offset_iter();
44
45        for (event, range) in parser {
46            match event {
47                Event::Start(Tag::CodeBlock(_)) => {
48                    // Record start position of code block
49                    code_block_start = Some(range.start);
50                }
51                Event::End(TagEnd::CodeBlock) => {
52                    // Complete the code block range
53                    if let Some(start) = code_block_start.take() {
54                        blocks.push((start, range.end));
55                    }
56                }
57                _ => {}
58            }
59        }
60
61        // Handle edge case: unclosed code block at end of content
62        // pulldown-cmark should handle this, but be defensive
63        if let Some(start) = code_block_start {
64            blocks.push((start, content.len()));
65        }
66
67        // Sort by start position (should already be sorted, but ensure consistency)
68        blocks.sort_by_key(|&(start, _)| start);
69        blocks
70    }
71
72    /// Check if a position is within a code block (for compatibility)
73    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
74        // This is a compatibility function - it only checks code blocks now, not spans
75        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
76    }
77
78    /// Check if a position is within a code block (NOT including inline code spans)
79    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
80        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
81    }
82
83    /// Analyze code block context relative to list parsing
84    /// This is the core function implementing Design #3's three-tier classification
85    pub fn analyze_code_block_context(
86        lines: &[crate::lint_context::LineInfo],
87        line_idx: usize,
88        min_continuation_indent: usize,
89    ) -> CodeBlockContext {
90        if let Some(line_info) = lines.get(line_idx) {
91            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
92            if line_info.indent >= min_continuation_indent {
93                return CodeBlockContext::Indented;
94            }
95
96            // Rule 2: Blank Line Context - Check for structural separation indicators
97            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
98
99            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
100            // This is the key fix: root-level code blocks with blank lines separate lists
101            if prev_blanks > 0 || next_blanks > 0 {
102                return CodeBlockContext::Standalone;
103            }
104
105            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
106            CodeBlockContext::Adjacent
107        } else {
108            // Fallback for invalid line index
109            CodeBlockContext::Adjacent
110        }
111    }
112
113    /// Count blank lines before and after the given line index
114    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
115        let mut prev_blanks = 0;
116        let mut next_blanks = 0;
117
118        // Count blank lines before (look backwards)
119        for i in (0..line_idx).rev() {
120            if let Some(line) = lines.get(i) {
121                if line.is_blank {
122                    prev_blanks += 1;
123                } else {
124                    break;
125                }
126            } else {
127                break;
128            }
129        }
130
131        // Count blank lines after (look forwards)
132        for i in (line_idx + 1)..lines.len() {
133            if let Some(line) = lines.get(i) {
134                if line.is_blank {
135                    next_blanks += 1;
136                } else {
137                    break;
138                }
139            } else {
140                break;
141            }
142        }
143
144        (prev_blanks, next_blanks)
145    }
146
147    /// Calculate minimum indentation required for code block to continue a list
148    /// Based on the most recent list item's marker width
149    pub fn calculate_min_continuation_indent(
150        content: &str,
151        lines: &[crate::lint_context::LineInfo],
152        current_line_idx: usize,
153    ) -> usize {
154        // Look backwards to find the most recent list item
155        for i in (0..current_line_idx).rev() {
156            if let Some(line_info) = lines.get(i) {
157                if let Some(list_item) = &line_info.list_item {
158                    // Calculate minimum continuation indent for this list item
159                    return if list_item.is_ordered {
160                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
161                    } else {
162                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
163                    };
164                }
165
166                // Stop at structural separators that would break list context
167                if line_info.heading.is_some() || Self::is_structural_separator(line_info.content(content)) {
168                    break;
169                }
170            }
171        }
172
173        0 // No list context found
174    }
175
176    /// Check if content is a structural separator (headings, horizontal rules, etc.)
177    fn is_structural_separator(content: &str) -> bool {
178        let trimmed = content.trim();
179        trimmed.starts_with("---")
180            || trimmed.starts_with("***")
181            || trimmed.starts_with("___")
182            || crate::utils::skip_context::is_table_line(trimmed)
183            || trimmed.starts_with(">") // Blockquotes
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn test_detect_fenced_code_blocks() {
193        // The function detects fenced blocks and inline code spans
194        // Fence markers (``` at line start) are now skipped in inline span detection
195
196        // Basic fenced code block with backticks
197        let content = "Some text\n```\ncode here\n```\nMore text";
198        let blocks = CodeBlockUtils::detect_code_blocks(content);
199        // Should find: 1 fenced block (fences are no longer detected as inline spans)
200        assert_eq!(blocks.len(), 1);
201
202        // Check that we have the fenced block
203        let fenced_block = blocks
204            .iter()
205            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
206        assert!(fenced_block.is_some());
207
208        // Fenced code block with tildes (no inline code detection for ~)
209        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
210        let blocks = CodeBlockUtils::detect_code_blocks(content);
211        assert_eq!(blocks.len(), 1);
212        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
213
214        // Multiple code blocks
215        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
216        let blocks = CodeBlockUtils::detect_code_blocks(content);
217        // 2 fenced blocks (fence markers no longer detected as inline spans)
218        assert_eq!(blocks.len(), 2);
219    }
220
221    #[test]
222    fn test_detect_code_blocks_with_language() {
223        // Code block with language identifier
224        let content = "Text\n```rust\nfn main() {}\n```\nMore";
225        let blocks = CodeBlockUtils::detect_code_blocks(content);
226        // 1 fenced block (fence markers no longer detected as inline spans)
227        assert_eq!(blocks.len(), 1);
228        // Check we have the full fenced block
229        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
230        assert!(fenced.is_some());
231    }
232
233    #[test]
234    fn test_unclosed_code_block() {
235        // Unclosed code block should extend to end of content
236        let content = "Text\n```\ncode here\nno closing fence";
237        let blocks = CodeBlockUtils::detect_code_blocks(content);
238        assert_eq!(blocks.len(), 1);
239        assert_eq!(blocks[0].1, content.len());
240    }
241
242    #[test]
243    fn test_indented_code_blocks() {
244        // Basic indented code block
245        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
246        let blocks = CodeBlockUtils::detect_code_blocks(content);
247        assert_eq!(blocks.len(), 1);
248        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
249        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
250
251        // Indented code with tabs
252        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
253        let blocks = CodeBlockUtils::detect_code_blocks(content);
254        assert_eq!(blocks.len(), 1);
255    }
256
257    #[test]
258    fn test_indented_code_requires_blank_line() {
259        // Indented lines without preceding blank line are not code blocks
260        let content = "Paragraph\n    indented but not code\nMore text";
261        let blocks = CodeBlockUtils::detect_code_blocks(content);
262        assert_eq!(blocks.len(), 0);
263
264        // With blank line, it becomes a code block
265        let content = "Paragraph\n\n    now it's code\nMore text";
266        let blocks = CodeBlockUtils::detect_code_blocks(content);
267        assert_eq!(blocks.len(), 1);
268    }
269
270    #[test]
271    fn test_indented_content_with_list_markers_is_code_block() {
272        // Per CommonMark spec: 4-space indented content after blank line IS a code block,
273        // even if the content looks like list markers. The indentation takes precedence.
274        // Verified with: echo 'List:\n\n    - Item 1' | npx commonmark
275        // Output: <pre><code>- Item 1</code></pre>
276        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
277        let blocks = CodeBlockUtils::detect_code_blocks(content);
278        assert_eq!(blocks.len(), 1); // This IS a code block per spec
279
280        // Same for numbered list markers
281        let content = "List:\n\n    1. First\n    2. Second";
282        let blocks = CodeBlockUtils::detect_code_blocks(content);
283        assert_eq!(blocks.len(), 1); // This IS a code block per spec
284    }
285
286    #[test]
287    fn test_actual_list_items_not_code_blocks() {
288        // Actual list items (no preceding blank line + 4 spaces) are NOT code blocks
289        let content = "- Item 1\n- Item 2\n* Item 3";
290        let blocks = CodeBlockUtils::detect_code_blocks(content);
291        assert_eq!(blocks.len(), 0);
292
293        // Nested list items
294        let content = "- Item 1\n  - Nested item\n- Item 2";
295        let blocks = CodeBlockUtils::detect_code_blocks(content);
296        assert_eq!(blocks.len(), 0);
297    }
298
299    #[test]
300    fn test_inline_code_spans_not_detected() {
301        // Inline code spans should NOT be detected as code blocks
302        let content = "Text with `inline code` here";
303        let blocks = CodeBlockUtils::detect_code_blocks(content);
304        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
305
306        // Multiple backtick code span
307        let content = "Text with ``code with ` backtick`` here";
308        let blocks = CodeBlockUtils::detect_code_blocks(content);
309        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
310
311        // Multiple code spans
312        let content = "Has `code1` and `code2` spans";
313        let blocks = CodeBlockUtils::detect_code_blocks(content);
314        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
315    }
316
317    #[test]
318    fn test_unclosed_code_span() {
319        // Unclosed code span should not be detected
320        let content = "Text with `unclosed code span";
321        let blocks = CodeBlockUtils::detect_code_blocks(content);
322        assert_eq!(blocks.len(), 0);
323
324        // Mismatched backticks
325        let content = "Text with ``one style` different close";
326        let blocks = CodeBlockUtils::detect_code_blocks(content);
327        assert_eq!(blocks.len(), 0);
328    }
329
330    #[test]
331    fn test_mixed_code_blocks_and_spans() {
332        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
333        let blocks = CodeBlockUtils::detect_code_blocks(content);
334        // Should only detect the fenced block, NOT the inline spans
335        assert_eq!(blocks.len(), 1);
336
337        // Check we have the fenced block only
338        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
339        // Should NOT detect inline spans
340        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
341        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
342    }
343
344    #[test]
345    fn test_is_in_code_block_or_span() {
346        let blocks = vec![(10, 20), (30, 40), (50, 60)];
347
348        // Test positions inside blocks
349        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
350        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
351        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
352
353        // Test positions at boundaries
354        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
355        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
356
357        // Test positions outside blocks
358        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
359        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
360        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
361    }
362
363    #[test]
364    fn test_empty_content() {
365        let blocks = CodeBlockUtils::detect_code_blocks("");
366        assert_eq!(blocks.len(), 0);
367    }
368
369    #[test]
370    fn test_code_block_at_start() {
371        let content = "```\ncode\n```\nText after";
372        let blocks = CodeBlockUtils::detect_code_blocks(content);
373        // 1 fenced block (fence markers no longer detected as inline spans)
374        assert_eq!(blocks.len(), 1);
375        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
376    }
377
378    #[test]
379    fn test_code_block_at_end() {
380        let content = "Text before\n```\ncode\n```";
381        let blocks = CodeBlockUtils::detect_code_blocks(content);
382        // 1 fenced block (fence markers no longer detected as inline spans)
383        assert_eq!(blocks.len(), 1);
384        // Check we have the fenced block
385        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
386        assert!(fenced.is_some());
387    }
388
389    #[test]
390    fn test_nested_fence_markers() {
391        // Code block containing fence markers as content
392        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
393        let blocks = CodeBlockUtils::detect_code_blocks(content);
394        // Should detect: outer block, inner ```, outer ````
395        assert!(!blocks.is_empty());
396        // Check we have the outer block
397        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
398        assert!(outer.is_some());
399    }
400
401    #[test]
402    fn test_indented_code_with_blank_lines() {
403        // Indented code blocks can contain blank lines
404        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
405        let blocks = CodeBlockUtils::detect_code_blocks(content);
406        // May have multiple blocks due to blank line handling
407        assert!(!blocks.is_empty());
408        // Check that we captured the indented code
409        let all_content: String = blocks
410            .iter()
411            .map(|(s, e)| &content[*s..*e])
412            .collect::<Vec<_>>()
413            .join("");
414        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
415    }
416
417    #[test]
418    fn test_code_span_with_spaces() {
419        // Code spans should NOT be detected as code blocks
420        let content = "Text ` code with spaces ` more";
421        let blocks = CodeBlockUtils::detect_code_blocks(content);
422        assert_eq!(blocks.len(), 0); // No blocks, only inline span
423    }
424
425    #[test]
426    fn test_fenced_block_with_info_string() {
427        // Fenced code blocks with complex info strings
428        let content = "```rust,no_run,should_panic\ncode\n```";
429        let blocks = CodeBlockUtils::detect_code_blocks(content);
430        // 1 fenced block (fence markers no longer detected as inline spans)
431        assert_eq!(blocks.len(), 1);
432        assert_eq!(blocks[0].0, 0);
433    }
434
435    #[test]
436    fn test_indented_fences_not_code_blocks() {
437        // Indented fence markers should still work as fences
438        let content = "Text\n  ```\n  code\n  ```\nAfter";
439        let blocks = CodeBlockUtils::detect_code_blocks(content);
440        // Only 1 fenced block (indented fences still work)
441        assert_eq!(blocks.len(), 1);
442    }
443
444    // Issue #175: Backticks in info string invalidate the fence
445    #[test]
446    fn test_backticks_in_info_string_not_code_block() {
447        // Per CommonMark spec: "If the info string comes after a backtick fence,
448        // it may not contain any backtick characters."
449        // So ```something``` is NOT a valid fence - the backticks are treated as inline code.
450        // Verified with: echo '```something```' | npx commonmark
451        // Output: <p><code>something</code></p>
452        let content = "```something```\n\n```bash\n# comment\n```";
453        let blocks = CodeBlockUtils::detect_code_blocks(content);
454        // Should find only the valid ```bash block, NOT the invalid ```something```
455        assert_eq!(blocks.len(), 1);
456        // The valid block should contain "# comment"
457        assert!(content[blocks[0].0..blocks[0].1].contains("# comment"));
458    }
459
460    #[test]
461    fn test_issue_175_reproduction() {
462        // Full reproduction of issue #175
463        let content = "```something```\n\n```bash\n# Have a parrot\necho \"🦜\"\n```";
464        let blocks = CodeBlockUtils::detect_code_blocks(content);
465        // Only the bash block is a code block
466        assert_eq!(blocks.len(), 1);
467        assert!(content[blocks[0].0..blocks[0].1].contains("Have a parrot"));
468    }
469
470    #[test]
471    fn test_tilde_fence_allows_tildes_in_info_string() {
472        // Tilde fences CAN have tildes in info string (only backtick restriction exists)
473        // ~~~abc~~~ opens an unclosed code block with info string "abc~~~"
474        let content = "~~~abc~~~\ncode content\n~~~";
475        let blocks = CodeBlockUtils::detect_code_blocks(content);
476        // This is a valid tilde fence that opens and closes
477        assert_eq!(blocks.len(), 1);
478    }
479
480    #[test]
481    fn test_nested_longer_fence_contains_shorter() {
482        // Longer fence (````) can contain shorter fence (```) as content
483        let content = "````\n```\nnested content\n```\n````";
484        let blocks = CodeBlockUtils::detect_code_blocks(content);
485        assert_eq!(blocks.len(), 1);
486        assert!(content[blocks[0].0..blocks[0].1].contains("nested content"));
487    }
488
489    #[test]
490    fn test_mixed_fence_types() {
491        // Tilde fence contains backtick markers as content
492        let content = "~~~\n```\nmixed content\n~~~";
493        let blocks = CodeBlockUtils::detect_code_blocks(content);
494        assert_eq!(blocks.len(), 1);
495        assert!(content[blocks[0].0..blocks[0].1].contains("mixed content"));
496    }
497}