rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3
4use crate::rules::blockquote_utils::BlockquoteUtils;
5use lazy_static::lazy_static;
6use regex::Regex;
7
8/// Classification of code blocks relative to list contexts
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub enum CodeBlockContext {
11    /// Code block that separates lists (root-level, with blank lines)
12    Standalone,
13    /// Code block that continues a list (properly indented)
14    Indented,
15    /// Code block adjacent to list content (edge case, defaults to non-breaking)
16    Adjacent,
17}
18
19lazy_static! {
20    static ref CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(```|~~~)").unwrap();
21    static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
22}
23
24/// Utility functions for detecting and handling code blocks in Markdown
25pub struct CodeBlockUtils;
26
27impl CodeBlockUtils {
28    /// Detect all code blocks in the content (NOT including inline code spans)
29    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
30        let mut blocks = Vec::new();
31        let mut in_code_block = false;
32        let mut code_block_start = 0;
33        let mut opening_fence_char = ' ';
34        let mut opening_fence_len = 0;
35
36        // Pre-compute line positions for efficient offset calculation
37        let lines: Vec<&str> = content.lines().collect();
38        let mut line_positions = Vec::with_capacity(lines.len());
39        let mut pos = 0;
40        for line in &lines {
41            line_positions.push(pos);
42            pos += line.len() + 1; // +1 for newline
43        }
44
45        // Find fenced code blocks
46        for (i, line) in lines.iter().enumerate() {
47            let line_start = line_positions[i];
48
49            // Strip ALL blockquote prefixes to properly detect fenced code blocks inside blockquotes
50            // This handles nested blockquotes by recursively stripping '>' markers
51            let mut line_without_blockquote = line.to_string();
52            while BlockquoteUtils::is_blockquote(&line_without_blockquote) {
53                line_without_blockquote = BlockquoteUtils::extract_content(&line_without_blockquote);
54            }
55
56            let trimmed = line_without_blockquote.trim_start();
57
58            // Check if this line could be a code fence
59            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
60                let fence_char = trimmed.chars().next().unwrap();
61                let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
62
63                if !in_code_block && fence_len >= 3 {
64                    // Opening fence
65                    code_block_start = line_start;
66                    in_code_block = true;
67                    opening_fence_char = fence_char;
68                    opening_fence_len = fence_len;
69                } else if in_code_block && fence_char == opening_fence_char && fence_len >= opening_fence_len {
70                    // Closing fence - must match opening fence character and be at least as long
71                    let code_block_end = line_start + line.len();
72                    blocks.push((code_block_start, code_block_end));
73                    in_code_block = false;
74                    opening_fence_char = ' ';
75                    opening_fence_len = 0;
76                }
77                // If we're in a code block but the fence doesn't match, it's just content
78            }
79        }
80
81        // Handle unclosed code blocks
82        if in_code_block {
83            blocks.push((code_block_start, content.len()));
84        }
85
86        // Find indented code blocks (4+ spaces or tab at start of line)
87        // According to CommonMark, indented code blocks must be preceded by a blank line
88        // (unless they're at the start of the document or after a block-level element)
89        //
90        // IMPORTANT: We must handle list contexts correctly:
91        // - At document level: 4 spaces + blank line before = code block
92        // - In a list context: 4 spaces = continuation paragraph (NOT a code block)
93        // - In a list context: 8+ spaces (depending on list marker) = code block
94        let mut in_indented_block = false;
95        let mut indented_block_start = 0;
96        let mut in_list_context = false;
97        let mut list_continuation_indent: usize = 0;
98
99        for (line_idx, line) in lines.iter().enumerate() {
100            let line_start = if line_idx < line_positions.len() {
101                line_positions[line_idx]
102            } else {
103                0
104            };
105
106            // Strip ALL blockquote prefixes to properly detect indented code blocks inside blockquotes
107            let mut line_without_blockquote = line.to_string();
108            while BlockquoteUtils::is_blockquote(&line_without_blockquote) {
109                line_without_blockquote = BlockquoteUtils::extract_content(&line_without_blockquote);
110            }
111
112            // Calculate the indent level
113            let indent_level = line_without_blockquote.len() - line_without_blockquote.trim_start().len();
114            let is_indented = line_without_blockquote.starts_with("    ") || line_without_blockquote.starts_with("\t");
115
116            // Check if this looks like a list item (has list marker after indentation)
117            let trimmed = line_without_blockquote.trim_start();
118
119            // Check for ordered list marker: 1-9 digits followed by . or )
120            // Must be followed by at least one space
121            let is_ordered_list = {
122                let mut chars = trimmed.chars();
123                let first_char = chars.next();
124                if !first_char.is_some_and(|c| c.is_numeric()) {
125                    false
126                } else {
127                    // Find delimiter position (. or ))
128                    let delimiter_pos = trimmed.chars().position(|c| c == '.' || c == ')');
129                    match delimiter_pos {
130                        Some(pos) if pos > 0 => {
131                            // All chars before delimiter must be digits
132                            let all_digits = trimmed[..pos].chars().all(|c| c.is_numeric());
133                            // Must be followed by space or tab
134                            let has_space = trimmed.chars().nth(pos + 1).is_some_and(|c| c == ' ' || c == '\t');
135                            all_digits && has_space
136                        }
137                        _ => false,
138                    }
139                }
140            };
141
142            let is_list_item =
143                trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") || is_ordered_list;
144
145            // Check if previous line was blank (after stripping blockquote markers)
146            let prev_line_without_blockquote = if line_idx > 0 {
147                let mut prev = lines[line_idx - 1].to_string();
148                while BlockquoteUtils::is_blockquote(&prev) {
149                    prev = BlockquoteUtils::extract_content(&prev);
150                }
151                prev
152            } else {
153                String::new()
154            };
155            let prev_blank = line_idx > 0 && prev_line_without_blockquote.trim().is_empty();
156
157            // Update list context tracking
158            if is_list_item {
159                // We're starting or continuing a list
160                in_list_context = true;
161
162                // Calculate continuation indent per CommonMark spec:
163                // "The spaces of indentation after the list marker determine how much
164                // relative indentation is needed. The first continuation block must be
165                // indented to the column of the first character other than a space after the marker."
166
167                let marker_column = indent_level;
168                let marker_width =
169                    if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") {
170                        1 // Single character marker (-, *, +)
171                    } else {
172                        // Ordered list marker: count digits + delimiter (. or ))
173                        trimmed.chars().take_while(|c| c.is_numeric()).count() + 1
174                    };
175
176                // Count actual spaces/tabs after marker (CommonMark allows 1-4 spaces)
177                // Find the first non-space character after the marker
178                let after_marker = &trimmed[marker_width..];
179                let spaces_after_marker = after_marker.chars().take_while(|c| *c == ' ' || *c == '\t').count();
180
181                // Continuation indent = marker column + marker width + actual spaces
182                // This is the column where the first content character appears
183                list_continuation_indent = marker_column + marker_width + spaces_after_marker;
184            } else if in_list_context
185                && !line_without_blockquote.trim().is_empty()
186                && indent_level < list_continuation_indent
187            {
188                // Outdented non-empty line ends the list context
189                in_list_context = false;
190                list_continuation_indent = 0;
191            }
192
193            // Determine if this indented line is:
194            // 1. A list continuation paragraph (indent >= continuation_indent, < continuation_indent + 4)
195            // 2. A code block within a list (indent >= continuation_indent + 4)
196            // 3. A document-level code block (not in list context)
197
198            let is_list_continuation_paragraph = in_list_context
199                && indent_level >= list_continuation_indent
200                && indent_level < (list_continuation_indent + 4);
201
202            let is_code_block_in_list = in_list_context && indent_level >= (list_continuation_indent + 4);
203
204            // Handle indented code blocks
205            if is_indented && !line_without_blockquote.trim().is_empty() && !is_list_item {
206                if is_code_block_in_list {
207                    // Code block within list (CommonMark Example 270, 273, 274)
208                    // Requires continuation_indent + 4 spaces, and must have blank line before
209                    if !in_indented_block && prev_blank {
210                        in_indented_block = true;
211                        indented_block_start = line_start;
212                    }
213                } else if !is_list_continuation_paragraph {
214                    // Document-level indented code block (not in list)
215                    if !in_indented_block && prev_blank {
216                        in_indented_block = true;
217                        indented_block_start = line_start;
218                    }
219                }
220                // If is_list_continuation_paragraph, don't treat as code block
221            } else if in_indented_block {
222                // End of indented code block
223                let block_end = if line_idx > 0 && line_idx - 1 < line_positions.len() {
224                    line_positions[line_idx - 1] + lines[line_idx - 1].len()
225                } else {
226                    line_start
227                };
228                blocks.push((indented_block_start, block_end));
229                in_indented_block = false;
230            }
231        }
232
233        // Handle indented block that goes to end of file
234        if in_indented_block {
235            blocks.push((indented_block_start, content.len()));
236        }
237
238        // Note: We DO NOT include inline code spans here - they are not code blocks!
239        // Inline code spans are handled separately by the code span parser.
240
241        blocks.sort_by(|a, b| a.0.cmp(&b.0));
242        blocks
243    }
244
245    /// Check if a position is within a code block (for compatibility)
246    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
247        // This is a compatibility function - it only checks code blocks now, not spans
248        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
249    }
250
251    /// Check if a position is within a code block (NOT including inline code spans)
252    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
253        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
254    }
255
256    /// Analyze code block context relative to list parsing
257    /// This is the core function implementing Design #3's three-tier classification
258    pub fn analyze_code_block_context(
259        lines: &[crate::lint_context::LineInfo],
260        line_idx: usize,
261        min_continuation_indent: usize,
262    ) -> CodeBlockContext {
263        if let Some(line_info) = lines.get(line_idx) {
264            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
265            if line_info.indent >= min_continuation_indent {
266                return CodeBlockContext::Indented;
267            }
268
269            // Rule 2: Blank Line Context - Check for structural separation indicators
270            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
271
272            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
273            // This is the key fix: root-level code blocks with blank lines separate lists
274            if prev_blanks > 0 || next_blanks > 0 {
275                return CodeBlockContext::Standalone;
276            }
277
278            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
279            CodeBlockContext::Adjacent
280        } else {
281            // Fallback for invalid line index
282            CodeBlockContext::Adjacent
283        }
284    }
285
286    /// Count blank lines before and after the given line index
287    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
288        let mut prev_blanks = 0;
289        let mut next_blanks = 0;
290
291        // Count blank lines before (look backwards)
292        for i in (0..line_idx).rev() {
293            if let Some(line) = lines.get(i) {
294                if line.is_blank {
295                    prev_blanks += 1;
296                } else {
297                    break;
298                }
299            } else {
300                break;
301            }
302        }
303
304        // Count blank lines after (look forwards)
305        for i in (line_idx + 1)..lines.len() {
306            if let Some(line) = lines.get(i) {
307                if line.is_blank {
308                    next_blanks += 1;
309                } else {
310                    break;
311                }
312            } else {
313                break;
314            }
315        }
316
317        (prev_blanks, next_blanks)
318    }
319
320    /// Calculate minimum indentation required for code block to continue a list
321    /// Based on the most recent list item's marker width
322    pub fn calculate_min_continuation_indent(
323        lines: &[crate::lint_context::LineInfo],
324        current_line_idx: usize,
325    ) -> usize {
326        // Look backwards to find the most recent list item
327        for i in (0..current_line_idx).rev() {
328            if let Some(line_info) = lines.get(i) {
329                if let Some(list_item) = &line_info.list_item {
330                    // Calculate minimum continuation indent for this list item
331                    return if list_item.is_ordered {
332                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
333                    } else {
334                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
335                    };
336                }
337
338                // Stop at structural separators that would break list context
339                if line_info.heading.is_some() || Self::is_structural_separator(&line_info.content) {
340                    break;
341                }
342            }
343        }
344
345        0 // No list context found
346    }
347
348    /// Check if content is a structural separator (headings, horizontal rules, etc.)
349    fn is_structural_separator(content: &str) -> bool {
350        let trimmed = content.trim();
351        trimmed.starts_with("---")
352            || trimmed.starts_with("***")
353            || trimmed.starts_with("___")
354            || trimmed.contains('|') // Tables
355            || trimmed.starts_with(">") // Blockquotes
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_detect_fenced_code_blocks() {
365        // The function detects fenced blocks and inline code spans
366        // Fence markers (``` at line start) are now skipped in inline span detection
367
368        // Basic fenced code block with backticks
369        let content = "Some text\n```\ncode here\n```\nMore text";
370        let blocks = CodeBlockUtils::detect_code_blocks(content);
371        // Should find: 1 fenced block (fences are no longer detected as inline spans)
372        assert_eq!(blocks.len(), 1);
373
374        // Check that we have the fenced block
375        let fenced_block = blocks
376            .iter()
377            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
378        assert!(fenced_block.is_some());
379
380        // Fenced code block with tildes (no inline code detection for ~)
381        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
382        let blocks = CodeBlockUtils::detect_code_blocks(content);
383        assert_eq!(blocks.len(), 1);
384        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
385
386        // Multiple code blocks
387        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
388        let blocks = CodeBlockUtils::detect_code_blocks(content);
389        // 2 fenced blocks (fence markers no longer detected as inline spans)
390        assert_eq!(blocks.len(), 2);
391    }
392
393    #[test]
394    fn test_detect_code_blocks_with_language() {
395        // Code block with language identifier
396        let content = "Text\n```rust\nfn main() {}\n```\nMore";
397        let blocks = CodeBlockUtils::detect_code_blocks(content);
398        // 1 fenced block (fence markers no longer detected as inline spans)
399        assert_eq!(blocks.len(), 1);
400        // Check we have the full fenced block
401        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
402        assert!(fenced.is_some());
403    }
404
405    #[test]
406    fn test_unclosed_code_block() {
407        // Unclosed code block should extend to end of content
408        let content = "Text\n```\ncode here\nno closing fence";
409        let blocks = CodeBlockUtils::detect_code_blocks(content);
410        assert_eq!(blocks.len(), 1);
411        assert_eq!(blocks[0].1, content.len());
412    }
413
414    #[test]
415    fn test_indented_code_blocks() {
416        // Basic indented code block
417        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
418        let blocks = CodeBlockUtils::detect_code_blocks(content);
419        assert_eq!(blocks.len(), 1);
420        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
421        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
422
423        // Indented code with tabs
424        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
425        let blocks = CodeBlockUtils::detect_code_blocks(content);
426        assert_eq!(blocks.len(), 1);
427    }
428
429    #[test]
430    fn test_indented_code_requires_blank_line() {
431        // Indented lines without preceding blank line are not code blocks
432        let content = "Paragraph\n    indented but not code\nMore text";
433        let blocks = CodeBlockUtils::detect_code_blocks(content);
434        assert_eq!(blocks.len(), 0);
435
436        // With blank line, it becomes a code block
437        let content = "Paragraph\n\n    now it's code\nMore text";
438        let blocks = CodeBlockUtils::detect_code_blocks(content);
439        assert_eq!(blocks.len(), 1);
440    }
441
442    #[test]
443    fn test_list_items_not_code_blocks() {
444        // List items should not be detected as code blocks
445        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
446        let blocks = CodeBlockUtils::detect_code_blocks(content);
447        assert_eq!(blocks.len(), 0);
448
449        // Numbered lists
450        let content = "List:\n\n    1. First\n    2. Second\n    1) Also first";
451        let blocks = CodeBlockUtils::detect_code_blocks(content);
452        assert_eq!(blocks.len(), 0);
453    }
454
455    #[test]
456    fn test_inline_code_spans_not_detected() {
457        // Inline code spans should NOT be detected as code blocks
458        let content = "Text with `inline code` here";
459        let blocks = CodeBlockUtils::detect_code_blocks(content);
460        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
461
462        // Multiple backtick code span
463        let content = "Text with ``code with ` backtick`` here";
464        let blocks = CodeBlockUtils::detect_code_blocks(content);
465        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
466
467        // Multiple code spans
468        let content = "Has `code1` and `code2` spans";
469        let blocks = CodeBlockUtils::detect_code_blocks(content);
470        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
471    }
472
473    #[test]
474    fn test_unclosed_code_span() {
475        // Unclosed code span should not be detected
476        let content = "Text with `unclosed code span";
477        let blocks = CodeBlockUtils::detect_code_blocks(content);
478        assert_eq!(blocks.len(), 0);
479
480        // Mismatched backticks
481        let content = "Text with ``one style` different close";
482        let blocks = CodeBlockUtils::detect_code_blocks(content);
483        assert_eq!(blocks.len(), 0);
484    }
485
486    #[test]
487    fn test_mixed_code_blocks_and_spans() {
488        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
489        let blocks = CodeBlockUtils::detect_code_blocks(content);
490        // Should only detect the fenced block, NOT the inline spans
491        assert_eq!(blocks.len(), 1);
492
493        // Check we have the fenced block only
494        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
495        // Should NOT detect inline spans
496        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
497        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
498    }
499
500    #[test]
501    fn test_is_in_code_block_or_span() {
502        let blocks = vec![(10, 20), (30, 40), (50, 60)];
503
504        // Test positions inside blocks
505        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
506        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
507        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
508
509        // Test positions at boundaries
510        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
511        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
512
513        // Test positions outside blocks
514        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
515        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
516        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
517    }
518
519    #[test]
520    fn test_empty_content() {
521        let blocks = CodeBlockUtils::detect_code_blocks("");
522        assert_eq!(blocks.len(), 0);
523    }
524
525    #[test]
526    fn test_code_block_at_start() {
527        let content = "```\ncode\n```\nText after";
528        let blocks = CodeBlockUtils::detect_code_blocks(content);
529        // 1 fenced block (fence markers no longer detected as inline spans)
530        assert_eq!(blocks.len(), 1);
531        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
532    }
533
534    #[test]
535    fn test_code_block_at_end() {
536        let content = "Text before\n```\ncode\n```";
537        let blocks = CodeBlockUtils::detect_code_blocks(content);
538        // 1 fenced block (fence markers no longer detected as inline spans)
539        assert_eq!(blocks.len(), 1);
540        // Check we have the fenced block
541        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
542        assert!(fenced.is_some());
543    }
544
545    #[test]
546    fn test_nested_fence_markers() {
547        // Code block containing fence markers as content
548        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
549        let blocks = CodeBlockUtils::detect_code_blocks(content);
550        // Should detect: outer block, inner ```, outer ````
551        assert!(!blocks.is_empty());
552        // Check we have the outer block
553        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
554        assert!(outer.is_some());
555    }
556
557    #[test]
558    fn test_indented_code_with_blank_lines() {
559        // Indented code blocks can contain blank lines
560        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
561        let blocks = CodeBlockUtils::detect_code_blocks(content);
562        // May have multiple blocks due to blank line handling
563        assert!(!blocks.is_empty());
564        // Check that we captured the indented code
565        let all_content: String = blocks
566            .iter()
567            .map(|(s, e)| &content[*s..*e])
568            .collect::<Vec<_>>()
569            .join("");
570        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
571    }
572
573    #[test]
574    fn test_code_span_with_spaces() {
575        // Code spans should NOT be detected as code blocks
576        let content = "Text ` code with spaces ` more";
577        let blocks = CodeBlockUtils::detect_code_blocks(content);
578        assert_eq!(blocks.len(), 0); // No blocks, only inline span
579    }
580
581    #[test]
582    fn test_fenced_block_with_info_string() {
583        // Fenced code blocks with complex info strings
584        let content = "```rust,no_run,should_panic\ncode\n```";
585        let blocks = CodeBlockUtils::detect_code_blocks(content);
586        // 1 fenced block (fence markers no longer detected as inline spans)
587        assert_eq!(blocks.len(), 1);
588        assert_eq!(blocks[0].0, 0);
589    }
590
591    #[test]
592    fn test_indented_fences_not_code_blocks() {
593        // Indented fence markers should still work as fences
594        let content = "Text\n  ```\n  code\n  ```\nAfter";
595        let blocks = CodeBlockUtils::detect_code_blocks(content);
596        // Only 1 fenced block (indented fences still work)
597        assert_eq!(blocks.len(), 1);
598    }
599}