rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3
4use crate::rules::blockquote_utils::BlockquoteUtils;
5
6/// Classification of code blocks relative to list contexts
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub enum CodeBlockContext {
9    /// Code block that separates lists (root-level, with blank lines)
10    Standalone,
11    /// Code block that continues a list (properly indented)
12    Indented,
13    /// Code block adjacent to list content (edge case, defaults to non-breaking)
14    Adjacent,
15}
16
17/// Utility functions for detecting and handling code blocks in Markdown
18pub struct CodeBlockUtils;
19
20impl CodeBlockUtils {
21    /// Detect all code blocks in the content (NOT including inline code spans)
22    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
23        let mut blocks = Vec::new();
24        let mut in_code_block = false;
25        let mut code_block_start = 0;
26        let mut opening_fence_char = ' ';
27        let mut opening_fence_len = 0;
28
29        // Pre-compute line positions for efficient offset calculation
30        let lines: Vec<&str> = content.lines().collect();
31        let mut line_positions = Vec::with_capacity(lines.len());
32        let mut pos = 0;
33        for line in &lines {
34            line_positions.push(pos);
35            pos += line.len() + 1; // +1 for newline
36        }
37
38        // Find fenced code blocks
39        for (i, line) in lines.iter().enumerate() {
40            let line_start = line_positions[i];
41
42            // Strip ALL blockquote prefixes to properly detect fenced code blocks inside blockquotes
43            // This handles nested blockquotes by recursively stripping '>' markers
44            let mut line_without_blockquote = line.to_string();
45            while BlockquoteUtils::is_blockquote(&line_without_blockquote) {
46                line_without_blockquote = BlockquoteUtils::extract_content(&line_without_blockquote);
47            }
48
49            let trimmed = line_without_blockquote.trim_start();
50
51            // Check if this line could be a code fence
52            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
53                let fence_char = trimmed.chars().next().unwrap();
54                let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
55
56                if !in_code_block && fence_len >= 3 {
57                    // Opening fence
58                    code_block_start = line_start;
59                    in_code_block = true;
60                    opening_fence_char = fence_char;
61                    opening_fence_len = fence_len;
62                } else if in_code_block && fence_char == opening_fence_char && fence_len >= opening_fence_len {
63                    // Closing fence - must match opening fence character and be at least as long
64                    let code_block_end = line_start + line.len();
65                    blocks.push((code_block_start, code_block_end));
66                    in_code_block = false;
67                    opening_fence_char = ' ';
68                    opening_fence_len = 0;
69                }
70                // If we're in a code block but the fence doesn't match, it's just content
71            }
72        }
73
74        // Handle unclosed code blocks
75        if in_code_block {
76            blocks.push((code_block_start, content.len()));
77        }
78
79        // Find indented code blocks (4+ spaces or tab at start of line)
80        // According to CommonMark, indented code blocks must be preceded by a blank line
81        // (unless they're at the start of the document or after a block-level element)
82        //
83        // IMPORTANT: We must handle list contexts correctly:
84        // - At document level: 4 spaces + blank line before = code block
85        // - In a list context: 4 spaces = continuation paragraph (NOT a code block)
86        // - In a list context: 8+ spaces (depending on list marker) = code block
87        let mut in_indented_block = false;
88        let mut indented_block_start = 0;
89        let mut in_list_context = false;
90        let mut list_continuation_indent: usize = 0;
91
92        for (line_idx, line) in lines.iter().enumerate() {
93            let line_start = if line_idx < line_positions.len() {
94                line_positions[line_idx]
95            } else {
96                0
97            };
98
99            // Strip ALL blockquote prefixes to properly detect indented code blocks inside blockquotes
100            let mut line_without_blockquote = line.to_string();
101            while BlockquoteUtils::is_blockquote(&line_without_blockquote) {
102                line_without_blockquote = BlockquoteUtils::extract_content(&line_without_blockquote);
103            }
104
105            // Calculate the indent level
106            let indent_level = line_without_blockquote.len() - line_without_blockquote.trim_start().len();
107            let is_indented = line_without_blockquote.starts_with("    ") || line_without_blockquote.starts_with("\t");
108
109            // Check if this looks like a list item (has list marker after indentation)
110            let trimmed = line_without_blockquote.trim_start();
111
112            // Check for ordered list marker: 1-9 digits followed by . or )
113            // Must be followed by at least one space
114            let is_ordered_list = {
115                let mut chars = trimmed.chars();
116                let first_char = chars.next();
117                if !first_char.is_some_and(|c| c.is_numeric()) {
118                    false
119                } else {
120                    // Find delimiter position (. or ))
121                    let delimiter_char_pos = trimmed.chars().position(|c| c == '.' || c == ')');
122                    match delimiter_char_pos {
123                        Some(char_pos) if char_pos > 0 => {
124                            // Convert character position to byte position for slicing
125                            let byte_pos = trimmed.char_indices().nth(char_pos).map(|(i, _)| i);
126                            if let Some(byte_pos) = byte_pos {
127                                // All chars before delimiter must be digits
128                                let all_digits = trimmed[..byte_pos].chars().all(|c| c.is_numeric());
129                                // Must be followed by space or tab
130                                let has_space =
131                                    trimmed.chars().nth(char_pos + 1).is_some_and(|c| c == ' ' || c == '\t');
132                                all_digits && has_space
133                            } else {
134                                false
135                            }
136                        }
137                        _ => false,
138                    }
139                }
140            };
141
142            let is_list_item =
143                trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") || is_ordered_list;
144
145            // Check if previous line was blank (after stripping blockquote markers)
146            let prev_line_without_blockquote = if line_idx > 0 {
147                let mut prev = lines[line_idx - 1].to_string();
148                while BlockquoteUtils::is_blockquote(&prev) {
149                    prev = BlockquoteUtils::extract_content(&prev);
150                }
151                prev
152            } else {
153                String::new()
154            };
155            let prev_blank = line_idx > 0 && prev_line_without_blockquote.trim().is_empty();
156
157            // Update list context tracking
158            if is_list_item {
159                // We're starting or continuing a list
160                in_list_context = true;
161
162                // Calculate continuation indent per CommonMark spec:
163                // "The spaces of indentation after the list marker determine how much
164                // relative indentation is needed. The first continuation block must be
165                // indented to the column of the first character other than a space after the marker."
166
167                let marker_column = indent_level;
168                let marker_width =
169                    if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") {
170                        1 // Single character marker (-, *, +)
171                    } else {
172                        // Ordered list marker: count digits + delimiter (. or ))
173                        trimmed.chars().take_while(|c| c.is_numeric()).count() + 1
174                    };
175
176                // Count actual spaces/tabs after marker (CommonMark allows 1-4 spaces)
177                // Find the first non-space character after the marker
178                let after_marker = &trimmed[marker_width..];
179                let spaces_after_marker = after_marker.chars().take_while(|c| *c == ' ' || *c == '\t').count();
180
181                // Continuation indent = marker column + marker width + actual spaces
182                // This is the column where the first content character appears
183                list_continuation_indent = marker_column + marker_width + spaces_after_marker;
184            } else if in_list_context
185                && !line_without_blockquote.trim().is_empty()
186                && indent_level < list_continuation_indent
187            {
188                // Outdented non-empty line ends the list context
189                in_list_context = false;
190                list_continuation_indent = 0;
191            }
192
193            // Determine if this indented line is:
194            // 1. A list continuation paragraph (indent >= continuation_indent, < continuation_indent + 4)
195            // 2. A code block within a list (indent >= continuation_indent + 4)
196            // 3. A document-level code block (not in list context)
197
198            let is_list_continuation_paragraph = in_list_context
199                && indent_level >= list_continuation_indent
200                && indent_level < (list_continuation_indent + 4);
201
202            let is_code_block_in_list = in_list_context && indent_level >= (list_continuation_indent + 4);
203
204            // Handle indented code blocks
205            if is_indented && !line_without_blockquote.trim().is_empty() && !is_list_item {
206                if is_code_block_in_list {
207                    // Code block within list (CommonMark Example 270, 273, 274)
208                    // Requires continuation_indent + 4 spaces, and must have blank line before
209                    if !in_indented_block && prev_blank {
210                        in_indented_block = true;
211                        indented_block_start = line_start;
212                    }
213                } else if !is_list_continuation_paragraph {
214                    // Document-level indented code block (not in list)
215                    if !in_indented_block && prev_blank {
216                        in_indented_block = true;
217                        indented_block_start = line_start;
218                    }
219                }
220                // If is_list_continuation_paragraph, don't treat as code block
221            } else if in_indented_block {
222                // End of indented code block
223                let block_end = if line_idx > 0 && line_idx - 1 < line_positions.len() {
224                    line_positions[line_idx - 1] + lines[line_idx - 1].len()
225                } else {
226                    line_start
227                };
228                blocks.push((indented_block_start, block_end));
229                in_indented_block = false;
230            }
231        }
232
233        // Handle indented block that goes to end of file
234        if in_indented_block {
235            blocks.push((indented_block_start, content.len()));
236        }
237
238        // Note: We DO NOT include inline code spans here - they are not code blocks!
239        // Inline code spans are handled separately by the code span parser.
240
241        blocks.sort_by(|a, b| a.0.cmp(&b.0));
242        blocks
243    }
244
245    /// Check if a position is within a code block (for compatibility)
246    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
247        // This is a compatibility function - it only checks code blocks now, not spans
248        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
249    }
250
251    /// Check if a position is within a code block (NOT including inline code spans)
252    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
253        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
254    }
255
256    /// Analyze code block context relative to list parsing
257    /// This is the core function implementing Design #3's three-tier classification
258    pub fn analyze_code_block_context(
259        lines: &[crate::lint_context::LineInfo],
260        line_idx: usize,
261        min_continuation_indent: usize,
262    ) -> CodeBlockContext {
263        if let Some(line_info) = lines.get(line_idx) {
264            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
265            if line_info.indent >= min_continuation_indent {
266                return CodeBlockContext::Indented;
267            }
268
269            // Rule 2: Blank Line Context - Check for structural separation indicators
270            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
271
272            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
273            // This is the key fix: root-level code blocks with blank lines separate lists
274            if prev_blanks > 0 || next_blanks > 0 {
275                return CodeBlockContext::Standalone;
276            }
277
278            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
279            CodeBlockContext::Adjacent
280        } else {
281            // Fallback for invalid line index
282            CodeBlockContext::Adjacent
283        }
284    }
285
286    /// Count blank lines before and after the given line index
287    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
288        let mut prev_blanks = 0;
289        let mut next_blanks = 0;
290
291        // Count blank lines before (look backwards)
292        for i in (0..line_idx).rev() {
293            if let Some(line) = lines.get(i) {
294                if line.is_blank {
295                    prev_blanks += 1;
296                } else {
297                    break;
298                }
299            } else {
300                break;
301            }
302        }
303
304        // Count blank lines after (look forwards)
305        for i in (line_idx + 1)..lines.len() {
306            if let Some(line) = lines.get(i) {
307                if line.is_blank {
308                    next_blanks += 1;
309                } else {
310                    break;
311                }
312            } else {
313                break;
314            }
315        }
316
317        (prev_blanks, next_blanks)
318    }
319
320    /// Calculate minimum indentation required for code block to continue a list
321    /// Based on the most recent list item's marker width
322    pub fn calculate_min_continuation_indent(
323        lines: &[crate::lint_context::LineInfo],
324        current_line_idx: usize,
325    ) -> usize {
326        // Look backwards to find the most recent list item
327        for i in (0..current_line_idx).rev() {
328            if let Some(line_info) = lines.get(i) {
329                if let Some(list_item) = &line_info.list_item {
330                    // Calculate minimum continuation indent for this list item
331                    return if list_item.is_ordered {
332                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
333                    } else {
334                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
335                    };
336                }
337
338                // Stop at structural separators that would break list context
339                if line_info.heading.is_some() || Self::is_structural_separator(&line_info.content) {
340                    break;
341                }
342            }
343        }
344
345        0 // No list context found
346    }
347
348    /// Check if content is a structural separator (headings, horizontal rules, etc.)
349    fn is_structural_separator(content: &str) -> bool {
350        let trimmed = content.trim();
351        trimmed.starts_with("---")
352            || trimmed.starts_with("***")
353            || trimmed.starts_with("___")
354            || trimmed.contains('|') // Tables
355            || trimmed.starts_with(">") // Blockquotes
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_detect_fenced_code_blocks() {
365        // The function detects fenced blocks and inline code spans
366        // Fence markers (``` at line start) are now skipped in inline span detection
367
368        // Basic fenced code block with backticks
369        let content = "Some text\n```\ncode here\n```\nMore text";
370        let blocks = CodeBlockUtils::detect_code_blocks(content);
371        // Should find: 1 fenced block (fences are no longer detected as inline spans)
372        assert_eq!(blocks.len(), 1);
373
374        // Check that we have the fenced block
375        let fenced_block = blocks
376            .iter()
377            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
378        assert!(fenced_block.is_some());
379
380        // Fenced code block with tildes (no inline code detection for ~)
381        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
382        let blocks = CodeBlockUtils::detect_code_blocks(content);
383        assert_eq!(blocks.len(), 1);
384        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
385
386        // Multiple code blocks
387        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
388        let blocks = CodeBlockUtils::detect_code_blocks(content);
389        // 2 fenced blocks (fence markers no longer detected as inline spans)
390        assert_eq!(blocks.len(), 2);
391    }
392
393    #[test]
394    fn test_detect_code_blocks_with_language() {
395        // Code block with language identifier
396        let content = "Text\n```rust\nfn main() {}\n```\nMore";
397        let blocks = CodeBlockUtils::detect_code_blocks(content);
398        // 1 fenced block (fence markers no longer detected as inline spans)
399        assert_eq!(blocks.len(), 1);
400        // Check we have the full fenced block
401        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
402        assert!(fenced.is_some());
403    }
404
405    #[test]
406    fn test_unclosed_code_block() {
407        // Unclosed code block should extend to end of content
408        let content = "Text\n```\ncode here\nno closing fence";
409        let blocks = CodeBlockUtils::detect_code_blocks(content);
410        assert_eq!(blocks.len(), 1);
411        assert_eq!(blocks[0].1, content.len());
412    }
413
414    #[test]
415    fn test_indented_code_blocks() {
416        // Basic indented code block
417        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
418        let blocks = CodeBlockUtils::detect_code_blocks(content);
419        assert_eq!(blocks.len(), 1);
420        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
421        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
422
423        // Indented code with tabs
424        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
425        let blocks = CodeBlockUtils::detect_code_blocks(content);
426        assert_eq!(blocks.len(), 1);
427    }
428
429    #[test]
430    fn test_indented_code_requires_blank_line() {
431        // Indented lines without preceding blank line are not code blocks
432        let content = "Paragraph\n    indented but not code\nMore text";
433        let blocks = CodeBlockUtils::detect_code_blocks(content);
434        assert_eq!(blocks.len(), 0);
435
436        // With blank line, it becomes a code block
437        let content = "Paragraph\n\n    now it's code\nMore text";
438        let blocks = CodeBlockUtils::detect_code_blocks(content);
439        assert_eq!(blocks.len(), 1);
440    }
441
442    #[test]
443    fn test_list_items_not_code_blocks() {
444        // List items should not be detected as code blocks
445        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
446        let blocks = CodeBlockUtils::detect_code_blocks(content);
447        assert_eq!(blocks.len(), 0);
448
449        // Numbered lists
450        let content = "List:\n\n    1. First\n    2. Second\n    1) Also first";
451        let blocks = CodeBlockUtils::detect_code_blocks(content);
452        assert_eq!(blocks.len(), 0);
453    }
454
455    #[test]
456    fn test_inline_code_spans_not_detected() {
457        // Inline code spans should NOT be detected as code blocks
458        let content = "Text with `inline code` here";
459        let blocks = CodeBlockUtils::detect_code_blocks(content);
460        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
461
462        // Multiple backtick code span
463        let content = "Text with ``code with ` backtick`` here";
464        let blocks = CodeBlockUtils::detect_code_blocks(content);
465        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
466
467        // Multiple code spans
468        let content = "Has `code1` and `code2` spans";
469        let blocks = CodeBlockUtils::detect_code_blocks(content);
470        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
471    }
472
473    #[test]
474    fn test_unclosed_code_span() {
475        // Unclosed code span should not be detected
476        let content = "Text with `unclosed code span";
477        let blocks = CodeBlockUtils::detect_code_blocks(content);
478        assert_eq!(blocks.len(), 0);
479
480        // Mismatched backticks
481        let content = "Text with ``one style` different close";
482        let blocks = CodeBlockUtils::detect_code_blocks(content);
483        assert_eq!(blocks.len(), 0);
484    }
485
486    #[test]
487    fn test_mixed_code_blocks_and_spans() {
488        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
489        let blocks = CodeBlockUtils::detect_code_blocks(content);
490        // Should only detect the fenced block, NOT the inline spans
491        assert_eq!(blocks.len(), 1);
492
493        // Check we have the fenced block only
494        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
495        // Should NOT detect inline spans
496        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
497        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
498    }
499
500    #[test]
501    fn test_is_in_code_block_or_span() {
502        let blocks = vec![(10, 20), (30, 40), (50, 60)];
503
504        // Test positions inside blocks
505        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
506        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
507        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
508
509        // Test positions at boundaries
510        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
511        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
512
513        // Test positions outside blocks
514        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
515        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
516        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
517    }
518
519    #[test]
520    fn test_empty_content() {
521        let blocks = CodeBlockUtils::detect_code_blocks("");
522        assert_eq!(blocks.len(), 0);
523    }
524
525    #[test]
526    fn test_code_block_at_start() {
527        let content = "```\ncode\n```\nText after";
528        let blocks = CodeBlockUtils::detect_code_blocks(content);
529        // 1 fenced block (fence markers no longer detected as inline spans)
530        assert_eq!(blocks.len(), 1);
531        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
532    }
533
534    #[test]
535    fn test_code_block_at_end() {
536        let content = "Text before\n```\ncode\n```";
537        let blocks = CodeBlockUtils::detect_code_blocks(content);
538        // 1 fenced block (fence markers no longer detected as inline spans)
539        assert_eq!(blocks.len(), 1);
540        // Check we have the fenced block
541        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
542        assert!(fenced.is_some());
543    }
544
545    #[test]
546    fn test_nested_fence_markers() {
547        // Code block containing fence markers as content
548        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
549        let blocks = CodeBlockUtils::detect_code_blocks(content);
550        // Should detect: outer block, inner ```, outer ````
551        assert!(!blocks.is_empty());
552        // Check we have the outer block
553        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
554        assert!(outer.is_some());
555    }
556
557    #[test]
558    fn test_indented_code_with_blank_lines() {
559        // Indented code blocks can contain blank lines
560        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
561        let blocks = CodeBlockUtils::detect_code_blocks(content);
562        // May have multiple blocks due to blank line handling
563        assert!(!blocks.is_empty());
564        // Check that we captured the indented code
565        let all_content: String = blocks
566            .iter()
567            .map(|(s, e)| &content[*s..*e])
568            .collect::<Vec<_>>()
569            .join("");
570        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
571    }
572
573    #[test]
574    fn test_code_span_with_spaces() {
575        // Code spans should NOT be detected as code blocks
576        let content = "Text ` code with spaces ` more";
577        let blocks = CodeBlockUtils::detect_code_blocks(content);
578        assert_eq!(blocks.len(), 0); // No blocks, only inline span
579    }
580
581    #[test]
582    fn test_fenced_block_with_info_string() {
583        // Fenced code blocks with complex info strings
584        let content = "```rust,no_run,should_panic\ncode\n```";
585        let blocks = CodeBlockUtils::detect_code_blocks(content);
586        // 1 fenced block (fence markers no longer detected as inline spans)
587        assert_eq!(blocks.len(), 1);
588        assert_eq!(blocks[0].0, 0);
589    }
590
591    #[test]
592    fn test_indented_fences_not_code_blocks() {
593        // Indented fence markers should still work as fences
594        let content = "Text\n  ```\n  code\n  ```\nAfter";
595        let blocks = CodeBlockUtils::detect_code_blocks(content);
596        // Only 1 fenced block (indented fences still work)
597        assert_eq!(blocks.len(), 1);
598    }
599}