rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3
4use crate::rules::blockquote_utils::BlockquoteUtils;
5use lazy_static::lazy_static;
6use regex::Regex;
7
8/// Classification of code blocks relative to list contexts
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub enum CodeBlockContext {
11    /// Code block that separates lists (root-level, with blank lines)
12    Standalone,
13    /// Code block that continues a list (properly indented)
14    Indented,
15    /// Code block adjacent to list content (edge case, defaults to non-breaking)
16    Adjacent,
17}
18
19lazy_static! {
20    static ref CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(```|~~~)").unwrap();
21    static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
22}
23
24/// Utility functions for detecting and handling code blocks in Markdown
25pub struct CodeBlockUtils;
26
27impl CodeBlockUtils {
28    /// Detect all code blocks in the content (NOT including inline code spans)
29    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
30        let mut blocks = Vec::new();
31        let mut in_code_block = false;
32        let mut code_block_start = 0;
33        let mut opening_fence_char = ' ';
34        let mut opening_fence_len = 0;
35
36        // Pre-compute line positions for efficient offset calculation
37        let lines: Vec<&str> = content.lines().collect();
38        let mut line_positions = Vec::with_capacity(lines.len());
39        let mut pos = 0;
40        for line in &lines {
41            line_positions.push(pos);
42            pos += line.len() + 1; // +1 for newline
43        }
44
45        // Find fenced code blocks
46        for (i, line) in lines.iter().enumerate() {
47            let line_start = line_positions[i];
48
49            // Strip ALL blockquote prefixes to properly detect fenced code blocks inside blockquotes
50            // This handles nested blockquotes by recursively stripping '>' markers
51            let mut line_without_blockquote = line.to_string();
52            while BlockquoteUtils::is_blockquote(&line_without_blockquote) {
53                line_without_blockquote = BlockquoteUtils::extract_content(&line_without_blockquote);
54            }
55
56            let trimmed = line_without_blockquote.trim_start();
57
58            // Check if this line could be a code fence
59            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
60                let fence_char = trimmed.chars().next().unwrap();
61                let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
62
63                if !in_code_block && fence_len >= 3 {
64                    // Opening fence
65                    code_block_start = line_start;
66                    in_code_block = true;
67                    opening_fence_char = fence_char;
68                    opening_fence_len = fence_len;
69                } else if in_code_block && fence_char == opening_fence_char && fence_len >= opening_fence_len {
70                    // Closing fence - must match opening fence character and be at least as long
71                    let code_block_end = line_start + line.len();
72                    blocks.push((code_block_start, code_block_end));
73                    in_code_block = false;
74                    opening_fence_char = ' ';
75                    opening_fence_len = 0;
76                }
77                // If we're in a code block but the fence doesn't match, it's just content
78            }
79        }
80
81        // Handle unclosed code blocks
82        if in_code_block {
83            blocks.push((code_block_start, content.len()));
84        }
85
86        // Find indented code blocks (4+ spaces or tab at start of line)
87        // According to CommonMark, indented code blocks must be preceded by a blank line
88        // (unless they're at the start of the document or after a block-level element)
89        let mut in_indented_block = false;
90        let mut indented_block_start = 0;
91
92        for (line_idx, line) in lines.iter().enumerate() {
93            let line_start = if line_idx < line_positions.len() {
94                line_positions[line_idx]
95            } else {
96                0
97            };
98
99            // Strip ALL blockquote prefixes to properly detect indented code blocks inside blockquotes
100            let mut line_without_blockquote = line.to_string();
101            while BlockquoteUtils::is_blockquote(&line_without_blockquote) {
102                line_without_blockquote = BlockquoteUtils::extract_content(&line_without_blockquote);
103            }
104
105            // Check if this line is indented code (after stripping blockquote markers)
106            let is_indented = line_without_blockquote.starts_with("    ") || line_without_blockquote.starts_with("\t");
107
108            // Check if this looks like a list item (has list marker after indentation)
109            let trimmed = line_without_blockquote.trim_start();
110            let is_list_item = trimmed.starts_with("- ")
111                || trimmed.starts_with("* ")
112                || trimmed.starts_with("+ ")
113                || trimmed.chars().next().is_some_and(|c| c.is_numeric())
114                    && trimmed.chars().nth(1).is_some_and(|c| c == '.' || c == ')');
115
116            // Check if previous line was blank (after stripping blockquote markers)
117            let prev_line_without_blockquote = if line_idx > 0 {
118                let mut prev = lines[line_idx - 1].to_string();
119                while BlockquoteUtils::is_blockquote(&prev) {
120                    prev = BlockquoteUtils::extract_content(&prev);
121                }
122                prev
123            } else {
124                String::new()
125            };
126            let prev_blank = line_idx > 0 && prev_line_without_blockquote.trim().is_empty();
127
128            if is_indented && !line_without_blockquote.trim().is_empty() && !is_list_item {
129                if !in_indented_block {
130                    // Only start an indented code block if preceded by a blank line
131                    if prev_blank {
132                        in_indented_block = true;
133                        indented_block_start = line_start;
134                    }
135                    // Otherwise, this is just an indented line, not a code block
136                }
137            } else if in_indented_block {
138                // End of indented code block
139                let block_end = if line_idx > 0 && line_idx - 1 < line_positions.len() {
140                    line_positions[line_idx - 1] + lines[line_idx - 1].len()
141                } else {
142                    line_start
143                };
144                blocks.push((indented_block_start, block_end));
145                in_indented_block = false;
146            }
147        }
148
149        // Handle indented block that goes to end of file
150        if in_indented_block {
151            blocks.push((indented_block_start, content.len()));
152        }
153
154        // Note: We DO NOT include inline code spans here - they are not code blocks!
155        // Inline code spans are handled separately by the code span parser.
156
157        blocks.sort_by(|a, b| a.0.cmp(&b.0));
158        blocks
159    }
160
161    /// Check if a position is within a code block (for compatibility)
162    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
163        // This is a compatibility function - it only checks code blocks now, not spans
164        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
165    }
166
167    /// Check if a position is within a code block (NOT including inline code spans)
168    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
169        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
170    }
171
172    /// Analyze code block context relative to list parsing
173    /// This is the core function implementing Design #3's three-tier classification
174    pub fn analyze_code_block_context(
175        lines: &[crate::lint_context::LineInfo],
176        line_idx: usize,
177        min_continuation_indent: usize,
178    ) -> CodeBlockContext {
179        if let Some(line_info) = lines.get(line_idx) {
180            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
181            if line_info.indent >= min_continuation_indent {
182                return CodeBlockContext::Indented;
183            }
184
185            // Rule 2: Blank Line Context - Check for structural separation indicators
186            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
187
188            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
189            // This is the key fix: root-level code blocks with blank lines separate lists
190            if prev_blanks > 0 || next_blanks > 0 {
191                return CodeBlockContext::Standalone;
192            }
193
194            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
195            CodeBlockContext::Adjacent
196        } else {
197            // Fallback for invalid line index
198            CodeBlockContext::Adjacent
199        }
200    }
201
202    /// Count blank lines before and after the given line index
203    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
204        let mut prev_blanks = 0;
205        let mut next_blanks = 0;
206
207        // Count blank lines before (look backwards)
208        for i in (0..line_idx).rev() {
209            if let Some(line) = lines.get(i) {
210                if line.is_blank {
211                    prev_blanks += 1;
212                } else {
213                    break;
214                }
215            } else {
216                break;
217            }
218        }
219
220        // Count blank lines after (look forwards)
221        for i in (line_idx + 1)..lines.len() {
222            if let Some(line) = lines.get(i) {
223                if line.is_blank {
224                    next_blanks += 1;
225                } else {
226                    break;
227                }
228            } else {
229                break;
230            }
231        }
232
233        (prev_blanks, next_blanks)
234    }
235
236    /// Calculate minimum indentation required for code block to continue a list
237    /// Based on the most recent list item's marker width
238    pub fn calculate_min_continuation_indent(
239        lines: &[crate::lint_context::LineInfo],
240        current_line_idx: usize,
241    ) -> usize {
242        // Look backwards to find the most recent list item
243        for i in (0..current_line_idx).rev() {
244            if let Some(line_info) = lines.get(i) {
245                if let Some(list_item) = &line_info.list_item {
246                    // Calculate minimum continuation indent for this list item
247                    return if list_item.is_ordered {
248                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
249                    } else {
250                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
251                    };
252                }
253
254                // Stop at structural separators that would break list context
255                if line_info.heading.is_some() || Self::is_structural_separator(&line_info.content) {
256                    break;
257                }
258            }
259        }
260
261        0 // No list context found
262    }
263
264    /// Check if content is a structural separator (headings, horizontal rules, etc.)
265    fn is_structural_separator(content: &str) -> bool {
266        let trimmed = content.trim();
267        trimmed.starts_with("---")
268            || trimmed.starts_with("***")
269            || trimmed.starts_with("___")
270            || trimmed.contains('|') // Tables
271            || trimmed.starts_with(">") // Blockquotes
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_detect_fenced_code_blocks() {
281        // The function detects fenced blocks and inline code spans
282        // Fence markers (``` at line start) are now skipped in inline span detection
283
284        // Basic fenced code block with backticks
285        let content = "Some text\n```\ncode here\n```\nMore text";
286        let blocks = CodeBlockUtils::detect_code_blocks(content);
287        // Should find: 1 fenced block (fences are no longer detected as inline spans)
288        assert_eq!(blocks.len(), 1);
289
290        // Check that we have the fenced block
291        let fenced_block = blocks
292            .iter()
293            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
294        assert!(fenced_block.is_some());
295
296        // Fenced code block with tildes (no inline code detection for ~)
297        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
298        let blocks = CodeBlockUtils::detect_code_blocks(content);
299        assert_eq!(blocks.len(), 1);
300        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
301
302        // Multiple code blocks
303        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
304        let blocks = CodeBlockUtils::detect_code_blocks(content);
305        // 2 fenced blocks (fence markers no longer detected as inline spans)
306        assert_eq!(blocks.len(), 2);
307    }
308
309    #[test]
310    fn test_detect_code_blocks_with_language() {
311        // Code block with language identifier
312        let content = "Text\n```rust\nfn main() {}\n```\nMore";
313        let blocks = CodeBlockUtils::detect_code_blocks(content);
314        // 1 fenced block (fence markers no longer detected as inline spans)
315        assert_eq!(blocks.len(), 1);
316        // Check we have the full fenced block
317        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
318        assert!(fenced.is_some());
319    }
320
321    #[test]
322    fn test_unclosed_code_block() {
323        // Unclosed code block should extend to end of content
324        let content = "Text\n```\ncode here\nno closing fence";
325        let blocks = CodeBlockUtils::detect_code_blocks(content);
326        assert_eq!(blocks.len(), 1);
327        assert_eq!(blocks[0].1, content.len());
328    }
329
330    #[test]
331    fn test_indented_code_blocks() {
332        // Basic indented code block
333        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
334        let blocks = CodeBlockUtils::detect_code_blocks(content);
335        assert_eq!(blocks.len(), 1);
336        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
337        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
338
339        // Indented code with tabs
340        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
341        let blocks = CodeBlockUtils::detect_code_blocks(content);
342        assert_eq!(blocks.len(), 1);
343    }
344
345    #[test]
346    fn test_indented_code_requires_blank_line() {
347        // Indented lines without preceding blank line are not code blocks
348        let content = "Paragraph\n    indented but not code\nMore text";
349        let blocks = CodeBlockUtils::detect_code_blocks(content);
350        assert_eq!(blocks.len(), 0);
351
352        // With blank line, it becomes a code block
353        let content = "Paragraph\n\n    now it's code\nMore text";
354        let blocks = CodeBlockUtils::detect_code_blocks(content);
355        assert_eq!(blocks.len(), 1);
356    }
357
358    #[test]
359    fn test_list_items_not_code_blocks() {
360        // List items should not be detected as code blocks
361        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
362        let blocks = CodeBlockUtils::detect_code_blocks(content);
363        assert_eq!(blocks.len(), 0);
364
365        // Numbered lists
366        let content = "List:\n\n    1. First\n    2. Second\n    1) Also first";
367        let blocks = CodeBlockUtils::detect_code_blocks(content);
368        assert_eq!(blocks.len(), 0);
369    }
370
371    #[test]
372    fn test_inline_code_spans_not_detected() {
373        // Inline code spans should NOT be detected as code blocks
374        let content = "Text with `inline code` here";
375        let blocks = CodeBlockUtils::detect_code_blocks(content);
376        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
377
378        // Multiple backtick code span
379        let content = "Text with ``code with ` backtick`` here";
380        let blocks = CodeBlockUtils::detect_code_blocks(content);
381        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
382
383        // Multiple code spans
384        let content = "Has `code1` and `code2` spans";
385        let blocks = CodeBlockUtils::detect_code_blocks(content);
386        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
387    }
388
389    #[test]
390    fn test_unclosed_code_span() {
391        // Unclosed code span should not be detected
392        let content = "Text with `unclosed code span";
393        let blocks = CodeBlockUtils::detect_code_blocks(content);
394        assert_eq!(blocks.len(), 0);
395
396        // Mismatched backticks
397        let content = "Text with ``one style` different close";
398        let blocks = CodeBlockUtils::detect_code_blocks(content);
399        assert_eq!(blocks.len(), 0);
400    }
401
402    #[test]
403    fn test_mixed_code_blocks_and_spans() {
404        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
405        let blocks = CodeBlockUtils::detect_code_blocks(content);
406        // Should only detect the fenced block, NOT the inline spans
407        assert_eq!(blocks.len(), 1);
408
409        // Check we have the fenced block only
410        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
411        // Should NOT detect inline spans
412        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
413        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
414    }
415
416    #[test]
417    fn test_is_in_code_block_or_span() {
418        let blocks = vec![(10, 20), (30, 40), (50, 60)];
419
420        // Test positions inside blocks
421        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
422        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
423        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
424
425        // Test positions at boundaries
426        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
427        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
428
429        // Test positions outside blocks
430        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
431        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
432        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
433    }
434
435    #[test]
436    fn test_empty_content() {
437        let blocks = CodeBlockUtils::detect_code_blocks("");
438        assert_eq!(blocks.len(), 0);
439    }
440
441    #[test]
442    fn test_code_block_at_start() {
443        let content = "```\ncode\n```\nText after";
444        let blocks = CodeBlockUtils::detect_code_blocks(content);
445        // 1 fenced block (fence markers no longer detected as inline spans)
446        assert_eq!(blocks.len(), 1);
447        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
448    }
449
450    #[test]
451    fn test_code_block_at_end() {
452        let content = "Text before\n```\ncode\n```";
453        let blocks = CodeBlockUtils::detect_code_blocks(content);
454        // 1 fenced block (fence markers no longer detected as inline spans)
455        assert_eq!(blocks.len(), 1);
456        // Check we have the fenced block
457        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
458        assert!(fenced.is_some());
459    }
460
461    #[test]
462    fn test_nested_fence_markers() {
463        // Code block containing fence markers as content
464        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
465        let blocks = CodeBlockUtils::detect_code_blocks(content);
466        // Should detect: outer block, inner ```, outer ````
467        assert!(!blocks.is_empty());
468        // Check we have the outer block
469        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
470        assert!(outer.is_some());
471    }
472
473    #[test]
474    fn test_indented_code_with_blank_lines() {
475        // Indented code blocks can contain blank lines
476        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
477        let blocks = CodeBlockUtils::detect_code_blocks(content);
478        // May have multiple blocks due to blank line handling
479        assert!(!blocks.is_empty());
480        // Check that we captured the indented code
481        let all_content: String = blocks
482            .iter()
483            .map(|(s, e)| &content[*s..*e])
484            .collect::<Vec<_>>()
485            .join("");
486        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
487    }
488
489    #[test]
490    fn test_code_span_with_spaces() {
491        // Code spans should NOT be detected as code blocks
492        let content = "Text ` code with spaces ` more";
493        let blocks = CodeBlockUtils::detect_code_blocks(content);
494        assert_eq!(blocks.len(), 0); // No blocks, only inline span
495    }
496
497    #[test]
498    fn test_fenced_block_with_info_string() {
499        // Fenced code blocks with complex info strings
500        let content = "```rust,no_run,should_panic\ncode\n```";
501        let blocks = CodeBlockUtils::detect_code_blocks(content);
502        // 1 fenced block (fence markers no longer detected as inline spans)
503        assert_eq!(blocks.len(), 1);
504        assert_eq!(blocks[0].0, 0);
505    }
506
507    #[test]
508    fn test_indented_fences_not_code_blocks() {
509        // Indented fence markers should still work as fences
510        let content = "Text\n  ```\n  code\n  ```\nAfter";
511        let blocks = CodeBlockUtils::detect_code_blocks(content);
512        // Only 1 fenced block (indented fences still work)
513        assert_eq!(blocks.len(), 1);
514    }
515}