rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3
4use lazy_static::lazy_static;
5use regex::Regex;
6
7/// Classification of code blocks relative to list contexts
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub enum CodeBlockContext {
10    /// Code block that separates lists (root-level, with blank lines)
11    Standalone,
12    /// Code block that continues a list (properly indented)
13    Indented,
14    /// Code block adjacent to list content (edge case, defaults to non-breaking)
15    Adjacent,
16}
17
18lazy_static! {
19    static ref CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(```|~~~)").unwrap();
20    static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
21}
22
23/// Utility functions for detecting and handling code blocks in Markdown
24pub struct CodeBlockUtils;
25
26impl CodeBlockUtils {
27    /// Detect all code blocks in the content (NOT including inline code spans)
28    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
29        let mut blocks = Vec::new();
30        let mut in_code_block = false;
31        let mut code_block_start = 0;
32        let mut opening_fence_char = ' ';
33        let mut opening_fence_len = 0;
34
35        // Pre-compute line positions for efficient offset calculation
36        let lines: Vec<&str> = content.lines().collect();
37        let mut line_positions = Vec::with_capacity(lines.len());
38        let mut pos = 0;
39        for line in &lines {
40            line_positions.push(pos);
41            pos += line.len() + 1; // +1 for newline
42        }
43
44        // Find fenced code blocks
45        for (i, line) in lines.iter().enumerate() {
46            let line_start = line_positions[i];
47            let trimmed = line.trim_start();
48
49            // Check if this line could be a code fence
50            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
51                let fence_char = trimmed.chars().next().unwrap();
52                let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
53
54                if !in_code_block && fence_len >= 3 {
55                    // Opening fence
56                    code_block_start = line_start;
57                    in_code_block = true;
58                    opening_fence_char = fence_char;
59                    opening_fence_len = fence_len;
60                } else if in_code_block && fence_char == opening_fence_char && fence_len >= opening_fence_len {
61                    // Closing fence - must match opening fence character and be at least as long
62                    let code_block_end = line_start + line.len();
63                    blocks.push((code_block_start, code_block_end));
64                    in_code_block = false;
65                    opening_fence_char = ' ';
66                    opening_fence_len = 0;
67                }
68                // If we're in a code block but the fence doesn't match, it's just content
69            }
70        }
71
72        // Handle unclosed code blocks
73        if in_code_block {
74            blocks.push((code_block_start, content.len()));
75        }
76
77        // Find indented code blocks (4+ spaces or tab at start of line)
78        // According to CommonMark, indented code blocks must be preceded by a blank line
79        // (unless they're at the start of the document or after a block-level element)
80        let mut in_indented_block = false;
81        let mut indented_block_start = 0;
82
83        for (line_idx, line) in lines.iter().enumerate() {
84            let line_start = if line_idx < line_positions.len() {
85                line_positions[line_idx]
86            } else {
87                0
88            };
89
90            // Check if this line is indented code
91            let is_indented = line.starts_with("    ") || line.starts_with("\t");
92
93            // Check if this looks like a list item (has list marker after indentation)
94            let trimmed = line.trim_start();
95            let is_list_item = trimmed.starts_with("- ")
96                || trimmed.starts_with("* ")
97                || trimmed.starts_with("+ ")
98                || trimmed.chars().next().is_some_and(|c| c.is_numeric())
99                    && trimmed.chars().nth(1).is_some_and(|c| c == '.' || c == ')');
100
101            // Check if previous line was blank
102            let prev_blank = line_idx > 0 && lines[line_idx - 1].trim().is_empty();
103
104            if is_indented && !line.trim().is_empty() && !is_list_item {
105                if !in_indented_block {
106                    // Only start an indented code block if preceded by a blank line
107                    if prev_blank {
108                        in_indented_block = true;
109                        indented_block_start = line_start;
110                    }
111                    // Otherwise, this is just an indented line, not a code block
112                }
113            } else if in_indented_block {
114                // End of indented code block
115                let block_end = if line_idx > 0 && line_idx - 1 < line_positions.len() {
116                    line_positions[line_idx - 1] + lines[line_idx - 1].len()
117                } else {
118                    line_start
119                };
120                blocks.push((indented_block_start, block_end));
121                in_indented_block = false;
122            }
123        }
124
125        // Handle indented block that goes to end of file
126        if in_indented_block {
127            blocks.push((indented_block_start, content.len()));
128        }
129
130        // Note: We DO NOT include inline code spans here - they are not code blocks!
131        // Inline code spans are handled separately by the code span parser.
132
133        blocks.sort_by(|a, b| a.0.cmp(&b.0));
134        blocks
135    }
136
137    /// Check if a position is within a code block (for compatibility)
138    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
139        // This is a compatibility function - it only checks code blocks now, not spans
140        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
141    }
142
143    /// Check if a position is within a code block (NOT including inline code spans)
144    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
145        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
146    }
147
148    /// Analyze code block context relative to list parsing
149    /// This is the core function implementing Design #3's three-tier classification
150    pub fn analyze_code_block_context(
151        lines: &[crate::lint_context::LineInfo],
152        line_idx: usize,
153        min_continuation_indent: usize,
154    ) -> CodeBlockContext {
155        if let Some(line_info) = lines.get(line_idx) {
156            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
157            if line_info.indent >= min_continuation_indent {
158                return CodeBlockContext::Indented;
159            }
160
161            // Rule 2: Blank Line Context - Check for structural separation indicators
162            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
163
164            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
165            // This is the key fix: root-level code blocks with blank lines separate lists
166            if prev_blanks > 0 || next_blanks > 0 {
167                return CodeBlockContext::Standalone;
168            }
169
170            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
171            CodeBlockContext::Adjacent
172        } else {
173            // Fallback for invalid line index
174            CodeBlockContext::Adjacent
175        }
176    }
177
178    /// Count blank lines before and after the given line index
179    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
180        let mut prev_blanks = 0;
181        let mut next_blanks = 0;
182
183        // Count blank lines before (look backwards)
184        for i in (0..line_idx).rev() {
185            if let Some(line) = lines.get(i) {
186                if line.is_blank {
187                    prev_blanks += 1;
188                } else {
189                    break;
190                }
191            } else {
192                break;
193            }
194        }
195
196        // Count blank lines after (look forwards)
197        for i in (line_idx + 1)..lines.len() {
198            if let Some(line) = lines.get(i) {
199                if line.is_blank {
200                    next_blanks += 1;
201                } else {
202                    break;
203                }
204            } else {
205                break;
206            }
207        }
208
209        (prev_blanks, next_blanks)
210    }
211
212    /// Calculate minimum indentation required for code block to continue a list
213    /// Based on the most recent list item's marker width
214    pub fn calculate_min_continuation_indent(
215        lines: &[crate::lint_context::LineInfo],
216        current_line_idx: usize,
217    ) -> usize {
218        // Look backwards to find the most recent list item
219        for i in (0..current_line_idx).rev() {
220            if let Some(line_info) = lines.get(i) {
221                if let Some(list_item) = &line_info.list_item {
222                    // Calculate minimum continuation indent for this list item
223                    return if list_item.is_ordered {
224                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
225                    } else {
226                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
227                    };
228                }
229
230                // Stop at structural separators that would break list context
231                if line_info.heading.is_some() || Self::is_structural_separator(&line_info.content) {
232                    break;
233                }
234            }
235        }
236
237        0 // No list context found
238    }
239
240    /// Check if content is a structural separator (headings, horizontal rules, etc.)
241    fn is_structural_separator(content: &str) -> bool {
242        let trimmed = content.trim();
243        trimmed.starts_with("---")
244            || trimmed.starts_with("***")
245            || trimmed.starts_with("___")
246            || trimmed.contains('|') // Tables
247            || trimmed.starts_with(">") // Blockquotes
248    }
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254
255    #[test]
256    fn test_detect_fenced_code_blocks() {
257        // The function detects fenced blocks and inline code spans
258        // Fence markers (``` at line start) are now skipped in inline span detection
259
260        // Basic fenced code block with backticks
261        let content = "Some text\n```\ncode here\n```\nMore text";
262        let blocks = CodeBlockUtils::detect_code_blocks(content);
263        // Should find: 1 fenced block (fences are no longer detected as inline spans)
264        assert_eq!(blocks.len(), 1);
265
266        // Check that we have the fenced block
267        let fenced_block = blocks
268            .iter()
269            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
270        assert!(fenced_block.is_some());
271
272        // Fenced code block with tildes (no inline code detection for ~)
273        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
274        let blocks = CodeBlockUtils::detect_code_blocks(content);
275        assert_eq!(blocks.len(), 1);
276        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
277
278        // Multiple code blocks
279        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
280        let blocks = CodeBlockUtils::detect_code_blocks(content);
281        // 2 fenced blocks (fence markers no longer detected as inline spans)
282        assert_eq!(blocks.len(), 2);
283    }
284
285    #[test]
286    fn test_detect_code_blocks_with_language() {
287        // Code block with language identifier
288        let content = "Text\n```rust\nfn main() {}\n```\nMore";
289        let blocks = CodeBlockUtils::detect_code_blocks(content);
290        // 1 fenced block (fence markers no longer detected as inline spans)
291        assert_eq!(blocks.len(), 1);
292        // Check we have the full fenced block
293        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
294        assert!(fenced.is_some());
295    }
296
297    #[test]
298    fn test_unclosed_code_block() {
299        // Unclosed code block should extend to end of content
300        let content = "Text\n```\ncode here\nno closing fence";
301        let blocks = CodeBlockUtils::detect_code_blocks(content);
302        assert_eq!(blocks.len(), 1);
303        assert_eq!(blocks[0].1, content.len());
304    }
305
306    #[test]
307    fn test_indented_code_blocks() {
308        // Basic indented code block
309        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
310        let blocks = CodeBlockUtils::detect_code_blocks(content);
311        assert_eq!(blocks.len(), 1);
312        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
313        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
314
315        // Indented code with tabs
316        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
317        let blocks = CodeBlockUtils::detect_code_blocks(content);
318        assert_eq!(blocks.len(), 1);
319    }
320
321    #[test]
322    fn test_indented_code_requires_blank_line() {
323        // Indented lines without preceding blank line are not code blocks
324        let content = "Paragraph\n    indented but not code\nMore text";
325        let blocks = CodeBlockUtils::detect_code_blocks(content);
326        assert_eq!(blocks.len(), 0);
327
328        // With blank line, it becomes a code block
329        let content = "Paragraph\n\n    now it's code\nMore text";
330        let blocks = CodeBlockUtils::detect_code_blocks(content);
331        assert_eq!(blocks.len(), 1);
332    }
333
334    #[test]
335    fn test_list_items_not_code_blocks() {
336        // List items should not be detected as code blocks
337        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
338        let blocks = CodeBlockUtils::detect_code_blocks(content);
339        assert_eq!(blocks.len(), 0);
340
341        // Numbered lists
342        let content = "List:\n\n    1. First\n    2. Second\n    1) Also first";
343        let blocks = CodeBlockUtils::detect_code_blocks(content);
344        assert_eq!(blocks.len(), 0);
345    }
346
347    #[test]
348    fn test_inline_code_spans_not_detected() {
349        // Inline code spans should NOT be detected as code blocks
350        let content = "Text with `inline code` here";
351        let blocks = CodeBlockUtils::detect_code_blocks(content);
352        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
353
354        // Multiple backtick code span
355        let content = "Text with ``code with ` backtick`` here";
356        let blocks = CodeBlockUtils::detect_code_blocks(content);
357        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
358
359        // Multiple code spans
360        let content = "Has `code1` and `code2` spans";
361        let blocks = CodeBlockUtils::detect_code_blocks(content);
362        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
363    }
364
365    #[test]
366    fn test_unclosed_code_span() {
367        // Unclosed code span should not be detected
368        let content = "Text with `unclosed code span";
369        let blocks = CodeBlockUtils::detect_code_blocks(content);
370        assert_eq!(blocks.len(), 0);
371
372        // Mismatched backticks
373        let content = "Text with ``one style` different close";
374        let blocks = CodeBlockUtils::detect_code_blocks(content);
375        assert_eq!(blocks.len(), 0);
376    }
377
378    #[test]
379    fn test_mixed_code_blocks_and_spans() {
380        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
381        let blocks = CodeBlockUtils::detect_code_blocks(content);
382        // Should only detect the fenced block, NOT the inline spans
383        assert_eq!(blocks.len(), 1);
384
385        // Check we have the fenced block only
386        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
387        // Should NOT detect inline spans
388        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
389        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
390    }
391
392    #[test]
393    fn test_is_in_code_block_or_span() {
394        let blocks = vec![(10, 20), (30, 40), (50, 60)];
395
396        // Test positions inside blocks
397        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
398        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
399        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
400
401        // Test positions at boundaries
402        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
403        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
404
405        // Test positions outside blocks
406        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
407        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
408        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
409    }
410
411    #[test]
412    fn test_empty_content() {
413        let blocks = CodeBlockUtils::detect_code_blocks("");
414        assert_eq!(blocks.len(), 0);
415    }
416
417    #[test]
418    fn test_code_block_at_start() {
419        let content = "```\ncode\n```\nText after";
420        let blocks = CodeBlockUtils::detect_code_blocks(content);
421        // 1 fenced block (fence markers no longer detected as inline spans)
422        assert_eq!(blocks.len(), 1);
423        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
424    }
425
426    #[test]
427    fn test_code_block_at_end() {
428        let content = "Text before\n```\ncode\n```";
429        let blocks = CodeBlockUtils::detect_code_blocks(content);
430        // 1 fenced block (fence markers no longer detected as inline spans)
431        assert_eq!(blocks.len(), 1);
432        // Check we have the fenced block
433        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
434        assert!(fenced.is_some());
435    }
436
437    #[test]
438    fn test_nested_fence_markers() {
439        // Code block containing fence markers as content
440        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
441        let blocks = CodeBlockUtils::detect_code_blocks(content);
442        // Should detect: outer block, inner ```, outer ````
443        assert!(!blocks.is_empty());
444        // Check we have the outer block
445        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
446        assert!(outer.is_some());
447    }
448
449    #[test]
450    fn test_indented_code_with_blank_lines() {
451        // Indented code blocks can contain blank lines
452        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
453        let blocks = CodeBlockUtils::detect_code_blocks(content);
454        // May have multiple blocks due to blank line handling
455        assert!(!blocks.is_empty());
456        // Check that we captured the indented code
457        let all_content: String = blocks
458            .iter()
459            .map(|(s, e)| &content[*s..*e])
460            .collect::<Vec<_>>()
461            .join("");
462        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
463    }
464
465    #[test]
466    fn test_code_span_with_spaces() {
467        // Code spans should NOT be detected as code blocks
468        let content = "Text ` code with spaces ` more";
469        let blocks = CodeBlockUtils::detect_code_blocks(content);
470        assert_eq!(blocks.len(), 0); // No blocks, only inline span
471    }
472
473    #[test]
474    fn test_fenced_block_with_info_string() {
475        // Fenced code blocks with complex info strings
476        let content = "```rust,no_run,should_panic\ncode\n```";
477        let blocks = CodeBlockUtils::detect_code_blocks(content);
478        // 1 fenced block (fence markers no longer detected as inline spans)
479        assert_eq!(blocks.len(), 1);
480        assert_eq!(blocks[0].0, 0);
481    }
482
483    #[test]
484    fn test_indented_fences_not_code_blocks() {
485        // Indented fence markers should still work as fences
486        let content = "Text\n  ```\n  code\n  ```\nAfter";
487        let blocks = CodeBlockUtils::detect_code_blocks(content);
488        // Only 1 fenced block (indented fences still work)
489        assert_eq!(blocks.len(), 1);
490    }
491}