rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3
4use lazy_static::lazy_static;
5use regex::Regex;
6
7/// Classification of code blocks relative to list contexts
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub enum CodeBlockContext {
10    /// Code block that separates lists (root-level, with blank lines)
11    Standalone,
12    /// Code block that continues a list (properly indented)
13    Indented,
14    /// Code block adjacent to list content (edge case, defaults to non-breaking)
15    Adjacent,
16}
17
18lazy_static! {
19    static ref CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(```|~~~)").unwrap();
20    static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
21}
22
23/// Utility functions for detecting and handling code blocks in Markdown
24pub struct CodeBlockUtils;
25
26impl CodeBlockUtils {
27    /// Detect all code blocks in the content
28    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
29        let mut blocks = Vec::new();
30        let mut in_code_block = false;
31        let mut code_block_start = 0;
32        let mut opening_fence_char = ' ';
33        let mut opening_fence_len = 0;
34
35        // Pre-compute line positions for efficient offset calculation
36        let lines: Vec<&str> = content.lines().collect();
37        let mut line_positions = Vec::with_capacity(lines.len());
38        let mut pos = 0;
39        for line in &lines {
40            line_positions.push(pos);
41            pos += line.len() + 1; // +1 for newline
42        }
43
44        // Find fenced code blocks
45        for (i, line) in lines.iter().enumerate() {
46            let line_start = line_positions[i];
47            let trimmed = line.trim_start();
48
49            // Check if this line could be a code fence
50            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
51                let fence_char = trimmed.chars().next().unwrap();
52                let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
53
54                if !in_code_block && fence_len >= 3 {
55                    // Opening fence
56                    code_block_start = line_start;
57                    in_code_block = true;
58                    opening_fence_char = fence_char;
59                    opening_fence_len = fence_len;
60                } else if in_code_block && fence_char == opening_fence_char && fence_len >= opening_fence_len {
61                    // Closing fence - must match opening fence character and be at least as long
62                    let code_block_end = line_start + line.len();
63                    blocks.push((code_block_start, code_block_end));
64                    in_code_block = false;
65                    opening_fence_char = ' ';
66                    opening_fence_len = 0;
67                }
68                // If we're in a code block but the fence doesn't match, it's just content
69            }
70        }
71
72        // Handle unclosed code blocks
73        if in_code_block {
74            blocks.push((code_block_start, content.len()));
75        }
76
77        // Find indented code blocks (4+ spaces or tab at start of line)
78        // According to CommonMark, indented code blocks must be preceded by a blank line
79        // (unless they're at the start of the document or after a block-level element)
80        let mut in_indented_block = false;
81        let mut indented_block_start = 0;
82
83        for (line_idx, line) in lines.iter().enumerate() {
84            let line_start = if line_idx < line_positions.len() {
85                line_positions[line_idx]
86            } else {
87                0
88            };
89
90            // Check if this line is indented code
91            let is_indented = line.starts_with("    ") || line.starts_with("\t");
92
93            // Check if this looks like a list item (has list marker after indentation)
94            let trimmed = line.trim_start();
95            let is_list_item = trimmed.starts_with("- ")
96                || trimmed.starts_with("* ")
97                || trimmed.starts_with("+ ")
98                || trimmed.chars().next().is_some_and(|c| c.is_numeric())
99                    && trimmed.chars().nth(1).is_some_and(|c| c == '.' || c == ')');
100
101            // Check if previous line was blank
102            let prev_blank = line_idx > 0 && lines[line_idx - 1].trim().is_empty();
103
104            if is_indented && !line.trim().is_empty() && !is_list_item {
105                if !in_indented_block {
106                    // Only start an indented code block if preceded by a blank line
107                    if prev_blank {
108                        in_indented_block = true;
109                        indented_block_start = line_start;
110                    }
111                    // Otherwise, this is just an indented line, not a code block
112                }
113            } else if in_indented_block {
114                // End of indented code block
115                let block_end = if line_idx > 0 && line_idx - 1 < line_positions.len() {
116                    line_positions[line_idx - 1] + lines[line_idx - 1].len()
117                } else {
118                    line_start
119                };
120                blocks.push((indented_block_start, block_end));
121                in_indented_block = false;
122            }
123        }
124
125        // Handle indented block that goes to end of file
126        if in_indented_block {
127            blocks.push((indented_block_start, content.len()));
128        }
129
130        // Find inline code spans
131        let mut i = 0;
132        while i < content.len() {
133            if let Some(m) = CODE_SPAN_PATTERN.find_at(content, i) {
134                let backtick_length = m.end() - m.start();
135                let start = m.start();
136
137                // Find matching closing backticks
138                if let Some(end_pos) = content[m.end()..].find(&"`".repeat(backtick_length)) {
139                    let end = m.end() + end_pos + backtick_length;
140                    blocks.push((start, end));
141                    i = end;
142                } else {
143                    i = m.end();
144                }
145            } else {
146                break;
147            }
148        }
149
150        blocks.sort_by(|a, b| a.0.cmp(&b.0));
151        blocks
152    }
153
154    /// Check if a position is within a code block or code span
155    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
156        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
157    }
158
159    /// Analyze code block context relative to list parsing
160    /// This is the core function implementing Design #3's three-tier classification
161    pub fn analyze_code_block_context(
162        lines: &[crate::lint_context::LineInfo],
163        line_idx: usize,
164        min_continuation_indent: usize,
165    ) -> CodeBlockContext {
166        if let Some(line_info) = lines.get(line_idx) {
167            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
168            if line_info.indent >= min_continuation_indent {
169                return CodeBlockContext::Indented;
170            }
171
172            // Rule 2: Blank Line Context - Check for structural separation indicators
173            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
174
175            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
176            // This is the key fix: root-level code blocks with blank lines separate lists
177            if prev_blanks > 0 || next_blanks > 0 {
178                return CodeBlockContext::Standalone;
179            }
180
181            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
182            CodeBlockContext::Adjacent
183        } else {
184            // Fallback for invalid line index
185            CodeBlockContext::Adjacent
186        }
187    }
188
189    /// Count blank lines before and after the given line index
190    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
191        let mut prev_blanks = 0;
192        let mut next_blanks = 0;
193
194        // Count blank lines before (look backwards)
195        for i in (0..line_idx).rev() {
196            if let Some(line) = lines.get(i) {
197                if line.is_blank {
198                    prev_blanks += 1;
199                } else {
200                    break;
201                }
202            } else {
203                break;
204            }
205        }
206
207        // Count blank lines after (look forwards)
208        for i in (line_idx + 1)..lines.len() {
209            if let Some(line) = lines.get(i) {
210                if line.is_blank {
211                    next_blanks += 1;
212                } else {
213                    break;
214                }
215            } else {
216                break;
217            }
218        }
219
220        (prev_blanks, next_blanks)
221    }
222
223    /// Calculate minimum indentation required for code block to continue a list
224    /// Based on the most recent list item's marker width
225    pub fn calculate_min_continuation_indent(
226        lines: &[crate::lint_context::LineInfo],
227        current_line_idx: usize,
228    ) -> usize {
229        // Look backwards to find the most recent list item
230        for i in (0..current_line_idx).rev() {
231            if let Some(line_info) = lines.get(i) {
232                if let Some(list_item) = &line_info.list_item {
233                    // Calculate minimum continuation indent for this list item
234                    return if list_item.is_ordered {
235                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
236                    } else {
237                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
238                    };
239                }
240
241                // Stop at structural separators that would break list context
242                if line_info.heading.is_some() || Self::is_structural_separator(&line_info.content) {
243                    break;
244                }
245            }
246        }
247
248        0 // No list context found
249    }
250
251    /// Check if content is a structural separator (headings, horizontal rules, etc.)
252    fn is_structural_separator(content: &str) -> bool {
253        let trimmed = content.trim();
254        trimmed.starts_with("---")
255            || trimmed.starts_with("***")
256            || trimmed.starts_with("___")
257            || trimmed.contains('|') // Tables
258            || trimmed.starts_with(">") // Blockquotes
259    }
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    #[test]
267    fn test_detect_fenced_code_blocks() {
268        // The function detects BOTH fenced blocks and inline code spans
269        // Fenced blocks with backticks also get picked up as inline spans due to the backticks
270
271        // Basic fenced code block with backticks
272        let content = "Some text\n```\ncode here\n```\nMore text";
273        let blocks = CodeBlockUtils::detect_code_blocks(content);
274        // Should find: 1 fenced block + 1 inline span (the ```)
275        assert_eq!(blocks.len(), 2);
276
277        // Check that we have the fenced block
278        let fenced_block = blocks
279            .iter()
280            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
281        assert!(fenced_block.is_some());
282
283        // Fenced code block with tildes (no inline code detection for ~)
284        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
285        let blocks = CodeBlockUtils::detect_code_blocks(content);
286        assert_eq!(blocks.len(), 1);
287        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
288
289        // Multiple code blocks
290        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
291        let blocks = CodeBlockUtils::detect_code_blocks(content);
292        // 2 fenced blocks + 1 inline span for the ```
293        assert_eq!(blocks.len(), 3);
294    }
295
296    #[test]
297    fn test_detect_code_blocks_with_language() {
298        // Code block with language identifier
299        let content = "Text\n```rust\nfn main() {}\n```\nMore";
300        let blocks = CodeBlockUtils::detect_code_blocks(content);
301        // 1 fenced block + 1 inline span for ```
302        assert_eq!(blocks.len(), 2);
303        // Check we have the full fenced block
304        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
305        assert!(fenced.is_some());
306    }
307
308    #[test]
309    fn test_unclosed_code_block() {
310        // Unclosed code block should extend to end of content
311        let content = "Text\n```\ncode here\nno closing fence";
312        let blocks = CodeBlockUtils::detect_code_blocks(content);
313        assert_eq!(blocks.len(), 1);
314        assert_eq!(blocks[0].1, content.len());
315    }
316
317    #[test]
318    fn test_indented_code_blocks() {
319        // Basic indented code block
320        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
321        let blocks = CodeBlockUtils::detect_code_blocks(content);
322        assert_eq!(blocks.len(), 1);
323        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
324        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
325
326        // Indented code with tabs
327        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
328        let blocks = CodeBlockUtils::detect_code_blocks(content);
329        assert_eq!(blocks.len(), 1);
330    }
331
332    #[test]
333    fn test_indented_code_requires_blank_line() {
334        // Indented lines without preceding blank line are not code blocks
335        let content = "Paragraph\n    indented but not code\nMore text";
336        let blocks = CodeBlockUtils::detect_code_blocks(content);
337        assert_eq!(blocks.len(), 0);
338
339        // With blank line, it becomes a code block
340        let content = "Paragraph\n\n    now it's code\nMore text";
341        let blocks = CodeBlockUtils::detect_code_blocks(content);
342        assert_eq!(blocks.len(), 1);
343    }
344
345    #[test]
346    fn test_list_items_not_code_blocks() {
347        // List items should not be detected as code blocks
348        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
349        let blocks = CodeBlockUtils::detect_code_blocks(content);
350        assert_eq!(blocks.len(), 0);
351
352        // Numbered lists
353        let content = "List:\n\n    1. First\n    2. Second\n    1) Also first";
354        let blocks = CodeBlockUtils::detect_code_blocks(content);
355        assert_eq!(blocks.len(), 0);
356    }
357
358    #[test]
359    fn test_inline_code_spans() {
360        // Single backtick code span
361        let content = "Text with `inline code` here";
362        let blocks = CodeBlockUtils::detect_code_blocks(content);
363        assert_eq!(blocks.len(), 1);
364        assert_eq!(&content[blocks[0].0..blocks[0].1], "`inline code`");
365
366        // Multiple backtick code span
367        let content = "Text with ``code with ` backtick`` here";
368        let blocks = CodeBlockUtils::detect_code_blocks(content);
369        assert_eq!(blocks.len(), 1);
370        assert_eq!(&content[blocks[0].0..blocks[0].1], "``code with ` backtick``");
371
372        // Multiple code spans
373        let content = "Has `code1` and `code2` spans";
374        let blocks = CodeBlockUtils::detect_code_blocks(content);
375        assert_eq!(blocks.len(), 2);
376    }
377
378    #[test]
379    fn test_unclosed_code_span() {
380        // Unclosed code span should not be detected
381        let content = "Text with `unclosed code span";
382        let blocks = CodeBlockUtils::detect_code_blocks(content);
383        assert_eq!(blocks.len(), 0);
384
385        // Mismatched backticks
386        let content = "Text with ``one style` different close";
387        let blocks = CodeBlockUtils::detect_code_blocks(content);
388        assert_eq!(blocks.len(), 0);
389    }
390
391    #[test]
392    fn test_mixed_code_blocks_and_spans() {
393        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
394        let blocks = CodeBlockUtils::detect_code_blocks(content);
395        // The function may detect overlapping blocks (fenced block and inline spans)
396        // We should have at least: span1, fenced block, span2
397        assert!(blocks.len() >= 3);
398
399        // Check we have the expected elements
400        assert!(blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
401        assert!(blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
402        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
403
404        // Verify they're sorted by position (allowing duplicates/overlaps)
405        for i in 1..blocks.len() {
406            assert!(blocks[i - 1].0 <= blocks[i].0);
407        }
408    }
409
410    #[test]
411    fn test_is_in_code_block_or_span() {
412        let blocks = vec![(10, 20), (30, 40), (50, 60)];
413
414        // Test positions inside blocks
415        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
416        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
417        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
418
419        // Test positions at boundaries
420        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
421        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
422
423        // Test positions outside blocks
424        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
425        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
426        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
427    }
428
429    #[test]
430    fn test_empty_content() {
431        let blocks = CodeBlockUtils::detect_code_blocks("");
432        assert_eq!(blocks.len(), 0);
433    }
434
435    #[test]
436    fn test_code_block_at_start() {
437        let content = "```\ncode\n```\nText after";
438        let blocks = CodeBlockUtils::detect_code_blocks(content);
439        // 1 fenced + 1 inline span
440        assert_eq!(blocks.len(), 2);
441        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
442    }
443
444    #[test]
445    fn test_code_block_at_end() {
446        let content = "Text before\n```\ncode\n```";
447        let blocks = CodeBlockUtils::detect_code_blocks(content);
448        // 1 fenced + 1 inline span
449        assert_eq!(blocks.len(), 2);
450        // Check we have the fenced block
451        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
452        assert!(fenced.is_some());
453    }
454
455    #[test]
456    fn test_nested_fence_markers() {
457        // Code block containing fence markers as content
458        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
459        let blocks = CodeBlockUtils::detect_code_blocks(content);
460        // Should detect: outer block, inner ```, outer ````
461        assert!(!blocks.is_empty());
462        // Check we have the outer block
463        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
464        assert!(outer.is_some());
465    }
466
467    #[test]
468    fn test_indented_code_with_blank_lines() {
469        // Indented code blocks can contain blank lines
470        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
471        let blocks = CodeBlockUtils::detect_code_blocks(content);
472        // May have multiple blocks due to blank line handling
473        assert!(!blocks.is_empty());
474        // Check that we captured the indented code
475        let all_content: String = blocks
476            .iter()
477            .map(|(s, e)| &content[*s..*e])
478            .collect::<Vec<_>>()
479            .join("");
480        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
481    }
482
483    #[test]
484    fn test_code_span_with_spaces() {
485        // Code spans can have leading/trailing spaces
486        let content = "Text ` code with spaces ` more";
487        let blocks = CodeBlockUtils::detect_code_blocks(content);
488        assert_eq!(blocks.len(), 1);
489        assert_eq!(&content[blocks[0].0..blocks[0].1], "` code with spaces `");
490    }
491
492    #[test]
493    fn test_fenced_block_with_info_string() {
494        // Fenced code blocks with complex info strings
495        let content = "```rust,no_run,should_panic\ncode\n```";
496        let blocks = CodeBlockUtils::detect_code_blocks(content);
497        // 1 fenced + 1 inline span
498        assert_eq!(blocks.len(), 2);
499        assert_eq!(blocks[0].0, 0);
500    }
501
502    #[test]
503    fn test_indented_fences_not_code_blocks() {
504        // Indented fence markers should still work as fences
505        let content = "Text\n  ```\n  code\n  ```\nAfter";
506        let blocks = CodeBlockUtils::detect_code_blocks(content);
507        // 1 fenced + 1 inline span
508        assert_eq!(blocks.len(), 2);
509    }
510}