rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3
4use lazy_static::lazy_static;
5use regex::Regex;
6
7/// Classification of code blocks relative to list contexts
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub enum CodeBlockContext {
10    /// Code block that separates lists (root-level, with blank lines)
11    Standalone,
12    /// Code block that continues a list (properly indented)
13    Indented,
14    /// Code block adjacent to list content (edge case, defaults to non-breaking)
15    Adjacent,
16}
17
18lazy_static! {
19    static ref CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(```|~~~)").unwrap();
20    static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
21}
22
23/// Utility functions for detecting and handling code blocks in Markdown
24pub struct CodeBlockUtils;
25
26impl CodeBlockUtils {
27    /// Detect all code blocks in the content
28    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
29        let mut blocks = Vec::new();
30        let mut in_code_block = false;
31        let mut code_block_start = 0;
32        let mut opening_fence_char = ' ';
33        let mut opening_fence_len = 0;
34
35        // Pre-compute line positions for efficient offset calculation
36        let lines: Vec<&str> = content.lines().collect();
37        let mut line_positions = Vec::with_capacity(lines.len());
38        let mut pos = 0;
39        for line in &lines {
40            line_positions.push(pos);
41            pos += line.len() + 1; // +1 for newline
42        }
43
44        // Find fenced code blocks
45        for (i, line) in lines.iter().enumerate() {
46            let line_start = line_positions[i];
47            let trimmed = line.trim_start();
48
49            // Check if this line could be a code fence
50            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
51                let fence_char = trimmed.chars().next().unwrap();
52                let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
53
54                if !in_code_block && fence_len >= 3 {
55                    // Opening fence
56                    code_block_start = line_start;
57                    in_code_block = true;
58                    opening_fence_char = fence_char;
59                    opening_fence_len = fence_len;
60                } else if in_code_block && fence_char == opening_fence_char && fence_len >= opening_fence_len {
61                    // Closing fence - must match opening fence character and be at least as long
62                    let code_block_end = line_start + line.len();
63                    blocks.push((code_block_start, code_block_end));
64                    in_code_block = false;
65                    opening_fence_char = ' ';
66                    opening_fence_len = 0;
67                }
68                // If we're in a code block but the fence doesn't match, it's just content
69            }
70        }
71
72        // Handle unclosed code blocks
73        if in_code_block {
74            blocks.push((code_block_start, content.len()));
75        }
76
77        // Find indented code blocks (4+ spaces or tab at start of line)
78        // According to CommonMark, indented code blocks must be preceded by a blank line
79        // (unless they're at the start of the document or after a block-level element)
80        let mut in_indented_block = false;
81        let mut indented_block_start = 0;
82
83        for (line_idx, line) in lines.iter().enumerate() {
84            let line_start = if line_idx < line_positions.len() {
85                line_positions[line_idx]
86            } else {
87                0
88            };
89
90            // Check if this line is indented code
91            let is_indented = line.starts_with("    ") || line.starts_with("\t");
92
93            // Check if this looks like a list item (has list marker after indentation)
94            let trimmed = line.trim_start();
95            let is_list_item = trimmed.starts_with("- ")
96                || trimmed.starts_with("* ")
97                || trimmed.starts_with("+ ")
98                || trimmed.chars().next().is_some_and(|c| c.is_numeric())
99                    && trimmed.chars().nth(1).is_some_and(|c| c == '.' || c == ')');
100
101            // Check if previous line was blank
102            let prev_blank = line_idx > 0 && lines[line_idx - 1].trim().is_empty();
103
104            if is_indented && !line.trim().is_empty() && !is_list_item {
105                if !in_indented_block {
106                    // Only start an indented code block if preceded by a blank line
107                    if prev_blank {
108                        in_indented_block = true;
109                        indented_block_start = line_start;
110                    }
111                    // Otherwise, this is just an indented line, not a code block
112                }
113            } else if in_indented_block {
114                // End of indented code block
115                let block_end = if line_idx > 0 && line_idx - 1 < line_positions.len() {
116                    line_positions[line_idx - 1] + lines[line_idx - 1].len()
117                } else {
118                    line_start
119                };
120                blocks.push((indented_block_start, block_end));
121                in_indented_block = false;
122            }
123        }
124
125        // Handle indented block that goes to end of file
126        if in_indented_block {
127            blocks.push((indented_block_start, content.len()));
128        }
129
130        // Find inline code spans
131        let mut i = 0;
132        while i < content.len() {
133            if let Some(m) = CODE_SPAN_PATTERN.find_at(content, i) {
134                let backtick_length = m.end() - m.start();
135                let start = m.start();
136
137                // Check if this is a fence marker (3+ backticks at start of line)
138                if backtick_length >= 3 {
139                    // Check if it's at the start of a line
140                    let at_line_start = start == 0 || content.as_bytes()[start - 1] == b'\n';
141                    if at_line_start {
142                        // This is a fence, not an inline code span - skip it
143                        i = m.end();
144                        continue;
145                    }
146                }
147
148                // Check if these backticks are escaped (preceded by backslash)
149                // In Markdown, \` is an escaped backtick and should not start a code span
150                let is_escaped = start > 0 && content.as_bytes()[start - 1] == b'\\';
151
152                if is_escaped {
153                    // Skip escaped backticks
154                    i = m.end();
155                    continue;
156                }
157
158                // Find matching closing backticks (that are also not escaped)
159                let search_str = &content[m.end()..];
160                let backtick_pattern = "`".repeat(backtick_length);
161
162                // Look for unescaped closing backticks
163                let mut search_pos = 0;
164                let mut found_end = None;
165                while search_pos < search_str.len() {
166                    if let Some(pos) = search_str[search_pos..].find(&backtick_pattern) {
167                        let absolute_pos = m.end() + search_pos + pos;
168                        // Check if these closing backticks are escaped
169                        if absolute_pos > 0 && content.as_bytes()[absolute_pos - 1] == b'\\' {
170                            // These are escaped, keep searching
171                            // Advance past the escaped backticks, but at least by 1
172                            let advance = (pos + backtick_length).max(1);
173                            search_pos += advance;
174                        } else {
175                            // Found unescaped closing backticks
176                            found_end = Some(search_pos + pos);
177                            break;
178                        }
179                    } else {
180                        break;
181                    }
182                }
183
184                if let Some(end_pos) = found_end {
185                    let end = m.end() + end_pos + backtick_length;
186                    blocks.push((start, end));
187                    i = end;
188                } else {
189                    i = m.end();
190                }
191            } else {
192                break;
193            }
194        }
195
196        blocks.sort_by(|a, b| a.0.cmp(&b.0));
197        blocks
198    }
199
200    /// Check if a position is within a code block or code span
201    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
202        blocks.iter().any(|&(start, end)| pos >= start && pos < end)
203    }
204
205    /// Analyze code block context relative to list parsing
206    /// This is the core function implementing Design #3's three-tier classification
207    pub fn analyze_code_block_context(
208        lines: &[crate::lint_context::LineInfo],
209        line_idx: usize,
210        min_continuation_indent: usize,
211    ) -> CodeBlockContext {
212        if let Some(line_info) = lines.get(line_idx) {
213            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
214            if line_info.indent >= min_continuation_indent {
215                return CodeBlockContext::Indented;
216            }
217
218            // Rule 2: Blank Line Context - Check for structural separation indicators
219            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
220
221            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
222            // This is the key fix: root-level code blocks with blank lines separate lists
223            if prev_blanks > 0 || next_blanks > 0 {
224                return CodeBlockContext::Standalone;
225            }
226
227            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
228            CodeBlockContext::Adjacent
229        } else {
230            // Fallback for invalid line index
231            CodeBlockContext::Adjacent
232        }
233    }
234
235    /// Count blank lines before and after the given line index
236    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
237        let mut prev_blanks = 0;
238        let mut next_blanks = 0;
239
240        // Count blank lines before (look backwards)
241        for i in (0..line_idx).rev() {
242            if let Some(line) = lines.get(i) {
243                if line.is_blank {
244                    prev_blanks += 1;
245                } else {
246                    break;
247                }
248            } else {
249                break;
250            }
251        }
252
253        // Count blank lines after (look forwards)
254        for i in (line_idx + 1)..lines.len() {
255            if let Some(line) = lines.get(i) {
256                if line.is_blank {
257                    next_blanks += 1;
258                } else {
259                    break;
260                }
261            } else {
262                break;
263            }
264        }
265
266        (prev_blanks, next_blanks)
267    }
268
269    /// Calculate minimum indentation required for code block to continue a list
270    /// Based on the most recent list item's marker width
271    pub fn calculate_min_continuation_indent(
272        lines: &[crate::lint_context::LineInfo],
273        current_line_idx: usize,
274    ) -> usize {
275        // Look backwards to find the most recent list item
276        for i in (0..current_line_idx).rev() {
277            if let Some(line_info) = lines.get(i) {
278                if let Some(list_item) = &line_info.list_item {
279                    // Calculate minimum continuation indent for this list item
280                    return if list_item.is_ordered {
281                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
282                    } else {
283                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
284                    };
285                }
286
287                // Stop at structural separators that would break list context
288                if line_info.heading.is_some() || Self::is_structural_separator(&line_info.content) {
289                    break;
290                }
291            }
292        }
293
294        0 // No list context found
295    }
296
297    /// Check if content is a structural separator (headings, horizontal rules, etc.)
298    fn is_structural_separator(content: &str) -> bool {
299        let trimmed = content.trim();
300        trimmed.starts_with("---")
301            || trimmed.starts_with("***")
302            || trimmed.starts_with("___")
303            || trimmed.contains('|') // Tables
304            || trimmed.starts_with(">") // Blockquotes
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_detect_fenced_code_blocks() {
314        // The function detects fenced blocks and inline code spans
315        // Fence markers (``` at line start) are now skipped in inline span detection
316
317        // Basic fenced code block with backticks
318        let content = "Some text\n```\ncode here\n```\nMore text";
319        let blocks = CodeBlockUtils::detect_code_blocks(content);
320        // Should find: 1 fenced block (fences are no longer detected as inline spans)
321        assert_eq!(blocks.len(), 1);
322
323        // Check that we have the fenced block
324        let fenced_block = blocks
325            .iter()
326            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
327        assert!(fenced_block.is_some());
328
329        // Fenced code block with tildes (no inline code detection for ~)
330        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
331        let blocks = CodeBlockUtils::detect_code_blocks(content);
332        assert_eq!(blocks.len(), 1);
333        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
334
335        // Multiple code blocks
336        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
337        let blocks = CodeBlockUtils::detect_code_blocks(content);
338        // 2 fenced blocks (fence markers no longer detected as inline spans)
339        assert_eq!(blocks.len(), 2);
340    }
341
342    #[test]
343    fn test_detect_code_blocks_with_language() {
344        // Code block with language identifier
345        let content = "Text\n```rust\nfn main() {}\n```\nMore";
346        let blocks = CodeBlockUtils::detect_code_blocks(content);
347        // 1 fenced block (fence markers no longer detected as inline spans)
348        assert_eq!(blocks.len(), 1);
349        // Check we have the full fenced block
350        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
351        assert!(fenced.is_some());
352    }
353
354    #[test]
355    fn test_unclosed_code_block() {
356        // Unclosed code block should extend to end of content
357        let content = "Text\n```\ncode here\nno closing fence";
358        let blocks = CodeBlockUtils::detect_code_blocks(content);
359        assert_eq!(blocks.len(), 1);
360        assert_eq!(blocks[0].1, content.len());
361    }
362
363    #[test]
364    fn test_indented_code_blocks() {
365        // Basic indented code block
366        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
367        let blocks = CodeBlockUtils::detect_code_blocks(content);
368        assert_eq!(blocks.len(), 1);
369        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
370        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
371
372        // Indented code with tabs
373        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
374        let blocks = CodeBlockUtils::detect_code_blocks(content);
375        assert_eq!(blocks.len(), 1);
376    }
377
378    #[test]
379    fn test_indented_code_requires_blank_line() {
380        // Indented lines without preceding blank line are not code blocks
381        let content = "Paragraph\n    indented but not code\nMore text";
382        let blocks = CodeBlockUtils::detect_code_blocks(content);
383        assert_eq!(blocks.len(), 0);
384
385        // With blank line, it becomes a code block
386        let content = "Paragraph\n\n    now it's code\nMore text";
387        let blocks = CodeBlockUtils::detect_code_blocks(content);
388        assert_eq!(blocks.len(), 1);
389    }
390
391    #[test]
392    fn test_list_items_not_code_blocks() {
393        // List items should not be detected as code blocks
394        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
395        let blocks = CodeBlockUtils::detect_code_blocks(content);
396        assert_eq!(blocks.len(), 0);
397
398        // Numbered lists
399        let content = "List:\n\n    1. First\n    2. Second\n    1) Also first";
400        let blocks = CodeBlockUtils::detect_code_blocks(content);
401        assert_eq!(blocks.len(), 0);
402    }
403
404    #[test]
405    fn test_inline_code_spans() {
406        // Single backtick code span
407        let content = "Text with `inline code` here";
408        let blocks = CodeBlockUtils::detect_code_blocks(content);
409        assert_eq!(blocks.len(), 1);
410        assert_eq!(&content[blocks[0].0..blocks[0].1], "`inline code`");
411
412        // Multiple backtick code span
413        let content = "Text with ``code with ` backtick`` here";
414        let blocks = CodeBlockUtils::detect_code_blocks(content);
415        assert_eq!(blocks.len(), 1);
416        assert_eq!(&content[blocks[0].0..blocks[0].1], "``code with ` backtick``");
417
418        // Multiple code spans
419        let content = "Has `code1` and `code2` spans";
420        let blocks = CodeBlockUtils::detect_code_blocks(content);
421        assert_eq!(blocks.len(), 2);
422    }
423
424    #[test]
425    fn test_unclosed_code_span() {
426        // Unclosed code span should not be detected
427        let content = "Text with `unclosed code span";
428        let blocks = CodeBlockUtils::detect_code_blocks(content);
429        assert_eq!(blocks.len(), 0);
430
431        // Mismatched backticks
432        let content = "Text with ``one style` different close";
433        let blocks = CodeBlockUtils::detect_code_blocks(content);
434        assert_eq!(blocks.len(), 0);
435    }
436
437    #[test]
438    fn test_mixed_code_blocks_and_spans() {
439        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
440        let blocks = CodeBlockUtils::detect_code_blocks(content);
441        // The function may detect overlapping blocks (fenced block and inline spans)
442        // We should have at least: span1, fenced block, span2
443        assert!(blocks.len() >= 3);
444
445        // Check we have the expected elements
446        assert!(blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
447        assert!(blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
448        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
449
450        // Verify they're sorted by position (allowing duplicates/overlaps)
451        for i in 1..blocks.len() {
452            assert!(blocks[i - 1].0 <= blocks[i].0);
453        }
454    }
455
456    #[test]
457    fn test_is_in_code_block_or_span() {
458        let blocks = vec![(10, 20), (30, 40), (50, 60)];
459
460        // Test positions inside blocks
461        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
462        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
463        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
464
465        // Test positions at boundaries
466        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
467        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
468
469        // Test positions outside blocks
470        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
471        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
472        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
473    }
474
475    #[test]
476    fn test_empty_content() {
477        let blocks = CodeBlockUtils::detect_code_blocks("");
478        assert_eq!(blocks.len(), 0);
479    }
480
481    #[test]
482    fn test_code_block_at_start() {
483        let content = "```\ncode\n```\nText after";
484        let blocks = CodeBlockUtils::detect_code_blocks(content);
485        // 1 fenced block (fence markers no longer detected as inline spans)
486        assert_eq!(blocks.len(), 1);
487        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
488    }
489
490    #[test]
491    fn test_code_block_at_end() {
492        let content = "Text before\n```\ncode\n```";
493        let blocks = CodeBlockUtils::detect_code_blocks(content);
494        // 1 fenced block (fence markers no longer detected as inline spans)
495        assert_eq!(blocks.len(), 1);
496        // Check we have the fenced block
497        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
498        assert!(fenced.is_some());
499    }
500
501    #[test]
502    fn test_nested_fence_markers() {
503        // Code block containing fence markers as content
504        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
505        let blocks = CodeBlockUtils::detect_code_blocks(content);
506        // Should detect: outer block, inner ```, outer ````
507        assert!(!blocks.is_empty());
508        // Check we have the outer block
509        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
510        assert!(outer.is_some());
511    }
512
513    #[test]
514    fn test_indented_code_with_blank_lines() {
515        // Indented code blocks can contain blank lines
516        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
517        let blocks = CodeBlockUtils::detect_code_blocks(content);
518        // May have multiple blocks due to blank line handling
519        assert!(!blocks.is_empty());
520        // Check that we captured the indented code
521        let all_content: String = blocks
522            .iter()
523            .map(|(s, e)| &content[*s..*e])
524            .collect::<Vec<_>>()
525            .join("");
526        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
527    }
528
529    #[test]
530    fn test_code_span_with_spaces() {
531        // Code spans can have leading/trailing spaces
532        let content = "Text ` code with spaces ` more";
533        let blocks = CodeBlockUtils::detect_code_blocks(content);
534        assert_eq!(blocks.len(), 1);
535        assert_eq!(&content[blocks[0].0..blocks[0].1], "` code with spaces `");
536    }
537
538    #[test]
539    fn test_fenced_block_with_info_string() {
540        // Fenced code blocks with complex info strings
541        let content = "```rust,no_run,should_panic\ncode\n```";
542        let blocks = CodeBlockUtils::detect_code_blocks(content);
543        // 1 fenced block (fence markers no longer detected as inline spans)
544        assert_eq!(blocks.len(), 1);
545        assert_eq!(blocks[0].0, 0);
546    }
547
548    #[test]
549    fn test_indented_fences_not_code_blocks() {
550        // Indented fence markers should still work as fences
551        let content = "Text\n  ```\n  code\n  ```\nAfter";
552        let blocks = CodeBlockUtils::detect_code_blocks(content);
553        // 1 fenced + 1 inline span
554        assert_eq!(blocks.len(), 2);
555    }
556}