Skip to main content

rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3//!
4//! Code block detection is delegated to pulldown-cmark, which correctly implements the
5//! CommonMark specification. This handles edge cases like:
6//! - Backtick fences with backticks in the info string (invalid per spec)
7//! - Nested fences (longer fence contains shorter fence as content)
8//! - Mixed fence types (tilde fence contains backticks as content)
9//! - Indented code blocks with proper list context handling
10
11use pulldown_cmark::{CodeBlockKind, Event, Parser, Tag, TagEnd};
12
13use super::parser_options::rumdl_parser_options;
14
15/// Type alias for code block and span ranges: (code_blocks, code_spans)
16pub type CodeRanges = (Vec<(usize, usize)>, Vec<(usize, usize)>);
17
18/// Detailed information about a code block captured during parsing
19#[derive(Debug, Clone)]
20pub struct CodeBlockDetail {
21    /// Byte offset where this code block starts
22    pub start: usize,
23    /// Byte offset where this code block ends
24    pub end: usize,
25    /// Whether this is a fenced code block (true) or indented (false)
26    pub is_fenced: bool,
27    /// The info string from fenced blocks (e.g., "rust" from ```rust), empty for indented
28    pub info_string: String,
29}
30
31/// A strong emphasis span captured during parsing
32#[derive(Debug, Clone)]
33pub struct StrongSpanDetail {
34    /// Byte offset where the strong span starts (including **)
35    pub start: usize,
36    /// Byte offset where the strong span ends (including **)
37    pub end: usize,
38    /// Whether this uses asterisk (**) or underscore (__) markers
39    pub is_asterisk: bool,
40}
41
42/// Ordered list membership: maps line number (1-indexed) to list ID
43pub type LineToListMap = std::collections::HashMap<usize, usize>;
44/// Ordered list start values: maps list ID to the start value
45pub type ListStartValues = std::collections::HashMap<usize, u64>;
46
47/// Result of the central pulldown-cmark parse, capturing all data needed by individual rules
48pub struct ParseResult {
49    /// Code block byte ranges (start, end)
50    pub code_blocks: Vec<(usize, usize)>,
51    /// Inline code span byte ranges (start, end)
52    pub code_spans: Vec<(usize, usize)>,
53    /// Detailed code block info (fenced vs indented, info string)
54    pub code_block_details: Vec<CodeBlockDetail>,
55    /// Strong emphasis span details
56    pub strong_spans: Vec<StrongSpanDetail>,
57    /// Ordered list membership: maps line number (1-indexed) to list ID
58    pub line_to_list: LineToListMap,
59    /// Ordered list start values: maps list ID to start value
60    pub list_start_values: ListStartValues,
61}
62
63/// Classification of code blocks relative to list contexts
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum CodeBlockContext {
66    /// Code block that separates lists (root-level, with blank lines)
67    Standalone,
68    /// Code block that continues a list (properly indented)
69    Indented,
70    /// Code block adjacent to list content (edge case, defaults to non-breaking)
71    Adjacent,
72}
73
74/// Utility functions for detecting and handling code blocks in Markdown
75pub struct CodeBlockUtils;
76
77impl CodeBlockUtils {
78    /// Detect all code blocks in the content (NOT including inline code spans)
79    ///
80    /// Uses pulldown-cmark for spec-compliant CommonMark parsing. This correctly handles:
81    /// - Fenced code blocks (``` and ~~~)
82    /// - Indented code blocks (4 spaces or tab)
83    /// - Code blocks inside lists, blockquotes, and other containers
84    /// - Edge cases like backticks in info strings (which invalidate the fence)
85    ///
86    /// Returns a sorted vector of (start, end) byte offset tuples.
87    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
88        Self::detect_code_blocks_and_spans(content).code_blocks
89    }
90
91    /// Returns code block ranges, inline code span ranges, and detailed code block info
92    /// in a single pulldown-cmark pass.
93    pub fn detect_code_blocks_and_spans(content: &str) -> ParseResult {
94        let mut blocks = Vec::new();
95        let mut spans = Vec::new();
96        let mut details = Vec::new();
97        let mut strong_spans = Vec::new();
98        let mut code_block_start: Option<(usize, bool, String)> = None;
99
100        // List membership tracking for ordered lists
101        let mut line_to_list = LineToListMap::new();
102        let mut list_start_values = ListStartValues::new();
103        let mut list_stack: Vec<(usize, bool, u64)> = Vec::new(); // (list_id, is_ordered, start_value)
104        let mut next_list_id: usize = 0;
105
106        // Pre-compute line start offsets for byte-to-line conversion
107        let line_starts: Vec<usize> = std::iter::once(0)
108            .chain(content.match_indices('\n').map(|(i, _)| i + 1))
109            .collect();
110
111        let byte_to_line = |byte_offset: usize| -> usize { line_starts.partition_point(|&start| start <= byte_offset) };
112
113        let options = rumdl_parser_options();
114        let parser = Parser::new_ext(content, options).into_offset_iter();
115
116        for (event, range) in parser {
117            match event {
118                Event::Start(Tag::CodeBlock(kind)) => {
119                    let (is_fenced, info_string) = match &kind {
120                        CodeBlockKind::Fenced(info) => (true, info.to_string()),
121                        CodeBlockKind::Indented => (false, String::new()),
122                    };
123                    code_block_start = Some((range.start, is_fenced, info_string));
124                }
125                Event::End(TagEnd::CodeBlock) => {
126                    if let Some((start, is_fenced, info_string)) = code_block_start.take() {
127                        blocks.push((start, range.end));
128                        details.push(CodeBlockDetail {
129                            start,
130                            end: range.end,
131                            is_fenced,
132                            info_string,
133                        });
134                    }
135                }
136                Event::Start(Tag::Strong) => {
137                    if range.start + 2 <= content.len() {
138                        let is_asterisk = &content[range.start..range.start + 2] == "**";
139                        strong_spans.push(StrongSpanDetail {
140                            start: range.start,
141                            end: range.end,
142                            is_asterisk,
143                        });
144                    }
145                }
146                Event::Start(Tag::List(start_num)) => {
147                    let is_ordered = start_num.is_some();
148                    let start_value = start_num.unwrap_or(1);
149                    list_stack.push((next_list_id, is_ordered, start_value));
150                    if is_ordered {
151                        list_start_values.insert(next_list_id, start_value);
152                    }
153                    next_list_id += 1;
154                }
155                Event::End(TagEnd::List(_)) => {
156                    list_stack.pop();
157                }
158                Event::Start(Tag::Item) => {
159                    if let Some(&(list_id, is_ordered, _)) = list_stack.last()
160                        && is_ordered
161                    {
162                        let line_num = byte_to_line(range.start);
163                        line_to_list.insert(line_num, list_id);
164                    }
165                }
166                Event::Code(_) => {
167                    spans.push((range.start, range.end));
168                }
169                _ => {}
170            }
171        }
172
173        // Handle edge case: unclosed code block at end of content
174        // pulldown-cmark should handle this, but be defensive
175        if let Some((start, is_fenced, info_string)) = code_block_start {
176            blocks.push((start, content.len()));
177            details.push(CodeBlockDetail {
178                start,
179                end: content.len(),
180                is_fenced,
181                info_string,
182            });
183        }
184
185        // Sort by start position (should already be sorted, but ensure consistency)
186        blocks.sort_by_key(|&(start, _)| start);
187        spans.sort_by_key(|&(start, _)| start);
188        details.sort_by_key(|d| d.start);
189        strong_spans.sort_by_key(|s| s.start);
190        ParseResult {
191            code_blocks: blocks,
192            code_spans: spans,
193            code_block_details: details,
194            strong_spans,
195            line_to_list,
196            list_start_values,
197        }
198    }
199
200    /// Check if a position is within a code block (for compatibility)
201    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
202        Self::is_in_code_block(blocks, pos)
203    }
204
205    /// Check if a byte position falls within any of the given sorted, non-overlapping ranges.
206    ///
207    /// Uses binary search on the sorted block ranges for O(log n) lookup.
208    /// The blocks slice must be sorted by start position (as returned by
209    /// `detect_code_blocks` and `detect_code_blocks_and_spans`).
210    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
211        // Binary search: find the last block whose start <= pos
212        let idx = blocks.partition_point(|&(start, _)| start <= pos);
213        // partition_point returns the first index where start > pos,
214        // so the candidate is at idx - 1
215        idx > 0 && pos < blocks[idx - 1].1
216    }
217
218    /// Analyze code block context relative to list parsing
219    /// This is the core function implementing Design #3's three-tier classification
220    pub fn analyze_code_block_context(
221        lines: &[crate::lint_context::LineInfo],
222        line_idx: usize,
223        min_continuation_indent: usize,
224    ) -> CodeBlockContext {
225        if let Some(line_info) = lines.get(line_idx) {
226            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
227            if line_info.indent >= min_continuation_indent {
228                return CodeBlockContext::Indented;
229            }
230
231            // Rule 2: Blank Line Context - Check for structural separation indicators
232            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
233
234            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
235            // This is the key fix: root-level code blocks with blank lines separate lists
236            if prev_blanks > 0 || next_blanks > 0 {
237                return CodeBlockContext::Standalone;
238            }
239
240            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
241            CodeBlockContext::Adjacent
242        } else {
243            // Fallback for invalid line index
244            CodeBlockContext::Adjacent
245        }
246    }
247
248    /// Count blank lines before and after the given line index
249    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
250        let mut prev_blanks = 0;
251        let mut next_blanks = 0;
252
253        // Count blank lines before (look backwards)
254        for i in (0..line_idx).rev() {
255            if let Some(line) = lines.get(i) {
256                if line.is_blank {
257                    prev_blanks += 1;
258                } else {
259                    break;
260                }
261            } else {
262                break;
263            }
264        }
265
266        // Count blank lines after (look forwards)
267        for i in (line_idx + 1)..lines.len() {
268            if let Some(line) = lines.get(i) {
269                if line.is_blank {
270                    next_blanks += 1;
271                } else {
272                    break;
273                }
274            } else {
275                break;
276            }
277        }
278
279        (prev_blanks, next_blanks)
280    }
281
282    /// Calculate minimum indentation required for code block to continue a list
283    /// Based on the most recent list item's marker width
284    pub fn calculate_min_continuation_indent(
285        content: &str,
286        lines: &[crate::lint_context::LineInfo],
287        current_line_idx: usize,
288    ) -> usize {
289        // Look backwards to find the most recent list item
290        for i in (0..current_line_idx).rev() {
291            if let Some(line_info) = lines.get(i) {
292                if let Some(list_item) = &line_info.list_item {
293                    // Calculate minimum continuation indent for this list item
294                    return if list_item.is_ordered {
295                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
296                    } else {
297                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
298                    };
299                }
300
301                // Stop at structural separators that would break list context
302                if line_info.heading.is_some() || Self::is_structural_separator(line_info.content(content)) {
303                    break;
304                }
305            }
306        }
307
308        0 // No list context found
309    }
310
311    /// Check if content is a structural separator (headings, horizontal rules, etc.)
312    fn is_structural_separator(content: &str) -> bool {
313        let trimmed = content.trim();
314        trimmed.starts_with("---")
315            || trimmed.starts_with("***")
316            || trimmed.starts_with("___")
317            || crate::utils::skip_context::is_table_line(trimmed)
318            || trimmed.starts_with(">") // Blockquotes
319    }
320
321    /// Detect fenced code blocks with markdown/md language tag.
322    ///
323    /// Returns a vector of `MarkdownCodeBlock` containing byte ranges for the
324    /// content between the fences (excluding the fence lines themselves).
325    ///
326    /// Only detects fenced code blocks (``` or ~~~), not indented code blocks,
327    /// since indented blocks don't have a language tag.
328    pub fn detect_markdown_code_blocks(content: &str) -> Vec<MarkdownCodeBlock> {
329        use pulldown_cmark::{CodeBlockKind, Event, Parser, Tag, TagEnd};
330
331        let mut blocks = Vec::new();
332        let mut current_block: Option<MarkdownCodeBlockBuilder> = None;
333
334        let options = rumdl_parser_options();
335        let parser = Parser::new_ext(content, options).into_offset_iter();
336
337        for (event, range) in parser {
338            match event {
339                Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info))) => {
340                    // Check if language is markdown or md (first word of info string)
341                    let language = info.split_whitespace().next().unwrap_or("");
342                    if language.eq_ignore_ascii_case("markdown") || language.eq_ignore_ascii_case("md") {
343                        // Find where content starts (after the opening fence line)
344                        let block_start = range.start;
345                        let content_start = content[block_start..]
346                            .find('\n')
347                            .map(|i| block_start + i + 1)
348                            .unwrap_or(content.len());
349
350                        current_block = Some(MarkdownCodeBlockBuilder { content_start });
351                    }
352                }
353                Event::End(TagEnd::CodeBlock) => {
354                    if let Some(builder) = current_block.take() {
355                        // Find where content ends (before the closing fence line)
356                        let block_end = range.end;
357
358                        // Validate range before slicing
359                        if builder.content_start > block_end || builder.content_start > content.len() {
360                            continue;
361                        }
362
363                        let search_range = &content[builder.content_start..block_end.min(content.len())];
364                        let content_end = search_range
365                            .rfind('\n')
366                            .map(|i| builder.content_start + i)
367                            .unwrap_or(builder.content_start);
368
369                        // Only add block if it has valid content range
370                        if content_end >= builder.content_start {
371                            blocks.push(MarkdownCodeBlock {
372                                content_start: builder.content_start,
373                                content_end,
374                            });
375                        }
376                    }
377                }
378                _ => {}
379            }
380        }
381
382        blocks
383    }
384}
385
386/// Information about a markdown code block for recursive formatting
387#[derive(Debug, Clone)]
388pub struct MarkdownCodeBlock {
389    /// Byte offset where the content starts (after opening fence line)
390    pub content_start: usize,
391    /// Byte offset where the content ends (before closing fence line)
392    pub content_end: usize,
393}
394
395/// Builder for MarkdownCodeBlock during parsing
396struct MarkdownCodeBlockBuilder {
397    content_start: usize,
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn test_detect_fenced_code_blocks() {
406        // The function detects fenced blocks and inline code spans
407        // Fence markers (``` at line start) are now skipped in inline span detection
408
409        // Basic fenced code block with backticks
410        let content = "Some text\n```\ncode here\n```\nMore text";
411        let blocks = CodeBlockUtils::detect_code_blocks(content);
412        // Should find: 1 fenced block (fences are no longer detected as inline spans)
413        assert_eq!(blocks.len(), 1);
414
415        // Check that we have the fenced block
416        let fenced_block = blocks
417            .iter()
418            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
419        assert!(fenced_block.is_some());
420
421        // Fenced code block with tildes (no inline code detection for ~)
422        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
423        let blocks = CodeBlockUtils::detect_code_blocks(content);
424        assert_eq!(blocks.len(), 1);
425        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
426
427        // Multiple code blocks
428        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
429        let blocks = CodeBlockUtils::detect_code_blocks(content);
430        // 2 fenced blocks (fence markers no longer detected as inline spans)
431        assert_eq!(blocks.len(), 2);
432    }
433
434    #[test]
435    fn test_detect_code_blocks_with_language() {
436        // Code block with language identifier
437        let content = "Text\n```rust\nfn main() {}\n```\nMore";
438        let blocks = CodeBlockUtils::detect_code_blocks(content);
439        // 1 fenced block (fence markers no longer detected as inline spans)
440        assert_eq!(blocks.len(), 1);
441        // Check we have the full fenced block
442        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
443        assert!(fenced.is_some());
444    }
445
446    #[test]
447    fn test_unclosed_code_block() {
448        // Unclosed code block should extend to end of content
449        let content = "Text\n```\ncode here\nno closing fence";
450        let blocks = CodeBlockUtils::detect_code_blocks(content);
451        assert_eq!(blocks.len(), 1);
452        assert_eq!(blocks[0].1, content.len());
453    }
454
455    #[test]
456    fn test_indented_code_blocks() {
457        // Basic indented code block
458        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
459        let blocks = CodeBlockUtils::detect_code_blocks(content);
460        assert_eq!(blocks.len(), 1);
461        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
462        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
463
464        // Indented code with tabs
465        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
466        let blocks = CodeBlockUtils::detect_code_blocks(content);
467        assert_eq!(blocks.len(), 1);
468    }
469
470    #[test]
471    fn test_indented_code_requires_blank_line() {
472        // Indented lines without preceding blank line are not code blocks
473        let content = "Paragraph\n    indented but not code\nMore text";
474        let blocks = CodeBlockUtils::detect_code_blocks(content);
475        assert_eq!(blocks.len(), 0);
476
477        // With blank line, it becomes a code block
478        let content = "Paragraph\n\n    now it's code\nMore text";
479        let blocks = CodeBlockUtils::detect_code_blocks(content);
480        assert_eq!(blocks.len(), 1);
481    }
482
483    #[test]
484    fn test_indented_content_with_list_markers_is_code_block() {
485        // Per CommonMark spec: 4-space indented content after blank line IS a code block,
486        // even if the content looks like list markers. The indentation takes precedence.
487        // Verified with: echo 'List:\n\n    - Item 1' | npx commonmark
488        // Output: <pre><code>- Item 1</code></pre>
489        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
490        let blocks = CodeBlockUtils::detect_code_blocks(content);
491        assert_eq!(blocks.len(), 1); // This IS a code block per spec
492
493        // Same for numbered list markers
494        let content = "List:\n\n    1. First\n    2. Second";
495        let blocks = CodeBlockUtils::detect_code_blocks(content);
496        assert_eq!(blocks.len(), 1); // This IS a code block per spec
497    }
498
499    #[test]
500    fn test_actual_list_items_not_code_blocks() {
501        // Actual list items (no preceding blank line + 4 spaces) are NOT code blocks
502        let content = "- Item 1\n- Item 2\n* Item 3";
503        let blocks = CodeBlockUtils::detect_code_blocks(content);
504        assert_eq!(blocks.len(), 0);
505
506        // Nested list items
507        let content = "- Item 1\n  - Nested item\n- Item 2";
508        let blocks = CodeBlockUtils::detect_code_blocks(content);
509        assert_eq!(blocks.len(), 0);
510    }
511
512    #[test]
513    fn test_inline_code_spans_not_detected() {
514        // Inline code spans should NOT be detected as code blocks
515        let content = "Text with `inline code` here";
516        let blocks = CodeBlockUtils::detect_code_blocks(content);
517        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
518
519        // Multiple backtick code span
520        let content = "Text with ``code with ` backtick`` here";
521        let blocks = CodeBlockUtils::detect_code_blocks(content);
522        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
523
524        // Multiple code spans
525        let content = "Has `code1` and `code2` spans";
526        let blocks = CodeBlockUtils::detect_code_blocks(content);
527        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
528    }
529
530    #[test]
531    fn test_unclosed_code_span() {
532        // Unclosed code span should not be detected
533        let content = "Text with `unclosed code span";
534        let blocks = CodeBlockUtils::detect_code_blocks(content);
535        assert_eq!(blocks.len(), 0);
536
537        // Mismatched backticks
538        let content = "Text with ``one style` different close";
539        let blocks = CodeBlockUtils::detect_code_blocks(content);
540        assert_eq!(blocks.len(), 0);
541    }
542
543    #[test]
544    fn test_mixed_code_blocks_and_spans() {
545        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
546        let blocks = CodeBlockUtils::detect_code_blocks(content);
547        // Should only detect the fenced block, NOT the inline spans
548        assert_eq!(blocks.len(), 1);
549
550        // Check we have the fenced block only
551        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
552        // Should NOT detect inline spans
553        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
554        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
555    }
556
557    #[test]
558    fn test_is_in_code_block_or_span() {
559        let blocks = vec![(10, 20), (30, 40), (50, 60)];
560
561        // Test positions inside blocks
562        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
563        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
564        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
565
566        // Test positions at boundaries
567        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
568        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
569
570        // Test positions outside blocks
571        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
572        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
573        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
574    }
575
576    #[test]
577    fn test_empty_content() {
578        let blocks = CodeBlockUtils::detect_code_blocks("");
579        assert_eq!(blocks.len(), 0);
580    }
581
582    #[test]
583    fn test_code_block_at_start() {
584        let content = "```\ncode\n```\nText after";
585        let blocks = CodeBlockUtils::detect_code_blocks(content);
586        // 1 fenced block (fence markers no longer detected as inline spans)
587        assert_eq!(blocks.len(), 1);
588        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
589    }
590
591    #[test]
592    fn test_code_block_at_end() {
593        let content = "Text before\n```\ncode\n```";
594        let blocks = CodeBlockUtils::detect_code_blocks(content);
595        // 1 fenced block (fence markers no longer detected as inline spans)
596        assert_eq!(blocks.len(), 1);
597        // Check we have the fenced block
598        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
599        assert!(fenced.is_some());
600    }
601
602    #[test]
603    fn test_nested_fence_markers() {
604        // Code block containing fence markers as content
605        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
606        let blocks = CodeBlockUtils::detect_code_blocks(content);
607        // Should detect: outer block, inner ```, outer ````
608        assert!(!blocks.is_empty());
609        // Check we have the outer block
610        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
611        assert!(outer.is_some());
612    }
613
614    #[test]
615    fn test_indented_code_with_blank_lines() {
616        // Indented code blocks can contain blank lines
617        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
618        let blocks = CodeBlockUtils::detect_code_blocks(content);
619        // May have multiple blocks due to blank line handling
620        assert!(!blocks.is_empty());
621        // Check that we captured the indented code
622        let all_content: String = blocks
623            .iter()
624            .map(|(s, e)| &content[*s..*e])
625            .collect::<Vec<_>>()
626            .join("");
627        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
628    }
629
630    #[test]
631    fn test_code_span_with_spaces() {
632        // Code spans should NOT be detected as code blocks
633        let content = "Text ` code with spaces ` more";
634        let blocks = CodeBlockUtils::detect_code_blocks(content);
635        assert_eq!(blocks.len(), 0); // No blocks, only inline span
636    }
637
638    #[test]
639    fn test_fenced_block_with_info_string() {
640        // Fenced code blocks with complex info strings
641        let content = "```rust,no_run,should_panic\ncode\n```";
642        let blocks = CodeBlockUtils::detect_code_blocks(content);
643        // 1 fenced block (fence markers no longer detected as inline spans)
644        assert_eq!(blocks.len(), 1);
645        assert_eq!(blocks[0].0, 0);
646    }
647
648    #[test]
649    fn test_indented_fences_not_code_blocks() {
650        // Indented fence markers should still work as fences
651        let content = "Text\n  ```\n  code\n  ```\nAfter";
652        let blocks = CodeBlockUtils::detect_code_blocks(content);
653        // Only 1 fenced block (indented fences still work)
654        assert_eq!(blocks.len(), 1);
655    }
656
657    // Issue #175: Backticks in info string invalidate the fence
658    #[test]
659    fn test_backticks_in_info_string_not_code_block() {
660        // Per CommonMark spec: "If the info string comes after a backtick fence,
661        // it may not contain any backtick characters."
662        // So ```something``` is NOT a valid fence - the backticks are treated as inline code.
663        // Verified with: echo '```something```' | npx commonmark
664        // Output: <p><code>something</code></p>
665        let content = "```something```\n\n```bash\n# comment\n```";
666        let blocks = CodeBlockUtils::detect_code_blocks(content);
667        // Should find only the valid ```bash block, NOT the invalid ```something```
668        assert_eq!(blocks.len(), 1);
669        // The valid block should contain "# comment"
670        assert!(content[blocks[0].0..blocks[0].1].contains("# comment"));
671    }
672
673    #[test]
674    fn test_issue_175_reproduction() {
675        // Full reproduction of issue #175
676        let content = "```something```\n\n```bash\n# Have a parrot\necho \"🦜\"\n```";
677        let blocks = CodeBlockUtils::detect_code_blocks(content);
678        // Only the bash block is a code block
679        assert_eq!(blocks.len(), 1);
680        assert!(content[blocks[0].0..blocks[0].1].contains("Have a parrot"));
681    }
682
683    #[test]
684    fn test_tilde_fence_allows_tildes_in_info_string() {
685        // Tilde fences CAN have tildes in info string (only backtick restriction exists)
686        // ~~~abc~~~ opens an unclosed code block with info string "abc~~~"
687        let content = "~~~abc~~~\ncode content\n~~~";
688        let blocks = CodeBlockUtils::detect_code_blocks(content);
689        // This is a valid tilde fence that opens and closes
690        assert_eq!(blocks.len(), 1);
691    }
692
693    #[test]
694    fn test_nested_longer_fence_contains_shorter() {
695        // Longer fence (````) can contain shorter fence (```) as content
696        let content = "````\n```\nnested content\n```\n````";
697        let blocks = CodeBlockUtils::detect_code_blocks(content);
698        assert_eq!(blocks.len(), 1);
699        assert!(content[blocks[0].0..blocks[0].1].contains("nested content"));
700    }
701
702    #[test]
703    fn test_mixed_fence_types() {
704        // Tilde fence contains backtick markers as content
705        let content = "~~~\n```\nmixed content\n~~~";
706        let blocks = CodeBlockUtils::detect_code_blocks(content);
707        assert_eq!(blocks.len(), 1);
708        assert!(content[blocks[0].0..blocks[0].1].contains("mixed content"));
709    }
710
711    #[test]
712    fn test_indented_code_in_list_issue_276() {
713        // Issue #276: Indented code block inside a list should be detected by pulldown-cmark
714        let content = r#"1. First item
7152. Second item with code:
716
717        # This is a code block in a list
718        print("Hello, world!")
719
7204. Third item"#;
721
722        let blocks = CodeBlockUtils::detect_code_blocks(content);
723        // pulldown-cmark SHOULD detect this indented code block inside the list
724        assert!(!blocks.is_empty(), "Should detect indented code block inside list");
725
726        // Verify the detected block contains our code
727        let all_content: String = blocks
728            .iter()
729            .map(|(s, e)| &content[*s..*e])
730            .collect::<Vec<_>>()
731            .join("");
732        assert!(
733            all_content.contains("code block in a list") || all_content.contains("print"),
734            "Detected block should contain the code content: {all_content:?}"
735        );
736    }
737
738    #[test]
739    fn test_detect_markdown_code_blocks() {
740        let content = r#"# Example
741
742```markdown
743# Heading
744Content here
745```
746
747```md
748Another heading
749More content
750```
751
752```rust
753// Not markdown
754fn main() {}
755```
756"#;
757
758        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
759
760        // Should detect 2 blocks (markdown and md, not rust)
761        assert_eq!(
762            blocks.len(),
763            2,
764            "Should detect exactly 2 markdown blocks, got {blocks:?}"
765        );
766
767        // First block should be the ```markdown block
768        let first = &blocks[0];
769        let first_content = &content[first.content_start..first.content_end];
770        assert!(
771            first_content.contains("# Heading"),
772            "First block should contain '# Heading', got: {first_content:?}"
773        );
774
775        // Second block should be the ```md block
776        let second = &blocks[1];
777        let second_content = &content[second.content_start..second.content_end];
778        assert!(
779            second_content.contains("Another heading"),
780            "Second block should contain 'Another heading', got: {second_content:?}"
781        );
782    }
783
784    #[test]
785    fn test_detect_markdown_code_blocks_empty() {
786        let content = "# Just a heading\n\nNo code blocks here\n";
787        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
788        assert_eq!(blocks.len(), 0);
789    }
790
791    #[test]
792    fn test_detect_markdown_code_blocks_case_insensitive() {
793        let content = "```MARKDOWN\nContent\n```\n";
794        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
795        assert_eq!(blocks.len(), 1);
796    }
797
798    #[test]
799    fn test_detect_markdown_code_blocks_at_eof_no_trailing_newline() {
800        // Block at end of file without trailing newline after closing fence
801        let content = "# Doc\n\n```markdown\nContent\n```";
802        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
803        assert_eq!(blocks.len(), 1);
804        // Content should be extractable without panic
805        let block_content = &content[blocks[0].content_start..blocks[0].content_end];
806        assert!(block_content.contains("Content"));
807    }
808
809    #[test]
810    fn test_detect_markdown_code_blocks_single_line_content() {
811        // Single line of content, no extra newlines
812        let content = "```markdown\nX\n```\n";
813        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
814        assert_eq!(blocks.len(), 1);
815        let block_content = &content[blocks[0].content_start..blocks[0].content_end];
816        assert_eq!(block_content, "X");
817    }
818
819    #[test]
820    fn test_detect_markdown_code_blocks_empty_content() {
821        // Block with no content between fences
822        let content = "```markdown\n```\n";
823        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
824        // Should detect block but with empty range or not at all
825        // Either behavior is acceptable as long as no panic
826        if !blocks.is_empty() {
827            // If detected, content range should be valid
828            assert!(blocks[0].content_start <= blocks[0].content_end);
829        }
830    }
831
832    #[test]
833    fn test_detect_markdown_code_blocks_validates_ranges() {
834        // Ensure no panic on various edge cases
835        let test_cases = [
836            "",                             // Empty content
837            "```markdown",                  // Unclosed block
838            "```markdown\n",                // Unclosed block with newline
839            "```\n```",                     // Non-markdown block
840            "```markdown\n```",             // Empty markdown block
841            "   ```markdown\n   X\n   ```", // Indented block
842        ];
843
844        for content in test_cases {
845            // Should not panic
846            let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
847            // All detected blocks should have valid ranges
848            for block in &blocks {
849                assert!(
850                    block.content_start <= block.content_end,
851                    "Invalid range in content: {content:?}"
852                );
853                assert!(
854                    block.content_end <= content.len(),
855                    "Range exceeds content length in: {content:?}"
856                );
857            }
858        }
859    }
860
861    // ── is_in_code_block binary search tests ─────────────────────────────
862
863    #[test]
864    fn test_is_in_code_block_empty_blocks() {
865        assert!(!CodeBlockUtils::is_in_code_block(&[], 0));
866        assert!(!CodeBlockUtils::is_in_code_block(&[], 100));
867        assert!(!CodeBlockUtils::is_in_code_block(&[], usize::MAX));
868    }
869
870    #[test]
871    fn test_is_in_code_block_single_range() {
872        let blocks = [(10, 20)];
873        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 0));
874        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 9));
875        assert!(CodeBlockUtils::is_in_code_block(&blocks, 10));
876        assert!(CodeBlockUtils::is_in_code_block(&blocks, 15));
877        assert!(CodeBlockUtils::is_in_code_block(&blocks, 19));
878        // end is exclusive
879        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 20));
880        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 21));
881    }
882
883    #[test]
884    fn test_is_in_code_block_multiple_ranges() {
885        let blocks = [(5, 10), (20, 30), (50, 60)];
886        // Before all
887        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 0));
888        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 4));
889        // In first
890        assert!(CodeBlockUtils::is_in_code_block(&blocks, 5));
891        assert!(CodeBlockUtils::is_in_code_block(&blocks, 9));
892        // Gap between first and second
893        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 10));
894        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 15));
895        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 19));
896        // In second
897        assert!(CodeBlockUtils::is_in_code_block(&blocks, 20));
898        assert!(CodeBlockUtils::is_in_code_block(&blocks, 29));
899        // Gap between second and third
900        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 30));
901        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 49));
902        // In third
903        assert!(CodeBlockUtils::is_in_code_block(&blocks, 50));
904        assert!(CodeBlockUtils::is_in_code_block(&blocks, 59));
905        // After all
906        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 60));
907        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 1000));
908    }
909
910    #[test]
911    fn test_is_in_code_block_adjacent_ranges() {
912        // Ranges that are exactly adjacent (end of one == start of next)
913        let blocks = [(0, 10), (10, 20), (20, 30)];
914        assert!(CodeBlockUtils::is_in_code_block(&blocks, 0));
915        assert!(CodeBlockUtils::is_in_code_block(&blocks, 9));
916        assert!(CodeBlockUtils::is_in_code_block(&blocks, 10));
917        assert!(CodeBlockUtils::is_in_code_block(&blocks, 19));
918        assert!(CodeBlockUtils::is_in_code_block(&blocks, 20));
919        assert!(CodeBlockUtils::is_in_code_block(&blocks, 29));
920        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 30));
921    }
922
923    #[test]
924    fn test_is_in_code_block_single_byte_range() {
925        let blocks = [(5, 6)];
926        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 4));
927        assert!(CodeBlockUtils::is_in_code_block(&blocks, 5));
928        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 6));
929    }
930
931    #[test]
932    fn test_is_in_code_block_matches_linear_scan() {
933        // Verify binary search produces identical results to linear scan
934        // for a realistic document layout
935        let content = "# Heading\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nSome text\n\n```\nmore code\n```\n\nEnd\n";
936        let blocks = CodeBlockUtils::detect_code_blocks(content);
937
938        for pos in 0..content.len() {
939            let binary = CodeBlockUtils::is_in_code_block(&blocks, pos);
940            let linear = blocks.iter().any(|&(s, e)| pos >= s && pos < e);
941            assert_eq!(
942                binary, linear,
943                "Mismatch at pos {pos}: binary={binary}, linear={linear}, blocks={blocks:?}"
944            );
945        }
946    }
947
948    #[test]
949    fn test_is_in_code_block_at_range_boundaries() {
950        // Exhaustive boundary testing for every block start/end
951        let blocks = [(100, 200), (300, 400), (500, 600)];
952        for &(start, end) in &blocks {
953            assert!(
954                !CodeBlockUtils::is_in_code_block(&blocks, start - 1),
955                "pos={} should be outside",
956                start - 1
957            );
958            assert!(
959                CodeBlockUtils::is_in_code_block(&blocks, start),
960                "pos={start} should be inside"
961            );
962            assert!(
963                CodeBlockUtils::is_in_code_block(&blocks, end - 1),
964                "pos={} should be inside",
965                end - 1
966            );
967            assert!(
968                !CodeBlockUtils::is_in_code_block(&blocks, end),
969                "pos={end} should be outside"
970            );
971        }
972    }
973}