Skip to main content

rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3//!
4//! Code block detection is delegated to pulldown-cmark, which correctly implements the
5//! CommonMark specification. This handles edge cases like:
6//! - Backtick fences with backticks in the info string (invalid per spec)
7//! - Nested fences (longer fence contains shorter fence as content)
8//! - Mixed fence types (tilde fence contains backticks as content)
9//! - Indented code blocks with proper list context handling
10
11use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag, TagEnd};
12
13/// Type alias for code block and span ranges: (code_blocks, code_spans)
14pub type CodeRanges = (Vec<(usize, usize)>, Vec<(usize, usize)>);
15
16/// Detailed information about a code block captured during parsing
17#[derive(Debug, Clone)]
18pub struct CodeBlockDetail {
19    /// Byte offset where this code block starts
20    pub start: usize,
21    /// Byte offset where this code block ends
22    pub end: usize,
23    /// Whether this is a fenced code block (true) or indented (false)
24    pub is_fenced: bool,
25    /// The info string from fenced blocks (e.g., "rust" from ```rust), empty for indented
26    pub info_string: String,
27}
28
29/// A strong emphasis span captured during parsing
30#[derive(Debug, Clone)]
31pub struct StrongSpanDetail {
32    /// Byte offset where the strong span starts (including **)
33    pub start: usize,
34    /// Byte offset where the strong span ends (including **)
35    pub end: usize,
36    /// Whether this uses asterisk (**) or underscore (__) markers
37    pub is_asterisk: bool,
38}
39
40/// Ordered list membership: maps line number (1-indexed) to list ID
41pub type LineToListMap = std::collections::HashMap<usize, usize>;
42/// Ordered list start values: maps list ID to the start value
43pub type ListStartValues = std::collections::HashMap<usize, u64>;
44
45/// Result of the central pulldown-cmark parse, capturing all data needed by individual rules
46pub struct ParseResult {
47    /// Code block byte ranges (start, end)
48    pub code_blocks: Vec<(usize, usize)>,
49    /// Inline code span byte ranges (start, end)
50    pub code_spans: Vec<(usize, usize)>,
51    /// Detailed code block info (fenced vs indented, info string)
52    pub code_block_details: Vec<CodeBlockDetail>,
53    /// Strong emphasis span details
54    pub strong_spans: Vec<StrongSpanDetail>,
55    /// Ordered list membership: maps line number (1-indexed) to list ID
56    pub line_to_list: LineToListMap,
57    /// Ordered list start values: maps list ID to start value
58    pub list_start_values: ListStartValues,
59}
60
61/// Classification of code blocks relative to list contexts
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum CodeBlockContext {
64    /// Code block that separates lists (root-level, with blank lines)
65    Standalone,
66    /// Code block that continues a list (properly indented)
67    Indented,
68    /// Code block adjacent to list content (edge case, defaults to non-breaking)
69    Adjacent,
70}
71
72/// Utility functions for detecting and handling code blocks in Markdown
73pub struct CodeBlockUtils;
74
75impl CodeBlockUtils {
76    /// Detect all code blocks in the content (NOT including inline code spans)
77    ///
78    /// Uses pulldown-cmark for spec-compliant CommonMark parsing. This correctly handles:
79    /// - Fenced code blocks (``` and ~~~)
80    /// - Indented code blocks (4 spaces or tab)
81    /// - Code blocks inside lists, blockquotes, and other containers
82    /// - Edge cases like backticks in info strings (which invalidate the fence)
83    ///
84    /// Returns a sorted vector of (start, end) byte offset tuples.
85    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
86        Self::detect_code_blocks_and_spans(content).code_blocks
87    }
88
89    /// Returns code block ranges, inline code span ranges, and detailed code block info
90    /// in a single pulldown-cmark pass.
91    pub fn detect_code_blocks_and_spans(content: &str) -> ParseResult {
92        let mut blocks = Vec::new();
93        let mut spans = Vec::new();
94        let mut details = Vec::new();
95        let mut strong_spans = Vec::new();
96        let mut code_block_start: Option<(usize, bool, String)> = None;
97
98        // List membership tracking for ordered lists
99        let mut line_to_list = LineToListMap::new();
100        let mut list_start_values = ListStartValues::new();
101        let mut list_stack: Vec<(usize, bool, u64)> = Vec::new(); // (list_id, is_ordered, start_value)
102        let mut next_list_id: usize = 0;
103
104        // Pre-compute line start offsets for byte-to-line conversion
105        let line_starts: Vec<usize> = std::iter::once(0)
106            .chain(content.match_indices('\n').map(|(i, _)| i + 1))
107            .collect();
108
109        let byte_to_line = |byte_offset: usize| -> usize { line_starts.partition_point(|&start| start <= byte_offset) };
110
111        // Use pulldown-cmark with all extensions for maximum compatibility
112        let options = Options::all();
113        let parser = Parser::new_ext(content, options).into_offset_iter();
114
115        for (event, range) in parser {
116            match event {
117                Event::Start(Tag::CodeBlock(kind)) => {
118                    let (is_fenced, info_string) = match &kind {
119                        CodeBlockKind::Fenced(info) => (true, info.to_string()),
120                        CodeBlockKind::Indented => (false, String::new()),
121                    };
122                    code_block_start = Some((range.start, is_fenced, info_string));
123                }
124                Event::End(TagEnd::CodeBlock) => {
125                    if let Some((start, is_fenced, info_string)) = code_block_start.take() {
126                        blocks.push((start, range.end));
127                        details.push(CodeBlockDetail {
128                            start,
129                            end: range.end,
130                            is_fenced,
131                            info_string,
132                        });
133                    }
134                }
135                Event::Start(Tag::Strong) => {
136                    if range.start + 2 <= content.len() {
137                        let is_asterisk = &content[range.start..range.start + 2] == "**";
138                        strong_spans.push(StrongSpanDetail {
139                            start: range.start,
140                            end: range.end,
141                            is_asterisk,
142                        });
143                    }
144                }
145                Event::Start(Tag::List(start_num)) => {
146                    let is_ordered = start_num.is_some();
147                    let start_value = start_num.unwrap_or(1);
148                    list_stack.push((next_list_id, is_ordered, start_value));
149                    if is_ordered {
150                        list_start_values.insert(next_list_id, start_value);
151                    }
152                    next_list_id += 1;
153                }
154                Event::End(TagEnd::List(_)) => {
155                    list_stack.pop();
156                }
157                Event::Start(Tag::Item) => {
158                    if let Some(&(list_id, is_ordered, _)) = list_stack.last()
159                        && is_ordered
160                    {
161                        let line_num = byte_to_line(range.start);
162                        line_to_list.insert(line_num, list_id);
163                    }
164                }
165                Event::Code(_) => {
166                    spans.push((range.start, range.end));
167                }
168                _ => {}
169            }
170        }
171
172        // Handle edge case: unclosed code block at end of content
173        // pulldown-cmark should handle this, but be defensive
174        if let Some((start, is_fenced, info_string)) = code_block_start {
175            blocks.push((start, content.len()));
176            details.push(CodeBlockDetail {
177                start,
178                end: content.len(),
179                is_fenced,
180                info_string,
181            });
182        }
183
184        // Sort by start position (should already be sorted, but ensure consistency)
185        blocks.sort_by_key(|&(start, _)| start);
186        spans.sort_by_key(|&(start, _)| start);
187        details.sort_by_key(|d| d.start);
188        strong_spans.sort_by_key(|s| s.start);
189        ParseResult {
190            code_blocks: blocks,
191            code_spans: spans,
192            code_block_details: details,
193            strong_spans,
194            line_to_list,
195            list_start_values,
196        }
197    }
198
199    /// Check if a position is within a code block (for compatibility)
200    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
201        Self::is_in_code_block(blocks, pos)
202    }
203
204    /// Check if a byte position falls within any of the given sorted, non-overlapping ranges.
205    ///
206    /// Uses binary search on the sorted block ranges for O(log n) lookup.
207    /// The blocks slice must be sorted by start position (as returned by
208    /// `detect_code_blocks` and `detect_code_blocks_and_spans`).
209    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
210        // Binary search: find the last block whose start <= pos
211        let idx = blocks.partition_point(|&(start, _)| start <= pos);
212        // partition_point returns the first index where start > pos,
213        // so the candidate is at idx - 1
214        idx > 0 && pos < blocks[idx - 1].1
215    }
216
217    /// Analyze code block context relative to list parsing
218    /// This is the core function implementing Design #3's three-tier classification
219    pub fn analyze_code_block_context(
220        lines: &[crate::lint_context::LineInfo],
221        line_idx: usize,
222        min_continuation_indent: usize,
223    ) -> CodeBlockContext {
224        if let Some(line_info) = lines.get(line_idx) {
225            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
226            if line_info.indent >= min_continuation_indent {
227                return CodeBlockContext::Indented;
228            }
229
230            // Rule 2: Blank Line Context - Check for structural separation indicators
231            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
232
233            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
234            // This is the key fix: root-level code blocks with blank lines separate lists
235            if prev_blanks > 0 || next_blanks > 0 {
236                return CodeBlockContext::Standalone;
237            }
238
239            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
240            CodeBlockContext::Adjacent
241        } else {
242            // Fallback for invalid line index
243            CodeBlockContext::Adjacent
244        }
245    }
246
247    /// Count blank lines before and after the given line index
248    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
249        let mut prev_blanks = 0;
250        let mut next_blanks = 0;
251
252        // Count blank lines before (look backwards)
253        for i in (0..line_idx).rev() {
254            if let Some(line) = lines.get(i) {
255                if line.is_blank {
256                    prev_blanks += 1;
257                } else {
258                    break;
259                }
260            } else {
261                break;
262            }
263        }
264
265        // Count blank lines after (look forwards)
266        for i in (line_idx + 1)..lines.len() {
267            if let Some(line) = lines.get(i) {
268                if line.is_blank {
269                    next_blanks += 1;
270                } else {
271                    break;
272                }
273            } else {
274                break;
275            }
276        }
277
278        (prev_blanks, next_blanks)
279    }
280
281    /// Calculate minimum indentation required for code block to continue a list
282    /// Based on the most recent list item's marker width
283    pub fn calculate_min_continuation_indent(
284        content: &str,
285        lines: &[crate::lint_context::LineInfo],
286        current_line_idx: usize,
287    ) -> usize {
288        // Look backwards to find the most recent list item
289        for i in (0..current_line_idx).rev() {
290            if let Some(line_info) = lines.get(i) {
291                if let Some(list_item) = &line_info.list_item {
292                    // Calculate minimum continuation indent for this list item
293                    return if list_item.is_ordered {
294                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
295                    } else {
296                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
297                    };
298                }
299
300                // Stop at structural separators that would break list context
301                if line_info.heading.is_some() || Self::is_structural_separator(line_info.content(content)) {
302                    break;
303                }
304            }
305        }
306
307        0 // No list context found
308    }
309
310    /// Check if content is a structural separator (headings, horizontal rules, etc.)
311    fn is_structural_separator(content: &str) -> bool {
312        let trimmed = content.trim();
313        trimmed.starts_with("---")
314            || trimmed.starts_with("***")
315            || trimmed.starts_with("___")
316            || crate::utils::skip_context::is_table_line(trimmed)
317            || trimmed.starts_with(">") // Blockquotes
318    }
319
320    /// Detect fenced code blocks with markdown/md language tag.
321    ///
322    /// Returns a vector of `MarkdownCodeBlock` containing byte ranges for the
323    /// content between the fences (excluding the fence lines themselves).
324    ///
325    /// Only detects fenced code blocks (``` or ~~~), not indented code blocks,
326    /// since indented blocks don't have a language tag.
327    pub fn detect_markdown_code_blocks(content: &str) -> Vec<MarkdownCodeBlock> {
328        use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag, TagEnd};
329
330        let mut blocks = Vec::new();
331        let mut current_block: Option<MarkdownCodeBlockBuilder> = None;
332
333        let options = Options::all();
334        let parser = Parser::new_ext(content, options).into_offset_iter();
335
336        for (event, range) in parser {
337            match event {
338                Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info))) => {
339                    // Check if language is markdown or md (first word of info string)
340                    let language = info.split_whitespace().next().unwrap_or("");
341                    if language.eq_ignore_ascii_case("markdown") || language.eq_ignore_ascii_case("md") {
342                        // Find where content starts (after the opening fence line)
343                        let block_start = range.start;
344                        let content_start = content[block_start..]
345                            .find('\n')
346                            .map(|i| block_start + i + 1)
347                            .unwrap_or(content.len());
348
349                        current_block = Some(MarkdownCodeBlockBuilder { content_start });
350                    }
351                }
352                Event::End(TagEnd::CodeBlock) => {
353                    if let Some(builder) = current_block.take() {
354                        // Find where content ends (before the closing fence line)
355                        let block_end = range.end;
356
357                        // Validate range before slicing
358                        if builder.content_start > block_end || builder.content_start > content.len() {
359                            continue;
360                        }
361
362                        let search_range = &content[builder.content_start..block_end.min(content.len())];
363                        let content_end = search_range
364                            .rfind('\n')
365                            .map(|i| builder.content_start + i)
366                            .unwrap_or(builder.content_start);
367
368                        // Only add block if it has valid content range
369                        if content_end >= builder.content_start {
370                            blocks.push(MarkdownCodeBlock {
371                                content_start: builder.content_start,
372                                content_end,
373                            });
374                        }
375                    }
376                }
377                _ => {}
378            }
379        }
380
381        blocks
382    }
383}
384
385/// Information about a markdown code block for recursive formatting
386#[derive(Debug, Clone)]
387pub struct MarkdownCodeBlock {
388    /// Byte offset where the content starts (after opening fence line)
389    pub content_start: usize,
390    /// Byte offset where the content ends (before closing fence line)
391    pub content_end: usize,
392}
393
394/// Builder for MarkdownCodeBlock during parsing
395struct MarkdownCodeBlockBuilder {
396    content_start: usize,
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_detect_fenced_code_blocks() {
405        // The function detects fenced blocks and inline code spans
406        // Fence markers (``` at line start) are now skipped in inline span detection
407
408        // Basic fenced code block with backticks
409        let content = "Some text\n```\ncode here\n```\nMore text";
410        let blocks = CodeBlockUtils::detect_code_blocks(content);
411        // Should find: 1 fenced block (fences are no longer detected as inline spans)
412        assert_eq!(blocks.len(), 1);
413
414        // Check that we have the fenced block
415        let fenced_block = blocks
416            .iter()
417            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
418        assert!(fenced_block.is_some());
419
420        // Fenced code block with tildes (no inline code detection for ~)
421        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
422        let blocks = CodeBlockUtils::detect_code_blocks(content);
423        assert_eq!(blocks.len(), 1);
424        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
425
426        // Multiple code blocks
427        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
428        let blocks = CodeBlockUtils::detect_code_blocks(content);
429        // 2 fenced blocks (fence markers no longer detected as inline spans)
430        assert_eq!(blocks.len(), 2);
431    }
432
433    #[test]
434    fn test_detect_code_blocks_with_language() {
435        // Code block with language identifier
436        let content = "Text\n```rust\nfn main() {}\n```\nMore";
437        let blocks = CodeBlockUtils::detect_code_blocks(content);
438        // 1 fenced block (fence markers no longer detected as inline spans)
439        assert_eq!(blocks.len(), 1);
440        // Check we have the full fenced block
441        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
442        assert!(fenced.is_some());
443    }
444
445    #[test]
446    fn test_unclosed_code_block() {
447        // Unclosed code block should extend to end of content
448        let content = "Text\n```\ncode here\nno closing fence";
449        let blocks = CodeBlockUtils::detect_code_blocks(content);
450        assert_eq!(blocks.len(), 1);
451        assert_eq!(blocks[0].1, content.len());
452    }
453
454    #[test]
455    fn test_indented_code_blocks() {
456        // Basic indented code block
457        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
458        let blocks = CodeBlockUtils::detect_code_blocks(content);
459        assert_eq!(blocks.len(), 1);
460        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
461        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
462
463        // Indented code with tabs
464        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
465        let blocks = CodeBlockUtils::detect_code_blocks(content);
466        assert_eq!(blocks.len(), 1);
467    }
468
469    #[test]
470    fn test_indented_code_requires_blank_line() {
471        // Indented lines without preceding blank line are not code blocks
472        let content = "Paragraph\n    indented but not code\nMore text";
473        let blocks = CodeBlockUtils::detect_code_blocks(content);
474        assert_eq!(blocks.len(), 0);
475
476        // With blank line, it becomes a code block
477        let content = "Paragraph\n\n    now it's code\nMore text";
478        let blocks = CodeBlockUtils::detect_code_blocks(content);
479        assert_eq!(blocks.len(), 1);
480    }
481
482    #[test]
483    fn test_indented_content_with_list_markers_is_code_block() {
484        // Per CommonMark spec: 4-space indented content after blank line IS a code block,
485        // even if the content looks like list markers. The indentation takes precedence.
486        // Verified with: echo 'List:\n\n    - Item 1' | npx commonmark
487        // Output: <pre><code>- Item 1</code></pre>
488        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
489        let blocks = CodeBlockUtils::detect_code_blocks(content);
490        assert_eq!(blocks.len(), 1); // This IS a code block per spec
491
492        // Same for numbered list markers
493        let content = "List:\n\n    1. First\n    2. Second";
494        let blocks = CodeBlockUtils::detect_code_blocks(content);
495        assert_eq!(blocks.len(), 1); // This IS a code block per spec
496    }
497
498    #[test]
499    fn test_actual_list_items_not_code_blocks() {
500        // Actual list items (no preceding blank line + 4 spaces) are NOT code blocks
501        let content = "- Item 1\n- Item 2\n* Item 3";
502        let blocks = CodeBlockUtils::detect_code_blocks(content);
503        assert_eq!(blocks.len(), 0);
504
505        // Nested list items
506        let content = "- Item 1\n  - Nested item\n- Item 2";
507        let blocks = CodeBlockUtils::detect_code_blocks(content);
508        assert_eq!(blocks.len(), 0);
509    }
510
511    #[test]
512    fn test_inline_code_spans_not_detected() {
513        // Inline code spans should NOT be detected as code blocks
514        let content = "Text with `inline code` here";
515        let blocks = CodeBlockUtils::detect_code_blocks(content);
516        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
517
518        // Multiple backtick code span
519        let content = "Text with ``code with ` backtick`` here";
520        let blocks = CodeBlockUtils::detect_code_blocks(content);
521        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
522
523        // Multiple code spans
524        let content = "Has `code1` and `code2` spans";
525        let blocks = CodeBlockUtils::detect_code_blocks(content);
526        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
527    }
528
529    #[test]
530    fn test_unclosed_code_span() {
531        // Unclosed code span should not be detected
532        let content = "Text with `unclosed code span";
533        let blocks = CodeBlockUtils::detect_code_blocks(content);
534        assert_eq!(blocks.len(), 0);
535
536        // Mismatched backticks
537        let content = "Text with ``one style` different close";
538        let blocks = CodeBlockUtils::detect_code_blocks(content);
539        assert_eq!(blocks.len(), 0);
540    }
541
542    #[test]
543    fn test_mixed_code_blocks_and_spans() {
544        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
545        let blocks = CodeBlockUtils::detect_code_blocks(content);
546        // Should only detect the fenced block, NOT the inline spans
547        assert_eq!(blocks.len(), 1);
548
549        // Check we have the fenced block only
550        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
551        // Should NOT detect inline spans
552        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
553        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
554    }
555
556    #[test]
557    fn test_is_in_code_block_or_span() {
558        let blocks = vec![(10, 20), (30, 40), (50, 60)];
559
560        // Test positions inside blocks
561        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
562        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
563        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
564
565        // Test positions at boundaries
566        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
567        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
568
569        // Test positions outside blocks
570        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
571        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
572        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
573    }
574
575    #[test]
576    fn test_empty_content() {
577        let blocks = CodeBlockUtils::detect_code_blocks("");
578        assert_eq!(blocks.len(), 0);
579    }
580
581    #[test]
582    fn test_code_block_at_start() {
583        let content = "```\ncode\n```\nText after";
584        let blocks = CodeBlockUtils::detect_code_blocks(content);
585        // 1 fenced block (fence markers no longer detected as inline spans)
586        assert_eq!(blocks.len(), 1);
587        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
588    }
589
590    #[test]
591    fn test_code_block_at_end() {
592        let content = "Text before\n```\ncode\n```";
593        let blocks = CodeBlockUtils::detect_code_blocks(content);
594        // 1 fenced block (fence markers no longer detected as inline spans)
595        assert_eq!(blocks.len(), 1);
596        // Check we have the fenced block
597        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
598        assert!(fenced.is_some());
599    }
600
601    #[test]
602    fn test_nested_fence_markers() {
603        // Code block containing fence markers as content
604        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
605        let blocks = CodeBlockUtils::detect_code_blocks(content);
606        // Should detect: outer block, inner ```, outer ````
607        assert!(!blocks.is_empty());
608        // Check we have the outer block
609        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
610        assert!(outer.is_some());
611    }
612
613    #[test]
614    fn test_indented_code_with_blank_lines() {
615        // Indented code blocks can contain blank lines
616        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
617        let blocks = CodeBlockUtils::detect_code_blocks(content);
618        // May have multiple blocks due to blank line handling
619        assert!(!blocks.is_empty());
620        // Check that we captured the indented code
621        let all_content: String = blocks
622            .iter()
623            .map(|(s, e)| &content[*s..*e])
624            .collect::<Vec<_>>()
625            .join("");
626        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
627    }
628
629    #[test]
630    fn test_code_span_with_spaces() {
631        // Code spans should NOT be detected as code blocks
632        let content = "Text ` code with spaces ` more";
633        let blocks = CodeBlockUtils::detect_code_blocks(content);
634        assert_eq!(blocks.len(), 0); // No blocks, only inline span
635    }
636
637    #[test]
638    fn test_fenced_block_with_info_string() {
639        // Fenced code blocks with complex info strings
640        let content = "```rust,no_run,should_panic\ncode\n```";
641        let blocks = CodeBlockUtils::detect_code_blocks(content);
642        // 1 fenced block (fence markers no longer detected as inline spans)
643        assert_eq!(blocks.len(), 1);
644        assert_eq!(blocks[0].0, 0);
645    }
646
647    #[test]
648    fn test_indented_fences_not_code_blocks() {
649        // Indented fence markers should still work as fences
650        let content = "Text\n  ```\n  code\n  ```\nAfter";
651        let blocks = CodeBlockUtils::detect_code_blocks(content);
652        // Only 1 fenced block (indented fences still work)
653        assert_eq!(blocks.len(), 1);
654    }
655
656    // Issue #175: Backticks in info string invalidate the fence
657    #[test]
658    fn test_backticks_in_info_string_not_code_block() {
659        // Per CommonMark spec: "If the info string comes after a backtick fence,
660        // it may not contain any backtick characters."
661        // So ```something``` is NOT a valid fence - the backticks are treated as inline code.
662        // Verified with: echo '```something```' | npx commonmark
663        // Output: <p><code>something</code></p>
664        let content = "```something```\n\n```bash\n# comment\n```";
665        let blocks = CodeBlockUtils::detect_code_blocks(content);
666        // Should find only the valid ```bash block, NOT the invalid ```something```
667        assert_eq!(blocks.len(), 1);
668        // The valid block should contain "# comment"
669        assert!(content[blocks[0].0..blocks[0].1].contains("# comment"));
670    }
671
672    #[test]
673    fn test_issue_175_reproduction() {
674        // Full reproduction of issue #175
675        let content = "```something```\n\n```bash\n# Have a parrot\necho \"🦜\"\n```";
676        let blocks = CodeBlockUtils::detect_code_blocks(content);
677        // Only the bash block is a code block
678        assert_eq!(blocks.len(), 1);
679        assert!(content[blocks[0].0..blocks[0].1].contains("Have a parrot"));
680    }
681
682    #[test]
683    fn test_tilde_fence_allows_tildes_in_info_string() {
684        // Tilde fences CAN have tildes in info string (only backtick restriction exists)
685        // ~~~abc~~~ opens an unclosed code block with info string "abc~~~"
686        let content = "~~~abc~~~\ncode content\n~~~";
687        let blocks = CodeBlockUtils::detect_code_blocks(content);
688        // This is a valid tilde fence that opens and closes
689        assert_eq!(blocks.len(), 1);
690    }
691
692    #[test]
693    fn test_nested_longer_fence_contains_shorter() {
694        // Longer fence (````) can contain shorter fence (```) as content
695        let content = "````\n```\nnested content\n```\n````";
696        let blocks = CodeBlockUtils::detect_code_blocks(content);
697        assert_eq!(blocks.len(), 1);
698        assert!(content[blocks[0].0..blocks[0].1].contains("nested content"));
699    }
700
701    #[test]
702    fn test_mixed_fence_types() {
703        // Tilde fence contains backtick markers as content
704        let content = "~~~\n```\nmixed content\n~~~";
705        let blocks = CodeBlockUtils::detect_code_blocks(content);
706        assert_eq!(blocks.len(), 1);
707        assert!(content[blocks[0].0..blocks[0].1].contains("mixed content"));
708    }
709
710    #[test]
711    fn test_indented_code_in_list_issue_276() {
712        // Issue #276: Indented code block inside a list should be detected by pulldown-cmark
713        let content = r#"1. First item
7142. Second item with code:
715
716        # This is a code block in a list
717        print("Hello, world!")
718
7194. Third item"#;
720
721        let blocks = CodeBlockUtils::detect_code_blocks(content);
722        // pulldown-cmark SHOULD detect this indented code block inside the list
723        assert!(!blocks.is_empty(), "Should detect indented code block inside list");
724
725        // Verify the detected block contains our code
726        let all_content: String = blocks
727            .iter()
728            .map(|(s, e)| &content[*s..*e])
729            .collect::<Vec<_>>()
730            .join("");
731        assert!(
732            all_content.contains("code block in a list") || all_content.contains("print"),
733            "Detected block should contain the code content: {all_content:?}"
734        );
735    }
736
737    #[test]
738    fn test_detect_markdown_code_blocks() {
739        let content = r#"# Example
740
741```markdown
742# Heading
743Content here
744```
745
746```md
747Another heading
748More content
749```
750
751```rust
752// Not markdown
753fn main() {}
754```
755"#;
756
757        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
758
759        // Should detect 2 blocks (markdown and md, not rust)
760        assert_eq!(
761            blocks.len(),
762            2,
763            "Should detect exactly 2 markdown blocks, got {blocks:?}"
764        );
765
766        // First block should be the ```markdown block
767        let first = &blocks[0];
768        let first_content = &content[first.content_start..first.content_end];
769        assert!(
770            first_content.contains("# Heading"),
771            "First block should contain '# Heading', got: {first_content:?}"
772        );
773
774        // Second block should be the ```md block
775        let second = &blocks[1];
776        let second_content = &content[second.content_start..second.content_end];
777        assert!(
778            second_content.contains("Another heading"),
779            "Second block should contain 'Another heading', got: {second_content:?}"
780        );
781    }
782
783    #[test]
784    fn test_detect_markdown_code_blocks_empty() {
785        let content = "# Just a heading\n\nNo code blocks here\n";
786        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
787        assert_eq!(blocks.len(), 0);
788    }
789
790    #[test]
791    fn test_detect_markdown_code_blocks_case_insensitive() {
792        let content = "```MARKDOWN\nContent\n```\n";
793        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
794        assert_eq!(blocks.len(), 1);
795    }
796
797    #[test]
798    fn test_detect_markdown_code_blocks_at_eof_no_trailing_newline() {
799        // Block at end of file without trailing newline after closing fence
800        let content = "# Doc\n\n```markdown\nContent\n```";
801        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
802        assert_eq!(blocks.len(), 1);
803        // Content should be extractable without panic
804        let block_content = &content[blocks[0].content_start..blocks[0].content_end];
805        assert!(block_content.contains("Content"));
806    }
807
808    #[test]
809    fn test_detect_markdown_code_blocks_single_line_content() {
810        // Single line of content, no extra newlines
811        let content = "```markdown\nX\n```\n";
812        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
813        assert_eq!(blocks.len(), 1);
814        let block_content = &content[blocks[0].content_start..blocks[0].content_end];
815        assert_eq!(block_content, "X");
816    }
817
818    #[test]
819    fn test_detect_markdown_code_blocks_empty_content() {
820        // Block with no content between fences
821        let content = "```markdown\n```\n";
822        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
823        // Should detect block but with empty range or not at all
824        // Either behavior is acceptable as long as no panic
825        if !blocks.is_empty() {
826            // If detected, content range should be valid
827            assert!(blocks[0].content_start <= blocks[0].content_end);
828        }
829    }
830
831    #[test]
832    fn test_detect_markdown_code_blocks_validates_ranges() {
833        // Ensure no panic on various edge cases
834        let test_cases = [
835            "",                             // Empty content
836            "```markdown",                  // Unclosed block
837            "```markdown\n",                // Unclosed block with newline
838            "```\n```",                     // Non-markdown block
839            "```markdown\n```",             // Empty markdown block
840            "   ```markdown\n   X\n   ```", // Indented block
841        ];
842
843        for content in test_cases {
844            // Should not panic
845            let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
846            // All detected blocks should have valid ranges
847            for block in &blocks {
848                assert!(
849                    block.content_start <= block.content_end,
850                    "Invalid range in content: {content:?}"
851                );
852                assert!(
853                    block.content_end <= content.len(),
854                    "Range exceeds content length in: {content:?}"
855                );
856            }
857        }
858    }
859
860    // ── is_in_code_block binary search tests ─────────────────────────────
861
862    #[test]
863    fn test_is_in_code_block_empty_blocks() {
864        assert!(!CodeBlockUtils::is_in_code_block(&[], 0));
865        assert!(!CodeBlockUtils::is_in_code_block(&[], 100));
866        assert!(!CodeBlockUtils::is_in_code_block(&[], usize::MAX));
867    }
868
869    #[test]
870    fn test_is_in_code_block_single_range() {
871        let blocks = [(10, 20)];
872        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 0));
873        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 9));
874        assert!(CodeBlockUtils::is_in_code_block(&blocks, 10));
875        assert!(CodeBlockUtils::is_in_code_block(&blocks, 15));
876        assert!(CodeBlockUtils::is_in_code_block(&blocks, 19));
877        // end is exclusive
878        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 20));
879        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 21));
880    }
881
882    #[test]
883    fn test_is_in_code_block_multiple_ranges() {
884        let blocks = [(5, 10), (20, 30), (50, 60)];
885        // Before all
886        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 0));
887        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 4));
888        // In first
889        assert!(CodeBlockUtils::is_in_code_block(&blocks, 5));
890        assert!(CodeBlockUtils::is_in_code_block(&blocks, 9));
891        // Gap between first and second
892        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 10));
893        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 15));
894        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 19));
895        // In second
896        assert!(CodeBlockUtils::is_in_code_block(&blocks, 20));
897        assert!(CodeBlockUtils::is_in_code_block(&blocks, 29));
898        // Gap between second and third
899        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 30));
900        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 49));
901        // In third
902        assert!(CodeBlockUtils::is_in_code_block(&blocks, 50));
903        assert!(CodeBlockUtils::is_in_code_block(&blocks, 59));
904        // After all
905        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 60));
906        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 1000));
907    }
908
909    #[test]
910    fn test_is_in_code_block_adjacent_ranges() {
911        // Ranges that are exactly adjacent (end of one == start of next)
912        let blocks = [(0, 10), (10, 20), (20, 30)];
913        assert!(CodeBlockUtils::is_in_code_block(&blocks, 0));
914        assert!(CodeBlockUtils::is_in_code_block(&blocks, 9));
915        assert!(CodeBlockUtils::is_in_code_block(&blocks, 10));
916        assert!(CodeBlockUtils::is_in_code_block(&blocks, 19));
917        assert!(CodeBlockUtils::is_in_code_block(&blocks, 20));
918        assert!(CodeBlockUtils::is_in_code_block(&blocks, 29));
919        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 30));
920    }
921
922    #[test]
923    fn test_is_in_code_block_single_byte_range() {
924        let blocks = [(5, 6)];
925        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 4));
926        assert!(CodeBlockUtils::is_in_code_block(&blocks, 5));
927        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 6));
928    }
929
930    #[test]
931    fn test_is_in_code_block_matches_linear_scan() {
932        // Verify binary search produces identical results to linear scan
933        // for a realistic document layout
934        let content = "# Heading\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nSome text\n\n```\nmore code\n```\n\nEnd\n";
935        let blocks = CodeBlockUtils::detect_code_blocks(content);
936
937        for pos in 0..content.len() {
938            let binary = CodeBlockUtils::is_in_code_block(&blocks, pos);
939            let linear = blocks.iter().any(|&(s, e)| pos >= s && pos < e);
940            assert_eq!(
941                binary, linear,
942                "Mismatch at pos {pos}: binary={binary}, linear={linear}, blocks={blocks:?}"
943            );
944        }
945    }
946
947    #[test]
948    fn test_is_in_code_block_at_range_boundaries() {
949        // Exhaustive boundary testing for every block start/end
950        let blocks = [(100, 200), (300, 400), (500, 600)];
951        for &(start, end) in &blocks {
952            assert!(
953                !CodeBlockUtils::is_in_code_block(&blocks, start - 1),
954                "pos={} should be outside",
955                start - 1
956            );
957            assert!(
958                CodeBlockUtils::is_in_code_block(&blocks, start),
959                "pos={start} should be inside"
960            );
961            assert!(
962                CodeBlockUtils::is_in_code_block(&blocks, end - 1),
963                "pos={} should be inside",
964                end - 1
965            );
966            assert!(
967                !CodeBlockUtils::is_in_code_block(&blocks, end),
968                "pos={end} should be outside"
969            );
970        }
971    }
972}