Skip to main content

rumdl_lib/utils/
code_block_utils.rs

1//!
2//! Utility functions for detecting and handling code blocks and code spans in Markdown for rumdl.
3//!
4//! Code block detection is delegated to pulldown-cmark, which correctly implements the
5//! CommonMark specification. This handles edge cases like:
6//! - Backtick fences with backticks in the info string (invalid per spec)
7//! - Nested fences (longer fence contains shorter fence as content)
8//! - Mixed fence types (tilde fence contains backticks as content)
9//! - Indented code blocks with proper list context handling
10
11use pulldown_cmark::{CodeBlockKind, Event, Parser, Tag, TagEnd};
12
13use super::parser_options::rumdl_parser_options;
14
15/// Detailed information about a code block captured during parsing
16#[derive(Debug, Clone)]
17pub struct CodeBlockDetail {
18    /// Byte offset where this code block starts
19    pub start: usize,
20    /// Byte offset where this code block ends
21    pub end: usize,
22    /// Whether this is a fenced code block (true) or indented (false)
23    pub is_fenced: bool,
24    /// The info string from fenced blocks (e.g., "rust" from ```rust), empty for indented
25    pub info_string: String,
26}
27
28/// A strong emphasis span captured during parsing
29#[derive(Debug, Clone)]
30pub struct StrongSpanDetail {
31    /// Byte offset where the strong span starts (including **)
32    pub start: usize,
33    /// Byte offset where the strong span ends (including **)
34    pub end: usize,
35    /// Whether this uses asterisk (**) or underscore (__) markers
36    pub is_asterisk: bool,
37}
38
39/// Ordered list membership: maps line number (1-indexed) to list ID
40pub type LineToListMap = std::collections::HashMap<usize, usize>;
41/// Ordered list start values: maps list ID to the start value
42pub type ListStartValues = std::collections::HashMap<usize, u64>;
43
44/// Result of the central pulldown-cmark parse, capturing all data needed by individual rules
45pub struct ParseResult {
46    /// Code block byte ranges (start, end)
47    pub code_blocks: Vec<(usize, usize)>,
48    /// Inline code span byte ranges (start, end)
49    pub code_spans: Vec<(usize, usize)>,
50    /// Detailed code block info (fenced vs indented, info string)
51    pub code_block_details: Vec<CodeBlockDetail>,
52    /// Strong emphasis span details
53    pub strong_spans: Vec<StrongSpanDetail>,
54    /// Ordered list membership: maps line number (1-indexed) to list ID
55    pub line_to_list: LineToListMap,
56    /// Ordered list start values: maps list ID to start value
57    pub list_start_values: ListStartValues,
58}
59
60/// Classification of code blocks relative to list contexts
61#[derive(Debug, Clone, PartialEq, Eq)]
62pub enum CodeBlockContext {
63    /// Code block that separates lists (root-level, with blank lines)
64    Standalone,
65    /// Code block that continues a list (properly indented)
66    Indented,
67    /// Code block adjacent to list content (edge case, defaults to non-breaking)
68    Adjacent,
69}
70
71/// Utility functions for detecting and handling code blocks in Markdown
72pub struct CodeBlockUtils;
73
74impl CodeBlockUtils {
75    /// Detect all code blocks in the content (NOT including inline code spans)
76    ///
77    /// Uses pulldown-cmark for spec-compliant CommonMark parsing. This correctly handles:
78    /// - Fenced code blocks (``` and ~~~)
79    /// - Indented code blocks (4 spaces or tab)
80    /// - Code blocks inside lists, blockquotes, and other containers
81    /// - Edge cases like backticks in info strings (which invalidate the fence)
82    ///
83    /// Returns a sorted vector of (start, end) byte offset tuples.
84    pub fn detect_code_blocks(content: &str) -> Vec<(usize, usize)> {
85        Self::detect_code_blocks_and_spans(content).code_blocks
86    }
87
88    /// Returns code block ranges, inline code span ranges, and detailed code block info
89    /// in a single pulldown-cmark pass.
90    pub fn detect_code_blocks_and_spans(content: &str) -> ParseResult {
91        let mut blocks = Vec::new();
92        let mut spans = Vec::new();
93        let mut details = Vec::new();
94        let mut strong_spans = Vec::new();
95        let mut code_block_start: Option<(usize, bool, String)> = None;
96
97        // List membership tracking for ordered lists
98        let mut line_to_list = LineToListMap::new();
99        let mut list_start_values = ListStartValues::new();
100        let mut list_stack: Vec<(usize, bool, u64)> = Vec::new(); // (list_id, is_ordered, start_value)
101        let mut next_list_id: usize = 0;
102
103        // Pre-compute line start offsets for byte-to-line conversion
104        let line_starts: Vec<usize> = std::iter::once(0)
105            .chain(content.match_indices('\n').map(|(i, _)| i + 1))
106            .collect();
107
108        let byte_to_line = |byte_offset: usize| -> usize { line_starts.partition_point(|&start| start <= byte_offset) };
109
110        let options = rumdl_parser_options();
111        let parser = Parser::new_ext(content, options).into_offset_iter();
112
113        for (event, range) in parser {
114            match event {
115                Event::Start(Tag::CodeBlock(kind)) => {
116                    let (is_fenced, info_string) = match &kind {
117                        CodeBlockKind::Fenced(info) => (true, info.to_string()),
118                        CodeBlockKind::Indented => (false, String::new()),
119                    };
120                    code_block_start = Some((range.start, is_fenced, info_string));
121                }
122                Event::End(TagEnd::CodeBlock) => {
123                    if let Some((start, is_fenced, info_string)) = code_block_start.take() {
124                        blocks.push((start, range.end));
125                        details.push(CodeBlockDetail {
126                            start,
127                            end: range.end,
128                            is_fenced,
129                            info_string,
130                        });
131                    }
132                }
133                Event::Start(Tag::Strong) => {
134                    if range.start + 2 <= content.len() {
135                        let is_asterisk = &content[range.start..range.start + 2] == "**";
136                        strong_spans.push(StrongSpanDetail {
137                            start: range.start,
138                            end: range.end,
139                            is_asterisk,
140                        });
141                    }
142                }
143                Event::Start(Tag::List(start_num)) => {
144                    let is_ordered = start_num.is_some();
145                    let start_value = start_num.unwrap_or(1);
146                    list_stack.push((next_list_id, is_ordered, start_value));
147                    if is_ordered {
148                        list_start_values.insert(next_list_id, start_value);
149                    }
150                    next_list_id += 1;
151                }
152                Event::End(TagEnd::List(_)) => {
153                    list_stack.pop();
154                }
155                Event::Start(Tag::Item) => {
156                    if let Some(&(list_id, is_ordered, _)) = list_stack.last()
157                        && is_ordered
158                    {
159                        let line_num = byte_to_line(range.start);
160                        line_to_list.insert(line_num, list_id);
161                    }
162                }
163                Event::Code(_) => {
164                    spans.push((range.start, range.end));
165                }
166                _ => {}
167            }
168        }
169
170        // Handle edge case: unclosed code block at end of content
171        // pulldown-cmark should handle this, but be defensive
172        if let Some((start, is_fenced, info_string)) = code_block_start {
173            blocks.push((start, content.len()));
174            details.push(CodeBlockDetail {
175                start,
176                end: content.len(),
177                is_fenced,
178                info_string,
179            });
180        }
181
182        // Sort by start position (should already be sorted, but ensure consistency)
183        blocks.sort_by_key(|&(start, _)| start);
184        spans.sort_by_key(|&(start, _)| start);
185        details.sort_by_key(|d| d.start);
186        strong_spans.sort_by_key(|s| s.start);
187        ParseResult {
188            code_blocks: blocks,
189            code_spans: spans,
190            code_block_details: details,
191            strong_spans,
192            line_to_list,
193            list_start_values,
194        }
195    }
196
197    /// Check if a position is within a code block (for compatibility)
198    pub fn is_in_code_block_or_span(blocks: &[(usize, usize)], pos: usize) -> bool {
199        Self::is_in_code_block(blocks, pos)
200    }
201
202    /// Check if a byte position falls within any of the given sorted, non-overlapping ranges.
203    ///
204    /// Uses binary search on the sorted block ranges for O(log n) lookup.
205    /// The blocks slice must be sorted by start position (as returned by
206    /// `detect_code_blocks` and `detect_code_blocks_and_spans`).
207    pub fn is_in_code_block(blocks: &[(usize, usize)], pos: usize) -> bool {
208        // Binary search: find the last block whose start <= pos
209        let idx = blocks.partition_point(|&(start, _)| start <= pos);
210        // partition_point returns the first index where start > pos,
211        // so the candidate is at idx - 1
212        idx > 0 && pos < blocks[idx - 1].1
213    }
214
215    /// Analyze code block context relative to list parsing
216    /// This is the core function implementing Design #3's three-tier classification
217    pub fn analyze_code_block_context(
218        lines: &[crate::lint_context::LineInfo],
219        line_idx: usize,
220        min_continuation_indent: usize,
221    ) -> CodeBlockContext {
222        if let Some(line_info) = lines.get(line_idx) {
223            // Rule 1: Indentation Analysis - Is it sufficiently indented for list continuation?
224            if line_info.indent >= min_continuation_indent {
225                return CodeBlockContext::Indented;
226            }
227
228            // Rule 2: Blank Line Context - Check for structural separation indicators
229            let (prev_blanks, next_blanks) = Self::count_surrounding_blank_lines(lines, line_idx);
230
231            // Rule 3: Standalone Detection - Insufficient indentation + blank line separation
232            // This is the key fix: root-level code blocks with blank lines separate lists
233            if prev_blanks > 0 || next_blanks > 0 {
234                return CodeBlockContext::Standalone;
235            }
236
237            // Rule 4: Default - Adjacent (conservative, non-breaking for edge cases)
238            CodeBlockContext::Adjacent
239        } else {
240            // Fallback for invalid line index
241            CodeBlockContext::Adjacent
242        }
243    }
244
245    /// Count blank lines before and after the given line index
246    fn count_surrounding_blank_lines(lines: &[crate::lint_context::LineInfo], line_idx: usize) -> (usize, usize) {
247        let mut prev_blanks = 0;
248        let mut next_blanks = 0;
249
250        // Count blank lines before (look backwards)
251        for i in (0..line_idx).rev() {
252            if let Some(line) = lines.get(i) {
253                if line.is_blank {
254                    prev_blanks += 1;
255                } else {
256                    break;
257                }
258            } else {
259                break;
260            }
261        }
262
263        // Count blank lines after (look forwards)
264        for i in (line_idx + 1)..lines.len() {
265            if let Some(line) = lines.get(i) {
266                if line.is_blank {
267                    next_blanks += 1;
268                } else {
269                    break;
270                }
271            } else {
272                break;
273            }
274        }
275
276        (prev_blanks, next_blanks)
277    }
278
279    /// Calculate minimum indentation required for code block to continue a list
280    /// Based on the most recent list item's marker width
281    pub fn calculate_min_continuation_indent(
282        content: &str,
283        lines: &[crate::lint_context::LineInfo],
284        current_line_idx: usize,
285    ) -> usize {
286        // Look backwards to find the most recent list item
287        for i in (0..current_line_idx).rev() {
288            if let Some(line_info) = lines.get(i) {
289                if let Some(list_item) = &line_info.list_item {
290                    // Calculate minimum continuation indent for this list item
291                    return if list_item.is_ordered {
292                        list_item.marker_column + list_item.marker.len() + 1 // +1 for space after marker
293                    } else {
294                        list_item.marker_column + 2 // Unordered lists need marker + space (min 2)
295                    };
296                }
297
298                // Stop at structural separators that would break list context
299                if line_info.heading.is_some() || Self::is_structural_separator(line_info.content(content)) {
300                    break;
301                }
302            }
303        }
304
305        0 // No list context found
306    }
307
308    /// Check if content is a structural separator (headings, horizontal rules, etc.)
309    fn is_structural_separator(content: &str) -> bool {
310        let trimmed = content.trim();
311        trimmed.starts_with("---")
312            || trimmed.starts_with("***")
313            || trimmed.starts_with("___")
314            || crate::utils::skip_context::is_table_line(trimmed)
315            || trimmed.starts_with('>') // Blockquotes
316    }
317
318    /// Detect fenced code blocks with markdown/md language tag.
319    ///
320    /// Returns a vector of `MarkdownCodeBlock` containing byte ranges for the
321    /// content between the fences (excluding the fence lines themselves).
322    ///
323    /// Only detects fenced code blocks (``` or ~~~), not indented code blocks,
324    /// since indented blocks don't have a language tag.
325    pub fn detect_markdown_code_blocks(content: &str) -> Vec<MarkdownCodeBlock> {
326        use pulldown_cmark::{CodeBlockKind, Event, Parser, Tag, TagEnd};
327
328        let mut blocks = Vec::new();
329        let mut current_block: Option<MarkdownCodeBlockBuilder> = None;
330
331        let options = rumdl_parser_options();
332        let parser = Parser::new_ext(content, options).into_offset_iter();
333
334        for (event, range) in parser {
335            match event {
336                Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info))) => {
337                    // Check if language is markdown or md (first word of info string)
338                    let language = info.split_whitespace().next().unwrap_or("");
339                    if language.eq_ignore_ascii_case("markdown") || language.eq_ignore_ascii_case("md") {
340                        // Find where content starts (after the opening fence line)
341                        let block_start = range.start;
342                        let content_start = content[block_start..]
343                            .find('\n')
344                            .map_or(content.len(), |i| block_start + i + 1);
345
346                        current_block = Some(MarkdownCodeBlockBuilder { content_start });
347                    }
348                }
349                Event::End(TagEnd::CodeBlock) => {
350                    if let Some(builder) = current_block.take() {
351                        // Find where content ends (before the closing fence line)
352                        let block_end = range.end;
353
354                        // Validate range before slicing
355                        if builder.content_start > block_end || builder.content_start > content.len() {
356                            continue;
357                        }
358
359                        let search_range = &content[builder.content_start..block_end.min(content.len())];
360                        let content_end = search_range
361                            .rfind('\n')
362                            .map_or(builder.content_start, |i| builder.content_start + i);
363
364                        // Only add block if it has valid content range
365                        if content_end >= builder.content_start {
366                            blocks.push(MarkdownCodeBlock {
367                                content_start: builder.content_start,
368                                content_end,
369                            });
370                        }
371                    }
372                }
373                _ => {}
374            }
375        }
376
377        blocks
378    }
379}
380
381/// Information about a markdown code block for recursive formatting
382#[derive(Debug, Clone)]
383pub struct MarkdownCodeBlock {
384    /// Byte offset where the content starts (after opening fence line)
385    pub content_start: usize,
386    /// Byte offset where the content ends (before closing fence line)
387    pub content_end: usize,
388}
389
390/// Builder for MarkdownCodeBlock during parsing
391struct MarkdownCodeBlockBuilder {
392    content_start: usize,
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398
399    #[test]
400    fn test_detect_fenced_code_blocks() {
401        // The function detects fenced blocks and inline code spans
402        // Fence markers (``` at line start) are now skipped in inline span detection
403
404        // Basic fenced code block with backticks
405        let content = "Some text\n```\ncode here\n```\nMore text";
406        let blocks = CodeBlockUtils::detect_code_blocks(content);
407        // Should find: 1 fenced block (fences are no longer detected as inline spans)
408        assert_eq!(blocks.len(), 1);
409
410        // Check that we have the fenced block
411        let fenced_block = blocks
412            .iter()
413            .find(|(start, end)| end - start > 10 && content[*start..*end].contains("code here"));
414        assert!(fenced_block.is_some());
415
416        // Fenced code block with tildes (no inline code detection for ~)
417        let content = "Some text\n~~~\ncode here\n~~~\nMore text";
418        let blocks = CodeBlockUtils::detect_code_blocks(content);
419        assert_eq!(blocks.len(), 1);
420        assert_eq!(&content[blocks[0].0..blocks[0].1], "~~~\ncode here\n~~~");
421
422        // Multiple code blocks
423        let content = "Text\n```\ncode1\n```\nMiddle\n~~~\ncode2\n~~~\nEnd";
424        let blocks = CodeBlockUtils::detect_code_blocks(content);
425        // 2 fenced blocks (fence markers no longer detected as inline spans)
426        assert_eq!(blocks.len(), 2);
427    }
428
429    #[test]
430    fn test_detect_code_blocks_with_language() {
431        // Code block with language identifier
432        let content = "Text\n```rust\nfn main() {}\n```\nMore";
433        let blocks = CodeBlockUtils::detect_code_blocks(content);
434        // 1 fenced block (fence markers no longer detected as inline spans)
435        assert_eq!(blocks.len(), 1);
436        // Check we have the full fenced block
437        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("fn main"));
438        assert!(fenced.is_some());
439    }
440
441    #[test]
442    fn test_unclosed_code_block() {
443        // Unclosed code block should extend to end of content
444        let content = "Text\n```\ncode here\nno closing fence";
445        let blocks = CodeBlockUtils::detect_code_blocks(content);
446        assert_eq!(blocks.len(), 1);
447        assert_eq!(blocks[0].1, content.len());
448    }
449
450    #[test]
451    fn test_indented_code_blocks() {
452        // Basic indented code block
453        let content = "Paragraph\n\n    code line 1\n    code line 2\n\nMore text";
454        let blocks = CodeBlockUtils::detect_code_blocks(content);
455        assert_eq!(blocks.len(), 1);
456        assert!(content[blocks[0].0..blocks[0].1].contains("code line 1"));
457        assert!(content[blocks[0].0..blocks[0].1].contains("code line 2"));
458
459        // Indented code with tabs
460        let content = "Paragraph\n\n\tcode with tab\n\tanother line\n\nText";
461        let blocks = CodeBlockUtils::detect_code_blocks(content);
462        assert_eq!(blocks.len(), 1);
463    }
464
465    #[test]
466    fn test_indented_code_requires_blank_line() {
467        // Indented lines without preceding blank line are not code blocks
468        let content = "Paragraph\n    indented but not code\nMore text";
469        let blocks = CodeBlockUtils::detect_code_blocks(content);
470        assert_eq!(blocks.len(), 0);
471
472        // With blank line, it becomes a code block
473        let content = "Paragraph\n\n    now it's code\nMore text";
474        let blocks = CodeBlockUtils::detect_code_blocks(content);
475        assert_eq!(blocks.len(), 1);
476    }
477
478    #[test]
479    fn test_indented_content_with_list_markers_is_code_block() {
480        // Per CommonMark spec: 4-space indented content after blank line IS a code block,
481        // even if the content looks like list markers. The indentation takes precedence.
482        // Verified with: echo 'List:\n\n    - Item 1' | npx commonmark
483        // Output: <pre><code>- Item 1</code></pre>
484        let content = "List:\n\n    - Item 1\n    - Item 2\n    * Item 3\n    + Item 4";
485        let blocks = CodeBlockUtils::detect_code_blocks(content);
486        assert_eq!(blocks.len(), 1); // This IS a code block per spec
487
488        // Same for numbered list markers
489        let content = "List:\n\n    1. First\n    2. Second";
490        let blocks = CodeBlockUtils::detect_code_blocks(content);
491        assert_eq!(blocks.len(), 1); // This IS a code block per spec
492    }
493
494    #[test]
495    fn test_actual_list_items_not_code_blocks() {
496        // Actual list items (no preceding blank line + 4 spaces) are NOT code blocks
497        let content = "- Item 1\n- Item 2\n* Item 3";
498        let blocks = CodeBlockUtils::detect_code_blocks(content);
499        assert_eq!(blocks.len(), 0);
500
501        // Nested list items
502        let content = "- Item 1\n  - Nested item\n- Item 2";
503        let blocks = CodeBlockUtils::detect_code_blocks(content);
504        assert_eq!(blocks.len(), 0);
505    }
506
507    #[test]
508    fn test_inline_code_spans_not_detected() {
509        // Inline code spans should NOT be detected as code blocks
510        let content = "Text with `inline code` here";
511        let blocks = CodeBlockUtils::detect_code_blocks(content);
512        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
513
514        // Multiple backtick code span
515        let content = "Text with ``code with ` backtick`` here";
516        let blocks = CodeBlockUtils::detect_code_blocks(content);
517        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
518
519        // Multiple code spans
520        let content = "Has `code1` and `code2` spans";
521        let blocks = CodeBlockUtils::detect_code_blocks(content);
522        assert_eq!(blocks.len(), 0); // No blocks, only inline spans
523    }
524
525    #[test]
526    fn test_unclosed_code_span() {
527        // Unclosed code span should not be detected
528        let content = "Text with `unclosed code span";
529        let blocks = CodeBlockUtils::detect_code_blocks(content);
530        assert_eq!(blocks.len(), 0);
531
532        // Mismatched backticks
533        let content = "Text with ``one style` different close";
534        let blocks = CodeBlockUtils::detect_code_blocks(content);
535        assert_eq!(blocks.len(), 0);
536    }
537
538    #[test]
539    fn test_mixed_code_blocks_and_spans() {
540        let content = "Has `span1` text\n```\nblock\n```\nand `span2`";
541        let blocks = CodeBlockUtils::detect_code_blocks(content);
542        // Should only detect the fenced block, NOT the inline spans
543        assert_eq!(blocks.len(), 1);
544
545        // Check we have the fenced block only
546        assert!(blocks.iter().any(|(s, e)| content[*s..*e].contains("block")));
547        // Should NOT detect inline spans
548        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span1`"));
549        assert!(!blocks.iter().any(|(s, e)| &content[*s..*e] == "`span2`"));
550    }
551
552    #[test]
553    fn test_is_in_code_block_or_span() {
554        let blocks = vec![(10, 20), (30, 40), (50, 60)];
555
556        // Test positions inside blocks
557        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 15));
558        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 35));
559        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 55));
560
561        // Test positions at boundaries
562        assert!(CodeBlockUtils::is_in_code_block_or_span(&blocks, 10)); // Start is inclusive
563        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 20)); // End is exclusive
564
565        // Test positions outside blocks
566        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 5));
567        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 25));
568        assert!(!CodeBlockUtils::is_in_code_block_or_span(&blocks, 65));
569    }
570
571    #[test]
572    fn test_empty_content() {
573        let blocks = CodeBlockUtils::detect_code_blocks("");
574        assert_eq!(blocks.len(), 0);
575    }
576
577    #[test]
578    fn test_code_block_at_start() {
579        let content = "```\ncode\n```\nText after";
580        let blocks = CodeBlockUtils::detect_code_blocks(content);
581        // 1 fenced block (fence markers no longer detected as inline spans)
582        assert_eq!(blocks.len(), 1);
583        assert_eq!(blocks[0].0, 0); // Fenced block starts at 0
584    }
585
586    #[test]
587    fn test_code_block_at_end() {
588        let content = "Text before\n```\ncode\n```";
589        let blocks = CodeBlockUtils::detect_code_blocks(content);
590        // 1 fenced block (fence markers no longer detected as inline spans)
591        assert_eq!(blocks.len(), 1);
592        // Check we have the fenced block
593        let fenced = blocks.iter().find(|(s, e)| content[*s..*e].contains("code"));
594        assert!(fenced.is_some());
595    }
596
597    #[test]
598    fn test_nested_fence_markers() {
599        // Code block containing fence markers as content
600        let content = "Text\n````\n```\nnested\n```\n````\nAfter";
601        let blocks = CodeBlockUtils::detect_code_blocks(content);
602        // Should detect: outer block, inner ```, outer ````
603        assert!(!blocks.is_empty());
604        // Check we have the outer block
605        let outer = blocks.iter().find(|(s, e)| content[*s..*e].contains("nested"));
606        assert!(outer.is_some());
607    }
608
609    #[test]
610    fn test_indented_code_with_blank_lines() {
611        // Indented code blocks can contain blank lines
612        let content = "Text\n\n    line1\n\n    line2\n\nAfter";
613        let blocks = CodeBlockUtils::detect_code_blocks(content);
614        // May have multiple blocks due to blank line handling
615        assert!(!blocks.is_empty());
616        // Check that we captured the indented code
617        let all_content: String = blocks
618            .iter()
619            .map(|(s, e)| &content[*s..*e])
620            .collect::<Vec<_>>()
621            .join("");
622        assert!(all_content.contains("line1") || content[blocks[0].0..blocks[0].1].contains("line1"));
623    }
624
625    #[test]
626    fn test_code_span_with_spaces() {
627        // Code spans should NOT be detected as code blocks
628        let content = "Text ` code with spaces ` more";
629        let blocks = CodeBlockUtils::detect_code_blocks(content);
630        assert_eq!(blocks.len(), 0); // No blocks, only inline span
631    }
632
633    #[test]
634    fn test_fenced_block_with_info_string() {
635        // Fenced code blocks with complex info strings
636        let content = "```rust,no_run,should_panic\ncode\n```";
637        let blocks = CodeBlockUtils::detect_code_blocks(content);
638        // 1 fenced block (fence markers no longer detected as inline spans)
639        assert_eq!(blocks.len(), 1);
640        assert_eq!(blocks[0].0, 0);
641    }
642
643    #[test]
644    fn test_indented_fences_not_code_blocks() {
645        // Indented fence markers should still work as fences
646        let content = "Text\n  ```\n  code\n  ```\nAfter";
647        let blocks = CodeBlockUtils::detect_code_blocks(content);
648        // Only 1 fenced block (indented fences still work)
649        assert_eq!(blocks.len(), 1);
650    }
651
652    // Issue #175: Backticks in info string invalidate the fence
653    #[test]
654    fn test_backticks_in_info_string_not_code_block() {
655        // Per CommonMark spec: "If the info string comes after a backtick fence,
656        // it may not contain any backtick characters."
657        // So ```something``` is NOT a valid fence - the backticks are treated as inline code.
658        // Verified with: echo '```something```' | npx commonmark
659        // Output: <p><code>something</code></p>
660        let content = "```something```\n\n```bash\n# comment\n```";
661        let blocks = CodeBlockUtils::detect_code_blocks(content);
662        // Should find only the valid ```bash block, NOT the invalid ```something```
663        assert_eq!(blocks.len(), 1);
664        // The valid block should contain "# comment"
665        assert!(content[blocks[0].0..blocks[0].1].contains("# comment"));
666    }
667
668    #[test]
669    fn test_issue_175_reproduction() {
670        // Full reproduction of issue #175
671        let content = "```something```\n\n```bash\n# Have a parrot\necho \"🦜\"\n```";
672        let blocks = CodeBlockUtils::detect_code_blocks(content);
673        // Only the bash block is a code block
674        assert_eq!(blocks.len(), 1);
675        assert!(content[blocks[0].0..blocks[0].1].contains("Have a parrot"));
676    }
677
678    #[test]
679    fn test_tilde_fence_allows_tildes_in_info_string() {
680        // Tilde fences CAN have tildes in info string (only backtick restriction exists)
681        // ~~~abc~~~ opens an unclosed code block with info string "abc~~~"
682        let content = "~~~abc~~~\ncode content\n~~~";
683        let blocks = CodeBlockUtils::detect_code_blocks(content);
684        // This is a valid tilde fence that opens and closes
685        assert_eq!(blocks.len(), 1);
686    }
687
688    #[test]
689    fn test_nested_longer_fence_contains_shorter() {
690        // Longer fence (````) can contain shorter fence (```) as content
691        let content = "````\n```\nnested content\n```\n````";
692        let blocks = CodeBlockUtils::detect_code_blocks(content);
693        assert_eq!(blocks.len(), 1);
694        assert!(content[blocks[0].0..blocks[0].1].contains("nested content"));
695    }
696
697    #[test]
698    fn test_mixed_fence_types() {
699        // Tilde fence contains backtick markers as content
700        let content = "~~~\n```\nmixed content\n~~~";
701        let blocks = CodeBlockUtils::detect_code_blocks(content);
702        assert_eq!(blocks.len(), 1);
703        assert!(content[blocks[0].0..blocks[0].1].contains("mixed content"));
704    }
705
706    #[test]
707    fn test_indented_code_in_list_issue_276() {
708        // Issue #276: Indented code block inside a list should be detected by pulldown-cmark
709        let content = r#"1. First item
7102. Second item with code:
711
712        # This is a code block in a list
713        print("Hello, world!")
714
7154. Third item"#;
716
717        let blocks = CodeBlockUtils::detect_code_blocks(content);
718        // pulldown-cmark SHOULD detect this indented code block inside the list
719        assert!(!blocks.is_empty(), "Should detect indented code block inside list");
720
721        // Verify the detected block contains our code
722        let all_content: String = blocks
723            .iter()
724            .map(|(s, e)| &content[*s..*e])
725            .collect::<Vec<_>>()
726            .join("");
727        assert!(
728            all_content.contains("code block in a list") || all_content.contains("print"),
729            "Detected block should contain the code content: {all_content:?}"
730        );
731    }
732
733    #[test]
734    fn test_detect_markdown_code_blocks() {
735        let content = r#"# Example
736
737```markdown
738# Heading
739Content here
740```
741
742```md
743Another heading
744More content
745```
746
747```rust
748// Not markdown
749fn main() {}
750```
751"#;
752
753        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
754
755        // Should detect 2 blocks (markdown and md, not rust)
756        assert_eq!(
757            blocks.len(),
758            2,
759            "Should detect exactly 2 markdown blocks, got {blocks:?}"
760        );
761
762        // First block should be the ```markdown block
763        let first = &blocks[0];
764        let first_content = &content[first.content_start..first.content_end];
765        assert!(
766            first_content.contains("# Heading"),
767            "First block should contain '# Heading', got: {first_content:?}"
768        );
769
770        // Second block should be the ```md block
771        let second = &blocks[1];
772        let second_content = &content[second.content_start..second.content_end];
773        assert!(
774            second_content.contains("Another heading"),
775            "Second block should contain 'Another heading', got: {second_content:?}"
776        );
777    }
778
779    #[test]
780    fn test_detect_markdown_code_blocks_empty() {
781        let content = "# Just a heading\n\nNo code blocks here\n";
782        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
783        assert_eq!(blocks.len(), 0);
784    }
785
786    #[test]
787    fn test_detect_markdown_code_blocks_case_insensitive() {
788        let content = "```MARKDOWN\nContent\n```\n";
789        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
790        assert_eq!(blocks.len(), 1);
791    }
792
793    #[test]
794    fn test_detect_markdown_code_blocks_at_eof_no_trailing_newline() {
795        // Block at end of file without trailing newline after closing fence
796        let content = "# Doc\n\n```markdown\nContent\n```";
797        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
798        assert_eq!(blocks.len(), 1);
799        // Content should be extractable without panic
800        let block_content = &content[blocks[0].content_start..blocks[0].content_end];
801        assert!(block_content.contains("Content"));
802    }
803
804    #[test]
805    fn test_detect_markdown_code_blocks_single_line_content() {
806        // Single line of content, no extra newlines
807        let content = "```markdown\nX\n```\n";
808        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
809        assert_eq!(blocks.len(), 1);
810        let block_content = &content[blocks[0].content_start..blocks[0].content_end];
811        assert_eq!(block_content, "X");
812    }
813
814    #[test]
815    fn test_detect_markdown_code_blocks_empty_content() {
816        // Block with no content between fences
817        let content = "```markdown\n```\n";
818        let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
819        // Should detect block but with empty range or not at all
820        // Either behavior is acceptable as long as no panic
821        if !blocks.is_empty() {
822            // If detected, content range should be valid
823            assert!(blocks[0].content_start <= blocks[0].content_end);
824        }
825    }
826
827    #[test]
828    fn test_detect_markdown_code_blocks_validates_ranges() {
829        // Ensure no panic on various edge cases
830        let test_cases = [
831            "",                             // Empty content
832            "```markdown",                  // Unclosed block
833            "```markdown\n",                // Unclosed block with newline
834            "```\n```",                     // Non-markdown block
835            "```markdown\n```",             // Empty markdown block
836            "   ```markdown\n   X\n   ```", // Indented block
837        ];
838
839        for content in test_cases {
840            // Should not panic
841            let blocks = CodeBlockUtils::detect_markdown_code_blocks(content);
842            // All detected blocks should have valid ranges
843            for block in &blocks {
844                assert!(
845                    block.content_start <= block.content_end,
846                    "Invalid range in content: {content:?}"
847                );
848                assert!(
849                    block.content_end <= content.len(),
850                    "Range exceeds content length in: {content:?}"
851                );
852            }
853        }
854    }
855
856    // ── is_in_code_block binary search tests ─────────────────────────────
857
858    #[test]
859    fn test_is_in_code_block_empty_blocks() {
860        assert!(!CodeBlockUtils::is_in_code_block(&[], 0));
861        assert!(!CodeBlockUtils::is_in_code_block(&[], 100));
862        assert!(!CodeBlockUtils::is_in_code_block(&[], usize::MAX));
863    }
864
865    #[test]
866    fn test_is_in_code_block_single_range() {
867        let blocks = [(10, 20)];
868        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 0));
869        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 9));
870        assert!(CodeBlockUtils::is_in_code_block(&blocks, 10));
871        assert!(CodeBlockUtils::is_in_code_block(&blocks, 15));
872        assert!(CodeBlockUtils::is_in_code_block(&blocks, 19));
873        // end is exclusive
874        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 20));
875        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 21));
876    }
877
878    #[test]
879    fn test_is_in_code_block_multiple_ranges() {
880        let blocks = [(5, 10), (20, 30), (50, 60)];
881        // Before all
882        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 0));
883        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 4));
884        // In first
885        assert!(CodeBlockUtils::is_in_code_block(&blocks, 5));
886        assert!(CodeBlockUtils::is_in_code_block(&blocks, 9));
887        // Gap between first and second
888        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 10));
889        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 15));
890        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 19));
891        // In second
892        assert!(CodeBlockUtils::is_in_code_block(&blocks, 20));
893        assert!(CodeBlockUtils::is_in_code_block(&blocks, 29));
894        // Gap between second and third
895        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 30));
896        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 49));
897        // In third
898        assert!(CodeBlockUtils::is_in_code_block(&blocks, 50));
899        assert!(CodeBlockUtils::is_in_code_block(&blocks, 59));
900        // After all
901        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 60));
902        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 1000));
903    }
904
905    #[test]
906    fn test_is_in_code_block_adjacent_ranges() {
907        // Ranges that are exactly adjacent (end of one == start of next)
908        let blocks = [(0, 10), (10, 20), (20, 30)];
909        assert!(CodeBlockUtils::is_in_code_block(&blocks, 0));
910        assert!(CodeBlockUtils::is_in_code_block(&blocks, 9));
911        assert!(CodeBlockUtils::is_in_code_block(&blocks, 10));
912        assert!(CodeBlockUtils::is_in_code_block(&blocks, 19));
913        assert!(CodeBlockUtils::is_in_code_block(&blocks, 20));
914        assert!(CodeBlockUtils::is_in_code_block(&blocks, 29));
915        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 30));
916    }
917
918    #[test]
919    fn test_is_in_code_block_single_byte_range() {
920        let blocks = [(5, 6)];
921        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 4));
922        assert!(CodeBlockUtils::is_in_code_block(&blocks, 5));
923        assert!(!CodeBlockUtils::is_in_code_block(&blocks, 6));
924    }
925
926    #[test]
927    fn test_is_in_code_block_matches_linear_scan() {
928        // Verify binary search produces identical results to linear scan
929        // for a realistic document layout
930        let content = "# Heading\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nSome text\n\n```\nmore code\n```\n\nEnd\n";
931        let blocks = CodeBlockUtils::detect_code_blocks(content);
932
933        for pos in 0..content.len() {
934            let binary = CodeBlockUtils::is_in_code_block(&blocks, pos);
935            let linear = blocks.iter().any(|&(s, e)| pos >= s && pos < e);
936            assert_eq!(
937                binary, linear,
938                "Mismatch at pos {pos}: binary={binary}, linear={linear}, blocks={blocks:?}"
939            );
940        }
941    }
942
943    #[test]
944    fn test_is_in_code_block_at_range_boundaries() {
945        // Exhaustive boundary testing for every block start/end
946        let blocks = [(100, 200), (300, 400), (500, 600)];
947        for &(start, end) in &blocks {
948            assert!(
949                !CodeBlockUtils::is_in_code_block(&blocks, start - 1),
950                "pos={} should be outside",
951                start - 1
952            );
953            assert!(
954                CodeBlockUtils::is_in_code_block(&blocks, start),
955                "pos={start} should be inside"
956            );
957            assert!(
958                CodeBlockUtils::is_in_code_block(&blocks, end - 1),
959                "pos={} should be inside",
960                end - 1
961            );
962            assert!(
963                !CodeBlockUtils::is_in_code_block(&blocks, end),
964                "pos={end} should be outside"
965            );
966        }
967    }
968}