marco_core/parser/blocks/
mod.rs

1//! Block-level parser modules.
2//!
3//! This layer converts block grammar outputs into AST nodes with positions.
4
5/// Shared block parser utilities.
6pub mod shared;
7
8/// CommonMark blockquote parser.
9pub mod cm_blockquote_parser;
10/// CommonMark fenced code block parser.
11pub mod cm_fenced_code_block_parser;
12/// CommonMark heading parsers.
13pub mod cm_heading_parser;
14/// CommonMark HTML block parser.
15pub mod cm_html_blocks_parser;
16/// CommonMark indented code block parser.
17pub mod cm_indented_code_block_parser;
18/// CommonMark link reference definition parser.
19pub mod cm_link_reference_parser;
20/// CommonMark list parser.
21pub mod cm_list_parser;
22/// CommonMark paragraph parser.
23pub mod cm_paragraph_parser;
24/// CommonMark thematic break parser.
25pub mod cm_thematic_break_parser;
26/// GFM alert/admonition post-processing.
27pub mod gfm_admonitions;
28/// GFM footnote definition parser.
29pub mod gfm_footnote_definition_parser;
30/// GFM table parser.
31pub mod gfm_table_parser;
32/// Extended headerless table parser.
33pub mod marco_headerless_table_parser;
34/// Extended slide deck parser.
35pub mod marco_sliders_parser;
36/// Extended tab block parser.
37pub mod marco_tab_blocks_parser;
38
39/// Re-export shared block parser utilities.
40pub use shared::{dedent_list_item_content, to_parser_span, to_parser_span_range, GrammarSpan};
41
42use super::ast::Document;
43use crate::grammar::blocks as grammar;
44use crate::parser::ast::{Node, NodeKind};
45use nom::Input;
46
47// ============================================================================
48// BlockContext: Track open blocks for continuation across blank lines
49// ============================================================================
50
51/// Type of block that's currently open
52#[derive(Debug, Clone, PartialEq)]
53enum BlockContextKind {
54    /// Individual list item within a list
55    /// content_indent: minimum spaces required for content continuation
56    ListItem { content_indent: usize },
57}
58
59/// Represents an open block that can accept continuation content
60#[derive(Debug, Clone)]
61struct BlockContext {
62    kind: BlockContextKind,
63}
64
65impl BlockContext {
66    /// Create a new list item context with the given content indent
67    pub fn new_list_item(content_indent: usize) -> Self {
68        Self {
69            kind: BlockContextKind::ListItem { content_indent },
70        }
71    }
72
73    /// Check if this block can continue at the given indent level
74    fn can_continue_at(&self, indent: usize) -> bool {
75        match self.kind {
76            BlockContextKind::ListItem { content_indent } => {
77                // List item content must be indented at least to content_indent
78                indent >= content_indent
79            }
80        }
81    }
82}
83
84// ============================================================================
85// ParserState: Stack of open blocks for context-aware parsing
86// ============================================================================
87
88/// Track all currently open block contexts
89struct ParserState {
90    blocks: Vec<BlockContext>,
91    allow_tab_blocks: bool,
92    allow_sliders: bool,
93}
94
95impl ParserState {
96    fn new() -> Self {
97        Self {
98            blocks: Vec::new(),
99            allow_tab_blocks: true,
100            allow_sliders: true,
101        }
102    }
103
104    fn new_with_tab_blocks(allow_tab_blocks: bool) -> Self {
105        Self {
106            blocks: Vec::new(),
107            allow_tab_blocks,
108            allow_sliders: true,
109        }
110    }
111
112    fn new_with_sliders(allow_sliders: bool) -> Self {
113        Self {
114            blocks: Vec::new(),
115            allow_tab_blocks: true,
116            allow_sliders,
117        }
118    }
119
120    /// Add a new block context to the stack
121    pub fn push_block(&mut self, context: BlockContext) {
122        self.blocks.push(context);
123    }
124
125    /// Remove and return the most recent block context
126    fn pop_block(&mut self) -> Option<BlockContext> {
127        self.blocks.pop()
128    }
129
130    /// Check if the current context can continue at the given indent
131    fn can_continue_at(&self, indent: usize) -> bool {
132        if let Some(context) = self.blocks.last() {
133            context.can_continue_at(indent)
134        } else {
135            // No context, can't continue
136            false
137        }
138    }
139
140    /// Close blocks that can't continue at the given indent
141    /// Returns the number of blocks closed
142    fn close_blocks_until_indent(&mut self, indent: usize) -> usize {
143        let mut closed = 0;
144
145        // Close blocks from innermost to outermost
146        while let Some(context) = self.blocks.last() {
147            if context.can_continue_at(indent) {
148                // This block can continue, stop closing
149                break;
150            } else {
151                // This block can't continue, close it
152                self.blocks.pop();
153                closed += 1;
154            }
155        }
156
157        closed
158    }
159}
160
161// ============================================================================
162// Main block parser entry point
163// ============================================================================
164
165/// Parse document into block-level structure, returning a Document
166pub fn parse_blocks(input: &str) -> Result<Document, Box<dyn std::error::Error>> {
167    let mut state = ParserState::new();
168    parse_blocks_internal(input, 0, &mut state)
169}
170
171// Internal parser with recursion depth limit and state tracking
172fn parse_blocks_internal(
173    input: &str,
174    depth: usize,
175    state: &mut ParserState,
176) -> Result<Document, Box<dyn std::error::Error>> {
177    // Prevent infinite recursion
178    const MAX_DEPTH: usize = 100;
179    if depth > MAX_DEPTH {
180        log::warn!("Maximum recursion depth reached in block parser");
181        return Ok(Document::new());
182    }
183
184    log::debug!(
185        "Block parser input: {} bytes at depth {}, state depth: {}",
186        input.len(),
187        depth,
188        state.blocks.len()
189    );
190
191    let mut nodes = Vec::new();
192    let mut document = Document::new(); // Create document early to collect references
193    let mut remaining = GrammarSpan::new(input);
194
195    // Safety: prevent infinite loops.
196    // This must be high enough for real documents; the progress-check below is the
197    // primary safety mechanism.
198    let max_iterations = input.lines().count().saturating_mul(8).max(1_000);
199    let mut iteration_count = 0;
200    let mut last_offset = 0;
201
202    while !remaining.fragment().is_empty() {
203        iteration_count += 1;
204        if iteration_count > max_iterations {
205            log::error!(
206                "Block parser exceeded iteration limit ({}) at depth {}",
207                max_iterations,
208                depth
209            );
210            break;
211        }
212
213        // Safety: ensure we're making progress
214        let current_offset = remaining.location_offset();
215        if current_offset == last_offset && iteration_count > 1 {
216            log::error!(
217                "Block parser not making progress at offset {}, depth {}",
218                current_offset,
219                depth
220            );
221            // Force skip one character, while preserving span offsets.
222            use nom::bytes::complete::take;
223            let skip_len = remaining
224                .fragment()
225                .chars()
226                .next()
227                .map(|c| c.len_utf8())
228                .unwrap_or(1);
229            if let Ok((rest, _)) =
230                take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
231            {
232                remaining = rest;
233                last_offset = remaining.location_offset();
234                continue;
235            }
236            break;
237        }
238        last_offset = current_offset;
239
240        // ========================================================================
241        // BLANK LINE HANDLING WITH CONTEXT AWARENESS (Example 307 fix)
242        // ========================================================================
243        // Extract the first line to check if it's blank
244        let first_line_end = remaining
245            .fragment()
246            .find('\n')
247            .unwrap_or(remaining.fragment().len());
248        let first_line = &remaining.fragment()[..first_line_end];
249
250        // A line is blank per CommonMark spec: only ASCII space (U+0020) and tab (U+0009).
251        // Notably, U+00A0 NO-BREAK SPACE is NOT a blank line — it produces a spacer paragraph.
252        if first_line.chars().all(|c| c == ' ' || c == '\t') {
253            // Peek at the next non-blank line to determine continuation
254            let peek_offset = if first_line_end < remaining.fragment().len() {
255                first_line_end + 1
256            } else {
257                first_line_end
258            };
259
260            // Find the next non-blank line
261            let mut next_nonblank_indent: Option<usize> = None;
262            let rest_of_input = &remaining.fragment()[peek_offset..];
263
264            for peek_line in rest_of_input.lines() {
265                if !peek_line.trim().is_empty() {
266                    // Count leading spaces (expand tabs)
267                    let mut indent = 0;
268                    for ch in peek_line.chars() {
269                        if ch == ' ' {
270                            indent += 1;
271                        } else if ch == '\t' {
272                            indent += 4 - (indent % 4); // Tab to next multiple of 4
273                        } else {
274                            break;
275                        }
276                    }
277                    next_nonblank_indent = Some(indent);
278                    break;
279                }
280            }
281
282            // Determine if we should preserve context or close blocks
283            let should_continue = if let Some(next_indent) = next_nonblank_indent {
284                // Check if the next content can continue the current context
285                state.can_continue_at(next_indent)
286            } else {
287                // No more content, close all contexts
288                false
289            };
290
291            if should_continue {
292                // Blank line continues the current block
293                // Skip the blank but preserve block context
294                log::debug!(
295                    "Blank line: continuing context at indent {:?}",
296                    next_nonblank_indent
297                );
298
299                use nom::bytes::complete::take;
300                let skip_len = if first_line_end < remaining.fragment().len() {
301                    first_line_end + 1 // Include newline
302                } else {
303                    first_line_end
304                };
305
306                if let Ok((new_remaining, _)) =
307                    take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
308                {
309                    remaining = new_remaining;
310                    continue;
311                } else {
312                    break;
313                }
314            } else {
315                // Blank line ends the current context(s)
316                // Close blocks that can't continue at the next indent
317                if let Some(next_indent) = next_nonblank_indent {
318                    let closed = state.close_blocks_until_indent(next_indent);
319                    log::debug!(
320                        "Blank line: closed {} blocks due to indent {}",
321                        closed,
322                        next_indent
323                    );
324                } else {
325                    // No more content, close everything
326                    log::debug!("Blank line: end of input, closing all blocks");
327                    while state.pop_block().is_some() {}
328                }
329
330                // Skip the blank line and continue parsing
331                use nom::bytes::complete::take;
332                let skip_len = if first_line_end < remaining.fragment().len() {
333                    first_line_end + 1
334                } else {
335                    first_line_end
336                };
337
338                if let Ok((new_remaining, _)) =
339                    take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
340                {
341                    remaining = new_remaining;
342                    continue;
343                } else {
344                    break;
345                }
346            }
347        }
348
349        // Try parsing HTML blocks (types 1-7, in order)
350        // Type 1: Special raw content tags (script, pre, style, textarea)
351        if let Ok((rest, content)) = grammar::html_special_tag(remaining) {
352            nodes.push(cm_html_blocks_parser::parse_html_block(content));
353            remaining = rest;
354            continue;
355        }
356
357        // Type 2: HTML comments
358        if let Ok((rest, content)) = grammar::html_comment(remaining) {
359            nodes.push(cm_html_blocks_parser::parse_html_block(content));
360            remaining = rest;
361            continue;
362        }
363
364        // Type 3: Processing instructions
365        if let Ok((rest, content)) = grammar::html_processing_instruction(remaining) {
366            nodes.push(cm_html_blocks_parser::parse_html_block(content));
367            remaining = rest;
368            continue;
369        }
370
371        // Type 4: Declarations
372        if let Ok((rest, content)) = grammar::html_declaration(remaining) {
373            nodes.push(cm_html_blocks_parser::parse_html_block(content));
374            remaining = rest;
375            continue;
376        }
377
378        // Type 5: CDATA sections
379        if let Ok((rest, content)) = grammar::html_cdata(remaining) {
380            nodes.push(cm_html_blocks_parser::parse_html_block(content));
381            remaining = rest;
382            continue;
383        }
384
385        // Type 6: Standard block tags (div, table, etc.)
386        if let Ok((rest, content)) = grammar::html_block_tag(remaining) {
387            nodes.push(cm_html_blocks_parser::parse_html_block(content));
388            remaining = rest;
389            continue;
390        }
391
392        // Type 7: Complete tags (CANNOT interrupt paragraphs)
393        // Try this but it will fail if we're in the middle of paragraph text
394        if let Ok((rest, content)) = grammar::html_complete_tag(remaining) {
395            nodes.push(cm_html_blocks_parser::parse_html_block(content));
396            remaining = rest;
397            continue;
398        } // Try parsing heading
399        if let Ok((rest, (level, content))) = grammar::heading(remaining) {
400            nodes.push(cm_heading_parser::parse_atx_heading(level, content));
401            remaining = rest;
402            continue;
403        }
404
405        // Try parsing fenced code block
406        if let Ok((rest, (language, content))) = grammar::fenced_code_block(remaining) {
407            nodes.push(cm_fenced_code_block_parser::parse_fenced_code_block(
408                language, content,
409            ));
410            remaining = rest;
411            continue;
412        }
413
414        // Try parsing thematic break (---, ***, ___)
415        if let Ok((rest, content)) = grammar::thematic_break(remaining) {
416            nodes.push(cm_thematic_break_parser::parse_thematic_break(content));
417            remaining = rest;
418            continue;
419        }
420
421        // Try parsing block quote (lines starting with >)
422        if let Ok((rest, content)) = grammar::blockquote(remaining) {
423            let node =
424                cm_blockquote_parser::parse_blockquote(content, depth, |cleaned, new_depth| {
425                    parse_blocks_internal(cleaned, new_depth, state)
426                })?;
427
428            nodes.push(node);
429            remaining = rest;
430            continue;
431        }
432
433        // Try parsing indented code block (4 spaces or 1 tab)
434        // NOTE: Must come BEFORE lists to avoid indented code being consumed as list content
435        if let Ok((rest, content)) = grammar::indented_code_block(remaining) {
436            nodes.push(cm_indented_code_block_parser::parse_indented_code_block(
437                content,
438            ));
439            remaining = rest;
440            continue;
441        }
442
443        // Try parsing list
444        // NOTE: Must come BEFORE setext heading to avoid "---" being parsed as underline
445        if let Ok((rest, items)) = grammar::list(remaining) {
446            let node = cm_list_parser::parse_list(
447                items,
448                depth,
449                parse_blocks_internal,
450                |content_indent| {
451                    let mut item_state = ParserState::new();
452                    item_state.push_block(BlockContext::new_list_item(content_indent));
453                    item_state
454                },
455            )?;
456
457            nodes.push(node);
458            remaining = rest;
459            continue;
460        }
461
462        // Try parsing extended slide decks.
463        // Must come BEFORE setext heading. Otherwise, the internal `---` / `--`
464        // separators can be consumed as setext underlines and the deck is lost.
465        if state.allow_sliders {
466            let deck_start = remaining;
467            if let Ok((rest, deck)) = grammar::marco_slide_deck(remaining) {
468                let node = marco_sliders_parser::parse_marco_slide_deck(
469                    deck,
470                    deck_start,
471                    rest,
472                    depth,
473                    |slide_body, new_depth| {
474                        // Slides support arbitrary markdown, but nested
475                        // `@slidestart` decks are disallowed.
476                        let mut slide_state = ParserState::new_with_sliders(false);
477                        parse_blocks_internal(slide_body, new_depth, &mut slide_state)
478                    },
479                )?;
480
481                nodes.push(node);
482                remaining = rest;
483                continue;
484            }
485        }
486
487        // Try parsing Setext heading (underline style: === or ---)
488        // NOTE: Must come AFTER lists to avoid eating list marker patterns like "- foo\n---"
489        let full_start = remaining;
490        if let Ok((rest, (level, content))) = grammar::setext_heading(remaining) {
491            let full_end = rest;
492            nodes.push(cm_heading_parser::parse_setext_heading(
493                level, content, full_start, full_end,
494            ));
495            remaining = rest;
496            continue;
497        }
498
499        // Try parsing link reference definition
500        // Must come BEFORE paragraph to avoid treating definitions as paragraphs
501        if let Some((rest, node)) =
502            gfm_footnote_definition_parser::parse_footnote_definition(remaining)
503        {
504            nodes.push(node);
505            remaining = rest;
506            continue;
507        }
508
509        if let Ok((rest, (label, url, title))) = grammar::link_reference_definition(remaining) {
510            cm_link_reference_parser::parse_link_reference(&mut document, &label, url, title);
511            remaining = rest;
512            continue;
513        }
514
515        // Try parsing GFM pipe table (extension)
516        // Must come BEFORE paragraph so tables aren't consumed as plain text.
517        //
518        // Also try parsing extended "headerless" pipe tables (delimiter-first).
519        // Must come BEFORE paragraph for the same reason.
520        let headerless_table_start = remaining;
521        if let Ok((rest, table)) = grammar::headerless_table(remaining) {
522            nodes.push(marco_headerless_table_parser::parse_marco_headerless_table(
523                table,
524                headerless_table_start,
525                rest,
526            ));
527            remaining = rest;
528            continue;
529        }
530
531        let table_start = remaining;
532        if let Ok((rest, table)) = grammar::gfm_table(remaining) {
533            nodes.push(gfm_table_parser::parse_gfm_table(table, table_start, rest));
534            remaining = rest;
535            continue;
536        }
537
538        // Try parsing extended tab blocks.
539        // Must come BEFORE paragraph so the container isn't consumed as plain text.
540        if state.allow_tab_blocks {
541            let tab_start = remaining;
542            if let Ok((rest, block)) = grammar::marco_tab_block(remaining) {
543                let node = marco_tab_blocks_parser::parse_marco_tab_block(
544                    block,
545                    tab_start,
546                    rest,
547                    depth,
548                    |panel, new_depth| {
549                        // Tabs must support arbitrary markdown in each panel, but nested
550                        // `:::tab` containers are disallowed. We implement that by
551                        // disabling tab parsing while parsing the panel body.
552                        let mut panel_state = ParserState::new_with_tab_blocks(false);
553                        parse_blocks_internal(panel, new_depth, &mut panel_state)
554                    },
555                )?;
556
557                nodes.push(node);
558                remaining = rest;
559                continue;
560            }
561        }
562
563        // Try parsing extended definition lists (Markdown Guide / Markdown Extra-style)
564        // Must come BEFORE paragraph so definition lists aren't consumed as plain text.
565        if let Some((rest, node)) = parse_extended_definition_list(remaining, depth) {
566            nodes.push(node);
567            remaining = rest;
568            continue;
569        }
570
571        // Try parsing paragraph
572        if let Ok((rest, content)) = grammar::paragraph(remaining) {
573            nodes.push(cm_paragraph_parser::parse_paragraph(content));
574            remaining = rest;
575            continue;
576        }
577
578        // If nothing matched, skip one character to avoid infinite loop.
579        // Use `take` so we preserve nom_locate offsets (important for spans/highlights).
580        log::warn!(
581            "Could not parse block at offset {}, skipping character",
582            remaining.location_offset()
583        );
584        use nom::bytes::complete::take;
585        let skip_len = remaining
586            .fragment()
587            .chars()
588            .next()
589            .map(|c| c.len_utf8())
590            .unwrap_or(1);
591        if let Ok((rest, _)) =
592            take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
593        {
594            remaining = rest;
595        } else {
596            break;
597        }
598    }
599
600    log::info!("Parsed {} blocks", nodes.len());
601
602    // Add parsed nodes to document
603    document.children = nodes;
604    Ok(document)
605}
606
607/// Attempt to parse a Markdown Guide extended definition list at the current input.
608///
609/// Syntax (canonical):
610/// ```text
611/// Term
612/// : definition
613///
614/// Another term
615/// : first definition
616/// : second definition
617/// ```
618///
619/// Supported extensions:
620/// - Multiple `: ...` definition lines per term
621/// - Multiple term groups in a single list, with optional blank lines between items
622/// - Multi-line definition bodies via indented continuation lines (>= 2 spaces)
623/// - Nested blocks inside a definition (via recursive block parsing after dedent)
624///
625/// Explicit non-goals / disambiguation:
626/// - Lines starting with `::` are *not* treated as definition markers.
627fn parse_extended_definition_list<'a>(
628    input: GrammarSpan<'a>,
629    depth: usize,
630) -> Option<(GrammarSpan<'a>, Node)> {
631    // We only match at a non-blank line; blank lines are already handled by the main loop.
632    let text = input.fragment();
633    if text.is_empty() {
634        return None;
635    }
636
637    const CONTINUATION_INDENT: usize = 2;
638
639    fn line_bounds(s: &str, start: usize) -> (usize, usize, usize) {
640        // Returns: (line_start, line_end_no_nl, next_start)
641        let rel_end = s[start..].find('\n').map(|i| start + i).unwrap_or(s.len());
642        let next = if rel_end < s.len() {
643            rel_end + 1
644        } else {
645            rel_end
646        };
647        (start, rel_end, next)
648    }
649
650    fn count_indent_columns(line: &str) -> usize {
651        // Count leading indentation, expanding tabs to 4-wide tab stops.
652        let mut indent = 0usize;
653        for ch in line.chars() {
654            if ch == ' ' {
655                indent += 1;
656            } else if ch == '\t' {
657                indent += 4 - (indent % 4);
658            } else {
659                break;
660            }
661        }
662        indent
663    }
664
665    fn def_marker_content_start(line: &str) -> Option<usize> {
666        // Optional leading spaces (up to 3) are allowed.
667        let bytes = line.as_bytes();
668        let mut i = 0usize;
669        for _ in 0..3 {
670            if bytes.get(i) == Some(&b' ') {
671                i += 1;
672            } else {
673                break;
674            }
675        }
676
677        if bytes.get(i) != Some(&b':') {
678            return None;
679        }
680        // Disallow "::" (reserved for other extensions / lookalikes).
681        if bytes.get(i + 1) == Some(&b':') {
682            return None;
683        }
684
685        // Require at least one whitespace after ':' (Markdown Guide uses ': ')
686        match bytes.get(i + 1) {
687            Some(b' ') | Some(b'\t') => {
688                // Strip exactly one whitespace after the marker; any extra stays as content.
689                Some(i + 2)
690            }
691            _ => None,
692        }
693    }
694
695    fn can_start_item_at(text: &str, start: usize) -> bool {
696        if start >= text.len() {
697            return false;
698        }
699        let (_t0s, t0e, t1s) = line_bounds(text, start);
700        let term_line = &text[start..t0e];
701        if term_line.trim().is_empty() {
702            return false;
703        }
704        if t1s >= text.len() {
705            return false;
706        }
707        let (_d0s, d0e, _d1s) = line_bounds(text, t1s);
708        let def_line = &text[t1s..d0e];
709        def_marker_content_start(def_line).is_some()
710    }
711
712    // We build a single <dl> node, potentially containing multiple term groups.
713    let mut children: Vec<Node> = Vec::new();
714    let mut cursor = 0usize;
715    let mut parsed_any = false;
716
717    // Parse one or more items.
718    loop {
719        if cursor >= text.len() {
720            break;
721        }
722
723        // Parse term line.
724        let (term_start, term_end, after_term) = line_bounds(text, cursor);
725        let term_line = &text[term_start..term_end];
726
727        // If we're at a blank line here, it means we consumed optional blanks between items.
728        // Stop the list; the main loop will handle blanks.
729        if term_line.trim().is_empty() {
730            break;
731        }
732
733        // Term must be followed immediately by at least one definition marker line.
734        if after_term >= text.len() {
735            break;
736        }
737
738        let (def_line_start, def_line_end, _after_def_line) = line_bounds(text, after_term);
739        let first_def_line = &text[def_line_start..def_line_end];
740        if def_marker_content_start(first_def_line).is_none() {
741            break;
742        }
743
744        // Build the <dt> node.
745        let term_start_span = input.take_from(term_start);
746        let (term_after_span, term_taken_span) = term_start_span.take_split(term_end - term_start);
747        let term_children = match crate::parser::inlines::parse_inlines_from_span(term_taken_span) {
748            Ok(children) => children,
749            Err(e) => {
750                log::warn!("Failed to parse inline elements in definition term: {}", e);
751                vec![Node {
752                    kind: NodeKind::Text(term_taken_span.fragment().to_string()),
753                    span: crate::parser::shared::opt_span(term_taken_span),
754                    children: Vec::new(),
755                }]
756            }
757        };
758
759        children.push(Node {
760            kind: NodeKind::DefinitionTerm,
761            span: crate::parser::shared::opt_span_range(term_start_span, term_after_span),
762            children: term_children,
763        });
764
765        // Parse one or more definitions for this term.
766        cursor = after_term;
767        while cursor < text.len() {
768            let (line_start, line_end, next_line_start) = line_bounds(text, cursor);
769            let line = &text[line_start..line_end];
770
771            let content_start_in_line = match def_marker_content_start(line) {
772                Some(i) => i,
773                None => break,
774            };
775
776            // Definition block span starts at the marker line.
777            let def_block_start = line_start;
778            let mut def_block_end = next_line_start;
779
780            // Build raw definition body text: first line after ": ", then indented continuations.
781            let mut raw_lines: Vec<&str> = Vec::new();
782            raw_lines.push(&line[content_start_in_line..]);
783
784            let mut scan = next_line_start;
785            while scan < text.len() {
786                let (ls, le, ln) = line_bounds(text, scan);
787                let l = &text[ls..le];
788
789                // Next definition marker starts a new <dd>.
790                if def_marker_content_start(l).is_some() {
791                    break;
792                }
793
794                if l.trim().is_empty() {
795                    // Only treat a blank line as part of this definition if the
796                    // next non-blank line is indented enough to continue.
797                    let mut look = ln;
798                    let mut next_indent: Option<usize> = None;
799                    while look < text.len() {
800                        let (_pls, ple, pln) = line_bounds(text, look);
801                        let pl = &text[look..ple];
802                        if !pl.trim().is_empty() {
803                            next_indent = Some(count_indent_columns(pl));
804                            break;
805                        }
806                        look = pln;
807                    }
808
809                    if next_indent.unwrap_or(0) >= CONTINUATION_INDENT {
810                        raw_lines.push("");
811                        scan = ln;
812                        def_block_end = scan;
813                        continue;
814                    }
815
816                    break;
817                }
818
819                let indent = count_indent_columns(l);
820                if indent >= CONTINUATION_INDENT {
821                    raw_lines.push(l);
822                    scan = ln;
823                    def_block_end = scan;
824                    continue;
825                }
826
827                break;
828            }
829
830            let raw_body = raw_lines.join("\n");
831            let dedented = dedent_list_item_content(&raw_body, CONTINUATION_INDENT);
832
833            // Parse the definition body as nested blocks.
834            let mut def_state = ParserState::new();
835            def_state.push_block(BlockContext::new_list_item(CONTINUATION_INDENT));
836            let def_children = match parse_blocks_internal(&dedented, depth + 1, &mut def_state) {
837                Ok(doc) => doc.children,
838                Err(e) => {
839                    log::warn!("Failed to parse definition description blocks: {}", e);
840                    Vec::new()
841                }
842            };
843
844            let dd_start_span = input.take_from(def_block_start);
845            let dd_end_span = input.take_from(def_block_end);
846            children.push(Node {
847                kind: NodeKind::DefinitionDescription,
848                span: crate::parser::shared::opt_span_range(dd_start_span, dd_end_span),
849                children: def_children,
850            });
851
852            parsed_any = true;
853            cursor = def_block_end;
854        }
855
856        // Between items, allow blank lines *only if* another valid item follows.
857        let mut scan = cursor;
858        while scan < text.len() {
859            let (_ls, le, ln) = line_bounds(text, scan);
860            let l = &text[scan..le];
861            if !l.trim().is_empty() {
862                break;
863            }
864            scan = ln;
865        }
866
867        if scan != cursor && can_start_item_at(text, scan) {
868            cursor = scan;
869            continue;
870        }
871
872        break;
873    }
874
875    if !parsed_any {
876        return None;
877    }
878
879    let (rest, _taken) = input.take_split(cursor);
880    let span = crate::parser::shared::opt_span_range(input, rest);
881    Some((
882        rest,
883        Node {
884            kind: NodeKind::DefinitionList,
885            span,
886            children,
887        },
888    ))
889}
890
891#[cfg(test)]
892mod tests {
893    use super::parse_blocks;
894    use crate::parser::ast::NodeKind;
895
896    #[test]
897    fn smoke_test_block_parser_handles_large_documents() {
898        // Regression test: we previously had an iteration cap (100) that could truncate
899        // parsing for realistic documents, which in turn truncated syntax highlighting.
900        let count = 250;
901        let mut input = String::new();
902        for i in 0..count {
903            input.push_str(&format!("Paragraph {i}\n\n"));
904        }
905
906        let doc = parse_blocks(&input).expect("parse_blocks failed");
907        assert_eq!(doc.children.len(), count);
908        assert!(matches!(
909            doc.children.last().unwrap().kind,
910            NodeKind::Paragraph
911        ));
912    }
913}
marco_core/parser/blocks/mod.rs

marco_core/parser/blocks/
mod.rs