marco_core/parser/blocks/
mod.rs

1// Block-level parser modules
2//
3// This module contains individual block parser functions that convert
4// grammar output into AST nodes with proper positioning.
5//
6// Phase 3: Parser module extraction - COMPLETE
7
8// Shared utilities
9pub mod shared;
10
11// Individual block parsers
12pub mod cm_blockquote_parser;
13pub mod cm_fenced_code_block_parser;
14pub mod cm_heading_parser;
15pub mod cm_html_blocks_parser;
16pub mod cm_indented_code_block_parser;
17pub mod cm_link_reference_parser;
18pub mod cm_list_parser;
19pub mod cm_paragraph_parser;
20pub mod cm_thematic_break_parser;
21pub mod gfm_admonitions;
22pub mod gfm_footnote_definition_parser;
23pub mod gfm_table_parser;
24pub mod marco_headerless_table_parser;
25pub mod marco_sliders_parser;
26pub mod marco_tab_blocks_parser;
27
28// Re-export shared utilities
29pub use shared::{dedent_list_item_content, to_parser_span, to_parser_span_range, GrammarSpan};
30
31use super::ast::Document;
32use crate::grammar::blocks as grammar;
33use crate::parser::ast::{Node, NodeKind};
34use nom::Input;
35
36// ============================================================================
37// BlockContext: Track open blocks for continuation across blank lines
38// ============================================================================
39
40/// Type of block that's currently open
41#[derive(Debug, Clone, PartialEq)]
42enum BlockContextKind {
43    /// Individual list item within a list
44    /// content_indent: minimum spaces required for content continuation
45    ListItem { content_indent: usize },
46}
47
48/// Represents an open block that can accept continuation content
49#[derive(Debug, Clone)]
50struct BlockContext {
51    kind: BlockContextKind,
52}
53
54impl BlockContext {
55    /// Create a new list item context with the given content indent
56    pub fn new_list_item(content_indent: usize) -> Self {
57        Self {
58            kind: BlockContextKind::ListItem { content_indent },
59        }
60    }
61
62    /// Check if this block can continue at the given indent level
63    fn can_continue_at(&self, indent: usize) -> bool {
64        match self.kind {
65            BlockContextKind::ListItem { content_indent } => {
66                // List item content must be indented at least to content_indent
67                indent >= content_indent
68            }
69        }
70    }
71}
72
73// ============================================================================
74// ParserState: Stack of open blocks for context-aware parsing
75// ============================================================================
76
77/// Track all currently open block contexts
78struct ParserState {
79    blocks: Vec<BlockContext>,
80    allow_tab_blocks: bool,
81    allow_sliders: bool,
82}
83
84impl ParserState {
85    fn new() -> Self {
86        Self {
87            blocks: Vec::new(),
88            allow_tab_blocks: true,
89            allow_sliders: true,
90        }
91    }
92
93    fn new_with_tab_blocks(allow_tab_blocks: bool) -> Self {
94        Self {
95            blocks: Vec::new(),
96            allow_tab_blocks,
97            allow_sliders: true,
98        }
99    }
100
101    fn new_with_sliders(allow_sliders: bool) -> Self {
102        Self {
103            blocks: Vec::new(),
104            allow_tab_blocks: true,
105            allow_sliders,
106        }
107    }
108
109    /// Add a new block context to the stack
110    pub fn push_block(&mut self, context: BlockContext) {
111        self.blocks.push(context);
112    }
113
114    /// Remove and return the most recent block context
115    fn pop_block(&mut self) -> Option<BlockContext> {
116        self.blocks.pop()
117    }
118
119    /// Check if the current context can continue at the given indent
120    fn can_continue_at(&self, indent: usize) -> bool {
121        if let Some(context) = self.blocks.last() {
122            context.can_continue_at(indent)
123        } else {
124            // No context, can't continue
125            false
126        }
127    }
128
129    /// Close blocks that can't continue at the given indent
130    /// Returns the number of blocks closed
131    fn close_blocks_until_indent(&mut self, indent: usize) -> usize {
132        let mut closed = 0;
133
134        // Close blocks from innermost to outermost
135        while let Some(context) = self.blocks.last() {
136            if context.can_continue_at(indent) {
137                // This block can continue, stop closing
138                break;
139            } else {
140                // This block can't continue, close it
141                self.blocks.pop();
142                closed += 1;
143            }
144        }
145
146        closed
147    }
148}
149
150// ============================================================================
151// Main block parser entry point
152// ============================================================================
153
154/// Parse document into block-level structure, returning a Document
155pub fn parse_blocks(input: &str) -> Result<Document, Box<dyn std::error::Error>> {
156    let mut state = ParserState::new();
157    parse_blocks_internal(input, 0, &mut state)
158}
159
160// Internal parser with recursion depth limit and state tracking
161fn parse_blocks_internal(
162    input: &str,
163    depth: usize,
164    state: &mut ParserState,
165) -> Result<Document, Box<dyn std::error::Error>> {
166    // Prevent infinite recursion
167    const MAX_DEPTH: usize = 100;
168    if depth > MAX_DEPTH {
169        log::warn!("Maximum recursion depth reached in block parser");
170        return Ok(Document::new());
171    }
172
173    log::debug!(
174        "Block parser input: {} bytes at depth {}, state depth: {}",
175        input.len(),
176        depth,
177        state.blocks.len()
178    );
179
180    let mut nodes = Vec::new();
181    let mut document = Document::new(); // Create document early to collect references
182    let mut remaining = GrammarSpan::new(input);
183
184    // Safety: prevent infinite loops.
185    // This must be high enough for real documents; the progress-check below is the
186    // primary safety mechanism.
187    let max_iterations = input.lines().count().saturating_mul(8).max(1_000);
188    let mut iteration_count = 0;
189    let mut last_offset = 0;
190
191    while !remaining.fragment().is_empty() {
192        iteration_count += 1;
193        if iteration_count > max_iterations {
194            log::error!(
195                "Block parser exceeded iteration limit ({}) at depth {}",
196                max_iterations,
197                depth
198            );
199            break;
200        }
201
202        // Safety: ensure we're making progress
203        let current_offset = remaining.location_offset();
204        if current_offset == last_offset && iteration_count > 1 {
205            log::error!(
206                "Block parser not making progress at offset {}, depth {}",
207                current_offset,
208                depth
209            );
210            // Force skip one character, while preserving span offsets.
211            use nom::bytes::complete::take;
212            let skip_len = remaining
213                .fragment()
214                .chars()
215                .next()
216                .map(|c| c.len_utf8())
217                .unwrap_or(1);
218            if let Ok((rest, _)) =
219                take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
220            {
221                remaining = rest;
222                last_offset = remaining.location_offset();
223                continue;
224            }
225            break;
226        }
227        last_offset = current_offset;
228
229        // ========================================================================
230        // BLANK LINE HANDLING WITH CONTEXT AWARENESS (Example 307 fix)
231        // ========================================================================
232        // Extract the first line to check if it's blank
233        let first_line_end = remaining
234            .fragment()
235            .find('\n')
236            .unwrap_or(remaining.fragment().len());
237        let first_line = &remaining.fragment()[..first_line_end];
238
239        // A line is blank per CommonMark spec: only ASCII space (U+0020) and tab (U+0009).
240        // Notably, U+00A0 NO-BREAK SPACE is NOT a blank line — it produces a spacer paragraph.
241        if first_line.chars().all(|c| c == ' ' || c == '\t') {
242            // Peek at the next non-blank line to determine continuation
243            let peek_offset = if first_line_end < remaining.fragment().len() {
244                first_line_end + 1
245            } else {
246                first_line_end
247            };
248
249            // Find the next non-blank line
250            let mut next_nonblank_indent: Option<usize> = None;
251            let rest_of_input = &remaining.fragment()[peek_offset..];
252
253            for peek_line in rest_of_input.lines() {
254                if !peek_line.trim().is_empty() {
255                    // Count leading spaces (expand tabs)
256                    let mut indent = 0;
257                    for ch in peek_line.chars() {
258                        if ch == ' ' {
259                            indent += 1;
260                        } else if ch == '\t' {
261                            indent += 4 - (indent % 4); // Tab to next multiple of 4
262                        } else {
263                            break;
264                        }
265                    }
266                    next_nonblank_indent = Some(indent);
267                    break;
268                }
269            }
270
271            // Determine if we should preserve context or close blocks
272            let should_continue = if let Some(next_indent) = next_nonblank_indent {
273                // Check if the next content can continue the current context
274                state.can_continue_at(next_indent)
275            } else {
276                // No more content, close all contexts
277                false
278            };
279
280            if should_continue {
281                // Blank line continues the current block
282                // Skip the blank but preserve block context
283                log::debug!(
284                    "Blank line: continuing context at indent {:?}",
285                    next_nonblank_indent
286                );
287
288                use nom::bytes::complete::take;
289                let skip_len = if first_line_end < remaining.fragment().len() {
290                    first_line_end + 1 // Include newline
291                } else {
292                    first_line_end
293                };
294
295                if let Ok((new_remaining, _)) =
296                    take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
297                {
298                    remaining = new_remaining;
299                    continue;
300                } else {
301                    break;
302                }
303            } else {
304                // Blank line ends the current context(s)
305                // Close blocks that can't continue at the next indent
306                if let Some(next_indent) = next_nonblank_indent {
307                    let closed = state.close_blocks_until_indent(next_indent);
308                    log::debug!(
309                        "Blank line: closed {} blocks due to indent {}",
310                        closed,
311                        next_indent
312                    );
313                } else {
314                    // No more content, close everything
315                    log::debug!("Blank line: end of input, closing all blocks");
316                    while state.pop_block().is_some() {}
317                }
318
319                // Skip the blank line and continue parsing
320                use nom::bytes::complete::take;
321                let skip_len = if first_line_end < remaining.fragment().len() {
322                    first_line_end + 1
323                } else {
324                    first_line_end
325                };
326
327                if let Ok((new_remaining, _)) =
328                    take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
329                {
330                    remaining = new_remaining;
331                    continue;
332                } else {
333                    break;
334                }
335            }
336        }
337
338        // Try parsing HTML blocks (types 1-7, in order)
339        // Type 1: Special raw content tags (script, pre, style, textarea)
340        if let Ok((rest, content)) = grammar::html_special_tag(remaining) {
341            nodes.push(cm_html_blocks_parser::parse_html_block(content));
342            remaining = rest;
343            continue;
344        }
345
346        // Type 2: HTML comments
347        if let Ok((rest, content)) = grammar::html_comment(remaining) {
348            nodes.push(cm_html_blocks_parser::parse_html_block(content));
349            remaining = rest;
350            continue;
351        }
352
353        // Type 3: Processing instructions
354        if let Ok((rest, content)) = grammar::html_processing_instruction(remaining) {
355            nodes.push(cm_html_blocks_parser::parse_html_block(content));
356            remaining = rest;
357            continue;
358        }
359
360        // Type 4: Declarations
361        if let Ok((rest, content)) = grammar::html_declaration(remaining) {
362            nodes.push(cm_html_blocks_parser::parse_html_block(content));
363            remaining = rest;
364            continue;
365        }
366
367        // Type 5: CDATA sections
368        if let Ok((rest, content)) = grammar::html_cdata(remaining) {
369            nodes.push(cm_html_blocks_parser::parse_html_block(content));
370            remaining = rest;
371            continue;
372        }
373
374        // Type 6: Standard block tags (div, table, etc.)
375        if let Ok((rest, content)) = grammar::html_block_tag(remaining) {
376            nodes.push(cm_html_blocks_parser::parse_html_block(content));
377            remaining = rest;
378            continue;
379        }
380
381        // Type 7: Complete tags (CANNOT interrupt paragraphs)
382        // Try this but it will fail if we're in the middle of paragraph text
383        if let Ok((rest, content)) = grammar::html_complete_tag(remaining) {
384            nodes.push(cm_html_blocks_parser::parse_html_block(content));
385            remaining = rest;
386            continue;
387        } // Try parsing heading
388        if let Ok((rest, (level, content))) = grammar::heading(remaining) {
389            nodes.push(cm_heading_parser::parse_atx_heading(level, content));
390            remaining = rest;
391            continue;
392        }
393
394        // Try parsing fenced code block
395        if let Ok((rest, (language, content))) = grammar::fenced_code_block(remaining) {
396            nodes.push(cm_fenced_code_block_parser::parse_fenced_code_block(
397                language, content,
398            ));
399            remaining = rest;
400            continue;
401        }
402
403        // Try parsing thematic break (---, ***, ___)
404        if let Ok((rest, content)) = grammar::thematic_break(remaining) {
405            nodes.push(cm_thematic_break_parser::parse_thematic_break(content));
406            remaining = rest;
407            continue;
408        }
409
410        // Try parsing block quote (lines starting with >)
411        if let Ok((rest, content)) = grammar::blockquote(remaining) {
412            let node =
413                cm_blockquote_parser::parse_blockquote(content, depth, |cleaned, new_depth| {
414                    parse_blocks_internal(cleaned, new_depth, state)
415                })?;
416
417            nodes.push(node);
418            remaining = rest;
419            continue;
420        }
421
422        // Try parsing indented code block (4 spaces or 1 tab)
423        // NOTE: Must come BEFORE lists to avoid indented code being consumed as list content
424        if let Ok((rest, content)) = grammar::indented_code_block(remaining) {
425            nodes.push(cm_indented_code_block_parser::parse_indented_code_block(
426                content,
427            ));
428            remaining = rest;
429            continue;
430        }
431
432        // Try parsing list
433        // NOTE: Must come BEFORE setext heading to avoid "---" being parsed as underline
434        if let Ok((rest, items)) = grammar::list(remaining) {
435            let node = cm_list_parser::parse_list(
436                items,
437                depth,
438                parse_blocks_internal,
439                |content_indent| {
440                    let mut item_state = ParserState::new();
441                    item_state.push_block(BlockContext::new_list_item(content_indent));
442                    item_state
443                },
444            )?;
445
446            nodes.push(node);
447            remaining = rest;
448            continue;
449        }
450
451        // Try parsing Marco sliders (extension)
452        // Must come BEFORE setext heading. Otherwise, the internal `---` / `--`
453        // separators can be consumed as setext underlines and the deck is lost.
454        if state.allow_sliders {
455            let deck_start = remaining;
456            if let Ok((rest, deck)) = grammar::marco_slide_deck(remaining) {
457                let node = marco_sliders_parser::parse_marco_slide_deck(
458                    deck,
459                    deck_start,
460                    rest,
461                    depth,
462                    |slide_body, new_depth| {
463                        // Slides support arbitrary markdown, but nested
464                        // `@slidestart` decks are disallowed.
465                        let mut slide_state = ParserState::new_with_sliders(false);
466                        parse_blocks_internal(slide_body, new_depth, &mut slide_state)
467                    },
468                )?;
469
470                nodes.push(node);
471                remaining = rest;
472                continue;
473            }
474        }
475
476        // Try parsing Setext heading (underline style: === or ---)
477        // NOTE: Must come AFTER lists to avoid eating list marker patterns like "- foo\n---"
478        let full_start = remaining;
479        if let Ok((rest, (level, content))) = grammar::setext_heading(remaining) {
480            let full_end = rest;
481            nodes.push(cm_heading_parser::parse_setext_heading(
482                level, content, full_start, full_end,
483            ));
484            remaining = rest;
485            continue;
486        }
487
488        // Try parsing link reference definition
489        // Must come BEFORE paragraph to avoid treating definitions as paragraphs
490        if let Some((rest, node)) =
491            gfm_footnote_definition_parser::parse_footnote_definition(remaining)
492        {
493            nodes.push(node);
494            remaining = rest;
495            continue;
496        }
497
498        if let Ok((rest, (label, url, title))) = grammar::link_reference_definition(remaining) {
499            cm_link_reference_parser::parse_link_reference(&mut document, &label, url, title);
500            remaining = rest;
501            continue;
502        }
503
504        // Try parsing GFM pipe table (extension)
505        // Must come BEFORE paragraph so tables aren't consumed as plain text.
506        //
507        // Also try parsing Marco "headerless" pipe tables (delimiter-first).
508        // Must come BEFORE paragraph for the same reason.
509        let headerless_table_start = remaining;
510        if let Ok((rest, table)) = grammar::marco_headerless_table(remaining) {
511            nodes.push(marco_headerless_table_parser::parse_marco_headerless_table(
512                table,
513                headerless_table_start,
514                rest,
515            ));
516            remaining = rest;
517            continue;
518        }
519
520        let table_start = remaining;
521        if let Ok((rest, table)) = grammar::gfm_table(remaining) {
522            nodes.push(gfm_table_parser::parse_gfm_table(table, table_start, rest));
523            remaining = rest;
524            continue;
525        }
526
527        // Try parsing Marco extended tab blocks (extension)
528        // Must come BEFORE paragraph so the container isn't consumed as plain text.
529        if state.allow_tab_blocks {
530            let tab_start = remaining;
531            if let Ok((rest, block)) = grammar::marco_tab_block(remaining) {
532                let node = marco_tab_blocks_parser::parse_marco_tab_block(
533                    block,
534                    tab_start,
535                    rest,
536                    depth,
537                    |panel, new_depth| {
538                        // Tabs must support arbitrary markdown in each panel, but nested
539                        // `:::tab` containers are disallowed. We implement that by
540                        // disabling tab parsing while parsing the panel body.
541                        let mut panel_state = ParserState::new_with_tab_blocks(false);
542                        parse_blocks_internal(panel, new_depth, &mut panel_state)
543                    },
544                )?;
545
546                nodes.push(node);
547                remaining = rest;
548                continue;
549            }
550        }
551
552        // Try parsing extended definition lists (Markdown Guide / Markdown Extra-style)
553        // Must come BEFORE paragraph so definition lists aren't consumed as plain text.
554        if let Some((rest, node)) = parse_extended_definition_list(remaining, depth) {
555            nodes.push(node);
556            remaining = rest;
557            continue;
558        }
559
560        // Try parsing paragraph
561        if let Ok((rest, content)) = grammar::paragraph(remaining) {
562            nodes.push(cm_paragraph_parser::parse_paragraph(content));
563            remaining = rest;
564            continue;
565        }
566
567        // If nothing matched, skip one character to avoid infinite loop.
568        // Use `take` so we preserve nom_locate offsets (important for spans/highlights).
569        log::warn!(
570            "Could not parse block at offset {}, skipping character",
571            remaining.location_offset()
572        );
573        use nom::bytes::complete::take;
574        let skip_len = remaining
575            .fragment()
576            .chars()
577            .next()
578            .map(|c| c.len_utf8())
579            .unwrap_or(1);
580        if let Ok((rest, _)) =
581            take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
582        {
583            remaining = rest;
584        } else {
585            break;
586        }
587    }
588
589    log::info!("Parsed {} blocks", nodes.len());
590
591    // Add parsed nodes to document
592    document.children = nodes;
593    Ok(document)
594}
595
596/// Attempt to parse a Markdown Guide extended definition list at the current input.
597///
598/// Syntax (canonical):
599/// ```text
600/// Term
601/// : definition
602///
603/// Another term
604/// : first definition
605/// : second definition
606/// ```
607///
608/// Supported extensions:
609/// - Multiple `: ...` definition lines per term
610/// - Multiple term groups in a single list, with optional blank lines between items
611/// - Multi-line definition bodies via indented continuation lines (>= 2 spaces)
612/// - Nested blocks inside a definition (via recursive block parsing after dedent)
613///
614/// Explicit non-goals / disambiguation:
615/// - Lines starting with `::` are *not* treated as definition markers.
616fn parse_extended_definition_list<'a>(
617    input: GrammarSpan<'a>,
618    depth: usize,
619) -> Option<(GrammarSpan<'a>, Node)> {
620    // We only match at a non-blank line; blank lines are already handled by the main loop.
621    let text = input.fragment();
622    if text.is_empty() {
623        return None;
624    }
625
626    const CONTINUATION_INDENT: usize = 2;
627
628    fn line_bounds(s: &str, start: usize) -> (usize, usize, usize) {
629        // Returns: (line_start, line_end_no_nl, next_start)
630        let rel_end = s[start..].find('\n').map(|i| start + i).unwrap_or(s.len());
631        let next = if rel_end < s.len() {
632            rel_end + 1
633        } else {
634            rel_end
635        };
636        (start, rel_end, next)
637    }
638
639    fn count_indent_columns(line: &str) -> usize {
640        // Count leading indentation, expanding tabs to 4-wide tab stops.
641        let mut indent = 0usize;
642        for ch in line.chars() {
643            if ch == ' ' {
644                indent += 1;
645            } else if ch == '\t' {
646                indent += 4 - (indent % 4);
647            } else {
648                break;
649            }
650        }
651        indent
652    }
653
654    fn def_marker_content_start(line: &str) -> Option<usize> {
655        // Optional leading spaces (up to 3) are allowed.
656        let bytes = line.as_bytes();
657        let mut i = 0usize;
658        for _ in 0..3 {
659            if bytes.get(i) == Some(&b' ') {
660                i += 1;
661            } else {
662                break;
663            }
664        }
665
666        if bytes.get(i) != Some(&b':') {
667            return None;
668        }
669        // Disallow "::" (reserved for other extensions / lookalikes).
670        if bytes.get(i + 1) == Some(&b':') {
671            return None;
672        }
673
674        // Require at least one whitespace after ':' (Markdown Guide uses ': ')
675        match bytes.get(i + 1) {
676            Some(b' ') | Some(b'\t') => {
677                // Strip exactly one whitespace after the marker; any extra stays as content.
678                Some(i + 2)
679            }
680            _ => None,
681        }
682    }
683
684    fn can_start_item_at(text: &str, start: usize) -> bool {
685        if start >= text.len() {
686            return false;
687        }
688        let (_t0s, t0e, t1s) = line_bounds(text, start);
689        let term_line = &text[start..t0e];
690        if term_line.trim().is_empty() {
691            return false;
692        }
693        if t1s >= text.len() {
694            return false;
695        }
696        let (_d0s, d0e, _d1s) = line_bounds(text, t1s);
697        let def_line = &text[t1s..d0e];
698        def_marker_content_start(def_line).is_some()
699    }
700
701    // We build a single <dl> node, potentially containing multiple term groups.
702    let mut children: Vec<Node> = Vec::new();
703    let mut cursor = 0usize;
704    let mut parsed_any = false;
705
706    // Parse one or more items.
707    loop {
708        if cursor >= text.len() {
709            break;
710        }
711
712        // Parse term line.
713        let (term_start, term_end, after_term) = line_bounds(text, cursor);
714        let term_line = &text[term_start..term_end];
715
716        // If we're at a blank line here, it means we consumed optional blanks between items.
717        // Stop the list; the main loop will handle blanks.
718        if term_line.trim().is_empty() {
719            break;
720        }
721
722        // Term must be followed immediately by at least one definition marker line.
723        if after_term >= text.len() {
724            break;
725        }
726
727        let (def_line_start, def_line_end, _after_def_line) = line_bounds(text, after_term);
728        let first_def_line = &text[def_line_start..def_line_end];
729        if def_marker_content_start(first_def_line).is_none() {
730            break;
731        }
732
733        // Build the <dt> node.
734        let term_start_span = input.take_from(term_start);
735        let (term_after_span, term_taken_span) = term_start_span.take_split(term_end - term_start);
736        let term_children = match crate::parser::inlines::parse_inlines_from_span(term_taken_span) {
737            Ok(children) => children,
738            Err(e) => {
739                log::warn!("Failed to parse inline elements in definition term: {}", e);
740                vec![Node {
741                    kind: NodeKind::Text(term_taken_span.fragment().to_string()),
742                    span: Some(crate::parser::shared::to_parser_span(term_taken_span)),
743                    children: Vec::new(),
744                }]
745            }
746        };
747
748        children.push(Node {
749            kind: NodeKind::DefinitionTerm,
750            span: Some(crate::parser::shared::to_parser_span_range(
751                term_start_span,
752                term_after_span,
753            )),
754            children: term_children,
755        });
756
757        // Parse one or more definitions for this term.
758        cursor = after_term;
759        while cursor < text.len() {
760            let (line_start, line_end, next_line_start) = line_bounds(text, cursor);
761            let line = &text[line_start..line_end];
762
763            let content_start_in_line = match def_marker_content_start(line) {
764                Some(i) => i,
765                None => break,
766            };
767
768            // Definition block span starts at the marker line.
769            let def_block_start = line_start;
770            let mut def_block_end = next_line_start;
771
772            // Build raw definition body text: first line after ": ", then indented continuations.
773            let mut raw_lines: Vec<&str> = Vec::new();
774            raw_lines.push(&line[content_start_in_line..]);
775
776            let mut scan = next_line_start;
777            while scan < text.len() {
778                let (ls, le, ln) = line_bounds(text, scan);
779                let l = &text[ls..le];
780
781                // Next definition marker starts a new <dd>.
782                if def_marker_content_start(l).is_some() {
783                    break;
784                }
785
786                if l.trim().is_empty() {
787                    // Only treat a blank line as part of this definition if the
788                    // next non-blank line is indented enough to continue.
789                    let mut look = ln;
790                    let mut next_indent: Option<usize> = None;
791                    while look < text.len() {
792                        let (_pls, ple, pln) = line_bounds(text, look);
793                        let pl = &text[look..ple];
794                        if !pl.trim().is_empty() {
795                            next_indent = Some(count_indent_columns(pl));
796                            break;
797                        }
798                        look = pln;
799                    }
800
801                    if next_indent.unwrap_or(0) >= CONTINUATION_INDENT {
802                        raw_lines.push("");
803                        scan = ln;
804                        def_block_end = scan;
805                        continue;
806                    }
807
808                    break;
809                }
810
811                let indent = count_indent_columns(l);
812                if indent >= CONTINUATION_INDENT {
813                    raw_lines.push(l);
814                    scan = ln;
815                    def_block_end = scan;
816                    continue;
817                }
818
819                break;
820            }
821
822            let raw_body = raw_lines.join("\n");
823            let dedented = dedent_list_item_content(&raw_body, CONTINUATION_INDENT);
824
825            // Parse the definition body as nested blocks.
826            let mut def_state = ParserState::new();
827            def_state.push_block(BlockContext::new_list_item(CONTINUATION_INDENT));
828            let def_children = match parse_blocks_internal(&dedented, depth + 1, &mut def_state) {
829                Ok(doc) => doc.children,
830                Err(e) => {
831                    log::warn!("Failed to parse definition description blocks: {}", e);
832                    Vec::new()
833                }
834            };
835
836            let dd_start_span = input.take_from(def_block_start);
837            let dd_end_span = input.take_from(def_block_end);
838            children.push(Node {
839                kind: NodeKind::DefinitionDescription,
840                span: Some(crate::parser::shared::to_parser_span_range(
841                    dd_start_span,
842                    dd_end_span,
843                )),
844                children: def_children,
845            });
846
847            parsed_any = true;
848            cursor = def_block_end;
849        }
850
851        // Between items, allow blank lines *only if* another valid item follows.
852        let mut scan = cursor;
853        while scan < text.len() {
854            let (_ls, le, ln) = line_bounds(text, scan);
855            let l = &text[scan..le];
856            if !l.trim().is_empty() {
857                break;
858            }
859            scan = ln;
860        }
861
862        if scan != cursor && can_start_item_at(text, scan) {
863            cursor = scan;
864            continue;
865        }
866
867        break;
868    }
869
870    if !parsed_any {
871        return None;
872    }
873
874    let (rest, _taken) = input.take_split(cursor);
875    let span = crate::parser::shared::to_parser_span_range(input, rest);
876    Some((
877        rest,
878        Node {
879            kind: NodeKind::DefinitionList,
880            span: Some(span),
881            children,
882        },
883    ))
884}
885
886#[cfg(test)]
887mod tests {
888    use super::parse_blocks;
889    use crate::parser::ast::NodeKind;
890
891    #[test]
892    fn smoke_test_block_parser_handles_large_documents() {
893        // Regression test: we previously had an iteration cap (100) that could truncate
894        // parsing for realistic documents, which in turn truncated syntax highlighting.
895        let count = 250;
896        let mut input = String::new();
897        for i in 0..count {
898            input.push_str(&format!("Paragraph {i}\n\n"));
899        }
900
901        let doc = parse_blocks(&input).expect("parse_blocks failed");
902        assert_eq!(doc.children.len(), count);
903        assert!(matches!(
904            doc.children.last().unwrap().kind,
905            NodeKind::Paragraph
906        ));
907    }
908}
marco_core/parser/blocks/mod.rs

marco_core/parser/blocks/
mod.rs