panache_parser/parser/utils/
list_item_buffer.rs

1//! Buffer for accumulating list item content before emission.
2//!
3//! This module provides infrastructure for buffering list item content during parsing,
4//! allowing us to determine tight vs loose lists and parse inline elements correctly.
5
6use crate::options::{Dialect, ParserOptions};
7use crate::parser::blocks::container_prefix::{
8    ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
9};
10use crate::parser::blocks::headings::{emit_atx_heading, try_parse_atx_heading};
11use crate::parser::blocks::horizontal_rules::{emit_horizontal_rule, try_parse_horizontal_rule};
12use crate::parser::blocks::html_blocks::{
13    HtmlBlockType, count_tag_balance, is_pandoc_matched_pair_tag, try_parse_html_block_start,
14};
15use crate::parser::utils::inline_emission;
16use crate::parser::utils::text_buffer::ParagraphBuffer;
17use crate::syntax::{SyntaxKind, SyntaxNode};
18use rowan::{GreenNodeBuilder, TextSize};
19
20/// A segment in the list item buffer - either text content or a blank line.
21#[derive(Debug, Clone)]
22pub(crate) enum ListItemContent {
23    /// Text content (includes newlines for losslessness)
24    Text(String),
25    /// Structural blockquote marker emitted inside buffered list-item text.
26    BlockquoteMarker {
27        leading_spaces: usize,
28        has_trailing_space: bool,
29    },
30}
31
32/// Buffer for accumulating list item content before emission.
33///
34/// Collects text, blank lines, and structural elements as we parse list item
35/// continuation lines. When the list item closes, we can:
36/// 1. Determine if it's tight (Plain) or loose (PARAGRAPH)
37/// 2. Parse inline elements correctly across continuation lines
38/// 3. Emit the complete structure
39#[derive(Debug, Default, Clone)]
40pub(crate) struct ListItemBuffer {
41    /// Segments of content in order
42    segments: Vec<ListItemContent>,
43}
44
45impl ListItemBuffer {
46    /// Create a new empty list item buffer.
47    pub(crate) fn new() -> Self {
48        Self {
49            segments: Vec::new(),
50        }
51    }
52
53    /// Push text content to the buffer.
54    pub(crate) fn push_text(&mut self, text: impl Into<String>) {
55        let text = text.into();
56        if text.is_empty() {
57            return;
58        }
59        self.segments.push(ListItemContent::Text(text));
60    }
61
62    pub(crate) fn push_blockquote_marker(
63        &mut self,
64        leading_spaces: usize,
65        has_trailing_space: bool,
66    ) {
67        self.segments.push(ListItemContent::BlockquoteMarker {
68            leading_spaces,
69            has_trailing_space,
70        });
71    }
72
73    /// Check if buffer is empty.
74    pub(crate) fn is_empty(&self) -> bool {
75        self.segments.is_empty()
76    }
77
78    /// Get the number of segments in the buffer (for debugging).
79    pub(crate) fn segment_count(&self) -> usize {
80        self.segments.len()
81    }
82
83    /// Return the text of the first segment, if it is a `Text` segment.
84    pub(crate) fn first_text(&self) -> Option<&str> {
85        match self.segments.first()? {
86            ListItemContent::Text(t) => Some(t.as_str()),
87            ListItemContent::BlockquoteMarker { .. } => None,
88        }
89    }
90
91    /// If the buffered text begins with a Pandoc matched-pair HTML open
92    /// tag (e.g. `<div ...>`, `<section>`, `<pre>`, `<video>`) whose
93    /// opens outnumber its closes in the buffered text, return the tag
94    /// name. Used by the block dispatcher to suppress the close-form
95    /// dispatch that would otherwise interrupt the LIST_ITEM buffer at
96    /// `</div>` / `</pre>` / etc. — letting the buffer accumulate the
97    /// full matched-pair text so the emit-time structural lift sees both
98    /// open and close.
99    ///
100    /// Only fires under Pandoc dialect. Under CommonMark, list items
101    /// keep their existing behavior (inline HTML inside Plain).
102    pub(crate) fn unclosed_pandoc_matched_pair_tag(
103        &self,
104        config: &ParserOptions,
105    ) -> Option<String> {
106        if config.dialect != Dialect::Pandoc {
107            return None;
108        }
109        let first = self.first_text()?;
110        let first_line_with_nl = first.split_inclusive('\n').next()?;
111        let first_line_no_nl = first_line_with_nl
112            .strip_suffix("\r\n")
113            .or_else(|| first_line_with_nl.strip_suffix('\n'))
114            .unwrap_or(first_line_with_nl);
115        let HtmlBlockType::BlockTag {
116            tag_name,
117            is_closing: false,
118            ..
119        } = try_parse_html_block_start(first_line_no_nl, false)?
120        else {
121            return None;
122        };
123        if !is_pandoc_matched_pair_tag(&tag_name) {
124            return None;
125        }
126        let mut opens = 0usize;
127        let mut closes = 0usize;
128        for segment in &self.segments {
129            if let ListItemContent::Text(t) = segment {
130                let (o, c) = count_tag_balance(t, &tag_name);
131                opens += o;
132                closes += c;
133            }
134        }
135        if opens > closes { Some(tag_name) } else { None }
136    }
137
138    /// Determine if this list item has blank lines between content.
139    ///
140    /// Used to decide between Plain (tight) and PARAGRAPH (loose).
141    /// Returns true if there's a blank line followed by more content.
142    pub(crate) fn has_blank_lines_between_content(&self) -> bool {
143        log::trace!(
144            "has_blank_lines_between_content: segments={} result=false",
145            self.segments.len()
146        );
147
148        false
149    }
150
151    /// Get concatenated text for inline parsing (excludes blank lines).
152    fn get_text_for_parsing(&self) -> String {
153        let mut result = String::new();
154        for segment in &self.segments {
155            if let ListItemContent::Text(text) = segment {
156                result.push_str(text);
157            }
158        }
159        result
160    }
161
162    fn to_paragraph_buffer(&self) -> ParagraphBuffer {
163        let mut paragraph_buffer = ParagraphBuffer::new();
164        for segment in &self.segments {
165            match segment {
166                ListItemContent::Text(text) => paragraph_buffer.push_text(text),
167                ListItemContent::BlockquoteMarker {
168                    leading_spaces,
169                    has_trailing_space,
170                } => paragraph_buffer.push_marker(*leading_spaces, *has_trailing_space),
171            }
172        }
173        paragraph_buffer
174    }
175
176    /// Emit the buffered content as a Plain or PARAGRAPH block.
177    ///
178    /// If `use_paragraph` is true, wraps in PARAGRAPH (loose list).
179    /// If false, wraps in PLAIN (tight list).
180    ///
181    /// `content_col` is the enclosing list-item's content column (or 0
182    /// outside a list-item). The HTML-block first-line structural lift
183    /// uses it to strip the list-item leading indent from continuation
184    /// lines before reparsing the body, so `<div>` body parses as
185    /// pandoc's `Para` (matched-pair under stripped indent) instead of
186    /// `Plain` (the indented-close demotion), and so verbatim-tag
187    /// content (`<pre>`, `<style>`, etc.) projects without the leading
188    /// indent baked into the RawBlock text. The stripped bytes are
189    /// re-emitted as `WHITESPACE` tokens at line starts during graft
190    /// so the CST stays byte-equal to source.
191    pub(crate) fn emit_as_block(
192        &self,
193        builder: &mut GreenNodeBuilder<'static>,
194        use_paragraph: bool,
195        config: &ParserOptions,
196        content_col: usize,
197        suppress_footnote_refs: bool,
198    ) {
199        if self.is_empty() {
200            return;
201        }
202
203        // Get text and parse inline elements
204        let text = self.get_text_for_parsing();
205
206        if !text.is_empty() {
207            let line_without_newline = text
208                .strip_suffix("\r\n")
209                .or_else(|| text.strip_suffix('\n'));
210            if let Some(line) = line_without_newline
211                && !line.contains('\n')
212                && !line.contains('\r')
213            {
214                if let Some(level) = try_parse_atx_heading(line) {
215                    emit_atx_heading(builder, &text, level, config);
216                    return;
217                }
218                if try_parse_horizontal_rule(line).is_some() {
219                    emit_horizontal_rule(builder, &text);
220                    return;
221                }
222            }
223
224            // Multi-line case: first line is an ATX heading, rest is plain
225            // continuation. Pandoc treats `- # Heading\n  Some text` as a
226            // list item containing Header + Plain, not a single Plain spanning
227            // both lines.
228            if self
229                .segments
230                .iter()
231                .all(|s| matches!(s, ListItemContent::Text(_)))
232                && let Some(first_nl) = text.find('\n')
233            {
234                let first_line = &text[..first_nl];
235                let after_first = &text[first_nl + 1..];
236                if !after_first.is_empty()
237                    && let Some(level) = try_parse_atx_heading(first_line)
238                {
239                    let heading_bytes = &text[..first_nl + 1];
240                    emit_atx_heading(builder, heading_bytes, level, config);
241
242                    let block_kind = if use_paragraph {
243                        SyntaxKind::PARAGRAPH
244                    } else {
245                        SyntaxKind::PLAIN
246                    };
247                    builder.start_node(block_kind.into());
248                    inline_emission::emit_inlines(
249                        builder,
250                        after_first,
251                        config,
252                        suppress_footnote_refs,
253                    );
254                    builder.finish_node();
255                    return;
256                }
257            }
258
259            // Pandoc HTML-block-first-line structural lift: when the buffered
260            // text begins with a matched HTML block (same-line `<div>...</div>`,
261            // single-line comment, `<pre>foo</pre>`, etc.) and the entire
262            // buffer is consumed by that block, reparse and graft the inner
263            // block as a direct LIST_ITEM child. Without this lift, the
264            // dispatcher's inline-HTML path takes over and emits
265            // `Plain[RawInline <tag>, body, RawInline </tag>]` instead of
266            // `Div [...]` or `RawBlock <tag>`.
267            //
268            // Multi-line cases where the close tag lives in a sibling
269            // HTML_BLOCK (because the dispatcher recognizes Pandoc strict-
270            // block close forms as block starts and breaks the buffer) are
271            // not handled here — the gate rejects HTML_BLOCK_DIV with only
272            // one HTML_BLOCK_TAG child. That sub-target stays open.
273            if config.dialect == Dialect::Pandoc
274                && self
275                    .segments
276                    .iter()
277                    .all(|s| matches!(s, ListItemContent::Text(_)))
278                && try_emit_html_block_lift(builder, &text, config, content_col, use_paragraph)
279            {
280                return;
281            }
282
283            // Structural block lift for marker-line tables and fenced divs.
284            // Pandoc recognizes `- | a | b |\n  | - | - |` and `- ::: note\n
285            // ...\n  :::` as nested Table / Div; without lifting, the buffer
286            // would emit them as PLAIN with raw `|` / `:` text. Mirrors the
287            // HTML lift above: strip list-item indent from continuation
288            // lines, reparse via the block dispatcher, accept a single root
289            // node whose kind is in the allowlist and that consumes the
290            // whole buffer.
291            if self
292                .segments
293                .iter()
294                .all(|s| matches!(s, ListItemContent::Text(_)))
295                && try_emit_table_or_div_lift(builder, &text, config, content_col)
296            {
297                return;
298            }
299        }
300
301        let block_kind = if use_paragraph {
302            SyntaxKind::PARAGRAPH
303        } else {
304            SyntaxKind::PLAIN
305        };
306
307        builder.start_node(block_kind.into());
308
309        let paragraph_buffer = self.to_paragraph_buffer();
310        if !paragraph_buffer.is_empty() {
311            paragraph_buffer.emit_with_inlines(builder, config, suppress_footnote_refs);
312        } else if !text.is_empty() {
313            inline_emission::emit_inlines(builder, &text, config, suppress_footnote_refs);
314        }
315
316        builder.finish_node(); // Close PLAIN or PARAGRAPH
317    }
318
319    /// Clear the buffer for reuse.
320    pub(crate) fn clear(&mut self) {
321        self.segments.clear();
322    }
323}
324
325/// Attempt the Pandoc HTML-block-first-line structural lift on the
326/// buffered list-item text. Returns `true` if `text` was emitted as
327/// one or more HTML block CST nodes (no surrounding PLAIN/PARAGRAPH
328/// wrapper). Returns `false` if the lift gate rejected the case;
329/// the caller falls through to its default Plain/Paragraph emission.
330///
331/// The gate is strict: the inner reparse must produce exactly one
332/// top-level HTML_BLOCK or HTML_BLOCK_DIV that consumes every byte
333/// of `text` (modulo list-item indent stripping — see `content_col`).
334/// For HTML_BLOCK_DIV, a matched open+close is required (>= 2
335/// `HTML_BLOCK_TAG` children). This avoids lifting unclosed shapes
336/// (where the close tag would live in a separate sibling HTML_BLOCK),
337/// which would produce a structurally incomplete CST.
338///
339/// When `content_col > 0`, continuation lines have up to `content_col`
340/// leading spaces stripped before the inner reparse, mirroring
341/// pandoc's list-item indent normalization. The stripped bytes are
342/// re-injected as `WHITESPACE` tokens at the start of each continuation
343/// line during graft so the result is byte-equal to the original
344/// buffer text.
345fn try_emit_html_block_lift(
346    builder: &mut GreenNodeBuilder<'static>,
347    text: &str,
348    config: &ParserOptions,
349    content_col: usize,
350    use_paragraph: bool,
351) -> bool {
352    let first_line = text.split_inclusive('\n').next().unwrap_or(text);
353    let first_line_no_nl = first_line
354        .strip_suffix("\r\n")
355        .or_else(|| first_line.strip_suffix('\n'))
356        .unwrap_or(first_line);
357    if try_parse_html_block_start(first_line_no_nl, false).is_none() {
358        return false;
359    }
360
361    let (parse_text, prefixes) = if content_col > 0 {
362        strip_list_item_indent(text, content_col)
363    } else {
364        (text.to_string(), Vec::new())
365    };
366
367    let refdefs = config.refdef_labels.clone().unwrap_or_default();
368    let inner_root = crate::parser::parse_with_refdefs(&parse_text, Some(config.clone()), refdefs);
369
370    let children: Vec<SyntaxNode> = inner_root.children().collect();
371    if children.is_empty() {
372        return false;
373    }
374    let first = &children[0];
375    if !matches!(
376        first.kind(),
377        SyntaxKind::HTML_BLOCK | SyntaxKind::HTML_BLOCK_RAW | SyntaxKind::HTML_BLOCK_DIV
378    ) {
379        return false;
380    }
381    let total_end = children.last().unwrap().text_range().end();
382    if total_end != TextSize::of(parse_text.as_str()) {
383        return false;
384    }
385
386    // Single-child path: existing same-line / fully-contained lift.
387    // Multi-child path: trailing-text split — the inner dispatcher
388    // produced sibling block(s) after the HTML_BLOCK / HTML_BLOCK_DIV.
389    // Sources:
390    //   - `try_parse_comment_pi_with_trailing_split` for `<!--…--> trail`
391    //     and `<?…?> trail` (HTML_BLOCK + PARAGRAPH).
392    //   - Same-line div / non-div strict-block lift's trailing branch
393    //     for `<div>foo</div>bar` (HTML_BLOCK_DIV + PARAGRAPH) and
394    //     `<form>foo</form>bar` (also HTML_BLOCK + PARAGRAPH after the
395    //     existing strict-block matched-pair lift fires).
396    // The trailing PARAGRAPH is retagged to PLAIN for tight list items
397    // so the item shape matches pandoc (`[RawBlock, Plain[trailing]]`
398    // for tight, `[RawBlock, Para[...]]` for loose). N>2 children would
399    // require Para→Plain SoftBreak fusion across HTML-block boundaries
400    // (0390 blocked); leave those shapes to the inline path until that
401    // gap closes.
402    let multi_child_trailing = if children.len() == 1 {
403        false
404    } else if children.len() == 2
405        && matches!(
406            first.kind(),
407            SyntaxKind::HTML_BLOCK | SyntaxKind::HTML_BLOCK_RAW | SyntaxKind::HTML_BLOCK_DIV
408        )
409        && children[1].kind() == SyntaxKind::PARAGRAPH
410    {
411        true
412    } else {
413        return false;
414    };
415
416    if first.kind() == SyntaxKind::HTML_BLOCK_DIV {
417        let html_block_tag_count = first
418            .children()
419            .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG)
420            .count();
421        if html_block_tag_count < 2 {
422            return false;
423        }
424    }
425
426    let prefix_lines: Vec<ContainerPrefixLine> = prefixes
427        .into_iter()
428        .map(ContainerPrefixLine::list_only)
429        .collect();
430    let mut prefix_state = ContainerPrefixState::new(prefix_lines);
431    if multi_child_trailing {
432        graft_node(builder, first, &mut prefix_state);
433        let trailing_kind = if use_paragraph {
434            SyntaxKind::PARAGRAPH
435        } else {
436            SyntaxKind::PLAIN
437        };
438        graft_node_retag_root(builder, &children[1], &mut prefix_state, trailing_kind);
439    } else {
440        graft_node(builder, first, &mut prefix_state);
441    }
442    true
443}
444
445/// Structural lift for pipe tables, grid tables, and fenced divs whose
446/// opener sits on the list-item marker line (or on the first non-blank
447/// continuation line of a buffered list item). Returns `true` when the
448/// buffered text was emitted as a single LIST_ITEM-child block. The
449/// strict single-root + total-end-coverage gate makes "lift failed"
450/// indistinguishable from "buffer is not actually a table/div" — the
451/// caller falls through to its PLAIN/PARAGRAPH wrapper.
452fn try_emit_table_or_div_lift(
453    builder: &mut GreenNodeBuilder<'static>,
454    text: &str,
455    config: &ParserOptions,
456    content_col: usize,
457) -> bool {
458    let first_line = text.split_inclusive('\n').next().unwrap_or(text);
459    let first_line_no_nl = first_line
460        .strip_suffix("\r\n")
461        .or_else(|| first_line.strip_suffix('\n'))
462        .unwrap_or(first_line);
463    let trimmed = first_line_no_nl.trim_start();
464    let first_byte = trimmed.as_bytes().first().copied();
465    if !matches!(first_byte, Some(b'|') | Some(b'+') | Some(b':')) {
466        return false;
467    }
468
469    let (parse_text, prefixes) = if content_col > 0 {
470        strip_list_item_indent(text, content_col)
471    } else {
472        (text.to_string(), Vec::new())
473    };
474
475    let refdefs = config.refdef_labels.clone().unwrap_or_default();
476    let inner_root = crate::parser::parse_with_refdefs(&parse_text, Some(config.clone()), refdefs);
477
478    let children: Vec<SyntaxNode> = inner_root.children().collect();
479    if children.len() != 1 {
480        return false;
481    }
482    let first = &children[0];
483    if !matches!(
484        first.kind(),
485        SyntaxKind::PIPE_TABLE | SyntaxKind::GRID_TABLE | SyntaxKind::FENCED_DIV
486    ) {
487        return false;
488    }
489    if first.text_range().end() != TextSize::of(parse_text.as_str()) {
490        return false;
491    }
492
493    let prefix_lines: Vec<ContainerPrefixLine> = prefixes
494        .into_iter()
495        .map(ContainerPrefixLine::list_only)
496        .collect();
497    let mut prefix_state = ContainerPrefixState::new(prefix_lines);
498    graft_node(builder, first, &mut prefix_state);
499    true
500}
501
502fn graft_node_retag_root(
503    builder: &mut GreenNodeBuilder<'static>,
504    node: &SyntaxNode,
505    prefix: &mut Option<ContainerPrefixState>,
506    new_kind: SyntaxKind,
507) {
508    builder.start_node(new_kind.into());
509    for child in node.children_with_tokens() {
510        match child {
511            rowan::NodeOrToken::Node(n) => graft_node(builder, &n, prefix),
512            rowan::NodeOrToken::Token(t) => {
513                emit_grafted_token(builder, t.kind(), t.text(), prefix);
514            }
515        }
516    }
517    builder.finish_node();
518}
519
520/// Strip up to `content_col` leading-space bytes from each continuation
521/// line of `text` (lines after the first). The first line is left
522/// untouched — its leading columns are owned by the list marker and
523/// its post-marker spaces. Returns the stripped text plus a per-line
524/// prefix vector for losslessness re-injection during graft.
525fn strip_list_item_indent(text: &str, content_col: usize) -> (String, Vec<String>) {
526    let mut stripped = String::with_capacity(text.len());
527    let mut prefixes: Vec<String> = Vec::new();
528    for (i, line) in text.split_inclusive('\n').enumerate() {
529        if i == 0 {
530            prefixes.push(String::new());
531            stripped.push_str(line);
532            continue;
533        }
534        let mut consumed = 0usize;
535        let mut col = 0usize;
536        for &b in line.as_bytes() {
537            if col >= content_col {
538                break;
539            }
540            match b {
541                b' ' => {
542                    col += 1;
543                    consumed += 1;
544                }
545                b'\t' => {
546                    let next = (col / 4 + 1) * 4;
547                    if next > content_col {
548                        break;
549                    }
550                    col = next;
551                    consumed += 1;
552                }
553                _ => break,
554            }
555        }
556        prefixes.push(line[..consumed].to_string());
557        stripped.push_str(&line[consumed..]);
558    }
559    (stripped, prefixes)
560}
561
562fn graft_node(
563    builder: &mut GreenNodeBuilder<'static>,
564    node: &SyntaxNode,
565    prefix: &mut Option<ContainerPrefixState>,
566) {
567    builder.start_node(node.kind().into());
568    for child in node.children_with_tokens() {
569        match child {
570            rowan::NodeOrToken::Node(n) => graft_node(builder, &n, prefix),
571            rowan::NodeOrToken::Token(t) => {
572                emit_grafted_token(builder, t.kind(), t.text(), prefix);
573            }
574        }
575    }
576    builder.finish_node();
577}
578
579fn emit_grafted_token(
580    builder: &mut GreenNodeBuilder<'static>,
581    kind: SyntaxKind,
582    text: &str,
583    prefix: &mut Option<ContainerPrefixState>,
584) {
585    if let Some(state) = prefix.as_mut() {
586        if state.at_line_start {
587            if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
588                emit_container_prefix_tokens(builder, line_prefix);
589            }
590            state.at_line_start = false;
591        }
592        builder.token(kind.into(), text);
593        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
594            state.line_idx += 1;
595            state.at_line_start = true;
596        }
597    } else {
598        builder.token(kind.into(), text);
599    }
600}
601
602#[cfg(test)]
603mod tests {
604    use super::*;
605
606    #[test]
607    fn test_new_buffer_is_empty() {
608        let buffer = ListItemBuffer::new();
609        assert!(buffer.is_empty());
610        assert!(!buffer.has_blank_lines_between_content());
611    }
612
613    #[test]
614    fn test_push_single_text() {
615        let mut buffer = ListItemBuffer::new();
616        buffer.push_text("Hello, world!");
617        assert!(!buffer.is_empty());
618        assert!(!buffer.has_blank_lines_between_content());
619        assert_eq!(buffer.get_text_for_parsing(), "Hello, world!");
620    }
621
622    #[test]
623    fn test_push_multiple_text_segments() {
624        let mut buffer = ListItemBuffer::new();
625        buffer.push_text("Line 1\n");
626        buffer.push_text("Line 2\n");
627        buffer.push_text("Line 3");
628        assert_eq!(buffer.get_text_for_parsing(), "Line 1\nLine 2\nLine 3");
629    }
630
631    #[test]
632    fn test_clear_buffer() {
633        let mut buffer = ListItemBuffer::new();
634        buffer.push_text("Some text");
635        assert!(!buffer.is_empty());
636
637        buffer.clear();
638        assert!(buffer.is_empty());
639        assert_eq!(buffer.get_text_for_parsing(), "");
640    }
641
642    #[test]
643    fn test_empty_text_ignored() {
644        let mut buffer = ListItemBuffer::new();
645        buffer.push_text("");
646        assert!(buffer.is_empty());
647    }
648}
panache_parser/parser/utils/list_item_buffer.rs

panache_parser/parser/utils/
list_item_buffer.rs