Skip to main content

panache_parser/parser/utils/
list_item_buffer.rs

1//! Buffer for accumulating list item content before emission.
2//!
3//! This module provides infrastructure for buffering list item content during parsing,
4//! allowing us to determine tight vs loose lists and parse inline elements correctly.
5
6use crate::options::{Dialect, ParserOptions};
7use crate::parser::blocks::container_prefix::{
8    ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
9};
10use crate::parser::blocks::headings::{emit_atx_heading, try_parse_atx_heading};
11use crate::parser::blocks::horizontal_rules::{emit_horizontal_rule, try_parse_horizontal_rule};
12use crate::parser::blocks::html_blocks::{
13    HtmlBlockType, count_tag_balance, is_pandoc_matched_pair_tag, try_parse_html_block_start,
14};
15use crate::parser::utils::inline_emission;
16use crate::parser::utils::text_buffer::ParagraphBuffer;
17use crate::syntax::{SyntaxKind, SyntaxNode};
18use rowan::{GreenNodeBuilder, TextSize};
19
20/// A segment in the list item buffer - either text content or a blank line.
21#[derive(Debug, Clone)]
22pub(crate) enum ListItemContent {
23    /// Text content (includes newlines for losslessness)
24    Text(String),
25    /// Structural blockquote marker emitted inside buffered list-item text.
26    BlockquoteMarker {
27        leading_spaces: usize,
28        has_trailing_space: bool,
29    },
30}
31
32/// Buffer for accumulating list item content before emission.
33///
34/// Collects text, blank lines, and structural elements as we parse list item
35/// continuation lines. When the list item closes, we can:
36/// 1. Determine if it's tight (Plain) or loose (PARAGRAPH)
37/// 2. Parse inline elements correctly across continuation lines
38/// 3. Emit the complete structure
39#[derive(Debug, Default, Clone)]
40pub(crate) struct ListItemBuffer {
41    /// Segments of content in order
42    segments: Vec<ListItemContent>,
43}
44
45impl ListItemBuffer {
46    /// Create a new empty list item buffer.
47    pub(crate) fn new() -> Self {
48        Self {
49            segments: Vec::new(),
50        }
51    }
52
53    /// Push text content to the buffer.
54    pub(crate) fn push_text(&mut self, text: impl Into<String>) {
55        let text = text.into();
56        if text.is_empty() {
57            return;
58        }
59        self.segments.push(ListItemContent::Text(text));
60    }
61
62    pub(crate) fn push_blockquote_marker(
63        &mut self,
64        leading_spaces: usize,
65        has_trailing_space: bool,
66    ) {
67        self.segments.push(ListItemContent::BlockquoteMarker {
68            leading_spaces,
69            has_trailing_space,
70        });
71    }
72
73    /// Check if buffer is empty.
74    pub(crate) fn is_empty(&self) -> bool {
75        self.segments.is_empty()
76    }
77
78    /// Get the number of segments in the buffer (for debugging).
79    pub(crate) fn segment_count(&self) -> usize {
80        self.segments.len()
81    }
82
83    /// Return the text of the first segment, if it is a `Text` segment.
84    pub(crate) fn first_text(&self) -> Option<&str> {
85        match self.segments.first()? {
86            ListItemContent::Text(t) => Some(t.as_str()),
87            ListItemContent::BlockquoteMarker { .. } => None,
88        }
89    }
90
91    /// If the buffered text begins with a Pandoc matched-pair HTML open
92    /// tag (e.g. `<div ...>`, `<section>`, `<pre>`, `<video>`) whose
93    /// opens outnumber its closes in the buffered text, return the tag
94    /// name. Used by the block dispatcher to suppress the close-form
95    /// dispatch that would otherwise interrupt the LIST_ITEM buffer at
96    /// `</div>` / `</pre>` / etc. — letting the buffer accumulate the
97    /// full matched-pair text so the emit-time structural lift sees both
98    /// open and close.
99    ///
100    /// Only fires under Pandoc dialect. Under CommonMark, list items
101    /// keep their existing behavior (inline HTML inside Plain).
102    pub(crate) fn unclosed_pandoc_matched_pair_tag(
103        &self,
104        config: &ParserOptions,
105    ) -> Option<String> {
106        if config.dialect != Dialect::Pandoc {
107            return None;
108        }
109        let first = self.first_text()?;
110        let first_line_with_nl = first.split_inclusive('\n').next()?;
111        let first_line_no_nl = first_line_with_nl
112            .strip_suffix("\r\n")
113            .or_else(|| first_line_with_nl.strip_suffix('\n'))
114            .unwrap_or(first_line_with_nl);
115        let HtmlBlockType::BlockTag {
116            tag_name,
117            is_closing: false,
118            ..
119        } = try_parse_html_block_start(first_line_no_nl, false)?
120        else {
121            return None;
122        };
123        if !is_pandoc_matched_pair_tag(&tag_name) {
124            return None;
125        }
126        let mut opens = 0usize;
127        let mut closes = 0usize;
128        for segment in &self.segments {
129            if let ListItemContent::Text(t) = segment {
130                let (o, c) = count_tag_balance(t, &tag_name);
131                opens += o;
132                closes += c;
133            }
134        }
135        if opens > closes { Some(tag_name) } else { None }
136    }
137
138    /// Determine if this list item has blank lines between content.
139    ///
140    /// Used to decide between Plain (tight) and PARAGRAPH (loose).
141    /// Returns true if there's a blank line followed by more content.
142    pub(crate) fn has_blank_lines_between_content(&self) -> bool {
143        log::trace!(
144            "has_blank_lines_between_content: segments={} result=false",
145            self.segments.len()
146        );
147
148        false
149    }
150
151    /// Get concatenated text for inline parsing (excludes blank lines).
152    fn get_text_for_parsing(&self) -> String {
153        let mut result = String::new();
154        for segment in &self.segments {
155            if let ListItemContent::Text(text) = segment {
156                result.push_str(text);
157            }
158        }
159        result
160    }
161
162    fn to_paragraph_buffer(&self) -> ParagraphBuffer {
163        let mut paragraph_buffer = ParagraphBuffer::new();
164        for segment in &self.segments {
165            match segment {
166                ListItemContent::Text(text) => paragraph_buffer.push_text(text),
167                ListItemContent::BlockquoteMarker {
168                    leading_spaces,
169                    has_trailing_space,
170                } => paragraph_buffer.push_marker(*leading_spaces, *has_trailing_space),
171            }
172        }
173        paragraph_buffer
174    }
175
176    /// Emit the buffered content as a Plain or PARAGRAPH block.
177    ///
178    /// If `use_paragraph` is true, wraps in PARAGRAPH (loose list).
179    /// If false, wraps in PLAIN (tight list).
180    ///
181    /// `content_col` is the enclosing list-item's content column (or 0
182    /// outside a list-item). The HTML-block first-line structural lift
183    /// uses it to strip the list-item leading indent from continuation
184    /// lines before reparsing the body, so `<div>` body parses as
185    /// pandoc's `Para` (matched-pair under stripped indent) instead of
186    /// `Plain` (the indented-close demotion), and so verbatim-tag
187    /// content (`<pre>`, `<style>`, etc.) projects without the leading
188    /// indent baked into the RawBlock text. The stripped bytes are
189    /// re-emitted as `WHITESPACE` tokens at line starts during graft
190    /// so the CST stays byte-equal to source.
191    pub(crate) fn emit_as_block(
192        &self,
193        builder: &mut GreenNodeBuilder<'static>,
194        use_paragraph: bool,
195        config: &ParserOptions,
196        content_col: usize,
197        suppress_footnote_refs: bool,
198    ) {
199        if self.is_empty() {
200            return;
201        }
202
203        // Get text and parse inline elements
204        let text = self.get_text_for_parsing();
205
206        if !text.is_empty() {
207            let line_without_newline = text
208                .strip_suffix("\r\n")
209                .or_else(|| text.strip_suffix('\n'));
210            if let Some(line) = line_without_newline
211                && !line.contains('\n')
212                && !line.contains('\r')
213            {
214                if let Some(level) = try_parse_atx_heading(line) {
215                    emit_atx_heading(builder, &text, level, config);
216                    return;
217                }
218                if try_parse_horizontal_rule(line).is_some() {
219                    emit_horizontal_rule(builder, &text);
220                    return;
221                }
222            }
223
224            // Multi-line case: first line is an ATX heading, rest is plain
225            // continuation. Pandoc treats `- # Heading\n  Some text` as a
226            // list item containing Header + Plain, not a single Plain spanning
227            // both lines.
228            if self
229                .segments
230                .iter()
231                .all(|s| matches!(s, ListItemContent::Text(_)))
232                && let Some(first_nl) = text.find('\n')
233            {
234                let first_line = &text[..first_nl];
235                let after_first = &text[first_nl + 1..];
236                if !after_first.is_empty()
237                    && let Some(level) = try_parse_atx_heading(first_line)
238                {
239                    let heading_bytes = &text[..first_nl + 1];
240                    emit_atx_heading(builder, heading_bytes, level, config);
241
242                    let block_kind = if use_paragraph {
243                        SyntaxKind::PARAGRAPH
244                    } else {
245                        SyntaxKind::PLAIN
246                    };
247                    builder.start_node(block_kind.into());
248                    inline_emission::emit_inlines(
249                        builder,
250                        after_first,
251                        config,
252                        suppress_footnote_refs,
253                    );
254                    builder.finish_node();
255                    return;
256                }
257            }
258
259            // Pandoc HTML-block-first-line structural lift: when the buffered
260            // text begins with a matched HTML block (same-line `<div>...</div>`,
261            // single-line comment, `<pre>foo</pre>`, etc.) and the entire
262            // buffer is consumed by that block, reparse and graft the inner
263            // block as a direct LIST_ITEM child. Without this lift, the
264            // dispatcher's inline-HTML path takes over and emits
265            // `Plain[RawInline <tag>, body, RawInline </tag>]` instead of
266            // `Div [...]` or `RawBlock <tag>`.
267            //
268            // Multi-line cases where the close tag lives in a sibling
269            // HTML_BLOCK (because the dispatcher recognizes Pandoc strict-
270            // block close forms as block starts and breaks the buffer) are
271            // not handled here — the gate rejects HTML_BLOCK_DIV with only
272            // one HTML_BLOCK_TAG child. That sub-target stays open.
273            if config.dialect == Dialect::Pandoc
274                && self
275                    .segments
276                    .iter()
277                    .all(|s| matches!(s, ListItemContent::Text(_)))
278                && try_emit_html_block_lift(builder, &text, config, content_col, use_paragraph)
279            {
280                return;
281            }
282        }
283
284        let block_kind = if use_paragraph {
285            SyntaxKind::PARAGRAPH
286        } else {
287            SyntaxKind::PLAIN
288        };
289
290        builder.start_node(block_kind.into());
291
292        let paragraph_buffer = self.to_paragraph_buffer();
293        if !paragraph_buffer.is_empty() {
294            paragraph_buffer.emit_with_inlines(builder, config, suppress_footnote_refs);
295        } else if !text.is_empty() {
296            inline_emission::emit_inlines(builder, &text, config, suppress_footnote_refs);
297        }
298
299        builder.finish_node(); // Close PLAIN or PARAGRAPH
300    }
301
302    /// Clear the buffer for reuse.
303    pub(crate) fn clear(&mut self) {
304        self.segments.clear();
305    }
306}
307
308/// Attempt the Pandoc HTML-block-first-line structural lift on the
309/// buffered list-item text. Returns `true` if `text` was emitted as
310/// one or more HTML block CST nodes (no surrounding PLAIN/PARAGRAPH
311/// wrapper). Returns `false` if the lift gate rejected the case;
312/// the caller falls through to its default Plain/Paragraph emission.
313///
314/// The gate is strict: the inner reparse must produce exactly one
315/// top-level HTML_BLOCK or HTML_BLOCK_DIV that consumes every byte
316/// of `text` (modulo list-item indent stripping — see `content_col`).
317/// For HTML_BLOCK_DIV, a matched open+close is required (>= 2
318/// `HTML_BLOCK_TAG` children). This avoids lifting unclosed shapes
319/// (where the close tag would live in a separate sibling HTML_BLOCK),
320/// which would produce a structurally incomplete CST.
321///
322/// When `content_col > 0`, continuation lines have up to `content_col`
323/// leading spaces stripped before the inner reparse, mirroring
324/// pandoc's list-item indent normalization. The stripped bytes are
325/// re-injected as `WHITESPACE` tokens at the start of each continuation
326/// line during graft so the result is byte-equal to the original
327/// buffer text.
328fn try_emit_html_block_lift(
329    builder: &mut GreenNodeBuilder<'static>,
330    text: &str,
331    config: &ParserOptions,
332    content_col: usize,
333    use_paragraph: bool,
334) -> bool {
335    let first_line = text.split_inclusive('\n').next().unwrap_or(text);
336    let first_line_no_nl = first_line
337        .strip_suffix("\r\n")
338        .or_else(|| first_line.strip_suffix('\n'))
339        .unwrap_or(first_line);
340    if try_parse_html_block_start(first_line_no_nl, false).is_none() {
341        return false;
342    }
343
344    let (parse_text, prefixes) = if content_col > 0 {
345        strip_list_item_indent(text, content_col)
346    } else {
347        (text.to_string(), Vec::new())
348    };
349
350    let refdefs = config.refdef_labels.clone().unwrap_or_default();
351    let inner_root = crate::parser::parse_with_refdefs(&parse_text, Some(config.clone()), refdefs);
352
353    let children: Vec<SyntaxNode> = inner_root.children().collect();
354    if children.is_empty() {
355        return false;
356    }
357    let first = &children[0];
358    if !matches!(
359        first.kind(),
360        SyntaxKind::HTML_BLOCK | SyntaxKind::HTML_BLOCK_DIV
361    ) {
362        return false;
363    }
364    let total_end = children.last().unwrap().text_range().end();
365    if total_end != TextSize::of(parse_text.as_str()) {
366        return false;
367    }
368
369    // Single-child path: existing same-line / fully-contained lift.
370    // Multi-child path: trailing-text split — the inner dispatcher
371    // produced sibling block(s) after the HTML_BLOCK / HTML_BLOCK_DIV.
372    // Sources:
373    //   - `try_parse_comment_pi_with_trailing_split` for `<!--…--> trail`
374    //     and `<?…?> trail` (HTML_BLOCK + PARAGRAPH).
375    //   - Same-line div / non-div strict-block lift's trailing branch
376    //     for `<div>foo</div>bar` (HTML_BLOCK_DIV + PARAGRAPH) and
377    //     `<form>foo</form>bar` (also HTML_BLOCK + PARAGRAPH after the
378    //     existing strict-block matched-pair lift fires).
379    // The trailing PARAGRAPH is retagged to PLAIN for tight list items
380    // so the item shape matches pandoc (`[RawBlock, Plain[trailing]]`
381    // for tight, `[RawBlock, Para[...]]` for loose). N>2 children would
382    // require Para→Plain SoftBreak fusion across HTML-block boundaries
383    // (0390 blocked); leave those shapes to the inline path until that
384    // gap closes.
385    let multi_child_trailing = if children.len() == 1 {
386        false
387    } else if children.len() == 2
388        && matches!(
389            first.kind(),
390            SyntaxKind::HTML_BLOCK | SyntaxKind::HTML_BLOCK_DIV
391        )
392        && children[1].kind() == SyntaxKind::PARAGRAPH
393    {
394        true
395    } else {
396        return false;
397    };
398
399    if first.kind() == SyntaxKind::HTML_BLOCK_DIV {
400        let html_block_tag_count = first
401            .children()
402            .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG)
403            .count();
404        if html_block_tag_count < 2 {
405            return false;
406        }
407    }
408
409    let prefix_lines: Vec<ContainerPrefixLine> = prefixes
410        .into_iter()
411        .map(ContainerPrefixLine::list_only)
412        .collect();
413    let mut prefix_state = ContainerPrefixState::new(prefix_lines);
414    if multi_child_trailing {
415        graft_node(builder, first, &mut prefix_state);
416        let trailing_kind = if use_paragraph {
417            SyntaxKind::PARAGRAPH
418        } else {
419            SyntaxKind::PLAIN
420        };
421        graft_node_retag_root(builder, &children[1], &mut prefix_state, trailing_kind);
422    } else {
423        graft_node(builder, first, &mut prefix_state);
424    }
425    true
426}
427
428fn graft_node_retag_root(
429    builder: &mut GreenNodeBuilder<'static>,
430    node: &SyntaxNode,
431    prefix: &mut Option<ContainerPrefixState>,
432    new_kind: SyntaxKind,
433) {
434    builder.start_node(new_kind.into());
435    for child in node.children_with_tokens() {
436        match child {
437            rowan::NodeOrToken::Node(n) => graft_node(builder, &n, prefix),
438            rowan::NodeOrToken::Token(t) => {
439                emit_grafted_token(builder, t.kind(), t.text(), prefix);
440            }
441        }
442    }
443    builder.finish_node();
444}
445
446/// Strip up to `content_col` leading-space bytes from each continuation
447/// line of `text` (lines after the first). The first line is left
448/// untouched — its leading columns are owned by the list marker and
449/// its post-marker spaces. Returns the stripped text plus a per-line
450/// prefix vector for losslessness re-injection during graft.
451fn strip_list_item_indent(text: &str, content_col: usize) -> (String, Vec<String>) {
452    let mut stripped = String::with_capacity(text.len());
453    let mut prefixes: Vec<String> = Vec::new();
454    for (i, line) in text.split_inclusive('\n').enumerate() {
455        if i == 0 {
456            prefixes.push(String::new());
457            stripped.push_str(line);
458            continue;
459        }
460        let mut consumed = 0usize;
461        let mut col = 0usize;
462        for &b in line.as_bytes() {
463            if col >= content_col {
464                break;
465            }
466            match b {
467                b' ' => {
468                    col += 1;
469                    consumed += 1;
470                }
471                b'\t' => {
472                    let next = (col / 4 + 1) * 4;
473                    if next > content_col {
474                        break;
475                    }
476                    col = next;
477                    consumed += 1;
478                }
479                _ => break,
480            }
481        }
482        prefixes.push(line[..consumed].to_string());
483        stripped.push_str(&line[consumed..]);
484    }
485    (stripped, prefixes)
486}
487
488fn graft_node(
489    builder: &mut GreenNodeBuilder<'static>,
490    node: &SyntaxNode,
491    prefix: &mut Option<ContainerPrefixState>,
492) {
493    builder.start_node(node.kind().into());
494    for child in node.children_with_tokens() {
495        match child {
496            rowan::NodeOrToken::Node(n) => graft_node(builder, &n, prefix),
497            rowan::NodeOrToken::Token(t) => {
498                emit_grafted_token(builder, t.kind(), t.text(), prefix);
499            }
500        }
501    }
502    builder.finish_node();
503}
504
505fn emit_grafted_token(
506    builder: &mut GreenNodeBuilder<'static>,
507    kind: SyntaxKind,
508    text: &str,
509    prefix: &mut Option<ContainerPrefixState>,
510) {
511    if let Some(state) = prefix.as_mut() {
512        if state.at_line_start {
513            if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
514                emit_container_prefix_tokens(builder, line_prefix);
515            }
516            state.at_line_start = false;
517        }
518        builder.token(kind.into(), text);
519        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
520            state.line_idx += 1;
521            state.at_line_start = true;
522        }
523    } else {
524        builder.token(kind.into(), text);
525    }
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531
532    #[test]
533    fn test_new_buffer_is_empty() {
534        let buffer = ListItemBuffer::new();
535        assert!(buffer.is_empty());
536        assert!(!buffer.has_blank_lines_between_content());
537    }
538
539    #[test]
540    fn test_push_single_text() {
541        let mut buffer = ListItemBuffer::new();
542        buffer.push_text("Hello, world!");
543        assert!(!buffer.is_empty());
544        assert!(!buffer.has_blank_lines_between_content());
545        assert_eq!(buffer.get_text_for_parsing(), "Hello, world!");
546    }
547
548    #[test]
549    fn test_push_multiple_text_segments() {
550        let mut buffer = ListItemBuffer::new();
551        buffer.push_text("Line 1\n");
552        buffer.push_text("Line 2\n");
553        buffer.push_text("Line 3");
554        assert_eq!(buffer.get_text_for_parsing(), "Line 1\nLine 2\nLine 3");
555    }
556
557    #[test]
558    fn test_clear_buffer() {
559        let mut buffer = ListItemBuffer::new();
560        buffer.push_text("Some text");
561        assert!(!buffer.is_empty());
562
563        buffer.clear();
564        assert!(buffer.is_empty());
565        assert_eq!(buffer.get_text_for_parsing(), "");
566    }
567
568    #[test]
569    fn test_empty_text_ignored() {
570        let mut buffer = ListItemBuffer::new();
571        buffer.push_text("");
572        assert!(buffer.is_empty());
573    }
574}