panache_parser/parser/utils/
list_item_buffer.rs

1//! Buffer for accumulating list item content before emission.
2//!
3//! This module provides infrastructure for buffering list item content during parsing,
4//! allowing us to determine tight vs loose lists and parse inline elements correctly.
5
6use crate::options::{Dialect, ParserOptions};
7use crate::parser::blocks::headings::{emit_atx_heading, try_parse_atx_heading};
8use crate::parser::blocks::horizontal_rules::{emit_horizontal_rule, try_parse_horizontal_rule};
9use crate::parser::blocks::html_blocks::{
10    HtmlBlockType, count_tag_balance, is_pandoc_matched_pair_tag, try_parse_html_block_start,
11};
12use crate::parser::utils::inline_emission;
13use crate::parser::utils::text_buffer::ParagraphBuffer;
14use crate::syntax::{SyntaxKind, SyntaxNode};
15use rowan::{GreenNodeBuilder, TextSize};
16
17/// A segment in the list item buffer - either text content or a blank line.
18#[derive(Debug, Clone)]
19pub(crate) enum ListItemContent {
20    /// Text content (includes newlines for losslessness)
21    Text(String),
22    /// Structural blockquote marker emitted inside buffered list-item text.
23    BlockquoteMarker {
24        leading_spaces: usize,
25        has_trailing_space: bool,
26    },
27}
28
29/// Buffer for accumulating list item content before emission.
30///
31/// Collects text, blank lines, and structural elements as we parse list item
32/// continuation lines. When the list item closes, we can:
33/// 1. Determine if it's tight (Plain) or loose (PARAGRAPH)
34/// 2. Parse inline elements correctly across continuation lines
35/// 3. Emit the complete structure
36#[derive(Debug, Default, Clone)]
37pub(crate) struct ListItemBuffer {
38    /// Segments of content in order
39    segments: Vec<ListItemContent>,
40}
41
42impl ListItemBuffer {
43    /// Create a new empty list item buffer.
44    pub(crate) fn new() -> Self {
45        Self {
46            segments: Vec::new(),
47        }
48    }
49
50    /// Push text content to the buffer.
51    pub(crate) fn push_text(&mut self, text: impl Into<String>) {
52        let text = text.into();
53        if text.is_empty() {
54            return;
55        }
56        self.segments.push(ListItemContent::Text(text));
57    }
58
59    pub(crate) fn push_blockquote_marker(
60        &mut self,
61        leading_spaces: usize,
62        has_trailing_space: bool,
63    ) {
64        self.segments.push(ListItemContent::BlockquoteMarker {
65            leading_spaces,
66            has_trailing_space,
67        });
68    }
69
70    /// Check if buffer is empty.
71    pub(crate) fn is_empty(&self) -> bool {
72        self.segments.is_empty()
73    }
74
75    /// Get the number of segments in the buffer (for debugging).
76    pub(crate) fn segment_count(&self) -> usize {
77        self.segments.len()
78    }
79
80    /// Return the text of the first segment, if it is a `Text` segment.
81    pub(crate) fn first_text(&self) -> Option<&str> {
82        match self.segments.first()? {
83            ListItemContent::Text(t) => Some(t.as_str()),
84            ListItemContent::BlockquoteMarker { .. } => None,
85        }
86    }
87
88    /// If the buffered text begins with a Pandoc matched-pair HTML open
89    /// tag (e.g. `<div ...>`, `<section>`, `<pre>`, `<video>`) whose
90    /// opens outnumber its closes in the buffered text, return the tag
91    /// name. Used by the block dispatcher to suppress the close-form
92    /// dispatch that would otherwise interrupt the LIST_ITEM buffer at
93    /// `</div>` / `</pre>` / etc. — letting the buffer accumulate the
94    /// full matched-pair text so the emit-time structural lift sees both
95    /// open and close.
96    ///
97    /// Only fires under Pandoc dialect. Under CommonMark, list items
98    /// keep their existing behavior (inline HTML inside Plain).
99    pub(crate) fn unclosed_pandoc_matched_pair_tag(
100        &self,
101        config: &ParserOptions,
102    ) -> Option<String> {
103        if config.dialect != Dialect::Pandoc {
104            return None;
105        }
106        let first = self.first_text()?;
107        let first_line_with_nl = first.split_inclusive('\n').next()?;
108        let first_line_no_nl = first_line_with_nl
109            .strip_suffix("\r\n")
110            .or_else(|| first_line_with_nl.strip_suffix('\n'))
111            .unwrap_or(first_line_with_nl);
112        let HtmlBlockType::BlockTag {
113            tag_name,
114            is_closing: false,
115            ..
116        } = try_parse_html_block_start(first_line_no_nl, false)?
117        else {
118            return None;
119        };
120        if !is_pandoc_matched_pair_tag(&tag_name) {
121            return None;
122        }
123        let mut opens = 0usize;
124        let mut closes = 0usize;
125        for segment in &self.segments {
126            if let ListItemContent::Text(t) = segment {
127                let (o, c) = count_tag_balance(t, &tag_name);
128                opens += o;
129                closes += c;
130            }
131        }
132        if opens > closes { Some(tag_name) } else { None }
133    }
134
135    /// Determine if this list item has blank lines between content.
136    ///
137    /// Used to decide between Plain (tight) and PARAGRAPH (loose).
138    /// Returns true if there's a blank line followed by more content.
139    pub(crate) fn has_blank_lines_between_content(&self) -> bool {
140        log::trace!(
141            "has_blank_lines_between_content: segments={} result=false",
142            self.segments.len()
143        );
144
145        false
146    }
147
148    /// Get concatenated text for inline parsing (excludes blank lines).
149    fn get_text_for_parsing(&self) -> String {
150        let mut result = String::new();
151        for segment in &self.segments {
152            if let ListItemContent::Text(text) = segment {
153                result.push_str(text);
154            }
155        }
156        result
157    }
158
159    fn to_paragraph_buffer(&self) -> ParagraphBuffer {
160        let mut paragraph_buffer = ParagraphBuffer::new();
161        for segment in &self.segments {
162            match segment {
163                ListItemContent::Text(text) => paragraph_buffer.push_text(text),
164                ListItemContent::BlockquoteMarker {
165                    leading_spaces,
166                    has_trailing_space,
167                } => paragraph_buffer.push_marker(*leading_spaces, *has_trailing_space),
168            }
169        }
170        paragraph_buffer
171    }
172
173    /// Emit the buffered content as a Plain or PARAGRAPH block.
174    ///
175    /// If `use_paragraph` is true, wraps in PARAGRAPH (loose list).
176    /// If false, wraps in PLAIN (tight list).
177    ///
178    /// `content_col` is the enclosing list-item's content column (or 0
179    /// outside a list-item). The HTML-block first-line structural lift
180    /// uses it to strip the list-item leading indent from continuation
181    /// lines before reparsing the body, so `<div>` body parses as
182    /// pandoc's `Para` (matched-pair under stripped indent) instead of
183    /// `Plain` (the indented-close demotion), and so verbatim-tag
184    /// content (`<pre>`, `<style>`, etc.) projects without the leading
185    /// indent baked into the RawBlock text. The stripped bytes are
186    /// re-emitted as `WHITESPACE` tokens at line starts during graft
187    /// so the CST stays byte-equal to source.
188    pub(crate) fn emit_as_block(
189        &self,
190        builder: &mut GreenNodeBuilder<'static>,
191        use_paragraph: bool,
192        config: &ParserOptions,
193        content_col: usize,
194        suppress_footnote_refs: bool,
195    ) {
196        if self.is_empty() {
197            return;
198        }
199
200        // Get text and parse inline elements
201        let text = self.get_text_for_parsing();
202
203        if !text.is_empty() {
204            let line_without_newline = text
205                .strip_suffix("\r\n")
206                .or_else(|| text.strip_suffix('\n'));
207            if let Some(line) = line_without_newline
208                && !line.contains('\n')
209                && !line.contains('\r')
210            {
211                if let Some(level) = try_parse_atx_heading(line) {
212                    emit_atx_heading(builder, &text, level, config);
213                    return;
214                }
215                if try_parse_horizontal_rule(line).is_some() {
216                    emit_horizontal_rule(builder, &text);
217                    return;
218                }
219            }
220
221            // Multi-line case: first line is an ATX heading, rest is plain
222            // continuation. Pandoc treats `- # Heading\n  Some text` as a
223            // list item containing Header + Plain, not a single Plain spanning
224            // both lines.
225            if self
226                .segments
227                .iter()
228                .all(|s| matches!(s, ListItemContent::Text(_)))
229                && let Some(first_nl) = text.find('\n')
230            {
231                let first_line = &text[..first_nl];
232                let after_first = &text[first_nl + 1..];
233                if !after_first.is_empty()
234                    && let Some(level) = try_parse_atx_heading(first_line)
235                {
236                    let heading_bytes = &text[..first_nl + 1];
237                    emit_atx_heading(builder, heading_bytes, level, config);
238
239                    let block_kind = if use_paragraph {
240                        SyntaxKind::PARAGRAPH
241                    } else {
242                        SyntaxKind::PLAIN
243                    };
244                    builder.start_node(block_kind.into());
245                    inline_emission::emit_inlines(
246                        builder,
247                        after_first,
248                        config,
249                        suppress_footnote_refs,
250                    );
251                    builder.finish_node();
252                    return;
253                }
254            }
255
256            // Pandoc HTML-block-first-line structural lift: when the buffered
257            // text begins with a matched HTML block (same-line `<div>...</div>`,
258            // single-line comment, `<pre>foo</pre>`, etc.) and the entire
259            // buffer is consumed by that block, reparse and graft the inner
260            // block as a direct LIST_ITEM child. Without this lift, the
261            // dispatcher's inline-HTML path takes over and emits
262            // `Plain[RawInline <tag>, body, RawInline </tag>]` instead of
263            // `Div [...]` or `RawBlock <tag>`.
264            //
265            // Multi-line cases where the close tag lives in a sibling
266            // HTML_BLOCK (because the dispatcher recognizes Pandoc strict-
267            // block close forms as block starts and breaks the buffer) are
268            // not handled here — the gate rejects HTML_BLOCK_DIV with only
269            // one HTML_BLOCK_TAG child. That sub-target stays open.
270            if config.dialect == Dialect::Pandoc
271                && self
272                    .segments
273                    .iter()
274                    .all(|s| matches!(s, ListItemContent::Text(_)))
275                && try_emit_html_block_lift(builder, &text, config, content_col, use_paragraph)
276            {
277                return;
278            }
279        }
280
281        let block_kind = if use_paragraph {
282            SyntaxKind::PARAGRAPH
283        } else {
284            SyntaxKind::PLAIN
285        };
286
287        builder.start_node(block_kind.into());
288
289        let paragraph_buffer = self.to_paragraph_buffer();
290        if !paragraph_buffer.is_empty() {
291            paragraph_buffer.emit_with_inlines(builder, config, suppress_footnote_refs);
292        } else if !text.is_empty() {
293            inline_emission::emit_inlines(builder, &text, config, suppress_footnote_refs);
294        }
295
296        builder.finish_node(); // Close PLAIN or PARAGRAPH
297    }
298
299    /// Clear the buffer for reuse.
300    pub(crate) fn clear(&mut self) {
301        self.segments.clear();
302    }
303}
304
305/// Attempt the Pandoc HTML-block-first-line structural lift on the
306/// buffered list-item text. Returns `true` if `text` was emitted as
307/// one or more HTML block CST nodes (no surrounding PLAIN/PARAGRAPH
308/// wrapper). Returns `false` if the lift gate rejected the case;
309/// the caller falls through to its default Plain/Paragraph emission.
310///
311/// The gate is strict: the inner reparse must produce exactly one
312/// top-level HTML_BLOCK or HTML_BLOCK_DIV that consumes every byte
313/// of `text` (modulo list-item indent stripping — see `content_col`).
314/// For HTML_BLOCK_DIV, a matched open+close is required (>= 2
315/// `HTML_BLOCK_TAG` children). This avoids lifting unclosed shapes
316/// (where the close tag would live in a separate sibling HTML_BLOCK),
317/// which would produce a structurally incomplete CST.
318///
319/// When `content_col > 0`, continuation lines have up to `content_col`
320/// leading spaces stripped before the inner reparse, mirroring
321/// pandoc's list-item indent normalization. The stripped bytes are
322/// re-injected as `WHITESPACE` tokens at the start of each continuation
323/// line during graft so the result is byte-equal to the original
324/// buffer text.
325fn try_emit_html_block_lift(
326    builder: &mut GreenNodeBuilder<'static>,
327    text: &str,
328    config: &ParserOptions,
329    content_col: usize,
330    use_paragraph: bool,
331) -> bool {
332    let first_line = text.split_inclusive('\n').next().unwrap_or(text);
333    let first_line_no_nl = first_line
334        .strip_suffix("\r\n")
335        .or_else(|| first_line.strip_suffix('\n'))
336        .unwrap_or(first_line);
337    if try_parse_html_block_start(first_line_no_nl, false).is_none() {
338        return false;
339    }
340
341    let (parse_text, prefixes) = if content_col > 0 {
342        strip_list_item_indent(text, content_col)
343    } else {
344        (text.to_string(), Vec::new())
345    };
346
347    let refdefs = config.refdef_labels.clone().unwrap_or_default();
348    let inner_root = crate::parser::parse_with_refdefs(&parse_text, Some(config.clone()), refdefs);
349
350    let children: Vec<SyntaxNode> = inner_root.children().collect();
351    if children.is_empty() {
352        return false;
353    }
354    let first = &children[0];
355    if !matches!(
356        first.kind(),
357        SyntaxKind::HTML_BLOCK | SyntaxKind::HTML_BLOCK_DIV
358    ) {
359        return false;
360    }
361    let total_end = children.last().unwrap().text_range().end();
362    if total_end != TextSize::of(parse_text.as_str()) {
363        return false;
364    }
365
366    // Single-child path: existing same-line / fully-contained lift.
367    // Multi-child path: trailing-text split — the inner dispatcher
368    // produced sibling block(s) after the HTML_BLOCK / HTML_BLOCK_DIV.
369    // Sources:
370    //   - `try_parse_comment_pi_with_trailing_split` for `<!--…--> trail`
371    //     and `<?…?> trail` (HTML_BLOCK + PARAGRAPH).
372    //   - Same-line div / non-div strict-block lift's trailing branch
373    //     for `<div>foo</div>bar` (HTML_BLOCK_DIV + PARAGRAPH) and
374    //     `<form>foo</form>bar` (also HTML_BLOCK + PARAGRAPH after the
375    //     existing strict-block matched-pair lift fires).
376    // The trailing PARAGRAPH is retagged to PLAIN for tight list items
377    // so the item shape matches pandoc (`[RawBlock, Plain[trailing]]`
378    // for tight, `[RawBlock, Para[...]]` for loose). N>2 children would
379    // require Para→Plain SoftBreak fusion across HTML-block boundaries
380    // (0390 blocked); leave those shapes to the inline path until that
381    // gap closes.
382    let multi_child_trailing = if children.len() == 1 {
383        false
384    } else if children.len() == 2
385        && matches!(
386            first.kind(),
387            SyntaxKind::HTML_BLOCK | SyntaxKind::HTML_BLOCK_DIV
388        )
389        && children[1].kind() == SyntaxKind::PARAGRAPH
390    {
391        true
392    } else {
393        return false;
394    };
395
396    if first.kind() == SyntaxKind::HTML_BLOCK_DIV {
397        let html_block_tag_count = first
398            .children()
399            .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG)
400            .count();
401        if html_block_tag_count < 2 {
402            return false;
403        }
404    }
405
406    let mut prefix_state = if prefixes.is_empty() {
407        None
408    } else {
409        Some(LinePrefixState {
410            prefixes,
411            line_idx: 0,
412            at_line_start: true,
413        })
414    };
415    if multi_child_trailing {
416        graft_node(builder, first, &mut prefix_state);
417        let trailing_kind = if use_paragraph {
418            SyntaxKind::PARAGRAPH
419        } else {
420            SyntaxKind::PLAIN
421        };
422        graft_node_retag_root(builder, &children[1], &mut prefix_state, trailing_kind);
423    } else {
424        graft_node(builder, first, &mut prefix_state);
425    }
426    true
427}
428
429fn graft_node_retag_root(
430    builder: &mut GreenNodeBuilder<'static>,
431    node: &SyntaxNode,
432    prefix: &mut Option<LinePrefixState>,
433    new_kind: SyntaxKind,
434) {
435    builder.start_node(new_kind.into());
436    for child in node.children_with_tokens() {
437        match child {
438            rowan::NodeOrToken::Node(n) => graft_node(builder, &n, prefix),
439            rowan::NodeOrToken::Token(t) => {
440                emit_grafted_token(builder, t.kind(), t.text(), prefix);
441            }
442        }
443    }
444    builder.finish_node();
445}
446
447/// Per-line indent-prefix state for the list-item HTML-block lift.
448/// `prefixes[i]` is the leading-space bytes stripped from source line
449/// `i` of the buffer text before the inner reparse. During graft these
450/// are re-emitted as `WHITESPACE` tokens at the start of each line so
451/// the CST stays byte-equal to source. Mirrors the `BqPrefixState`
452/// pattern in `parser/blocks/html_blocks.rs` (which handles
453/// `BLOCK_QUOTE_MARKER` + `WHITESPACE` re-injection for bq-wrapped
454/// HTML lifts).
455struct LinePrefixState {
456    prefixes: Vec<String>,
457    line_idx: usize,
458    at_line_start: bool,
459}
460
461/// Strip up to `content_col` leading-space bytes from each continuation
462/// line of `text` (lines after the first). The first line is left
463/// untouched — its leading columns are owned by the list marker and
464/// its post-marker spaces. Returns the stripped text plus a per-line
465/// prefix vector for losslessness re-injection during graft.
466fn strip_list_item_indent(text: &str, content_col: usize) -> (String, Vec<String>) {
467    let mut stripped = String::with_capacity(text.len());
468    let mut prefixes: Vec<String> = Vec::new();
469    for (i, line) in text.split_inclusive('\n').enumerate() {
470        if i == 0 {
471            prefixes.push(String::new());
472            stripped.push_str(line);
473            continue;
474        }
475        let mut consumed = 0usize;
476        let mut col = 0usize;
477        for &b in line.as_bytes() {
478            if col >= content_col {
479                break;
480            }
481            match b {
482                b' ' => {
483                    col += 1;
484                    consumed += 1;
485                }
486                b'\t' => {
487                    let next = (col / 4 + 1) * 4;
488                    if next > content_col {
489                        break;
490                    }
491                    col = next;
492                    consumed += 1;
493                }
494                _ => break,
495            }
496        }
497        prefixes.push(line[..consumed].to_string());
498        stripped.push_str(&line[consumed..]);
499    }
500    (stripped, prefixes)
501}
502
503fn graft_node(
504    builder: &mut GreenNodeBuilder<'static>,
505    node: &SyntaxNode,
506    prefix: &mut Option<LinePrefixState>,
507) {
508    builder.start_node(node.kind().into());
509    for child in node.children_with_tokens() {
510        match child {
511            rowan::NodeOrToken::Node(n) => graft_node(builder, &n, prefix),
512            rowan::NodeOrToken::Token(t) => {
513                emit_grafted_token(builder, t.kind(), t.text(), prefix);
514            }
515        }
516    }
517    builder.finish_node();
518}
519
520fn emit_grafted_token(
521    builder: &mut GreenNodeBuilder<'static>,
522    kind: SyntaxKind,
523    text: &str,
524    prefix: &mut Option<LinePrefixState>,
525) {
526    if let Some(state) = prefix.as_mut() {
527        if state.at_line_start {
528            if let Some(p) = state.prefixes.get(state.line_idx)
529                && !p.is_empty()
530            {
531                builder.token(SyntaxKind::WHITESPACE.into(), p);
532            }
533            state.at_line_start = false;
534        }
535        builder.token(kind.into(), text);
536        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
537            state.line_idx += 1;
538            state.at_line_start = true;
539        }
540    } else {
541        builder.token(kind.into(), text);
542    }
543}
544
545#[cfg(test)]
546mod tests {
547    use super::*;
548
549    #[test]
550    fn test_new_buffer_is_empty() {
551        let buffer = ListItemBuffer::new();
552        assert!(buffer.is_empty());
553        assert!(!buffer.has_blank_lines_between_content());
554    }
555
556    #[test]
557    fn test_push_single_text() {
558        let mut buffer = ListItemBuffer::new();
559        buffer.push_text("Hello, world!");
560        assert!(!buffer.is_empty());
561        assert!(!buffer.has_blank_lines_between_content());
562        assert_eq!(buffer.get_text_for_parsing(), "Hello, world!");
563    }
564
565    #[test]
566    fn test_push_multiple_text_segments() {
567        let mut buffer = ListItemBuffer::new();
568        buffer.push_text("Line 1\n");
569        buffer.push_text("Line 2\n");
570        buffer.push_text("Line 3");
571        assert_eq!(buffer.get_text_for_parsing(), "Line 1\nLine 2\nLine 3");
572    }
573
574    #[test]
575    fn test_clear_buffer() {
576        let mut buffer = ListItemBuffer::new();
577        buffer.push_text("Some text");
578        assert!(!buffer.is_empty());
579
580        buffer.clear();
581        assert!(buffer.is_empty());
582        assert_eq!(buffer.get_text_for_parsing(), "");
583    }
584
585    #[test]
586    fn test_empty_text_ignored() {
587        let mut buffer = ListItemBuffer::new();
588        buffer.push_text("");
589        assert!(buffer.is_empty());
590    }
591}
panache_parser/parser/utils/list_item_buffer.rs

panache_parser/parser/utils/
list_item_buffer.rs