Skip to main content

panache_parser/parser/yaml/
parser.rs

1//! YAML parser core: orchestrator + streaming-token-to-CST builder.
2//!
3//! Two layers live in this module:
4//!
5//! 1. **Orchestrator** — [`parse_yaml_tree`] and [`parse_yaml_report`].
6//!    These drive [`parse_stream`] for a pure-YAML parse rooted at
7//!    `YAML_STREAM`, run the structural
8//!    [`super::validator::validate_yaml`] pass, and surface diagnostics.
9//!    Host envelope wrappers (`DOCUMENT`, `YAML_METADATA_CONTENT`,
10//!    `HASHPIPE_YAML_CONTENT`) are added by the host parser at embedding
11//!    sites and are not concerns of the standalone YAML parse path.
12//!
13//! 2. **Streaming parser** — [`parse_stream`] drives
14//!    [`super::scanner::Scanner`] and emits the rowan green tree. Each
15//!    contiguous run of body content is wrapped in a `YAML_DOCUMENT`
16//!    node (with `---` / `...` markers consumed inside the document
17//!    they delimit); block-context content nests under `YAML_BLOCK_MAP`
18//!    / `YAML_BLOCK_SEQUENCE` containers driven by the scanner's
19//!    synthetic `BlockMappingStart` / `BlockSequenceStart` / `BlockEnd`
20//!    markers; each key-value pair is wrapped in
21//!    `YAML_BLOCK_MAP_ENTRY`, each `-` entry in
22//!    `YAML_BLOCK_SEQUENCE_ITEM`, and each map entry splits into
23//!    `YAML_BLOCK_MAP_KEY` (everything up to and including the `:`) and
24//!    `YAML_BLOCK_MAP_VALUE` (everything after). Flow contexts mirror
25//!    the same shape: `YAML_FLOW_MAP` / `YAML_FLOW_MAP_ENTRY` /
26//!    `YAML_FLOW_MAP_KEY` / `YAML_FLOW_MAP_VALUE` and
27//!    `YAML_FLOW_SEQUENCE` / `YAML_FLOW_SEQUENCE_ITEM`. Source-backed
28//!    `[` / `]` / `{` / `}` / `,` are emitted at the container level
29//!    (siblings of items), with item/entry sub-wrappers closing on `,`
30//!    and the matching closer.
31
32#![allow(dead_code)]
33
34use crate::syntax::{SyntaxKind, SyntaxNode};
35use rowan::GreenNodeBuilder;
36
37use super::model::{YamlDiagnostic, YamlParseReport};
38use super::profile::YamlValidationContext;
39use super::scanner::{Scanner, TokenKind, TriviaKind};
40
41/// Strip a per-line `prefix` (marker plus at most one following space)
42/// from every line, joining with `\n`. The stripped baseline a
43/// prefix-aware parse is validated against (see
44/// [`validate_yaml_with_prefix`]).
45fn strip_line_prefix(input: &str, prefix: &str) -> String {
46    input
47        .lines()
48        .map(|line| match line.strip_prefix(prefix) {
49            Some(rest) => rest.strip_prefix(' ').unwrap_or(rest),
50            None => line,
51        })
52        .collect::<Vec<_>>()
53        .join("\n")
54}
55
56/// Structural validation for embedded (prefixed) YAML. Strips the
57/// per-line `prefix` to the plain-YAML baseline and runs the standard
58/// [`super::validator::validate_yaml`] pass. The verdict matches the
59/// stripped baseline; diagnostic offsets refer to the stripped text
60/// (host-offset remapping is a later concern). An empty `prefix` is
61/// plain validation.
62pub fn validate_yaml_with_prefix(input: &str, prefix: &str) -> Option<YamlDiagnostic> {
63    if prefix.is_empty() {
64        return super::validator::validate_yaml(input);
65    }
66    super::validator::validate_yaml(&strip_line_prefix(input, prefix))
67}
68
69/// Strip the per-line `prefix` exactly as [`strip_line_prefix`] does, but also
70/// record — for every byte of the stripped output, plus a trailing EOF entry —
71/// the byte offset it came from within `input`. Building the text and the map in
72/// lockstep guarantees they cannot drift, so a diagnostic offset (in stripped
73/// coordinates) can be located back in the original prefixed region. `prefix`
74/// must be non-empty (empty-prefix validation never strips).
75fn strip_line_prefix_with_offsets(input: &str, prefix: &str) -> (String, Vec<usize>) {
76    let mut stripped = String::new();
77    let mut offsets = Vec::new();
78    let base = input.as_ptr() as usize;
79    for (line_idx, line) in input.lines().enumerate() {
80        // `line` is a subslice of `input`; recover its byte offset.
81        let line_off = line.as_ptr() as usize - base;
82        if line_idx > 0 {
83            // The join `\n` maps to the original line break preceding this line.
84            offsets.push(line_off.saturating_sub(1));
85            stripped.push('\n');
86        }
87        let (payload, payload_off) = match line.strip_prefix(prefix) {
88            Some(rest) => {
89                let after = rest.strip_prefix(' ').unwrap_or(rest);
90                (after, line.len() - after.len())
91            }
92            None => (line, 0),
93        };
94        offsets.extend((0..payload.len()).map(|i| line_off + payload_off + i));
95        stripped.push_str(payload);
96    }
97    offsets.push(input.len());
98    (stripped, offsets)
99}
100
101/// Locate a structural YAML diagnostic in `input` (raw, possibly `prefix`-marked),
102/// returning the diagnostic plus the byte range **within `input`** it covers
103/// (start, end). An empty `prefix` is plain YAML with identity offsets. Returns
104/// `None` when `input` is valid — the verdict matches
105/// [`validate_yaml_with_prefix`]. The host parser adds the region's document
106/// start to emit a host-ranged `SyntaxError` for malformed embedded YAML.
107pub fn locate_yaml_diagnostic(input: &str, prefix: &str) -> Option<(YamlDiagnostic, usize, usize)> {
108    locate_yaml_diagnostic_ctx(input, prefix, YamlValidationContext::substrate())
109}
110
111/// Like [`locate_yaml_diagnostic`], but validates under `ctx` — the
112/// (flavor, location) consumer profile for this YAML region. The host parser
113/// builds `ctx` from the document flavor and whether the region is frontmatter
114/// or a hashpipe `#|` body, so consumer-only rejections (implicit empty keys,
115/// duplicate keys) surface for the parsers that actually read the YAML.
116pub fn locate_yaml_diagnostic_ctx(
117    input: &str,
118    prefix: &str,
119    ctx: YamlValidationContext,
120) -> Option<(YamlDiagnostic, usize, usize)> {
121    if prefix.is_empty() {
122        let diag = super::validator::validate_yaml_with_context(input, ctx)?;
123        let start = diag.byte_start.min(input.len());
124        let end = diag.byte_end.min(input.len()).max(start);
125        return Some((diag, start, end));
126    }
127    // Validate cheaply first (no offset table) — the common, valid path returns
128    // here. Only build the lockstep offset map when there's actually a
129    // diagnostic to locate.
130    let diag =
131        super::validator::validate_yaml_with_context(&strip_line_prefix(input, prefix), ctx)?;
132    let (_stripped, offsets) = strip_line_prefix_with_offsets(input, prefix);
133    let start = offsets.get(diag.byte_start).copied().unwrap_or(input.len());
134    let end = offsets
135        .get(diag.byte_end)
136        .copied()
137        .unwrap_or(input.len())
138        .max(start);
139    Some((diag, start, end))
140}
141
142/// Parse YAML tree structure from input, or `None` if it fails to parse.
143pub fn parse_yaml_tree(input: &str) -> Option<SyntaxNode> {
144    parse_yaml_report(input).tree
145}
146
147/// Parse YAML tree structure and include diagnostics on failure.
148///
149/// Diagnostics flow through the structural
150/// [`super::validator::validate_yaml`] pass, which composes per-cluster
151/// `check_*` functions covering directive ordering, structural shape
152/// (unterminated flow, trailing content, invalid keys, indent
153/// anomalies, block-scalar header, etc.), and lex-level checks like
154/// `LEX_INVALID_DOUBLE_QUOTED_ESCAPE`.
155///
156/// The returned tree, when present, comes from the streaming scanner
157/// and builder.
158pub fn parse_yaml_report(input: &str) -> YamlParseReport {
159    if let Some(err) = super::validator::validate_yaml(input) {
160        return YamlParseReport {
161            tree: None,
162            diagnostics: vec![err],
163        };
164    }
165
166    YamlParseReport {
167        tree: Some(parse_stream(input)),
168        diagnostics: Vec::new(),
169    }
170}
171
172/// Drive the scanner over `input` and build a CST. Always returns a
173/// `SyntaxNode` — the scanner is permissive and the builder preserves
174/// bytes regardless of well-formedness.
175pub fn parse_stream(input: &str) -> SyntaxNode {
176    parse_stream_inner(input, None)
177}
178
179/// Like [`parse_stream`], but treats `prefix` (e.g. hashpipe `"#|"`) as
180/// an embedded-YAML per-line marker: the scanner excludes it from
181/// column/indent accounting and the builder peels it into
182/// `YAML_LINE_PREFIX` leaves, so the resulting CST's token ranges are
183/// host ranges directly (prefix bytes included as trivia, no offset
184/// remapping). An empty `prefix` behaves like [`parse_stream`].
185///
186/// `prefix` may be a *composite* marker — a container prefix prepended to
187/// `#|` (e.g. `"   #|"` for a list-indented cell, `"> #|"` for a blockquoted
188/// one). Within a hashpipe preamble the container prefix is uniform per line,
189/// so matching the whole composite marker via `strip_prefix` parses a nested
190/// cell identically to a top-level one, peeling the entire prefix into one
191/// `YAML_LINE_PREFIX` leaf. The host computes this marker (see
192/// `parse_fenced_code_block`).
193pub fn parse_stream_with_prefix(input: &str, prefix: &str) -> SyntaxNode {
194    parse_stream_inner(input, (!prefix.is_empty()).then_some(prefix))
195}
196
197fn parse_stream_inner(input: &str, line_prefix: Option<&str>) -> SyntaxNode {
198    let mut builder = GreenNodeBuilder::new();
199    builder.start_node(SyntaxKind::YAML_STREAM.into());
200    let mut scanner = match line_prefix {
201        Some(prefix) => Scanner::with_prefix(input, prefix),
202        None => Scanner::new(input),
203    };
204    let mut doc_open = false;
205    // True when the open YAML_DOCUMENT has only seen directives + trivia
206    // (no body content yet, no `---`). YAML 1.2 says directives belong to
207    // the document the following `---` opens, so when DocumentStart
208    // arrives in this state the marker stays inside the same document
209    // rather than splitting it. Cleared as soon as any non-directive
210    // body content lands.
211    let mut doc_only_has_directives = false;
212    // Stack of currently-open block containers. Each frame tracks
213    // whether its current `YAML_BLOCK_MAP_ENTRY` / `YAML_BLOCK_SEQUENCE_ITEM`
214    // sub-wrapper is still open and waiting to be closed (by the next
215    // `Key` / `BlockEntry` peer or by `BlockEnd`).
216    let mut block_stack: Vec<BlockFrame> = Vec::new();
217    // Kind of the last non-trivia, non-stream-marker, non-decoration
218    // token emitted. An indentless block sequence is only valid when
219    // its `-` directly follows the map entry's `:` (the value is
220    // otherwise empty), so the `BlockEntry` handler consults this to
221    // tell RLU9 (`foo:\n- 42`, value is purely the sequence) apart from
222    // G9HC (`seq:\n&anchor\n- a` with the anchor at column 0 — an
223    // error the validator must still catch on the unwrapped shape).
224    // Anchor / Tag / Alias tokens are *decorations* of the next node
225    // and don't fill the empty-value slot; they're skipped here so a
226    // value-leading decoration still permits an indentless sequence
227    // (SKE5: `seq:\n &anchor\n- a`).
228    let mut prev_significant: Option<TokenKind> = None;
229    // Smallest column among Anchor/Tag/Alias decorations seen since the
230    // last value-filling token. The indentless detector uses this to
231    // distinguish SKE5 (decoration indented past parent → wrap) from
232    // G9HC (decoration at parent indent → leave unwrapped for the
233    // validator). `None` when no decoration is pending.
234    let mut decoration_col_floor: Option<usize> = None;
235    while let Some(tok) = scanner.next_token() {
236        let last_significant = prev_significant;
237        let decorations_so_far = decoration_col_floor;
238        let is_decoration = matches!(
239            tok.kind,
240            TokenKind::Anchor | TokenKind::Tag | TokenKind::Alias
241        );
242        if !matches!(
243            tok.kind,
244            TokenKind::Trivia(_) | TokenKind::StreamStart | TokenKind::StreamEnd
245        ) {
246            if is_decoration {
247                decoration_col_floor = Some(
248                    decoration_col_floor.map_or(tok.start.column, |c| c.min(tok.start.column)),
249                );
250            } else {
251                prev_significant = Some(tok.kind);
252                decoration_col_floor = None;
253            }
254        }
255        match tok.kind {
256            TokenKind::StreamStart | TokenKind::StreamEnd => continue,
257            TokenKind::BlockMappingStart => {
258                ensure_doc_open(&mut builder, &mut doc_open);
259                doc_only_has_directives = false;
260                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
261                builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
262                block_stack.push(BlockFrame::BlockMap {
263                    entry_open: false,
264                    in_value: false,
265                });
266                continue;
267            }
268            TokenKind::BlockSequenceStart => {
269                ensure_doc_open(&mut builder, &mut doc_open);
270                doc_only_has_directives = false;
271                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
272                builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
273                block_stack.push(BlockFrame::BlockSequence {
274                    item_open: false,
275                    indentless: false,
276                });
277                continue;
278            }
279            TokenKind::BlockEnd => {
280                // Indentless sequences have no scanner BlockEnd of their
281                // own, so a BlockEnd arriving while one is on top is meant
282                // for the real container beneath it. Close the indentless
283                // frame(s) first, then consume the BlockEnd normally.
284                close_indentless_sequences(&mut builder, &mut block_stack);
285                close_open_sub_wrapper(&mut builder, &mut block_stack);
286                // Defensive: only close if the scanner gave us an open
287                // container. A stray BlockEnd would otherwise pop the
288                // YAML_DOCUMENT or YAML_STREAM frame.
289                if block_stack.pop().is_some() {
290                    builder.finish_node();
291                }
292                continue;
293            }
294            TokenKind::FlowSequenceStart => {
295                ensure_doc_open(&mut builder, &mut doc_open);
296                doc_only_has_directives = false;
297                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
298                // If nested inside a Map's open KEY/VALUE wrapper, the
299                // current open scope is the appropriate parent.
300                builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE.into());
301                block_stack.push(BlockFrame::FlowSequence { item_open: false });
302                let text = &input[tok.start.index..tok.end.index];
303                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
304                continue;
305            }
306            TokenKind::FlowSequenceEnd => {
307                close_open_sub_wrapper(&mut builder, &mut block_stack);
308                let text = &input[tok.start.index..tok.end.index];
309                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
310                if matches!(
311                    block_stack.last(),
312                    Some(BlockFrame::FlowSequence { .. } | BlockFrame::FlowMap { .. })
313                ) {
314                    block_stack.pop();
315                    builder.finish_node();
316                }
317                continue;
318            }
319            TokenKind::FlowMappingStart => {
320                ensure_doc_open(&mut builder, &mut doc_open);
321                doc_only_has_directives = false;
322                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
323                builder.start_node(SyntaxKind::YAML_FLOW_MAP.into());
324                block_stack.push(BlockFrame::FlowMap {
325                    entry_open: false,
326                    in_value: false,
327                });
328                let text = &input[tok.start.index..tok.end.index];
329                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
330                continue;
331            }
332            TokenKind::FlowMappingEnd => {
333                close_open_sub_wrapper(&mut builder, &mut block_stack);
334                let text = &input[tok.start.index..tok.end.index];
335                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
336                if matches!(
337                    block_stack.last(),
338                    Some(BlockFrame::FlowMap { .. } | BlockFrame::FlowSequence { .. })
339                ) {
340                    block_stack.pop();
341                    builder.finish_node();
342                }
343                continue;
344            }
345            TokenKind::FlowEntry => {
346                // `,` closes the current entry/item and lives at the
347                // container level (between peer entries/items).
348                close_open_sub_wrapper(&mut builder, &mut block_stack);
349                let text = &input[tok.start.index..tok.end.index];
350                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
351                continue;
352            }
353            TokenKind::Key => {
354                // A `Key` at the parent map's level terminates any
355                // open indentless sequence value first, revealing the
356                // map frame below.
357                close_indentless_sequences(&mut builder, &mut block_stack);
358                // Both the synthetic 0-width splice and the source-backed
359                // `?` indicator open a new map entry. Close the previous
360                // entry first if still open. After this, the current
361                // open scope is the new key wrapper.
362                if matches!(
363                    block_stack.last(),
364                    Some(BlockFrame::BlockMap { .. } | BlockFrame::FlowMap { .. })
365                ) {
366                    open_map_entry_with_key(&mut builder, &mut block_stack);
367                }
368                if tok.start.index == tok.end.index {
369                    // Synthetic Key splice carries no bytes.
370                    continue;
371                }
372                // Source-backed `?`: ensure we have somewhere to put it.
373                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
374                // Fall through to emit `?` inside the open KEY (or
375                // current scope if not in a Map frame).
376            }
377            TokenKind::Value => {
378                // An empty-key `:` at the parent map's level likewise
379                // terminates an open indentless sequence value first.
380                close_indentless_sequences(&mut builder, &mut block_stack);
381                let map_state = match block_stack.last().copied() {
382                    Some(BlockFrame::BlockMap {
383                        entry_open,
384                        in_value,
385                    }) => Some((false, entry_open, in_value)),
386                    Some(BlockFrame::FlowMap {
387                        entry_open,
388                        in_value,
389                    }) => Some((true, entry_open, in_value)),
390                    _ => None,
391                };
392                if let Some((is_flow, mut entry_open, mut in_value)) = map_state {
393                    // A bare `:` arriving while the current block-map
394                    // entry is already in its VALUE phase starts a NEW
395                    // entry whose key is empty (`: a\n: b`, 2JQS/S3PD) —
396                    // not a double-colon inside that value. The scanner's
397                    // indent machinery guarantees we only reach here for a
398                    // peer at the map's column (a deeper colon rolls a
399                    // fresh BlockMappingStart; a shallower one unwinds with
400                    // BlockEnd first), so close the current entry and fall
401                    // through to open the new one. Flow maps separate
402                    // entries with `,`, which already closes the entry, so
403                    // their in_value is false here — leave them alone.
404                    if !is_flow && entry_open && in_value {
405                        close_open_sub_wrapper(&mut builder, &mut block_stack);
406                        entry_open = false;
407                        in_value = false;
408                    }
409                    // Empty-key shorthand: `:` arriving without a prior
410                    // Key opens an ENTRY+KEY before consuming the colon.
411                    if !entry_open {
412                        open_map_entry_with_key(&mut builder, &mut block_stack);
413                    }
414                    if !in_value {
415                        // The colon is the last token of KEY. After it
416                        // we close KEY and open VALUE.
417                        let text = &input[tok.start.index..tok.end.index];
418                        if !text.is_empty() {
419                            builder.token(SyntaxKind::YAML_COLON.into(), text);
420                        }
421                        builder.finish_node(); // close KEY
422                        let value_kind = if is_flow {
423                            SyntaxKind::YAML_FLOW_MAP_VALUE
424                        } else {
425                            SyntaxKind::YAML_BLOCK_MAP_VALUE
426                        };
427                        builder.start_node(value_kind.into());
428                        if let Some(
429                            BlockFrame::BlockMap { in_value, .. }
430                            | BlockFrame::FlowMap { in_value, .. },
431                        ) = block_stack.last_mut()
432                        {
433                            *in_value = true;
434                        }
435                        continue;
436                    }
437                    // Already in_value: pathological double-colon. Fall
438                    // through and emit at the current scope (inside
439                    // VALUE) for losslessness.
440                }
441                // Not a Map frame: ensure flow-seq ITEM is open, then
442                // fall through to emit `:` at current scope.
443                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
444            }
445            TokenKind::BlockEntry => {
446                // An indentless sequence opens when a `-` lands directly
447                // in a block-map VALUE: the scanner pushed no indent
448                // level (the `-` is at the parent key's column), so no
449                // `BlockSequenceStart` arrived. Synthesize the
450                // `YAML_BLOCK_SEQUENCE` frame inside the open VALUE so the
451                // tree matches the indented form (spec 8.2.1). Only when
452                // the `:` is the last significant token — i.e. the value
453                // is otherwise empty; a `-` after scalar content in the
454                // value is a structural error left unwrapped for the
455                // validator to reject.
456                // Decorations between `:` and `-` are allowed only when
457                // they sit inside the value scope — strictly indented
458                // past the indentless `-`. Otherwise the anchor is at
459                // the parent mapping's level (G9HC) and the sequence
460                // shouldn't wrap.
461                let decorations_inside_value =
462                    decorations_so_far.is_none_or(|c| c > tok.start.column);
463                let indentless_value = last_significant == Some(TokenKind::Value)
464                    && matches!(
465                        block_stack.last(),
466                        Some(BlockFrame::BlockMap { in_value: true, .. })
467                    )
468                    && decorations_inside_value;
469                // The mirror case: a `-` landing directly after the `?`
470                // explicit-key indicator opens an indentless sequence as
471                // the KEY's content (6PBE). The scanner likewise pushes no
472                // indent level, so synthesize the `YAML_BLOCK_SEQUENCE`
473                // inside the open KEY. `close_indentless_sequences` later
474                // pops it when the entry's `:` (`Value`) arrives.
475                let indentless_key = last_significant == Some(TokenKind::Key)
476                    && matches!(
477                        block_stack.last(),
478                        Some(BlockFrame::BlockMap {
479                            entry_open: true,
480                            in_value: false,
481                        })
482                    )
483                    && decorations_inside_value;
484                if indentless_value || indentless_key {
485                    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
486                    block_stack.push(BlockFrame::BlockSequence {
487                        item_open: false,
488                        indentless: true,
489                    });
490                }
491                if matches!(block_stack.last(), Some(BlockFrame::BlockSequence { .. })) {
492                    close_open_sub_wrapper(&mut builder, &mut block_stack);
493                    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM.into());
494                    if let Some(BlockFrame::BlockSequence { item_open, .. }) =
495                        block_stack.last_mut()
496                    {
497                        *item_open = true;
498                    }
499                }
500                // Fall through to emit the `-` byte inside the new ITEM
501                // (or at current scope if not in a Sequence frame).
502            }
503            TokenKind::Trivia(_) => {
504                // Trivia bypasses item-opening: pre-content trivia in a
505                // flow sequence stays at SEQUENCE level.
506            }
507            _ => {
508                // Any other source-backed content (Scalar, Anchor, Tag,
509                // Alias, Directive, doc markers): if we're inside a
510                // FlowSequence with no open ITEM, open one before
511                // emitting. Doc markers are handled below.
512                if !matches!(tok.kind, TokenKind::DocumentStart | TokenKind::DocumentEnd) {
513                    ensure_flow_seq_item_open(&mut builder, &mut block_stack);
514                }
515            }
516        }
517        let text = &input[tok.start.index..tok.end.index];
518        if text.is_empty() {
519            // Defensive: never emit zero-width tokens (rowan rejects).
520            continue;
521        }
522        let kind = map_token_to_syntax_kind(tok.kind);
523        match tok.kind {
524            TokenKind::DocumentStart => {
525                // `---` begins a fresh document. Two cases:
526                //  - The currently-open document only has directives so
527                //    far: per YAML 1.2 the directives belong to the doc
528                //    that this `---` opens. Stay inside, just emit the
529                //    marker.
530                //  - Otherwise: close the previous doc (and any open
531                //    block containers) and open a new YAML_DOCUMENT.
532                //    The scanner unwinds the indent stack at column 0,
533                //    but a same-indent map at indent==0 leaves them
534                //    open, so close them defensively.
535                if doc_open && doc_only_has_directives {
536                    builder.token(kind.into(), text);
537                    doc_only_has_directives = false;
538                } else {
539                    close_block_containers(&mut builder, &mut block_stack);
540                    if doc_open {
541                        builder.finish_node();
542                    }
543                    builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
544                    doc_open = true;
545                    doc_only_has_directives = false;
546                    builder.token(kind.into(), text);
547                }
548            }
549            TokenKind::DocumentEnd => {
550                // `...` closes the current document. Close any open
551                // block containers first so the marker is a child of
552                // the document, not buried in a block container.
553                close_block_containers(&mut builder, &mut block_stack);
554                if !doc_open {
555                    builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
556                }
557                builder.token(kind.into(), text);
558                builder.finish_node();
559                doc_open = false;
560                doc_only_has_directives = false;
561            }
562            TokenKind::Trivia(_) => {
563                // Trivia goes to whichever level is currently open;
564                // pre-document trivia stays at YAML_STREAM, in-document
565                // trivia stays inside the YAML_DOCUMENT, the open
566                // block container, or the open ENTRY/ITEM sub-wrapper.
567                builder.token(kind.into(), text);
568            }
569            TokenKind::Directive => {
570                // Directives belong inside a YAML_DOCUMENT but don't by
571                // themselves count as body content — a following `---`
572                // should not split into a separate doc.
573                let was_open = doc_open;
574                ensure_doc_open(&mut builder, &mut doc_open);
575                if !was_open {
576                    doc_only_has_directives = true;
577                }
578                builder.token(kind.into(), text);
579            }
580            TokenKind::Scalar(_) => {
581                // A scalar is emitted as a `YAML_SCALAR` *node* whose
582                // leaves are the per-physical-line content fragments
583                // (`YAML_SCALAR_TEXT`) interleaved with `NEWLINE` tokens.
584                // The byte slice is unchanged, so this is lossless; the
585                // node shape lets the formatter/LSP navigate scalar lines
586                // (and, later, hashpipe line prefixes) as real structure.
587                ensure_doc_open(&mut builder, &mut doc_open);
588                doc_only_has_directives = false;
589                emit_scalar_node(&mut builder, text, line_prefix);
590            }
591            _ => {
592                // Any other non-trivia content (Anchor, Tag, Alias, ...)
593                // opens an implicit document when one isn't already in
594                // progress and counts as body content (clears the
595                // directives-only flag).
596                ensure_doc_open(&mut builder, &mut doc_open);
597                doc_only_has_directives = false;
598                builder.token(kind.into(), text);
599            }
600        }
601    }
602    // Close any open block containers (and their open ENTRY/ITEM
603    // sub-wrappers) and the open document. The scanner emits BlockEnd
604    // on stream end via `unwind_indent(-1)`, so this is normally a
605    // no-op for `block_stack`; kept for safety against truncated
606    // inputs and future scanner quirks.
607    close_block_containers(&mut builder, &mut block_stack);
608    if doc_open {
609        builder.finish_node();
610    }
611    builder.finish_node();
612    SyntaxNode::new_root(builder.finish())
613}
614
615/// Tracks an open container in the streaming builder's stack. Block and
616/// flow contexts share state shape, but their containers and
617/// sub-wrappers use different `SyntaxKind` variants and they close on
618/// different tokens (`BlockEnd` / dedent vs. `]` / `}` / `,`).
619///
620/// For maps, `entry_open` records whether the entry sub-wrapper is
621/// still open, and `in_value` selects between the KEY and VALUE
622/// sub-sub-wrapper. For sequences, `item_open` records whether the
623/// item sub-wrapper is still open.
624#[derive(Debug, Clone, Copy)]
625enum BlockFrame {
626    BlockMap {
627        entry_open: bool,
628        in_value: bool,
629    },
630    /// `indentless` marks a sequence opened as a block-map value whose
631    /// `-` entries sit at the same column as the parent key (YAML's
632    /// "indentless sequence", spec 8.2.1). The scanner never pushes an
633    /// indent level for it, so it emits no matching `BlockEnd`; the
634    /// builder must close the frame itself when the parent map's next
635    /// `Key` / `Value` / `BlockEnd` arrives.
636    BlockSequence {
637        item_open: bool,
638        indentless: bool,
639    },
640    FlowMap {
641        entry_open: bool,
642        in_value: bool,
643    },
644    FlowSequence {
645        item_open: bool,
646    },
647}
648
649fn ensure_doc_open(builder: &mut GreenNodeBuilder<'_>, doc_open: &mut bool) {
650    if !*doc_open {
651        builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
652        *doc_open = true;
653    }
654}
655
656/// In a flow sequence, source-backed content opens a new
657/// `YAML_FLOW_SEQUENCE_ITEM` lazily — there is no `-` token to drive
658/// the boundary the way `BlockEntry` drives block sequences. Trivia
659/// arriving before the first item stays at the container level.
660fn ensure_flow_seq_item_open(builder: &mut GreenNodeBuilder<'_>, stack: &mut [BlockFrame]) {
661    if let Some(BlockFrame::FlowSequence { item_open }) = stack.last_mut()
662        && !*item_open
663    {
664        builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
665        *item_open = true;
666    }
667}
668
669/// Open `<MAP>_ENTRY` > `<MAP>_KEY` for the next entry, closing any
670/// previously-open entry on the same Map frame. Caller must have
671/// verified the top frame is a Map (Block or Flow).
672fn open_map_entry_with_key(builder: &mut GreenNodeBuilder<'_>, stack: &mut [BlockFrame]) {
673    close_open_sub_wrapper(builder, stack);
674    let (entry_kind, key_kind) = match stack.last() {
675        Some(BlockFrame::BlockMap { .. }) => (
676            SyntaxKind::YAML_BLOCK_MAP_ENTRY,
677            SyntaxKind::YAML_BLOCK_MAP_KEY,
678        ),
679        Some(BlockFrame::FlowMap { .. }) => (
680            SyntaxKind::YAML_FLOW_MAP_ENTRY,
681            SyntaxKind::YAML_FLOW_MAP_KEY,
682        ),
683        _ => return,
684    };
685    builder.start_node(entry_kind.into());
686    builder.start_node(key_kind.into());
687    if let Some(
688        BlockFrame::BlockMap {
689            entry_open,
690            in_value,
691        }
692        | BlockFrame::FlowMap {
693            entry_open,
694            in_value,
695        },
696    ) = stack.last_mut()
697    {
698        *entry_open = true;
699        *in_value = false;
700    }
701}
702
703/// Close any indentless `YAML_BLOCK_SEQUENCE` frames on top of the
704/// stack. These have no matching scanner `BlockEnd`, so they're closed
705/// here when the parent map's next `Key` / `Value` / `BlockEnd` arrives.
706/// Closing the open ITEM, finishing the SEQUENCE node, and popping the
707/// frame reveals the parent map for the incoming token. Loops because
708/// the next token may close several levels, though in practice
709/// indentless frames never stack directly (they're always separated by
710/// a map frame).
711fn close_indentless_sequences(builder: &mut GreenNodeBuilder<'_>, stack: &mut Vec<BlockFrame>) {
712    while let Some(BlockFrame::BlockSequence {
713        indentless: true, ..
714    }) = stack.last()
715    {
716        close_open_sub_wrapper(builder, stack);
717        stack.pop();
718        builder.finish_node(); // close YAML_BLOCK_SEQUENCE
719    }
720}
721
722/// Close the top-of-stack frame's entry/item sub-wrapper if still open
723/// and clear the flag. For maps, this closes the inner KEY/VALUE
724/// node and the surrounding ENTRY. If we're closing while the entry
725/// is still in its KEY phase (i.e. the entry never received a `:`,
726/// e.g. a `?`-only explicit-key entry), an empty VALUE wrapper is
727/// inserted before the ENTRY closes so every ENTRY has the same
728/// `KEY + VALUE` child shape — the projection layer relies on that
729/// invariant. For sequences it closes the ITEM. Caller decides whether
730/// to also pop the frame itself.
731fn close_open_sub_wrapper(builder: &mut GreenNodeBuilder<'_>, stack: &mut [BlockFrame]) {
732    let Some(frame) = stack.last_mut() else {
733        return;
734    };
735    match frame {
736        BlockFrame::BlockMap {
737            entry_open: true,
738            in_value,
739        } => {
740            if *in_value {
741                builder.finish_node(); // close VALUE
742            } else {
743                builder.finish_node(); // close KEY
744                builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
745                builder.finish_node(); // empty VALUE for shape parity
746            }
747            builder.finish_node(); // close ENTRY
748            *frame = BlockFrame::BlockMap {
749                entry_open: false,
750                in_value: false,
751            };
752        }
753        BlockFrame::FlowMap {
754            entry_open: true,
755            in_value,
756        } => {
757            if *in_value {
758                builder.finish_node();
759            } else {
760                builder.finish_node();
761                builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
762                builder.finish_node();
763            }
764            builder.finish_node();
765            *frame = BlockFrame::FlowMap {
766                entry_open: false,
767                in_value: false,
768            };
769        }
770        BlockFrame::BlockSequence {
771            item_open: true,
772            indentless,
773        } => {
774            let indentless = *indentless;
775            builder.finish_node();
776            *frame = BlockFrame::BlockSequence {
777                item_open: false,
778                indentless,
779            };
780        }
781        BlockFrame::FlowSequence { item_open: true } => {
782            builder.finish_node();
783            *frame = BlockFrame::FlowSequence { item_open: false };
784        }
785        _ => {}
786    }
787}
788
789fn close_block_containers(builder: &mut GreenNodeBuilder<'_>, stack: &mut Vec<BlockFrame>) {
790    while let Some(frame) = stack.pop() {
791        match frame {
792            BlockFrame::BlockMap {
793                entry_open: true,
794                in_value,
795            } => {
796                if in_value {
797                    builder.finish_node(); // close VALUE
798                } else {
799                    builder.finish_node(); // close KEY
800                    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
801                    builder.finish_node();
802                }
803                builder.finish_node(); // close ENTRY
804            }
805            BlockFrame::FlowMap {
806                entry_open: true,
807                in_value,
808            } => {
809                if in_value {
810                    builder.finish_node();
811                } else {
812                    builder.finish_node();
813                    builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
814                    builder.finish_node();
815                }
816                builder.finish_node();
817            }
818            BlockFrame::BlockSequence {
819                item_open: true, ..
820            }
821            | BlockFrame::FlowSequence { item_open: true } => {
822                builder.finish_node();
823            }
824            _ => {}
825        }
826        builder.finish_node();
827    }
828}
829
830/// Emit a scalar token's bytes as a `YAML_SCALAR` node whose leaves are
831/// the per-physical-line content fragments (`YAML_SCALAR_TEXT`)
832/// interleaved with `NEWLINE` leaves for the line breaks. Concatenating
833/// the leaves reproduces `text` exactly, so this is byte-lossless and the
834/// node's text range is unchanged. The node wrapper plus per-line
835/// fragmentation is what lets the formatter/LSP treat a scalar as real
836/// structure and is the seam a later step uses to interleave hashpipe
837/// line-prefix leaves (see the yaml-formatter cutover plan, step 2).
838fn emit_scalar_node(
839    builder: &mut GreenNodeBuilder<'static>,
840    text: &str,
841    line_prefix: Option<&str>,
842) {
843    builder.start_node(SyntaxKind::YAML_SCALAR.into());
844    emit_scalar_fragments(builder, text, line_prefix);
845    builder.finish_node();
846}
847
848/// Split a scalar's source `text` into per-physical-line leaves:
849/// `YAML_SCALAR_TEXT` content interleaved with `NEWLINE` line breaks
850/// (`\n`, `\r\n`, and lone `\r` each one `NEWLINE` leaf). When
851/// `line_prefix` is set, an embedded prefix at the start of each
852/// *continuation* line (the marker plus at most one trailing space,
853/// mirroring the scanner) is peeled into a leading `YAML_LINE_PREFIX`
854/// leaf. The first line never carries an embedded prefix — its line-start
855/// prefix was emitted as a separate `Trivia(LinePrefix)` token by the
856/// scanner before the scalar began. Empty content runs are skipped
857/// (rowan rejects zero-width tokens). The concatenation of all leaves
858/// equals `text` exactly, so the node stays byte-lossless.
859fn emit_scalar_fragments(
860    builder: &mut GreenNodeBuilder<'static>,
861    text: &str,
862    line_prefix: Option<&str>,
863) {
864    let bytes = text.as_bytes();
865    let mut i = 0;
866    let mut line_index = 0usize;
867    while i < bytes.len() {
868        // Peel an embedded line prefix on continuation lines only.
869        if line_index > 0
870            && let Some(prefix) = line_prefix
871            && let Some(len) = prefix_match_len(&text[i..], prefix)
872        {
873            builder.token(SyntaxKind::YAML_LINE_PREFIX.into(), &text[i..i + len]);
874            i += len;
875        }
876        // Content up to the next line break.
877        let content_start = i;
878        while i < bytes.len() && !matches!(bytes[i], b'\n' | b'\r') {
879            i += 1;
880        }
881        if content_start < i {
882            builder.token(SyntaxKind::YAML_SCALAR_TEXT.into(), &text[content_start..i]);
883        }
884        // Line break (if any).
885        if i < bytes.len() {
886            let nl_len = if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
887                2
888            } else {
889                1
890            };
891            builder.token(SyntaxKind::NEWLINE.into(), &text[i..i + nl_len]);
892            i += nl_len;
893            line_index += 1;
894        }
895    }
896}
897
898/// Match an embedded line prefix at the start of `s`: the `marker` plus
899/// at most one following space (mirroring `strip_line_prefix` and the
900/// scanner's `prefix_byte_len_at`). Returns the matched byte length.
901fn prefix_match_len(s: &str, marker: &str) -> Option<usize> {
902    let after = s.strip_prefix(marker)?;
903    Some(marker.len() + usize::from(after.starts_with(' ')))
904}
905
906fn map_token_to_syntax_kind(kind: TokenKind) -> SyntaxKind {
907    match kind {
908        TokenKind::Trivia(TriviaKind::Whitespace) => SyntaxKind::WHITESPACE,
909        TokenKind::Trivia(TriviaKind::Newline) => SyntaxKind::NEWLINE,
910        TokenKind::Trivia(TriviaKind::Comment) => SyntaxKind::YAML_COMMENT,
911        TokenKind::Trivia(TriviaKind::LinePrefix) => SyntaxKind::YAML_LINE_PREFIX,
912        TokenKind::DocumentStart => SyntaxKind::YAML_DOCUMENT_START,
913        TokenKind::DocumentEnd => SyntaxKind::YAML_DOCUMENT_END,
914        TokenKind::Directive => SyntaxKind::YAML_DIRECTIVE,
915        TokenKind::BlockEntry => SyntaxKind::YAML_BLOCK_SEQ_ENTRY,
916        TokenKind::FlowEntry => SyntaxKind::YAML_FLOW_INDICATOR,
917        TokenKind::FlowSequenceStart | TokenKind::FlowSequenceEnd => {
918            SyntaxKind::YAML_FLOW_INDICATOR
919        }
920        TokenKind::FlowMappingStart | TokenKind::FlowMappingEnd => SyntaxKind::YAML_FLOW_INDICATOR,
921        TokenKind::Value => SyntaxKind::YAML_COLON,
922        TokenKind::Anchor => SyntaxKind::YAML_ANCHOR,
923        TokenKind::Alias => SyntaxKind::YAML_ALIAS,
924        TokenKind::Tag => SyntaxKind::YAML_TAG,
925        // Scalar tokens are emitted as a `YAML_SCALAR` *node* (split into
926        // per-line `YAML_SCALAR_TEXT` leaves) via `emit_scalar_node`, not
927        // through this token-kind map. This arm is the leaf kind for a
928        // scalar's content fragment, used by that helper.
929        TokenKind::Scalar(_) => SyntaxKind::YAML_SCALAR_TEXT,
930        // Source-backed `Key` (the explicit `?` indicator) — there is
931        // no dedicated SyntaxKind yet, route to YAML_KEY for now.
932        TokenKind::Key => SyntaxKind::YAML_KEY,
933        // Synthetic markers handled before this map; defensive
934        // fallback (never emitted as bytes).
935        TokenKind::StreamStart
936        | TokenKind::StreamEnd
937        | TokenKind::BlockSequenceStart
938        | TokenKind::BlockMappingStart
939        | TokenKind::BlockEnd => SyntaxKind::YAML_FLOW_INDICATOR,
940    }
941}
942
943#[cfg(test)]
944mod tests {
945    use super::*;
946    use crate::syntax::SyntaxKind;
947
948    /// `parse_stream` must reproduce its input byte-for-byte.
949    fn assert_lossless(input: &str) {
950        assert_eq!(
951            parse_stream(input).text().to_string(),
952            input,
953            "input {input:?} not preserved"
954        );
955    }
956
957    #[test]
958    fn strip_with_offsets_matches_strip_line_prefix() {
959        for input in [
960            "#| a: 1\n",
961            "#| a: 1\n#|   b\n",
962            "  #| x: 1\n",
963            "#| a\r\n#| b\r\n",
964            "#| a",
965        ] {
966            let (text, offsets) = strip_line_prefix_with_offsets(input, "#|");
967            assert_eq!(text, strip_line_prefix(input, "#|"), "text for {input:?}");
968            assert_eq!(offsets.len(), text.len() + 1, "offset count for {input:?}");
969            assert!(
970                offsets.iter().all(|&o| o <= input.len()),
971                "offsets in bounds for {input:?}"
972            );
973        }
974    }
975
976    #[test]
977    fn locate_maps_hashpipe_error_to_region_offset() {
978        let input = "#| echo: [\n";
979        let (_diag, start, _end) = locate_yaml_diagnostic(input, "#|").expect("diagnostic");
980        assert_eq!(start, input.find('[').unwrap());
981    }
982
983    #[test]
984    fn locate_maps_composite_marker_error() {
985        // List-indented cell: the marker includes the container indent.
986        let input = "   #| echo: [\n";
987        let (_diag, start, _end) = locate_yaml_diagnostic(input, "   #|").expect("diagnostic");
988        assert_eq!(start, input.find('[').unwrap());
989    }
990
991    #[test]
992    fn locate_maps_crlf_region_error() {
993        let input = "#| ok: 1\r\n#| echo: [\r\n";
994        let (_diag, start, _end) = locate_yaml_diagnostic(input, "#|").expect("diagnostic");
995        assert_eq!(start, input.find('[').unwrap());
996    }
997
998    #[test]
999    fn locate_frontmatter_uses_identity_offsets() {
1000        let input = "title: [\n";
1001        let (diag, start, _end) = locate_yaml_diagnostic(input, "").expect("diagnostic");
1002        assert_eq!(start, diag.byte_start);
1003        assert_eq!(start, input.find('[').unwrap());
1004    }
1005
1006    #[test]
1007    fn locate_returns_none_for_valid_yaml() {
1008        assert!(locate_yaml_diagnostic("#| echo: false\n", "#|").is_none());
1009        assert!(locate_yaml_diagnostic("title: ok\n", "").is_none());
1010    }
1011
1012    #[test]
1013    fn block_scalar_followed_by_option_is_not_swallowed_as_comment() {
1014        // Regression: a prefixed option after a `|` block scalar was scanned as a
1015        // YAML comment (the terminating line's `#|` prefix wasn't peeled), which
1016        // dropped the option. Both keys must survive as structure.
1017        let input = "#| fig-cap: |\n#|   A caption\n#| echo: false\n";
1018        let tree = parse_stream_with_prefix(input, "#|");
1019        assert_eq!(tree.to_string(), input, "byte-lossless");
1020        let entries = tree
1021            .descendants()
1022            .filter(|node| node.kind() == SyntaxKind::YAML_BLOCK_MAP_ENTRY)
1023            .count();
1024        assert_eq!(entries, 2, "expected fig-cap and echo entries");
1025        assert!(
1026            !tree
1027                .descendants_with_tokens()
1028                .any(|element| element.kind() == SyntaxKind::YAML_COMMENT),
1029            "the option line must not be scanned as a comment"
1030        );
1031    }
1032
1033    #[test]
1034    fn returns_byte_lossless_cst_for_empty_input() {
1035        assert_lossless("");
1036    }
1037
1038    #[test]
1039    fn returns_byte_lossless_cst_for_simple_mapping() {
1040        assert_lossless("key: value\n");
1041    }
1042
1043    #[test]
1044    fn returns_byte_lossless_cst_for_block_sequence() {
1045        assert_lossless("- a\n- b\n");
1046    }
1047
1048    #[test]
1049    fn returns_byte_lossless_cst_for_flow_mapping() {
1050        assert_lossless("{a: b, c: d}\n");
1051    }
1052
1053    #[test]
1054    fn returns_byte_lossless_cst_for_block_scalar() {
1055        assert_lossless("key: |\n  hello\n  world\n");
1056    }
1057
1058    #[test]
1059    fn returns_byte_lossless_cst_for_quoted_scalar() {
1060        assert_lossless("\"key\": \"value\"\n");
1061    }
1062
1063    #[test]
1064    fn returns_byte_lossless_cst_for_multi_line_plain_scalar() {
1065        assert_lossless("key: hello\n  world\n");
1066    }
1067
1068    #[test]
1069    fn preserves_explicit_key_indicator_byte_in_flow_context() {
1070        // The `?` explicit-key indicator carries a 1-byte source span
1071        // even in flow context, so the builder must NOT drop it
1072        // (only zero-width `Key` splices from `fetch_value` should be
1073        // dropped). Regression: an earlier draft filtered every Key.
1074        assert_lossless("{ ?foo: bar }\n");
1075    }
1076
1077    #[test]
1078    fn does_not_absorb_terminator_line_break_into_flow_scalar() {
1079        // Regression: in flow context the multi-line plain
1080        // continuation must abort if the next non-blank char is a
1081        // flow terminator (`}`/`]`/`,`). Otherwise the trailing
1082        // newline got swallowed into the scalar (`42\n` instead of
1083        // `42`) and the closer's byte position drifted.
1084        assert_lossless("{a: 42\n}\n");
1085    }
1086
1087    fn document_count(tree: &SyntaxNode) -> usize {
1088        tree.children()
1089            .filter(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1090            .count()
1091    }
1092
1093    #[test]
1094    fn implicit_document_wraps_body_with_no_markers() {
1095        // No explicit `---` or `...` — the body still belongs to one
1096        // YAML_DOCUMENT so projection has a node to walk.
1097        let input = "key: value\n";
1098        let tree = parse_stream(input);
1099        assert_eq!(document_count(&tree), 1);
1100        assert_eq!(tree.text().to_string(), input);
1101    }
1102
1103    #[test]
1104    fn explicit_doc_start_opens_document_marker_lives_inside() {
1105        let input = "---\nkey: value\n";
1106        let tree = parse_stream(input);
1107        assert_eq!(document_count(&tree), 1);
1108        let doc = tree
1109            .children()
1110            .find(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1111            .expect("document node");
1112        assert!(
1113            doc.children_with_tokens().any(|el| el
1114                .as_token()
1115                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_START)),
1116            "`---` token should live inside YAML_DOCUMENT"
1117        );
1118        assert_eq!(tree.text().to_string(), input);
1119    }
1120
1121    #[test]
1122    fn explicit_doc_end_closes_document_marker_lives_inside() {
1123        let input = "key: value\n...\n";
1124        let tree = parse_stream(input);
1125        assert_eq!(document_count(&tree), 1);
1126        let doc = tree
1127            .children()
1128            .find(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1129            .expect("document node");
1130        assert!(
1131            doc.children_with_tokens().any(|el| el
1132                .as_token()
1133                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_END)),
1134            "`...` token should live inside YAML_DOCUMENT"
1135        );
1136        assert_eq!(tree.text().to_string(), input);
1137    }
1138
1139    #[test]
1140    fn consecutive_doc_starts_emit_two_documents() {
1141        let input = "---\na\n---\nb\n";
1142        let tree = parse_stream(input);
1143        assert_eq!(document_count(&tree), 2);
1144        assert_eq!(tree.text().to_string(), input);
1145    }
1146
1147    #[test]
1148    fn pre_document_trivia_stays_at_stream_level() {
1149        // A leading newline before the first document content should
1150        // sit under YAML_STREAM, not inside a YAML_DOCUMENT — there is
1151        // no document yet at that point.
1152        let input = "\n---\nkey: value\n";
1153        let tree = parse_stream(input);
1154        let stream_token_kinds: Vec<SyntaxKind> = tree
1155            .children_with_tokens()
1156            .filter_map(|el| el.into_token())
1157            .map(|t| t.kind())
1158            .collect();
1159        assert!(
1160            stream_token_kinds.contains(&SyntaxKind::NEWLINE),
1161            "leading newline should be a direct child of YAML_STREAM, got {stream_token_kinds:?}"
1162        );
1163        assert_eq!(tree.text().to_string(), input);
1164    }
1165
1166    #[test]
1167    fn bare_doc_end_at_stream_start_opens_synthetic_empty_document() {
1168        // Pathological but lossless: a stream that begins with `...`
1169        // wraps the marker in an empty YAML_DOCUMENT so no source
1170        // bytes leak out at YAML_STREAM level uncoupled from a doc.
1171        let input = "...\n";
1172        let tree = parse_stream(input);
1173        assert_eq!(document_count(&tree), 1);
1174        assert_eq!(tree.text().to_string(), input);
1175    }
1176
1177    fn first_document(tree: &SyntaxNode) -> SyntaxNode {
1178        tree.children()
1179            .find(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1180            .expect("at least one document")
1181    }
1182
1183    fn block_map_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1184        parent
1185            .children()
1186            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1187    }
1188
1189    fn block_seq_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1190        parent
1191            .children()
1192            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE)
1193    }
1194
1195    fn block_map_entries(map: &SyntaxNode) -> Vec<SyntaxNode> {
1196        map.children()
1197            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_ENTRY)
1198            .collect()
1199    }
1200
1201    fn block_seq_items(seq: &SyntaxNode) -> Vec<SyntaxNode> {
1202        seq.children()
1203            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM)
1204            .collect()
1205    }
1206
1207    fn entry_key(entry: &SyntaxNode) -> SyntaxNode {
1208        entry
1209            .children()
1210            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_KEY)
1211            .expect("entry should have a YAML_BLOCK_MAP_KEY child")
1212    }
1213
1214    fn entry_value(entry: &SyntaxNode) -> SyntaxNode {
1215        entry
1216            .children()
1217            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_VALUE)
1218            .expect("entry should have a YAML_BLOCK_MAP_VALUE child")
1219    }
1220
1221    #[test]
1222    fn consecutive_empty_key_colons_open_separate_entries() {
1223        // `: a\n: b` is two block-map entries, each with an empty
1224        // (null) key and a value (2JQS). The scanner emits two bare
1225        // `Value` tokens with no Key/BlockEnd between them, so the
1226        // builder must close the first entry when the second `:`
1227        // arrives at the map's column rather than absorbing it into
1228        // the first value.
1229        let input = ": a\n: b\n";
1230        let tree = parse_stream(input);
1231        let doc = first_document(&tree);
1232        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1233        let entries = block_map_entries(&map);
1234        assert_eq!(entries.len(), 2, "expected two empty-key ENTRY nodes");
1235        for (entry, scalar) in entries.iter().zip(["a", "b"]) {
1236            let key = entry_key(entry);
1237            // Empty key: the KEY holds only the `:` value indicator.
1238            assert!(
1239                !key.children().any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1240                "empty key should carry no scalar, got {key:?}",
1241            );
1242            let value = entry_value(entry);
1243            assert!(
1244                value
1245                    .children()
1246                    .any(|n| n.kind() == SyntaxKind::YAML_SCALAR && n.text() == scalar),
1247                "value should be {scalar:?}, got {value:?}",
1248            );
1249        }
1250        assert_eq!(tree.text().to_string(), input);
1251    }
1252
1253    #[test]
1254    fn block_mapping_wraps_key_value_with_key_and_value_sub_wrappers() {
1255        let input = "key: value\n";
1256        let tree = parse_stream(input);
1257        let doc = first_document(&tree);
1258        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1259        let entries = block_map_entries(&map);
1260        assert_eq!(entries.len(), 1, "expected one ENTRY for `key: value`");
1261        let key = entry_key(&entries[0]);
1262        let value = entry_value(&entries[0]);
1263        // Colon ends the KEY (last token); VALUE has the scalar.
1264        assert!(
1265            key.children_with_tokens().any(|el| el
1266                .as_token()
1267                .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1268            "colon should be the trailing token of YAML_BLOCK_MAP_KEY",
1269        );
1270        assert!(
1271            value
1272                .children()
1273                .any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1274            "scalar `value` should live inside YAML_BLOCK_MAP_VALUE",
1275        );
1276        assert_eq!(tree.text().to_string(), input);
1277    }
1278
1279    #[test]
1280    fn block_sequence_wraps_entries_in_yaml_block_sequence() {
1281        let input = "- a\n- b\n";
1282        let tree = parse_stream(input);
1283        let doc = first_document(&tree);
1284        let seq = block_seq_under(&doc).expect("YAML_BLOCK_SEQUENCE child");
1285        let items = block_seq_items(&seq);
1286        assert_eq!(items.len(), 2, "expected 2 YAML_BLOCK_SEQUENCE_ITEM");
1287        // Each item must own its own `-` entry token.
1288        for item in &items {
1289            let dash_count = item
1290                .children_with_tokens()
1291                .filter(|el| {
1292                    el.as_token()
1293                        .is_some_and(|t| t.kind() == SyntaxKind::YAML_BLOCK_SEQ_ENTRY)
1294                })
1295                .count();
1296            assert_eq!(dash_count, 1, "each item owns exactly one `-` token");
1297        }
1298        assert_eq!(tree.text().to_string(), input);
1299    }
1300
1301    #[test]
1302    fn nested_block_mapping_nests_inner_block_map_inside_outer_value() {
1303        let input = "outer:\n  inner: x\n";
1304        let tree = parse_stream(input);
1305        let doc = first_document(&tree);
1306        let outer = block_map_under(&doc).expect("outer YAML_BLOCK_MAP");
1307        let outer_entries = block_map_entries(&outer);
1308        assert_eq!(outer_entries.len(), 1);
1309        let outer_value = entry_value(&outer_entries[0]);
1310        let inner = outer_value
1311            .children()
1312            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1313            .expect("inner YAML_BLOCK_MAP nested under outer VALUE");
1314        let inner_entries = block_map_entries(&inner);
1315        assert_eq!(inner_entries.len(), 1);
1316        let inner_key = entry_key(&inner_entries[0]);
1317        assert!(
1318            inner_key.children_with_tokens().any(|el| el
1319                .as_token()
1320                .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1321            "inner key should own its colon",
1322        );
1323        assert_eq!(tree.text().to_string(), input);
1324    }
1325
1326    #[test]
1327    fn block_sequence_inside_mapping_nests_under_outer_map_value() {
1328        let input = "items:\n  - a\n  - b\n";
1329        let tree = parse_stream(input);
1330        let doc = first_document(&tree);
1331        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1332        let entries = block_map_entries(&map);
1333        assert_eq!(entries.len(), 1, "one entry: `items: <seq>`");
1334        let value = entry_value(&entries[0]);
1335        let seq = value
1336            .children()
1337            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE)
1338            .expect("YAML_BLOCK_SEQUENCE nested under map VALUE");
1339        let items = block_seq_items(&seq);
1340        assert_eq!(items.len(), 2);
1341        assert_eq!(tree.text().to_string(), input);
1342    }
1343
1344    #[test]
1345    fn dedent_closes_inner_block_map_before_next_outer_key() {
1346        // outer:
1347        //   inner: x
1348        // sibling: y
1349        // The dedent before `sibling` must close the inner map and
1350        // its outer ENTRY so `sibling: y` lands as a sibling ENTRY
1351        // under the outer map.
1352        let input = "outer:\n  inner: x\nsibling: y\n";
1353        let tree = parse_stream(input);
1354        let doc = first_document(&tree);
1355        let outer = block_map_under(&doc).expect("outer YAML_BLOCK_MAP");
1356        let entries = block_map_entries(&outer);
1357        assert_eq!(
1358            entries.len(),
1359            2,
1360            "outer map should have two entries (`outer:` and `sibling:`)",
1361        );
1362        // Only the first entry's VALUE has a nested map; the second is flat.
1363        let first_value = entry_value(&entries[0]);
1364        let nested_in_first = first_value
1365            .children()
1366            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1367            .count();
1368        assert_eq!(nested_in_first, 1);
1369        let second_value = entry_value(&entries[1]);
1370        let nested_in_second = second_value
1371            .children()
1372            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1373            .count();
1374        assert_eq!(nested_in_second, 0);
1375        assert_eq!(tree.text().to_string(), input);
1376    }
1377
1378    #[test]
1379    fn block_map_with_two_top_level_entries_emits_two_entry_wrappers() {
1380        let input = "a: 1\nb: 2\n";
1381        let tree = parse_stream(input);
1382        let doc = first_document(&tree);
1383        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1384        assert_eq!(block_map_entries(&map).len(), 2);
1385        assert_eq!(tree.text().to_string(), input);
1386    }
1387
1388    #[test]
1389    fn explicit_key_indicator_question_mark_lives_inside_key() {
1390        // `? a\n: b\n` — the `?` is a source-backed Key token. It
1391        // opens the ENTRY and lives inside the resulting KEY node
1392        // (alongside the scalar `a` and the trailing `:`).
1393        let input = "? a\n: b\n";
1394        let tree = parse_stream(input);
1395        let doc = first_document(&tree);
1396        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1397        let entries = block_map_entries(&map);
1398        assert_eq!(entries.len(), 1);
1399        let key = entry_key(&entries[0]);
1400        let has_question = key.children_with_tokens().any(|el| {
1401            el.as_token()
1402                .is_some_and(|t| t.kind() == SyntaxKind::YAML_KEY)
1403        });
1404        assert!(has_question, "`?` should live inside YAML_BLOCK_MAP_KEY");
1405        assert_eq!(tree.text().to_string(), input);
1406    }
1407
1408    #[test]
1409    fn explicit_key_indentless_sequence_wraps_inside_key() {
1410        // `?\n- a\n- b\n:\n- c\n- d\n` (6PBE) — the explicit `?` key's
1411        // content is a zero-indented block sequence. As with an indentless
1412        // sequence in a VALUE, the scanner pushes no indent level and emits
1413        // no BlockSequenceStart, so the builder must synthesize a
1414        // YAML_BLOCK_SEQUENCE inside the KEY (mirroring the VALUE side)
1415        // rather than leaving the `- a` / `- b` entries flat.
1416        let input = "?\n- a\n- b\n:\n- c\n- d\n";
1417        let tree = parse_stream(input);
1418        let doc = first_document(&tree);
1419        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1420        let entries = block_map_entries(&map);
1421        assert_eq!(entries.len(), 1);
1422        let key = entry_key(&entries[0]);
1423        assert!(
1424            key.children()
1425                .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE),
1426            "explicit-key block sequence should be wrapped in YAML_BLOCK_SEQUENCE inside KEY",
1427        );
1428        let value = entry_value(&entries[0]);
1429        assert!(
1430            value
1431                .children()
1432                .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE),
1433            "value-side block sequence should remain wrapped",
1434        );
1435        assert_eq!(tree.text().to_string(), input);
1436    }
1437
1438    #[test]
1439    fn empty_key_shorthand_opens_entry_with_empty_key() {
1440        // `: value\n` — bare `:` at column 0 is the empty-implicit-key
1441        // shorthand. The builder must open ENTRY+KEY before the colon
1442        // arrives so the colon ends up as the only KEY child.
1443        let input = ": value\n";
1444        let tree = parse_stream(input);
1445        let doc = first_document(&tree);
1446        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1447        let entries = block_map_entries(&map);
1448        assert_eq!(entries.len(), 1);
1449        let key = entry_key(&entries[0]);
1450        // KEY has no scalar; only the colon.
1451        assert!(
1452            !key.children().any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1453            "empty-key shorthand has no scalar in KEY",
1454        );
1455        assert!(
1456            key.children_with_tokens().any(|el| el
1457                .as_token()
1458                .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1459            "empty-key KEY still owns the `:` token",
1460        );
1461        let value = entry_value(&entries[0]);
1462        assert!(
1463            value
1464                .children()
1465                .any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1466            "VALUE owns the `value` scalar",
1467        );
1468        assert_eq!(tree.text().to_string(), input);
1469    }
1470
1471    #[test]
1472    fn document_end_marker_lives_at_document_level_not_inside_block_map() {
1473        // `...` must not be buried inside the block map; it is a
1474        // document-level marker. The builder closes any open block
1475        // containers before consuming `DocumentEnd`.
1476        let input = "key: value\n...\n";
1477        let tree = parse_stream(input);
1478        let doc = first_document(&tree);
1479        let has_doc_end = doc.children_with_tokens().any(|el| {
1480            el.as_token()
1481                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_END)
1482        });
1483        assert!(
1484            has_doc_end,
1485            "DOCUMENT_END should be a direct child of YAML_DOCUMENT"
1486        );
1487        assert_eq!(tree.text().to_string(), input);
1488    }
1489
1490    fn flow_map_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1491        parent
1492            .children()
1493            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
1494    }
1495
1496    fn flow_seq_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1497        parent
1498            .children()
1499            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
1500    }
1501
1502    fn flow_map_entries(map: &SyntaxNode) -> Vec<SyntaxNode> {
1503        map.children()
1504            .filter(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_ENTRY)
1505            .collect()
1506    }
1507
1508    fn flow_seq_items(seq: &SyntaxNode) -> Vec<SyntaxNode> {
1509        seq.children()
1510            .filter(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE_ITEM)
1511            .collect()
1512    }
1513
1514    #[test]
1515    fn flow_sequence_wraps_each_item_in_flow_sequence_item() {
1516        let input = "[a, b, c]\n";
1517        let tree = parse_stream(input);
1518        let doc = first_document(&tree);
1519        let seq = flow_seq_under(&doc).expect("YAML_FLOW_SEQUENCE child");
1520        let items = flow_seq_items(&seq);
1521        assert_eq!(items.len(), 3);
1522        // The opening `[` and closing `]` live at SEQUENCE level
1523        // (siblings of items).
1524        let bracket_count = seq
1525            .children_with_tokens()
1526            .filter(|el| {
1527                el.as_token().map(|t| t.text()) == Some("[")
1528                    || el.as_token().map(|t| t.text()) == Some("]")
1529            })
1530            .count();
1531        assert_eq!(bracket_count, 2, "`[` and `]` at SEQUENCE level");
1532        assert_eq!(tree.text().to_string(), input);
1533    }
1534
1535    #[test]
1536    fn flow_mapping_wraps_each_entry_with_key_and_value() {
1537        let input = "{a: 1, b: 2}\n";
1538        let tree = parse_stream(input);
1539        let doc = first_document(&tree);
1540        let map = flow_map_under(&doc).expect("YAML_FLOW_MAP child");
1541        let entries = flow_map_entries(&map);
1542        assert_eq!(entries.len(), 2);
1543        for entry in &entries {
1544            let key = entry
1545                .children()
1546                .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_KEY)
1547                .expect("entry has YAML_FLOW_MAP_KEY");
1548            assert!(
1549                key.children_with_tokens().any(|el| el
1550                    .as_token()
1551                    .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1552                "flow KEY owns trailing `:`",
1553            );
1554            let value = entry
1555                .children()
1556                .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_VALUE)
1557                .expect("entry has YAML_FLOW_MAP_VALUE");
1558            assert!(
1559                value
1560                    .children()
1561                    .any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1562                "flow VALUE owns its scalar",
1563            );
1564        }
1565        assert_eq!(tree.text().to_string(), input);
1566    }
1567
1568    #[test]
1569    fn flow_sequence_inside_flow_sequence_nests_under_outer_item() {
1570        let input = "[[1, 2], [3, 4]]\n";
1571        let tree = parse_stream(input);
1572        let doc = first_document(&tree);
1573        let outer = flow_seq_under(&doc).expect("outer YAML_FLOW_SEQUENCE");
1574        let outer_items = flow_seq_items(&outer);
1575        assert_eq!(outer_items.len(), 2);
1576        for item in &outer_items {
1577            assert!(
1578                item.children()
1579                    .any(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE),
1580                "outer item should contain a nested YAML_FLOW_SEQUENCE",
1581            );
1582        }
1583        assert_eq!(tree.text().to_string(), input);
1584    }
1585
1586    #[test]
1587    fn flow_mapping_inside_flow_sequence_nests_under_item() {
1588        let input = "[{a: 1}, {b: 2}]\n";
1589        let tree = parse_stream(input);
1590        let doc = first_document(&tree);
1591        let seq = flow_seq_under(&doc).expect("YAML_FLOW_SEQUENCE child");
1592        let items = flow_seq_items(&seq);
1593        assert_eq!(items.len(), 2);
1594        for item in &items {
1595            assert!(
1596                item.children()
1597                    .any(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP),
1598                "each item should contain a nested YAML_FLOW_MAP",
1599            );
1600        }
1601        assert_eq!(tree.text().to_string(), input);
1602    }
1603
1604    #[test]
1605    fn flow_mapping_at_block_map_value_nests_under_block_map_value() {
1606        let input = "key: {a: 1, b: 2}\n";
1607        let tree = parse_stream(input);
1608        let doc = first_document(&tree);
1609        let block_map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1610        let entries = block_map_entries(&block_map);
1611        assert_eq!(entries.len(), 1);
1612        let value = entry_value(&entries[0]);
1613        assert!(
1614            value
1615                .children()
1616                .any(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP),
1617            "flow map should be nested under outer block map's VALUE",
1618        );
1619        assert_eq!(tree.text().to_string(), input);
1620    }
1621
1622    #[test]
1623    fn directive_prelude_stays_inside_document_opened_by_marker() {
1624        // YAML 1.2 §6.8.1: directives belong to the document the
1625        // following `---` opens. The builder must not split the
1626        // directive line into a separate doc — the entire input is one
1627        // YAML_DOCUMENT.
1628        let input = "%TAG !e! tag:example.com,2000:app/\n---\n!e!foo \"bar\"\n";
1629        let tree = parse_stream(input);
1630        assert_eq!(document_count(&tree), 1);
1631        let doc = first_document(&tree);
1632        let has_doc_start = doc.children_with_tokens().any(|el| {
1633            el.as_token()
1634                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_START)
1635        });
1636        assert!(has_doc_start, "the `---` should live inside the same doc");
1637        assert_eq!(tree.text().to_string(), input);
1638    }
1639
1640    #[test]
1641    fn explicit_key_without_value_emits_empty_value_for_shape_parity() {
1642        // `? a\n? b\n` — neither entry has a `:`. Each ENTRY must still
1643        // hold both KEY and VALUE children (VALUE empty) so projection
1644        // walkers don't have to special-case missing children.
1645        let input = "? a\n? b\n";
1646        let tree = parse_stream(input);
1647        let doc = first_document(&tree);
1648        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP");
1649        let entries = block_map_entries(&map);
1650        assert_eq!(entries.len(), 2);
1651        for entry in &entries {
1652            assert!(
1653                entry
1654                    .children()
1655                    .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_KEY),
1656                "ENTRY missing KEY child",
1657            );
1658            assert!(
1659                entry
1660                    .children()
1661                    .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_VALUE),
1662                "ENTRY missing VALUE child",
1663            );
1664        }
1665    }
1666}