Skip to main content

panache_parser/parser/yaml/
parser.rs

1//! YAML parser core: orchestrator + streaming-token-to-CST builder.
2//!
3//! Two layers live in this module:
4//!
5//! 1. **Orchestrator** — [`parse_yaml_tree`] and [`parse_yaml_report`].
6//!    These drive [`parse_stream`] for a pure-YAML parse rooted at
7//!    `YAML_STREAM`, run the structural
8//!    [`super::validator::validate_yaml`] pass, and surface diagnostics.
9//!    Host envelope wrappers (`DOCUMENT`, `YAML_METADATA_CONTENT`,
10//!    `HASHPIPE_YAML_CONTENT`) are added by the host parser at embedding
11//!    sites and are not concerns of the standalone YAML parse path.
12//!
13//! 2. **Streaming parser** — [`parse_stream`] drives
14//!    [`super::scanner::Scanner`] and emits the rowan green tree. Each
15//!    contiguous run of body content is wrapped in a `YAML_DOCUMENT`
16//!    node (with `---` / `...` markers consumed inside the document
17//!    they delimit); block-context content nests under `YAML_BLOCK_MAP`
18//!    / `YAML_BLOCK_SEQUENCE` containers driven by the scanner's
19//!    synthetic `BlockMappingStart` / `BlockSequenceStart` / `BlockEnd`
20//!    markers; each key-value pair is wrapped in
21//!    `YAML_BLOCK_MAP_ENTRY`, each `-` entry in
22//!    `YAML_BLOCK_SEQUENCE_ITEM`, and each map entry splits into
23//!    `YAML_BLOCK_MAP_KEY` (everything up to and including the `:`) and
24//!    `YAML_BLOCK_MAP_VALUE` (everything after). Flow contexts mirror
25//!    the same shape: `YAML_FLOW_MAP` / `YAML_FLOW_MAP_ENTRY` /
26//!    `YAML_FLOW_MAP_KEY` / `YAML_FLOW_MAP_VALUE` and
27//!    `YAML_FLOW_SEQUENCE` / `YAML_FLOW_SEQUENCE_ITEM`. Source-backed
28//!    `[` / `]` / `{` / `}` / `,` are emitted at the container level
29//!    (siblings of items), with item/entry sub-wrappers closing on `,`
30//!    and the matching closer.
31
32#![allow(dead_code)]
33
34use crate::syntax::{SyntaxKind, SyntaxNode};
35use rowan::GreenNodeBuilder;
36
37use super::model::{YamlDiagnostic, YamlParseReport};
38use super::scanner::{Scanner, TokenKind, TriviaKind};
39
40/// Strip a per-line `prefix` (marker plus at most one following space)
41/// from every line, joining with `\n`. The stripped baseline a
42/// prefix-aware parse is validated against (see
43/// [`validate_yaml_with_prefix`]).
44fn strip_line_prefix(input: &str, prefix: &str) -> String {
45    input
46        .lines()
47        .map(|line| match line.strip_prefix(prefix) {
48            Some(rest) => rest.strip_prefix(' ').unwrap_or(rest),
49            None => line,
50        })
51        .collect::<Vec<_>>()
52        .join("\n")
53}
54
55/// Structural validation for embedded (prefixed) YAML. Strips the
56/// per-line `prefix` to the plain-YAML baseline and runs the standard
57/// [`super::validator::validate_yaml`] pass. The verdict matches the
58/// stripped baseline; diagnostic offsets refer to the stripped text
59/// (host-offset remapping is a later concern). An empty `prefix` is
60/// plain validation.
61pub fn validate_yaml_with_prefix(input: &str, prefix: &str) -> Option<YamlDiagnostic> {
62    if prefix.is_empty() {
63        return super::validator::validate_yaml(input);
64    }
65    super::validator::validate_yaml(&strip_line_prefix(input, prefix))
66}
67
68/// Strip the per-line `prefix` exactly as [`strip_line_prefix`] does, but also
69/// record — for every byte of the stripped output, plus a trailing EOF entry —
70/// the byte offset it came from within `input`. Building the text and the map in
71/// lockstep guarantees they cannot drift, so a diagnostic offset (in stripped
72/// coordinates) can be located back in the original prefixed region. `prefix`
73/// must be non-empty (empty-prefix validation never strips).
74fn strip_line_prefix_with_offsets(input: &str, prefix: &str) -> (String, Vec<usize>) {
75    let mut stripped = String::new();
76    let mut offsets = Vec::new();
77    let base = input.as_ptr() as usize;
78    for (line_idx, line) in input.lines().enumerate() {
79        // `line` is a subslice of `input`; recover its byte offset.
80        let line_off = line.as_ptr() as usize - base;
81        if line_idx > 0 {
82            // The join `\n` maps to the original line break preceding this line.
83            offsets.push(line_off.saturating_sub(1));
84            stripped.push('\n');
85        }
86        let (payload, payload_off) = match line.strip_prefix(prefix) {
87            Some(rest) => {
88                let after = rest.strip_prefix(' ').unwrap_or(rest);
89                (after, line.len() - after.len())
90            }
91            None => (line, 0),
92        };
93        offsets.extend((0..payload.len()).map(|i| line_off + payload_off + i));
94        stripped.push_str(payload);
95    }
96    offsets.push(input.len());
97    (stripped, offsets)
98}
99
100/// Locate a structural YAML diagnostic in `input` (raw, possibly `prefix`-marked),
101/// returning the diagnostic plus the byte range **within `input`** it covers
102/// (start, end). An empty `prefix` is plain YAML with identity offsets. Returns
103/// `None` when `input` is valid — the verdict matches
104/// [`validate_yaml_with_prefix`]. The host parser adds the region's document
105/// start to emit a host-ranged `SyntaxError` for malformed embedded YAML.
106pub fn locate_yaml_diagnostic(input: &str, prefix: &str) -> Option<(YamlDiagnostic, usize, usize)> {
107    if prefix.is_empty() {
108        let diag = super::validator::validate_yaml(input)?;
109        let start = diag.byte_start.min(input.len());
110        let end = diag.byte_end.min(input.len()).max(start);
111        return Some((diag, start, end));
112    }
113    // Validate cheaply first (no offset table) — the common, valid path returns
114    // here with the same verdict as `validate_yaml_with_prefix`. Only build the
115    // lockstep offset map when there's actually a diagnostic to locate.
116    let diag = super::validator::validate_yaml(&strip_line_prefix(input, prefix))?;
117    let (_stripped, offsets) = strip_line_prefix_with_offsets(input, prefix);
118    let start = offsets.get(diag.byte_start).copied().unwrap_or(input.len());
119    let end = offsets
120        .get(diag.byte_end)
121        .copied()
122        .unwrap_or(input.len())
123        .max(start);
124    Some((diag, start, end))
125}
126
127/// Parse YAML tree structure from input, or `None` if it fails to parse.
128pub fn parse_yaml_tree(input: &str) -> Option<SyntaxNode> {
129    parse_yaml_report(input).tree
130}
131
132/// Parse YAML tree structure and include diagnostics on failure.
133///
134/// Diagnostics flow through the structural
135/// [`super::validator::validate_yaml`] pass, which composes per-cluster
136/// `check_*` functions covering directive ordering, structural shape
137/// (unterminated flow, trailing content, invalid keys, indent
138/// anomalies, block-scalar header, etc.), and lex-level checks like
139/// `LEX_INVALID_DOUBLE_QUOTED_ESCAPE`.
140///
141/// The returned tree, when present, comes from the streaming scanner
142/// and builder.
143pub fn parse_yaml_report(input: &str) -> YamlParseReport {
144    if let Some(err) = super::validator::validate_yaml(input) {
145        return YamlParseReport {
146            tree: None,
147            diagnostics: vec![err],
148        };
149    }
150
151    YamlParseReport {
152        tree: Some(parse_stream(input)),
153        diagnostics: Vec::new(),
154    }
155}
156
157/// Drive the scanner over `input` and build a CST. Always returns a
158/// `SyntaxNode` — the scanner is permissive and the builder preserves
159/// bytes regardless of well-formedness.
160pub fn parse_stream(input: &str) -> SyntaxNode {
161    parse_stream_inner(input, None)
162}
163
164/// Like [`parse_stream`], but treats `prefix` (e.g. hashpipe `"#|"`) as
165/// an embedded-YAML per-line marker: the scanner excludes it from
166/// column/indent accounting and the builder peels it into
167/// `YAML_LINE_PREFIX` leaves, so the resulting CST's token ranges are
168/// host ranges directly (prefix bytes included as trivia, no offset
169/// remapping). An empty `prefix` behaves like [`parse_stream`].
170///
171/// `prefix` may be a *composite* marker — a container prefix prepended to
172/// `#|` (e.g. `"   #|"` for a list-indented cell, `"> #|"` for a blockquoted
173/// one). Within a hashpipe preamble the container prefix is uniform per line,
174/// so matching the whole composite marker via `strip_prefix` parses a nested
175/// cell identically to a top-level one, peeling the entire prefix into one
176/// `YAML_LINE_PREFIX` leaf. The host computes this marker (see
177/// `parse_fenced_code_block`).
178pub fn parse_stream_with_prefix(input: &str, prefix: &str) -> SyntaxNode {
179    parse_stream_inner(input, (!prefix.is_empty()).then_some(prefix))
180}
181
182fn parse_stream_inner(input: &str, line_prefix: Option<&str>) -> SyntaxNode {
183    let mut builder = GreenNodeBuilder::new();
184    builder.start_node(SyntaxKind::YAML_STREAM.into());
185    let mut scanner = match line_prefix {
186        Some(prefix) => Scanner::with_prefix(input, prefix),
187        None => Scanner::new(input),
188    };
189    let mut doc_open = false;
190    // True when the open YAML_DOCUMENT has only seen directives + trivia
191    // (no body content yet, no `---`). YAML 1.2 says directives belong to
192    // the document the following `---` opens, so when DocumentStart
193    // arrives in this state the marker stays inside the same document
194    // rather than splitting it. Cleared as soon as any non-directive
195    // body content lands.
196    let mut doc_only_has_directives = false;
197    // Stack of currently-open block containers. Each frame tracks
198    // whether its current `YAML_BLOCK_MAP_ENTRY` / `YAML_BLOCK_SEQUENCE_ITEM`
199    // sub-wrapper is still open and waiting to be closed (by the next
200    // `Key` / `BlockEntry` peer or by `BlockEnd`).
201    let mut block_stack: Vec<BlockFrame> = Vec::new();
202    // Kind of the last non-trivia, non-stream-marker, non-decoration
203    // token emitted. An indentless block sequence is only valid when
204    // its `-` directly follows the map entry's `:` (the value is
205    // otherwise empty), so the `BlockEntry` handler consults this to
206    // tell RLU9 (`foo:\n- 42`, value is purely the sequence) apart from
207    // G9HC (`seq:\n&anchor\n- a` with the anchor at column 0 — an
208    // error the validator must still catch on the unwrapped shape).
209    // Anchor / Tag / Alias tokens are *decorations* of the next node
210    // and don't fill the empty-value slot; they're skipped here so a
211    // value-leading decoration still permits an indentless sequence
212    // (SKE5: `seq:\n &anchor\n- a`).
213    let mut prev_significant: Option<TokenKind> = None;
214    // Smallest column among Anchor/Tag/Alias decorations seen since the
215    // last value-filling token. The indentless detector uses this to
216    // distinguish SKE5 (decoration indented past parent → wrap) from
217    // G9HC (decoration at parent indent → leave unwrapped for the
218    // validator). `None` when no decoration is pending.
219    let mut decoration_col_floor: Option<usize> = None;
220    while let Some(tok) = scanner.next_token() {
221        let last_significant = prev_significant;
222        let decorations_so_far = decoration_col_floor;
223        let is_decoration = matches!(
224            tok.kind,
225            TokenKind::Anchor | TokenKind::Tag | TokenKind::Alias
226        );
227        if !matches!(
228            tok.kind,
229            TokenKind::Trivia(_) | TokenKind::StreamStart | TokenKind::StreamEnd
230        ) {
231            if is_decoration {
232                decoration_col_floor = Some(
233                    decoration_col_floor.map_or(tok.start.column, |c| c.min(tok.start.column)),
234                );
235            } else {
236                prev_significant = Some(tok.kind);
237                decoration_col_floor = None;
238            }
239        }
240        match tok.kind {
241            TokenKind::StreamStart | TokenKind::StreamEnd => continue,
242            TokenKind::BlockMappingStart => {
243                ensure_doc_open(&mut builder, &mut doc_open);
244                doc_only_has_directives = false;
245                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
246                builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
247                block_stack.push(BlockFrame::BlockMap {
248                    entry_open: false,
249                    in_value: false,
250                });
251                continue;
252            }
253            TokenKind::BlockSequenceStart => {
254                ensure_doc_open(&mut builder, &mut doc_open);
255                doc_only_has_directives = false;
256                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
257                builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
258                block_stack.push(BlockFrame::BlockSequence {
259                    item_open: false,
260                    indentless: false,
261                });
262                continue;
263            }
264            TokenKind::BlockEnd => {
265                // Indentless sequences have no scanner BlockEnd of their
266                // own, so a BlockEnd arriving while one is on top is meant
267                // for the real container beneath it. Close the indentless
268                // frame(s) first, then consume the BlockEnd normally.
269                close_indentless_sequences(&mut builder, &mut block_stack);
270                close_open_sub_wrapper(&mut builder, &mut block_stack);
271                // Defensive: only close if the scanner gave us an open
272                // container. A stray BlockEnd would otherwise pop the
273                // YAML_DOCUMENT or YAML_STREAM frame.
274                if block_stack.pop().is_some() {
275                    builder.finish_node();
276                }
277                continue;
278            }
279            TokenKind::FlowSequenceStart => {
280                ensure_doc_open(&mut builder, &mut doc_open);
281                doc_only_has_directives = false;
282                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
283                // If nested inside a Map's open KEY/VALUE wrapper, the
284                // current open scope is the appropriate parent.
285                builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE.into());
286                block_stack.push(BlockFrame::FlowSequence { item_open: false });
287                let text = &input[tok.start.index..tok.end.index];
288                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
289                continue;
290            }
291            TokenKind::FlowSequenceEnd => {
292                close_open_sub_wrapper(&mut builder, &mut block_stack);
293                let text = &input[tok.start.index..tok.end.index];
294                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
295                if matches!(
296                    block_stack.last(),
297                    Some(BlockFrame::FlowSequence { .. } | BlockFrame::FlowMap { .. })
298                ) {
299                    block_stack.pop();
300                    builder.finish_node();
301                }
302                continue;
303            }
304            TokenKind::FlowMappingStart => {
305                ensure_doc_open(&mut builder, &mut doc_open);
306                doc_only_has_directives = false;
307                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
308                builder.start_node(SyntaxKind::YAML_FLOW_MAP.into());
309                block_stack.push(BlockFrame::FlowMap {
310                    entry_open: false,
311                    in_value: false,
312                });
313                let text = &input[tok.start.index..tok.end.index];
314                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
315                continue;
316            }
317            TokenKind::FlowMappingEnd => {
318                close_open_sub_wrapper(&mut builder, &mut block_stack);
319                let text = &input[tok.start.index..tok.end.index];
320                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
321                if matches!(
322                    block_stack.last(),
323                    Some(BlockFrame::FlowMap { .. } | BlockFrame::FlowSequence { .. })
324                ) {
325                    block_stack.pop();
326                    builder.finish_node();
327                }
328                continue;
329            }
330            TokenKind::FlowEntry => {
331                // `,` closes the current entry/item and lives at the
332                // container level (between peer entries/items).
333                close_open_sub_wrapper(&mut builder, &mut block_stack);
334                let text = &input[tok.start.index..tok.end.index];
335                builder.token(SyntaxKind::YAML_FLOW_INDICATOR.into(), text);
336                continue;
337            }
338            TokenKind::Key => {
339                // A `Key` at the parent map's level terminates any
340                // open indentless sequence value first, revealing the
341                // map frame below.
342                close_indentless_sequences(&mut builder, &mut block_stack);
343                // Both the synthetic 0-width splice and the source-backed
344                // `?` indicator open a new map entry. Close the previous
345                // entry first if still open. After this, the current
346                // open scope is the new key wrapper.
347                if matches!(
348                    block_stack.last(),
349                    Some(BlockFrame::BlockMap { .. } | BlockFrame::FlowMap { .. })
350                ) {
351                    open_map_entry_with_key(&mut builder, &mut block_stack);
352                }
353                if tok.start.index == tok.end.index {
354                    // Synthetic Key splice carries no bytes.
355                    continue;
356                }
357                // Source-backed `?`: ensure we have somewhere to put it.
358                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
359                // Fall through to emit `?` inside the open KEY (or
360                // current scope if not in a Map frame).
361            }
362            TokenKind::Value => {
363                // An empty-key `:` at the parent map's level likewise
364                // terminates an open indentless sequence value first.
365                close_indentless_sequences(&mut builder, &mut block_stack);
366                let map_state = match block_stack.last().copied() {
367                    Some(BlockFrame::BlockMap {
368                        entry_open,
369                        in_value,
370                    }) => Some((false, entry_open, in_value)),
371                    Some(BlockFrame::FlowMap {
372                        entry_open,
373                        in_value,
374                    }) => Some((true, entry_open, in_value)),
375                    _ => None,
376                };
377                if let Some((is_flow, mut entry_open, mut in_value)) = map_state {
378                    // A bare `:` arriving while the current block-map
379                    // entry is already in its VALUE phase starts a NEW
380                    // entry whose key is empty (`: a\n: b`, 2JQS/S3PD) —
381                    // not a double-colon inside that value. The scanner's
382                    // indent machinery guarantees we only reach here for a
383                    // peer at the map's column (a deeper colon rolls a
384                    // fresh BlockMappingStart; a shallower one unwinds with
385                    // BlockEnd first), so close the current entry and fall
386                    // through to open the new one. Flow maps separate
387                    // entries with `,`, which already closes the entry, so
388                    // their in_value is false here — leave them alone.
389                    if !is_flow && entry_open && in_value {
390                        close_open_sub_wrapper(&mut builder, &mut block_stack);
391                        entry_open = false;
392                        in_value = false;
393                    }
394                    // Empty-key shorthand: `:` arriving without a prior
395                    // Key opens an ENTRY+KEY before consuming the colon.
396                    if !entry_open {
397                        open_map_entry_with_key(&mut builder, &mut block_stack);
398                    }
399                    if !in_value {
400                        // The colon is the last token of KEY. After it
401                        // we close KEY and open VALUE.
402                        let text = &input[tok.start.index..tok.end.index];
403                        if !text.is_empty() {
404                            builder.token(SyntaxKind::YAML_COLON.into(), text);
405                        }
406                        builder.finish_node(); // close KEY
407                        let value_kind = if is_flow {
408                            SyntaxKind::YAML_FLOW_MAP_VALUE
409                        } else {
410                            SyntaxKind::YAML_BLOCK_MAP_VALUE
411                        };
412                        builder.start_node(value_kind.into());
413                        if let Some(
414                            BlockFrame::BlockMap { in_value, .. }
415                            | BlockFrame::FlowMap { in_value, .. },
416                        ) = block_stack.last_mut()
417                        {
418                            *in_value = true;
419                        }
420                        continue;
421                    }
422                    // Already in_value: pathological double-colon. Fall
423                    // through and emit at the current scope (inside
424                    // VALUE) for losslessness.
425                }
426                // Not a Map frame: ensure flow-seq ITEM is open, then
427                // fall through to emit `:` at current scope.
428                ensure_flow_seq_item_open(&mut builder, &mut block_stack);
429            }
430            TokenKind::BlockEntry => {
431                // An indentless sequence opens when a `-` lands directly
432                // in a block-map VALUE: the scanner pushed no indent
433                // level (the `-` is at the parent key's column), so no
434                // `BlockSequenceStart` arrived. Synthesize the
435                // `YAML_BLOCK_SEQUENCE` frame inside the open VALUE so the
436                // tree matches the indented form (spec 8.2.1). Only when
437                // the `:` is the last significant token — i.e. the value
438                // is otherwise empty; a `-` after scalar content in the
439                // value is a structural error left unwrapped for the
440                // validator to reject.
441                // Decorations between `:` and `-` are allowed only when
442                // they sit inside the value scope — strictly indented
443                // past the indentless `-`. Otherwise the anchor is at
444                // the parent mapping's level (G9HC) and the sequence
445                // shouldn't wrap.
446                let decorations_inside_value =
447                    decorations_so_far.is_none_or(|c| c > tok.start.column);
448                let indentless_value = last_significant == Some(TokenKind::Value)
449                    && matches!(
450                        block_stack.last(),
451                        Some(BlockFrame::BlockMap { in_value: true, .. })
452                    )
453                    && decorations_inside_value;
454                // The mirror case: a `-` landing directly after the `?`
455                // explicit-key indicator opens an indentless sequence as
456                // the KEY's content (6PBE). The scanner likewise pushes no
457                // indent level, so synthesize the `YAML_BLOCK_SEQUENCE`
458                // inside the open KEY. `close_indentless_sequences` later
459                // pops it when the entry's `:` (`Value`) arrives.
460                let indentless_key = last_significant == Some(TokenKind::Key)
461                    && matches!(
462                        block_stack.last(),
463                        Some(BlockFrame::BlockMap {
464                            entry_open: true,
465                            in_value: false,
466                        })
467                    )
468                    && decorations_inside_value;
469                if indentless_value || indentless_key {
470                    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
471                    block_stack.push(BlockFrame::BlockSequence {
472                        item_open: false,
473                        indentless: true,
474                    });
475                }
476                if matches!(block_stack.last(), Some(BlockFrame::BlockSequence { .. })) {
477                    close_open_sub_wrapper(&mut builder, &mut block_stack);
478                    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM.into());
479                    if let Some(BlockFrame::BlockSequence { item_open, .. }) =
480                        block_stack.last_mut()
481                    {
482                        *item_open = true;
483                    }
484                }
485                // Fall through to emit the `-` byte inside the new ITEM
486                // (or at current scope if not in a Sequence frame).
487            }
488            TokenKind::Trivia(_) => {
489                // Trivia bypasses item-opening: pre-content trivia in a
490                // flow sequence stays at SEQUENCE level.
491            }
492            _ => {
493                // Any other source-backed content (Scalar, Anchor, Tag,
494                // Alias, Directive, doc markers): if we're inside a
495                // FlowSequence with no open ITEM, open one before
496                // emitting. Doc markers are handled below.
497                if !matches!(tok.kind, TokenKind::DocumentStart | TokenKind::DocumentEnd) {
498                    ensure_flow_seq_item_open(&mut builder, &mut block_stack);
499                }
500            }
501        }
502        let text = &input[tok.start.index..tok.end.index];
503        if text.is_empty() {
504            // Defensive: never emit zero-width tokens (rowan rejects).
505            continue;
506        }
507        let kind = map_token_to_syntax_kind(tok.kind);
508        match tok.kind {
509            TokenKind::DocumentStart => {
510                // `---` begins a fresh document. Two cases:
511                //  - The currently-open document only has directives so
512                //    far: per YAML 1.2 the directives belong to the doc
513                //    that this `---` opens. Stay inside, just emit the
514                //    marker.
515                //  - Otherwise: close the previous doc (and any open
516                //    block containers) and open a new YAML_DOCUMENT.
517                //    The scanner unwinds the indent stack at column 0,
518                //    but a same-indent map at indent==0 leaves them
519                //    open, so close them defensively.
520                if doc_open && doc_only_has_directives {
521                    builder.token(kind.into(), text);
522                    doc_only_has_directives = false;
523                } else {
524                    close_block_containers(&mut builder, &mut block_stack);
525                    if doc_open {
526                        builder.finish_node();
527                    }
528                    builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
529                    doc_open = true;
530                    doc_only_has_directives = false;
531                    builder.token(kind.into(), text);
532                }
533            }
534            TokenKind::DocumentEnd => {
535                // `...` closes the current document. Close any open
536                // block containers first so the marker is a child of
537                // the document, not buried in a block container.
538                close_block_containers(&mut builder, &mut block_stack);
539                if !doc_open {
540                    builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
541                }
542                builder.token(kind.into(), text);
543                builder.finish_node();
544                doc_open = false;
545                doc_only_has_directives = false;
546            }
547            TokenKind::Trivia(_) => {
548                // Trivia goes to whichever level is currently open;
549                // pre-document trivia stays at YAML_STREAM, in-document
550                // trivia stays inside the YAML_DOCUMENT, the open
551                // block container, or the open ENTRY/ITEM sub-wrapper.
552                builder.token(kind.into(), text);
553            }
554            TokenKind::Directive => {
555                // Directives belong inside a YAML_DOCUMENT but don't by
556                // themselves count as body content — a following `---`
557                // should not split into a separate doc.
558                let was_open = doc_open;
559                ensure_doc_open(&mut builder, &mut doc_open);
560                if !was_open {
561                    doc_only_has_directives = true;
562                }
563                builder.token(kind.into(), text);
564            }
565            TokenKind::Scalar(_) => {
566                // A scalar is emitted as a `YAML_SCALAR` *node* whose
567                // leaves are the per-physical-line content fragments
568                // (`YAML_SCALAR_TEXT`) interleaved with `NEWLINE` tokens.
569                // The byte slice is unchanged, so this is lossless; the
570                // node shape lets the formatter/LSP navigate scalar lines
571                // (and, later, hashpipe line prefixes) as real structure.
572                ensure_doc_open(&mut builder, &mut doc_open);
573                doc_only_has_directives = false;
574                emit_scalar_node(&mut builder, text, line_prefix);
575            }
576            _ => {
577                // Any other non-trivia content (Anchor, Tag, Alias, ...)
578                // opens an implicit document when one isn't already in
579                // progress and counts as body content (clears the
580                // directives-only flag).
581                ensure_doc_open(&mut builder, &mut doc_open);
582                doc_only_has_directives = false;
583                builder.token(kind.into(), text);
584            }
585        }
586    }
587    // Close any open block containers (and their open ENTRY/ITEM
588    // sub-wrappers) and the open document. The scanner emits BlockEnd
589    // on stream end via `unwind_indent(-1)`, so this is normally a
590    // no-op for `block_stack`; kept for safety against truncated
591    // inputs and future scanner quirks.
592    close_block_containers(&mut builder, &mut block_stack);
593    if doc_open {
594        builder.finish_node();
595    }
596    builder.finish_node();
597    SyntaxNode::new_root(builder.finish())
598}
599
600/// Tracks an open container in the streaming builder's stack. Block and
601/// flow contexts share state shape, but their containers and
602/// sub-wrappers use different `SyntaxKind` variants and they close on
603/// different tokens (`BlockEnd` / dedent vs. `]` / `}` / `,`).
604///
605/// For maps, `entry_open` records whether the entry sub-wrapper is
606/// still open, and `in_value` selects between the KEY and VALUE
607/// sub-sub-wrapper. For sequences, `item_open` records whether the
608/// item sub-wrapper is still open.
609#[derive(Debug, Clone, Copy)]
610enum BlockFrame {
611    BlockMap {
612        entry_open: bool,
613        in_value: bool,
614    },
615    /// `indentless` marks a sequence opened as a block-map value whose
616    /// `-` entries sit at the same column as the parent key (YAML's
617    /// "indentless sequence", spec 8.2.1). The scanner never pushes an
618    /// indent level for it, so it emits no matching `BlockEnd`; the
619    /// builder must close the frame itself when the parent map's next
620    /// `Key` / `Value` / `BlockEnd` arrives.
621    BlockSequence {
622        item_open: bool,
623        indentless: bool,
624    },
625    FlowMap {
626        entry_open: bool,
627        in_value: bool,
628    },
629    FlowSequence {
630        item_open: bool,
631    },
632}
633
634fn ensure_doc_open(builder: &mut GreenNodeBuilder<'_>, doc_open: &mut bool) {
635    if !*doc_open {
636        builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
637        *doc_open = true;
638    }
639}
640
641/// In a flow sequence, source-backed content opens a new
642/// `YAML_FLOW_SEQUENCE_ITEM` lazily — there is no `-` token to drive
643/// the boundary the way `BlockEntry` drives block sequences. Trivia
644/// arriving before the first item stays at the container level.
645fn ensure_flow_seq_item_open(builder: &mut GreenNodeBuilder<'_>, stack: &mut [BlockFrame]) {
646    if let Some(BlockFrame::FlowSequence { item_open }) = stack.last_mut()
647        && !*item_open
648    {
649        builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
650        *item_open = true;
651    }
652}
653
654/// Open `<MAP>_ENTRY` > `<MAP>_KEY` for the next entry, closing any
655/// previously-open entry on the same Map frame. Caller must have
656/// verified the top frame is a Map (Block or Flow).
657fn open_map_entry_with_key(builder: &mut GreenNodeBuilder<'_>, stack: &mut [BlockFrame]) {
658    close_open_sub_wrapper(builder, stack);
659    let (entry_kind, key_kind) = match stack.last() {
660        Some(BlockFrame::BlockMap { .. }) => (
661            SyntaxKind::YAML_BLOCK_MAP_ENTRY,
662            SyntaxKind::YAML_BLOCK_MAP_KEY,
663        ),
664        Some(BlockFrame::FlowMap { .. }) => (
665            SyntaxKind::YAML_FLOW_MAP_ENTRY,
666            SyntaxKind::YAML_FLOW_MAP_KEY,
667        ),
668        _ => return,
669    };
670    builder.start_node(entry_kind.into());
671    builder.start_node(key_kind.into());
672    if let Some(
673        BlockFrame::BlockMap {
674            entry_open,
675            in_value,
676        }
677        | BlockFrame::FlowMap {
678            entry_open,
679            in_value,
680        },
681    ) = stack.last_mut()
682    {
683        *entry_open = true;
684        *in_value = false;
685    }
686}
687
688/// Close any indentless `YAML_BLOCK_SEQUENCE` frames on top of the
689/// stack. These have no matching scanner `BlockEnd`, so they're closed
690/// here when the parent map's next `Key` / `Value` / `BlockEnd` arrives.
691/// Closing the open ITEM, finishing the SEQUENCE node, and popping the
692/// frame reveals the parent map for the incoming token. Loops because
693/// the next token may close several levels, though in practice
694/// indentless frames never stack directly (they're always separated by
695/// a map frame).
696fn close_indentless_sequences(builder: &mut GreenNodeBuilder<'_>, stack: &mut Vec<BlockFrame>) {
697    while let Some(BlockFrame::BlockSequence {
698        indentless: true, ..
699    }) = stack.last()
700    {
701        close_open_sub_wrapper(builder, stack);
702        stack.pop();
703        builder.finish_node(); // close YAML_BLOCK_SEQUENCE
704    }
705}
706
707/// Close the top-of-stack frame's entry/item sub-wrapper if still open
708/// and clear the flag. For maps, this closes the inner KEY/VALUE
709/// node and the surrounding ENTRY. If we're closing while the entry
710/// is still in its KEY phase (i.e. the entry never received a `:`,
711/// e.g. a `?`-only explicit-key entry), an empty VALUE wrapper is
712/// inserted before the ENTRY closes so every ENTRY has the same
713/// `KEY + VALUE` child shape — the projection layer relies on that
714/// invariant. For sequences it closes the ITEM. Caller decides whether
715/// to also pop the frame itself.
716fn close_open_sub_wrapper(builder: &mut GreenNodeBuilder<'_>, stack: &mut [BlockFrame]) {
717    let Some(frame) = stack.last_mut() else {
718        return;
719    };
720    match frame {
721        BlockFrame::BlockMap {
722            entry_open: true,
723            in_value,
724        } => {
725            if *in_value {
726                builder.finish_node(); // close VALUE
727            } else {
728                builder.finish_node(); // close KEY
729                builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
730                builder.finish_node(); // empty VALUE for shape parity
731            }
732            builder.finish_node(); // close ENTRY
733            *frame = BlockFrame::BlockMap {
734                entry_open: false,
735                in_value: false,
736            };
737        }
738        BlockFrame::FlowMap {
739            entry_open: true,
740            in_value,
741        } => {
742            if *in_value {
743                builder.finish_node();
744            } else {
745                builder.finish_node();
746                builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
747                builder.finish_node();
748            }
749            builder.finish_node();
750            *frame = BlockFrame::FlowMap {
751                entry_open: false,
752                in_value: false,
753            };
754        }
755        BlockFrame::BlockSequence {
756            item_open: true,
757            indentless,
758        } => {
759            let indentless = *indentless;
760            builder.finish_node();
761            *frame = BlockFrame::BlockSequence {
762                item_open: false,
763                indentless,
764            };
765        }
766        BlockFrame::FlowSequence { item_open: true } => {
767            builder.finish_node();
768            *frame = BlockFrame::FlowSequence { item_open: false };
769        }
770        _ => {}
771    }
772}
773
774fn close_block_containers(builder: &mut GreenNodeBuilder<'_>, stack: &mut Vec<BlockFrame>) {
775    while let Some(frame) = stack.pop() {
776        match frame {
777            BlockFrame::BlockMap {
778                entry_open: true,
779                in_value,
780            } => {
781                if in_value {
782                    builder.finish_node(); // close VALUE
783                } else {
784                    builder.finish_node(); // close KEY
785                    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
786                    builder.finish_node();
787                }
788                builder.finish_node(); // close ENTRY
789            }
790            BlockFrame::FlowMap {
791                entry_open: true,
792                in_value,
793            } => {
794                if in_value {
795                    builder.finish_node();
796                } else {
797                    builder.finish_node();
798                    builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
799                    builder.finish_node();
800                }
801                builder.finish_node();
802            }
803            BlockFrame::BlockSequence {
804                item_open: true, ..
805            }
806            | BlockFrame::FlowSequence { item_open: true } => {
807                builder.finish_node();
808            }
809            _ => {}
810        }
811        builder.finish_node();
812    }
813}
814
815/// Emit a scalar token's bytes as a `YAML_SCALAR` node whose leaves are
816/// the per-physical-line content fragments (`YAML_SCALAR_TEXT`)
817/// interleaved with `NEWLINE` leaves for the line breaks. Concatenating
818/// the leaves reproduces `text` exactly, so this is byte-lossless and the
819/// node's text range is unchanged. The node wrapper plus per-line
820/// fragmentation is what lets the formatter/LSP treat a scalar as real
821/// structure and is the seam a later step uses to interleave hashpipe
822/// line-prefix leaves (see the yaml-formatter cutover plan, step 2).
823fn emit_scalar_node(
824    builder: &mut GreenNodeBuilder<'static>,
825    text: &str,
826    line_prefix: Option<&str>,
827) {
828    builder.start_node(SyntaxKind::YAML_SCALAR.into());
829    emit_scalar_fragments(builder, text, line_prefix);
830    builder.finish_node();
831}
832
833/// Split a scalar's source `text` into per-physical-line leaves:
834/// `YAML_SCALAR_TEXT` content interleaved with `NEWLINE` line breaks
835/// (`\n`, `\r\n`, and lone `\r` each one `NEWLINE` leaf). When
836/// `line_prefix` is set, an embedded prefix at the start of each
837/// *continuation* line (the marker plus at most one trailing space,
838/// mirroring the scanner) is peeled into a leading `YAML_LINE_PREFIX`
839/// leaf. The first line never carries an embedded prefix — its line-start
840/// prefix was emitted as a separate `Trivia(LinePrefix)` token by the
841/// scanner before the scalar began. Empty content runs are skipped
842/// (rowan rejects zero-width tokens). The concatenation of all leaves
843/// equals `text` exactly, so the node stays byte-lossless.
844fn emit_scalar_fragments(
845    builder: &mut GreenNodeBuilder<'static>,
846    text: &str,
847    line_prefix: Option<&str>,
848) {
849    let bytes = text.as_bytes();
850    let mut i = 0;
851    let mut line_index = 0usize;
852    while i < bytes.len() {
853        // Peel an embedded line prefix on continuation lines only.
854        if line_index > 0
855            && let Some(prefix) = line_prefix
856            && let Some(len) = prefix_match_len(&text[i..], prefix)
857        {
858            builder.token(SyntaxKind::YAML_LINE_PREFIX.into(), &text[i..i + len]);
859            i += len;
860        }
861        // Content up to the next line break.
862        let content_start = i;
863        while i < bytes.len() && !matches!(bytes[i], b'\n' | b'\r') {
864            i += 1;
865        }
866        if content_start < i {
867            builder.token(SyntaxKind::YAML_SCALAR_TEXT.into(), &text[content_start..i]);
868        }
869        // Line break (if any).
870        if i < bytes.len() {
871            let nl_len = if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
872                2
873            } else {
874                1
875            };
876            builder.token(SyntaxKind::NEWLINE.into(), &text[i..i + nl_len]);
877            i += nl_len;
878            line_index += 1;
879        }
880    }
881}
882
883/// Match an embedded line prefix at the start of `s`: the `marker` plus
884/// at most one following space (mirroring `strip_line_prefix` and the
885/// scanner's `prefix_byte_len_at`). Returns the matched byte length.
886fn prefix_match_len(s: &str, marker: &str) -> Option<usize> {
887    let after = s.strip_prefix(marker)?;
888    Some(marker.len() + usize::from(after.starts_with(' ')))
889}
890
891fn map_token_to_syntax_kind(kind: TokenKind) -> SyntaxKind {
892    match kind {
893        TokenKind::Trivia(TriviaKind::Whitespace) => SyntaxKind::WHITESPACE,
894        TokenKind::Trivia(TriviaKind::Newline) => SyntaxKind::NEWLINE,
895        TokenKind::Trivia(TriviaKind::Comment) => SyntaxKind::YAML_COMMENT,
896        TokenKind::Trivia(TriviaKind::LinePrefix) => SyntaxKind::YAML_LINE_PREFIX,
897        TokenKind::DocumentStart => SyntaxKind::YAML_DOCUMENT_START,
898        TokenKind::DocumentEnd => SyntaxKind::YAML_DOCUMENT_END,
899        TokenKind::Directive => SyntaxKind::YAML_DIRECTIVE,
900        TokenKind::BlockEntry => SyntaxKind::YAML_BLOCK_SEQ_ENTRY,
901        TokenKind::FlowEntry => SyntaxKind::YAML_FLOW_INDICATOR,
902        TokenKind::FlowSequenceStart | TokenKind::FlowSequenceEnd => {
903            SyntaxKind::YAML_FLOW_INDICATOR
904        }
905        TokenKind::FlowMappingStart | TokenKind::FlowMappingEnd => SyntaxKind::YAML_FLOW_INDICATOR,
906        TokenKind::Value => SyntaxKind::YAML_COLON,
907        TokenKind::Anchor => SyntaxKind::YAML_ANCHOR,
908        TokenKind::Alias => SyntaxKind::YAML_ALIAS,
909        TokenKind::Tag => SyntaxKind::YAML_TAG,
910        // Scalar tokens are emitted as a `YAML_SCALAR` *node* (split into
911        // per-line `YAML_SCALAR_TEXT` leaves) via `emit_scalar_node`, not
912        // through this token-kind map. This arm is the leaf kind for a
913        // scalar's content fragment, used by that helper.
914        TokenKind::Scalar(_) => SyntaxKind::YAML_SCALAR_TEXT,
915        // Source-backed `Key` (the explicit `?` indicator) — there is
916        // no dedicated SyntaxKind yet, route to YAML_KEY for now.
917        TokenKind::Key => SyntaxKind::YAML_KEY,
918        // Synthetic markers handled before this map; defensive
919        // fallback (never emitted as bytes).
920        TokenKind::StreamStart
921        | TokenKind::StreamEnd
922        | TokenKind::BlockSequenceStart
923        | TokenKind::BlockMappingStart
924        | TokenKind::BlockEnd => SyntaxKind::YAML_FLOW_INDICATOR,
925    }
926}
927
928#[cfg(test)]
929mod tests {
930    use super::*;
931    use crate::syntax::SyntaxKind;
932
933    /// `parse_stream` must reproduce its input byte-for-byte.
934    fn assert_lossless(input: &str) {
935        assert_eq!(
936            parse_stream(input).text().to_string(),
937            input,
938            "input {input:?} not preserved"
939        );
940    }
941
942    #[test]
943    fn strip_with_offsets_matches_strip_line_prefix() {
944        for input in [
945            "#| a: 1\n",
946            "#| a: 1\n#|   b\n",
947            "  #| x: 1\n",
948            "#| a\r\n#| b\r\n",
949            "#| a",
950        ] {
951            let (text, offsets) = strip_line_prefix_with_offsets(input, "#|");
952            assert_eq!(text, strip_line_prefix(input, "#|"), "text for {input:?}");
953            assert_eq!(offsets.len(), text.len() + 1, "offset count for {input:?}");
954            assert!(
955                offsets.iter().all(|&o| o <= input.len()),
956                "offsets in bounds for {input:?}"
957            );
958        }
959    }
960
961    #[test]
962    fn locate_maps_hashpipe_error_to_region_offset() {
963        let input = "#| echo: [\n";
964        let (_diag, start, _end) = locate_yaml_diagnostic(input, "#|").expect("diagnostic");
965        assert_eq!(start, input.find('[').unwrap());
966    }
967
968    #[test]
969    fn locate_maps_composite_marker_error() {
970        // List-indented cell: the marker includes the container indent.
971        let input = "   #| echo: [\n";
972        let (_diag, start, _end) = locate_yaml_diagnostic(input, "   #|").expect("diagnostic");
973        assert_eq!(start, input.find('[').unwrap());
974    }
975
976    #[test]
977    fn locate_maps_crlf_region_error() {
978        let input = "#| ok: 1\r\n#| echo: [\r\n";
979        let (_diag, start, _end) = locate_yaml_diagnostic(input, "#|").expect("diagnostic");
980        assert_eq!(start, input.find('[').unwrap());
981    }
982
983    #[test]
984    fn locate_frontmatter_uses_identity_offsets() {
985        let input = "title: [\n";
986        let (diag, start, _end) = locate_yaml_diagnostic(input, "").expect("diagnostic");
987        assert_eq!(start, diag.byte_start);
988        assert_eq!(start, input.find('[').unwrap());
989    }
990
991    #[test]
992    fn locate_returns_none_for_valid_yaml() {
993        assert!(locate_yaml_diagnostic("#| echo: false\n", "#|").is_none());
994        assert!(locate_yaml_diagnostic("title: ok\n", "").is_none());
995    }
996
997    #[test]
998    fn block_scalar_followed_by_option_is_not_swallowed_as_comment() {
999        // Regression: a prefixed option after a `|` block scalar was scanned as a
1000        // YAML comment (the terminating line's `#|` prefix wasn't peeled), which
1001        // dropped the option. Both keys must survive as structure.
1002        let input = "#| fig-cap: |\n#|   A caption\n#| echo: false\n";
1003        let tree = parse_stream_with_prefix(input, "#|");
1004        assert_eq!(tree.to_string(), input, "byte-lossless");
1005        let entries = tree
1006            .descendants()
1007            .filter(|node| node.kind() == SyntaxKind::YAML_BLOCK_MAP_ENTRY)
1008            .count();
1009        assert_eq!(entries, 2, "expected fig-cap and echo entries");
1010        assert!(
1011            !tree
1012                .descendants_with_tokens()
1013                .any(|element| element.kind() == SyntaxKind::YAML_COMMENT),
1014            "the option line must not be scanned as a comment"
1015        );
1016    }
1017
1018    #[test]
1019    fn returns_byte_lossless_cst_for_empty_input() {
1020        assert_lossless("");
1021    }
1022
1023    #[test]
1024    fn returns_byte_lossless_cst_for_simple_mapping() {
1025        assert_lossless("key: value\n");
1026    }
1027
1028    #[test]
1029    fn returns_byte_lossless_cst_for_block_sequence() {
1030        assert_lossless("- a\n- b\n");
1031    }
1032
1033    #[test]
1034    fn returns_byte_lossless_cst_for_flow_mapping() {
1035        assert_lossless("{a: b, c: d}\n");
1036    }
1037
1038    #[test]
1039    fn returns_byte_lossless_cst_for_block_scalar() {
1040        assert_lossless("key: |\n  hello\n  world\n");
1041    }
1042
1043    #[test]
1044    fn returns_byte_lossless_cst_for_quoted_scalar() {
1045        assert_lossless("\"key\": \"value\"\n");
1046    }
1047
1048    #[test]
1049    fn returns_byte_lossless_cst_for_multi_line_plain_scalar() {
1050        assert_lossless("key: hello\n  world\n");
1051    }
1052
1053    #[test]
1054    fn preserves_explicit_key_indicator_byte_in_flow_context() {
1055        // The `?` explicit-key indicator carries a 1-byte source span
1056        // even in flow context, so the builder must NOT drop it
1057        // (only zero-width `Key` splices from `fetch_value` should be
1058        // dropped). Regression: an earlier draft filtered every Key.
1059        assert_lossless("{ ?foo: bar }\n");
1060    }
1061
1062    #[test]
1063    fn does_not_absorb_terminator_line_break_into_flow_scalar() {
1064        // Regression: in flow context the multi-line plain
1065        // continuation must abort if the next non-blank char is a
1066        // flow terminator (`}`/`]`/`,`). Otherwise the trailing
1067        // newline got swallowed into the scalar (`42\n` instead of
1068        // `42`) and the closer's byte position drifted.
1069        assert_lossless("{a: 42\n}\n");
1070    }
1071
1072    fn document_count(tree: &SyntaxNode) -> usize {
1073        tree.children()
1074            .filter(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1075            .count()
1076    }
1077
1078    #[test]
1079    fn implicit_document_wraps_body_with_no_markers() {
1080        // No explicit `---` or `...` — the body still belongs to one
1081        // YAML_DOCUMENT so projection has a node to walk.
1082        let input = "key: value\n";
1083        let tree = parse_stream(input);
1084        assert_eq!(document_count(&tree), 1);
1085        assert_eq!(tree.text().to_string(), input);
1086    }
1087
1088    #[test]
1089    fn explicit_doc_start_opens_document_marker_lives_inside() {
1090        let input = "---\nkey: value\n";
1091        let tree = parse_stream(input);
1092        assert_eq!(document_count(&tree), 1);
1093        let doc = tree
1094            .children()
1095            .find(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1096            .expect("document node");
1097        assert!(
1098            doc.children_with_tokens().any(|el| el
1099                .as_token()
1100                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_START)),
1101            "`---` token should live inside YAML_DOCUMENT"
1102        );
1103        assert_eq!(tree.text().to_string(), input);
1104    }
1105
1106    #[test]
1107    fn explicit_doc_end_closes_document_marker_lives_inside() {
1108        let input = "key: value\n...\n";
1109        let tree = parse_stream(input);
1110        assert_eq!(document_count(&tree), 1);
1111        let doc = tree
1112            .children()
1113            .find(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1114            .expect("document node");
1115        assert!(
1116            doc.children_with_tokens().any(|el| el
1117                .as_token()
1118                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_END)),
1119            "`...` token should live inside YAML_DOCUMENT"
1120        );
1121        assert_eq!(tree.text().to_string(), input);
1122    }
1123
1124    #[test]
1125    fn consecutive_doc_starts_emit_two_documents() {
1126        let input = "---\na\n---\nb\n";
1127        let tree = parse_stream(input);
1128        assert_eq!(document_count(&tree), 2);
1129        assert_eq!(tree.text().to_string(), input);
1130    }
1131
1132    #[test]
1133    fn pre_document_trivia_stays_at_stream_level() {
1134        // A leading newline before the first document content should
1135        // sit under YAML_STREAM, not inside a YAML_DOCUMENT — there is
1136        // no document yet at that point.
1137        let input = "\n---\nkey: value\n";
1138        let tree = parse_stream(input);
1139        let stream_token_kinds: Vec<SyntaxKind> = tree
1140            .children_with_tokens()
1141            .filter_map(|el| el.into_token())
1142            .map(|t| t.kind())
1143            .collect();
1144        assert!(
1145            stream_token_kinds.contains(&SyntaxKind::NEWLINE),
1146            "leading newline should be a direct child of YAML_STREAM, got {stream_token_kinds:?}"
1147        );
1148        assert_eq!(tree.text().to_string(), input);
1149    }
1150
1151    #[test]
1152    fn bare_doc_end_at_stream_start_opens_synthetic_empty_document() {
1153        // Pathological but lossless: a stream that begins with `...`
1154        // wraps the marker in an empty YAML_DOCUMENT so no source
1155        // bytes leak out at YAML_STREAM level uncoupled from a doc.
1156        let input = "...\n";
1157        let tree = parse_stream(input);
1158        assert_eq!(document_count(&tree), 1);
1159        assert_eq!(tree.text().to_string(), input);
1160    }
1161
1162    fn first_document(tree: &SyntaxNode) -> SyntaxNode {
1163        tree.children()
1164            .find(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
1165            .expect("at least one document")
1166    }
1167
1168    fn block_map_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1169        parent
1170            .children()
1171            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1172    }
1173
1174    fn block_seq_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1175        parent
1176            .children()
1177            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE)
1178    }
1179
1180    fn block_map_entries(map: &SyntaxNode) -> Vec<SyntaxNode> {
1181        map.children()
1182            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_ENTRY)
1183            .collect()
1184    }
1185
1186    fn block_seq_items(seq: &SyntaxNode) -> Vec<SyntaxNode> {
1187        seq.children()
1188            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM)
1189            .collect()
1190    }
1191
1192    fn entry_key(entry: &SyntaxNode) -> SyntaxNode {
1193        entry
1194            .children()
1195            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_KEY)
1196            .expect("entry should have a YAML_BLOCK_MAP_KEY child")
1197    }
1198
1199    fn entry_value(entry: &SyntaxNode) -> SyntaxNode {
1200        entry
1201            .children()
1202            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_VALUE)
1203            .expect("entry should have a YAML_BLOCK_MAP_VALUE child")
1204    }
1205
1206    #[test]
1207    fn consecutive_empty_key_colons_open_separate_entries() {
1208        // `: a\n: b` is two block-map entries, each with an empty
1209        // (null) key and a value (2JQS). The scanner emits two bare
1210        // `Value` tokens with no Key/BlockEnd between them, so the
1211        // builder must close the first entry when the second `:`
1212        // arrives at the map's column rather than absorbing it into
1213        // the first value.
1214        let input = ": a\n: b\n";
1215        let tree = parse_stream(input);
1216        let doc = first_document(&tree);
1217        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1218        let entries = block_map_entries(&map);
1219        assert_eq!(entries.len(), 2, "expected two empty-key ENTRY nodes");
1220        for (entry, scalar) in entries.iter().zip(["a", "b"]) {
1221            let key = entry_key(entry);
1222            // Empty key: the KEY holds only the `:` value indicator.
1223            assert!(
1224                !key.children().any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1225                "empty key should carry no scalar, got {key:?}",
1226            );
1227            let value = entry_value(entry);
1228            assert!(
1229                value
1230                    .children()
1231                    .any(|n| n.kind() == SyntaxKind::YAML_SCALAR && n.text() == scalar),
1232                "value should be {scalar:?}, got {value:?}",
1233            );
1234        }
1235        assert_eq!(tree.text().to_string(), input);
1236    }
1237
1238    #[test]
1239    fn block_mapping_wraps_key_value_with_key_and_value_sub_wrappers() {
1240        let input = "key: value\n";
1241        let tree = parse_stream(input);
1242        let doc = first_document(&tree);
1243        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1244        let entries = block_map_entries(&map);
1245        assert_eq!(entries.len(), 1, "expected one ENTRY for `key: value`");
1246        let key = entry_key(&entries[0]);
1247        let value = entry_value(&entries[0]);
1248        // Colon ends the KEY (last token); VALUE has the scalar.
1249        assert!(
1250            key.children_with_tokens().any(|el| el
1251                .as_token()
1252                .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1253            "colon should be the trailing token of YAML_BLOCK_MAP_KEY",
1254        );
1255        assert!(
1256            value
1257                .children()
1258                .any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1259            "scalar `value` should live inside YAML_BLOCK_MAP_VALUE",
1260        );
1261        assert_eq!(tree.text().to_string(), input);
1262    }
1263
1264    #[test]
1265    fn block_sequence_wraps_entries_in_yaml_block_sequence() {
1266        let input = "- a\n- b\n";
1267        let tree = parse_stream(input);
1268        let doc = first_document(&tree);
1269        let seq = block_seq_under(&doc).expect("YAML_BLOCK_SEQUENCE child");
1270        let items = block_seq_items(&seq);
1271        assert_eq!(items.len(), 2, "expected 2 YAML_BLOCK_SEQUENCE_ITEM");
1272        // Each item must own its own `-` entry token.
1273        for item in &items {
1274            let dash_count = item
1275                .children_with_tokens()
1276                .filter(|el| {
1277                    el.as_token()
1278                        .is_some_and(|t| t.kind() == SyntaxKind::YAML_BLOCK_SEQ_ENTRY)
1279                })
1280                .count();
1281            assert_eq!(dash_count, 1, "each item owns exactly one `-` token");
1282        }
1283        assert_eq!(tree.text().to_string(), input);
1284    }
1285
1286    #[test]
1287    fn nested_block_mapping_nests_inner_block_map_inside_outer_value() {
1288        let input = "outer:\n  inner: x\n";
1289        let tree = parse_stream(input);
1290        let doc = first_document(&tree);
1291        let outer = block_map_under(&doc).expect("outer YAML_BLOCK_MAP");
1292        let outer_entries = block_map_entries(&outer);
1293        assert_eq!(outer_entries.len(), 1);
1294        let outer_value = entry_value(&outer_entries[0]);
1295        let inner = outer_value
1296            .children()
1297            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1298            .expect("inner YAML_BLOCK_MAP nested under outer VALUE");
1299        let inner_entries = block_map_entries(&inner);
1300        assert_eq!(inner_entries.len(), 1);
1301        let inner_key = entry_key(&inner_entries[0]);
1302        assert!(
1303            inner_key.children_with_tokens().any(|el| el
1304                .as_token()
1305                .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1306            "inner key should own its colon",
1307        );
1308        assert_eq!(tree.text().to_string(), input);
1309    }
1310
1311    #[test]
1312    fn block_sequence_inside_mapping_nests_under_outer_map_value() {
1313        let input = "items:\n  - a\n  - b\n";
1314        let tree = parse_stream(input);
1315        let doc = first_document(&tree);
1316        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1317        let entries = block_map_entries(&map);
1318        assert_eq!(entries.len(), 1, "one entry: `items: <seq>`");
1319        let value = entry_value(&entries[0]);
1320        let seq = value
1321            .children()
1322            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE)
1323            .expect("YAML_BLOCK_SEQUENCE nested under map VALUE");
1324        let items = block_seq_items(&seq);
1325        assert_eq!(items.len(), 2);
1326        assert_eq!(tree.text().to_string(), input);
1327    }
1328
1329    #[test]
1330    fn dedent_closes_inner_block_map_before_next_outer_key() {
1331        // outer:
1332        //   inner: x
1333        // sibling: y
1334        // The dedent before `sibling` must close the inner map and
1335        // its outer ENTRY so `sibling: y` lands as a sibling ENTRY
1336        // under the outer map.
1337        let input = "outer:\n  inner: x\nsibling: y\n";
1338        let tree = parse_stream(input);
1339        let doc = first_document(&tree);
1340        let outer = block_map_under(&doc).expect("outer YAML_BLOCK_MAP");
1341        let entries = block_map_entries(&outer);
1342        assert_eq!(
1343            entries.len(),
1344            2,
1345            "outer map should have two entries (`outer:` and `sibling:`)",
1346        );
1347        // Only the first entry's VALUE has a nested map; the second is flat.
1348        let first_value = entry_value(&entries[0]);
1349        let nested_in_first = first_value
1350            .children()
1351            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1352            .count();
1353        assert_eq!(nested_in_first, 1);
1354        let second_value = entry_value(&entries[1]);
1355        let nested_in_second = second_value
1356            .children()
1357            .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1358            .count();
1359        assert_eq!(nested_in_second, 0);
1360        assert_eq!(tree.text().to_string(), input);
1361    }
1362
1363    #[test]
1364    fn block_map_with_two_top_level_entries_emits_two_entry_wrappers() {
1365        let input = "a: 1\nb: 2\n";
1366        let tree = parse_stream(input);
1367        let doc = first_document(&tree);
1368        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1369        assert_eq!(block_map_entries(&map).len(), 2);
1370        assert_eq!(tree.text().to_string(), input);
1371    }
1372
1373    #[test]
1374    fn explicit_key_indicator_question_mark_lives_inside_key() {
1375        // `? a\n: b\n` — the `?` is a source-backed Key token. It
1376        // opens the ENTRY and lives inside the resulting KEY node
1377        // (alongside the scalar `a` and the trailing `:`).
1378        let input = "? a\n: b\n";
1379        let tree = parse_stream(input);
1380        let doc = first_document(&tree);
1381        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1382        let entries = block_map_entries(&map);
1383        assert_eq!(entries.len(), 1);
1384        let key = entry_key(&entries[0]);
1385        let has_question = key.children_with_tokens().any(|el| {
1386            el.as_token()
1387                .is_some_and(|t| t.kind() == SyntaxKind::YAML_KEY)
1388        });
1389        assert!(has_question, "`?` should live inside YAML_BLOCK_MAP_KEY");
1390        assert_eq!(tree.text().to_string(), input);
1391    }
1392
1393    #[test]
1394    fn explicit_key_indentless_sequence_wraps_inside_key() {
1395        // `?\n- a\n- b\n:\n- c\n- d\n` (6PBE) — the explicit `?` key's
1396        // content is a zero-indented block sequence. As with an indentless
1397        // sequence in a VALUE, the scanner pushes no indent level and emits
1398        // no BlockSequenceStart, so the builder must synthesize a
1399        // YAML_BLOCK_SEQUENCE inside the KEY (mirroring the VALUE side)
1400        // rather than leaving the `- a` / `- b` entries flat.
1401        let input = "?\n- a\n- b\n:\n- c\n- d\n";
1402        let tree = parse_stream(input);
1403        let doc = first_document(&tree);
1404        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1405        let entries = block_map_entries(&map);
1406        assert_eq!(entries.len(), 1);
1407        let key = entry_key(&entries[0]);
1408        assert!(
1409            key.children()
1410                .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE),
1411            "explicit-key block sequence should be wrapped in YAML_BLOCK_SEQUENCE inside KEY",
1412        );
1413        let value = entry_value(&entries[0]);
1414        assert!(
1415            value
1416                .children()
1417                .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE),
1418            "value-side block sequence should remain wrapped",
1419        );
1420        assert_eq!(tree.text().to_string(), input);
1421    }
1422
1423    #[test]
1424    fn empty_key_shorthand_opens_entry_with_empty_key() {
1425        // `: value\n` — bare `:` at column 0 is the empty-implicit-key
1426        // shorthand. The builder must open ENTRY+KEY before the colon
1427        // arrives so the colon ends up as the only KEY child.
1428        let input = ": value\n";
1429        let tree = parse_stream(input);
1430        let doc = first_document(&tree);
1431        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1432        let entries = block_map_entries(&map);
1433        assert_eq!(entries.len(), 1);
1434        let key = entry_key(&entries[0]);
1435        // KEY has no scalar; only the colon.
1436        assert!(
1437            !key.children().any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1438            "empty-key shorthand has no scalar in KEY",
1439        );
1440        assert!(
1441            key.children_with_tokens().any(|el| el
1442                .as_token()
1443                .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1444            "empty-key KEY still owns the `:` token",
1445        );
1446        let value = entry_value(&entries[0]);
1447        assert!(
1448            value
1449                .children()
1450                .any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1451            "VALUE owns the `value` scalar",
1452        );
1453        assert_eq!(tree.text().to_string(), input);
1454    }
1455
1456    #[test]
1457    fn document_end_marker_lives_at_document_level_not_inside_block_map() {
1458        // `...` must not be buried inside the block map; it is a
1459        // document-level marker. The builder closes any open block
1460        // containers before consuming `DocumentEnd`.
1461        let input = "key: value\n...\n";
1462        let tree = parse_stream(input);
1463        let doc = first_document(&tree);
1464        let has_doc_end = doc.children_with_tokens().any(|el| {
1465            el.as_token()
1466                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_END)
1467        });
1468        assert!(
1469            has_doc_end,
1470            "DOCUMENT_END should be a direct child of YAML_DOCUMENT"
1471        );
1472        assert_eq!(tree.text().to_string(), input);
1473    }
1474
1475    fn flow_map_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1476        parent
1477            .children()
1478            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
1479    }
1480
1481    fn flow_seq_under(parent: &SyntaxNode) -> Option<SyntaxNode> {
1482        parent
1483            .children()
1484            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
1485    }
1486
1487    fn flow_map_entries(map: &SyntaxNode) -> Vec<SyntaxNode> {
1488        map.children()
1489            .filter(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_ENTRY)
1490            .collect()
1491    }
1492
1493    fn flow_seq_items(seq: &SyntaxNode) -> Vec<SyntaxNode> {
1494        seq.children()
1495            .filter(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE_ITEM)
1496            .collect()
1497    }
1498
1499    #[test]
1500    fn flow_sequence_wraps_each_item_in_flow_sequence_item() {
1501        let input = "[a, b, c]\n";
1502        let tree = parse_stream(input);
1503        let doc = first_document(&tree);
1504        let seq = flow_seq_under(&doc).expect("YAML_FLOW_SEQUENCE child");
1505        let items = flow_seq_items(&seq);
1506        assert_eq!(items.len(), 3);
1507        // The opening `[` and closing `]` live at SEQUENCE level
1508        // (siblings of items).
1509        let bracket_count = seq
1510            .children_with_tokens()
1511            .filter(|el| {
1512                el.as_token().map(|t| t.text()) == Some("[")
1513                    || el.as_token().map(|t| t.text()) == Some("]")
1514            })
1515            .count();
1516        assert_eq!(bracket_count, 2, "`[` and `]` at SEQUENCE level");
1517        assert_eq!(tree.text().to_string(), input);
1518    }
1519
1520    #[test]
1521    fn flow_mapping_wraps_each_entry_with_key_and_value() {
1522        let input = "{a: 1, b: 2}\n";
1523        let tree = parse_stream(input);
1524        let doc = first_document(&tree);
1525        let map = flow_map_under(&doc).expect("YAML_FLOW_MAP child");
1526        let entries = flow_map_entries(&map);
1527        assert_eq!(entries.len(), 2);
1528        for entry in &entries {
1529            let key = entry
1530                .children()
1531                .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_KEY)
1532                .expect("entry has YAML_FLOW_MAP_KEY");
1533            assert!(
1534                key.children_with_tokens().any(|el| el
1535                    .as_token()
1536                    .is_some_and(|t| t.kind() == SyntaxKind::YAML_COLON)),
1537                "flow KEY owns trailing `:`",
1538            );
1539            let value = entry
1540                .children()
1541                .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_VALUE)
1542                .expect("entry has YAML_FLOW_MAP_VALUE");
1543            assert!(
1544                value
1545                    .children()
1546                    .any(|n| n.kind() == SyntaxKind::YAML_SCALAR),
1547                "flow VALUE owns its scalar",
1548            );
1549        }
1550        assert_eq!(tree.text().to_string(), input);
1551    }
1552
1553    #[test]
1554    fn flow_sequence_inside_flow_sequence_nests_under_outer_item() {
1555        let input = "[[1, 2], [3, 4]]\n";
1556        let tree = parse_stream(input);
1557        let doc = first_document(&tree);
1558        let outer = flow_seq_under(&doc).expect("outer YAML_FLOW_SEQUENCE");
1559        let outer_items = flow_seq_items(&outer);
1560        assert_eq!(outer_items.len(), 2);
1561        for item in &outer_items {
1562            assert!(
1563                item.children()
1564                    .any(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE),
1565                "outer item should contain a nested YAML_FLOW_SEQUENCE",
1566            );
1567        }
1568        assert_eq!(tree.text().to_string(), input);
1569    }
1570
1571    #[test]
1572    fn flow_mapping_inside_flow_sequence_nests_under_item() {
1573        let input = "[{a: 1}, {b: 2}]\n";
1574        let tree = parse_stream(input);
1575        let doc = first_document(&tree);
1576        let seq = flow_seq_under(&doc).expect("YAML_FLOW_SEQUENCE child");
1577        let items = flow_seq_items(&seq);
1578        assert_eq!(items.len(), 2);
1579        for item in &items {
1580            assert!(
1581                item.children()
1582                    .any(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP),
1583                "each item should contain a nested YAML_FLOW_MAP",
1584            );
1585        }
1586        assert_eq!(tree.text().to_string(), input);
1587    }
1588
1589    #[test]
1590    fn flow_mapping_at_block_map_value_nests_under_block_map_value() {
1591        let input = "key: {a: 1, b: 2}\n";
1592        let tree = parse_stream(input);
1593        let doc = first_document(&tree);
1594        let block_map = block_map_under(&doc).expect("YAML_BLOCK_MAP child");
1595        let entries = block_map_entries(&block_map);
1596        assert_eq!(entries.len(), 1);
1597        let value = entry_value(&entries[0]);
1598        assert!(
1599            value
1600                .children()
1601                .any(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP),
1602            "flow map should be nested under outer block map's VALUE",
1603        );
1604        assert_eq!(tree.text().to_string(), input);
1605    }
1606
1607    #[test]
1608    fn directive_prelude_stays_inside_document_opened_by_marker() {
1609        // YAML 1.2 §6.8.1: directives belong to the document the
1610        // following `---` opens. The builder must not split the
1611        // directive line into a separate doc — the entire input is one
1612        // YAML_DOCUMENT.
1613        let input = "%TAG !e! tag:example.com,2000:app/\n---\n!e!foo \"bar\"\n";
1614        let tree = parse_stream(input);
1615        assert_eq!(document_count(&tree), 1);
1616        let doc = first_document(&tree);
1617        let has_doc_start = doc.children_with_tokens().any(|el| {
1618            el.as_token()
1619                .is_some_and(|t| t.kind() == SyntaxKind::YAML_DOCUMENT_START)
1620        });
1621        assert!(has_doc_start, "the `---` should live inside the same doc");
1622        assert_eq!(tree.text().to_string(), input);
1623    }
1624
1625    #[test]
1626    fn explicit_key_without_value_emits_empty_value_for_shape_parity() {
1627        // `? a\n? b\n` — neither entry has a `:`. Each ENTRY must still
1628        // hold both KEY and VALUE children (VALUE empty) so projection
1629        // walkers don't have to special-case missing children.
1630        let input = "? a\n? b\n";
1631        let tree = parse_stream(input);
1632        let doc = first_document(&tree);
1633        let map = block_map_under(&doc).expect("YAML_BLOCK_MAP");
1634        let entries = block_map_entries(&map);
1635        assert_eq!(entries.len(), 2);
1636        for entry in &entries {
1637            assert!(
1638                entry
1639                    .children()
1640                    .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_KEY),
1641                "ENTRY missing KEY child",
1642            );
1643            assert!(
1644                entry
1645                    .children()
1646                    .any(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_VALUE),
1647                "ENTRY missing VALUE child",
1648            );
1649        }
1650    }
1651}