Skip to main content

panache_parser/parser/yaml/
parser.rs

1use crate::syntax::{SyntaxKind, SyntaxNode};
2use rowan::GreenNodeBuilder;
3
4use super::lexer::{lex_mapping_tokens_with_diagnostic, split_once_unquoted_key_colon};
5use super::model::{
6    ShadowYamlOptions, ShadowYamlOutcome, ShadowYamlReport, YamlDiagnostic, YamlInputKind,
7    YamlParseReport, YamlToken, YamlTokenSpan, diagnostic_codes,
8};
9
10/// Parse YAML in shadow mode using prototype groundwork only.
11///
12/// This API is intentionally read-only and does not replace production YAML
13/// parsing. By default it is disabled and reports `SkippedDisabled`.
14pub fn parse_shadow(input: &str, options: ShadowYamlOptions) -> ShadowYamlReport {
15    let line_count = input.lines().count().max(1);
16
17    if !options.enabled {
18        return ShadowYamlReport {
19            outcome: ShadowYamlOutcome::SkippedDisabled,
20            shadow_reason: "shadow-disabled",
21            input_kind: options.input_kind,
22            input_len_bytes: input.len(),
23            line_count,
24            normalized_input: None,
25        };
26    }
27
28    let normalized = match options.input_kind {
29        YamlInputKind::Plain => input.to_owned(),
30        YamlInputKind::Hashpipe => normalize_hashpipe_input(input),
31    };
32
33    let parsed = parse_yaml_tree(&normalized).is_some();
34
35    ShadowYamlReport {
36        outcome: if parsed {
37            ShadowYamlOutcome::PrototypeParsed
38        } else {
39            ShadowYamlOutcome::PrototypeRejected
40        },
41        shadow_reason: if parsed {
42            "prototype-basic-mapping-parsed"
43        } else {
44            "prototype-basic-mapping-rejected"
45        },
46        input_kind: options.input_kind,
47        input_len_bytes: input.len(),
48        line_count,
49        normalized_input: Some(normalized),
50    }
51}
52
53fn normalize_hashpipe_input(input: &str) -> String {
54    input
55        .lines()
56        .map(strip_hashpipe_prefix)
57        .collect::<Vec<_>>()
58        .join("\n")
59}
60
61fn strip_hashpipe_prefix(line: &str) -> &str {
62    if let Some(rest) = line.strip_prefix("#|") {
63        return rest.strip_prefix(' ').unwrap_or(rest);
64    }
65    line
66}
67
68fn emit_token_as_yaml(builder: &mut GreenNodeBuilder<'_>, token: &YamlTokenSpan<'_>) {
69    let kind = match token.kind {
70        YamlToken::Whitespace => SyntaxKind::WHITESPACE,
71        YamlToken::Comment => SyntaxKind::YAML_COMMENT,
72        YamlToken::Tag => SyntaxKind::YAML_TAG,
73        YamlToken::Colon => SyntaxKind::YAML_COLON,
74        _ => SyntaxKind::YAML_SCALAR,
75    };
76    builder.token(kind.into(), token.text);
77}
78
79fn diag_at_token(
80    token: &YamlTokenSpan<'_>,
81    code: &'static str,
82    message: &'static str,
83) -> YamlDiagnostic {
84    YamlDiagnostic {
85        code,
86        message,
87        byte_start: token.byte_start,
88        byte_end: token.byte_end,
89    }
90}
91
92fn emit_flow_sequence<'a>(
93    builder: &mut GreenNodeBuilder<'_>,
94    tokens: &[YamlTokenSpan<'a>],
95    i: &mut usize,
96) -> Result<(), YamlDiagnostic> {
97    if *i >= tokens.len() || tokens[*i].kind != YamlToken::FlowSeqStart {
98        return Err(YamlDiagnostic {
99            code: diagnostic_codes::PARSE_EXPECTED_FLOW_SEQUENCE_START,
100            message: "expected flow sequence start token",
101            byte_start: tokens.get(*i).map(|t| t.byte_start).unwrap_or(0),
102            byte_end: tokens.get(*i).map(|t| t.byte_end).unwrap_or(0),
103        });
104    }
105
106    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE.into());
107    emit_token_as_yaml(builder, &tokens[*i]); // [
108    *i += 1;
109
110    let mut open_item = false;
111    while *i < tokens.len() {
112        match tokens[*i].kind {
113            YamlToken::FlowSeqEnd => {
114                if open_item {
115                    builder.finish_node(); // YAML_FLOW_SEQUENCE_ITEM
116                }
117                emit_token_as_yaml(builder, &tokens[*i]); // ]
118                *i += 1;
119                if *i < tokens.len() {
120                    match tokens[*i].kind {
121                        YamlToken::Newline | YamlToken::Comment => {}
122                        YamlToken::Whitespace if tokens[*i].text.trim().is_empty() => {}
123                        _ => {
124                            return Err(diag_at_token(
125                                &tokens[*i],
126                                diagnostic_codes::PARSE_TRAILING_CONTENT_AFTER_FLOW_END,
127                                "trailing content after flow sequence end",
128                            ));
129                        }
130                    }
131                }
132                builder.finish_node(); // YAML_FLOW_SEQUENCE
133                return Ok(());
134            }
135            YamlToken::Comma => {
136                if !open_item {
137                    return Err(diag_at_token(
138                        &tokens[*i],
139                        diagnostic_codes::PARSE_INVALID_FLOW_SEQUENCE_COMMA,
140                        "invalid comma position in flow sequence",
141                    ));
142                }
143                builder.finish_node(); // YAML_FLOW_SEQUENCE_ITEM
144                open_item = false;
145                emit_token_as_yaml(builder, &tokens[*i]);
146                *i += 1;
147            }
148            YamlToken::Whitespace | YamlToken::Newline | YamlToken::Indent | YamlToken::Dedent
149                if !open_item =>
150            {
151                emit_token_as_yaml(builder, &tokens[*i]);
152                *i += 1;
153            }
154            YamlToken::Scalar if !open_item && tokens[*i].text.trim().is_empty() => {
155                emit_token_as_yaml(builder, &tokens[*i]);
156                *i += 1;
157            }
158            YamlToken::FlowSeqStart => {
159                if !open_item {
160                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
161                    open_item = true;
162                }
163                emit_flow_sequence(builder, tokens, i)?;
164            }
165            YamlToken::FlowMapStart => {
166                if !open_item {
167                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
168                    open_item = true;
169                }
170                emit_flow_map(builder, tokens, i)?;
171            }
172            _ => {
173                if !open_item {
174                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
175                    open_item = true;
176                }
177                emit_token_as_yaml(builder, &tokens[*i]);
178                *i += 1;
179            }
180        }
181    }
182
183    let (byte_start, byte_end) =
184        if let Some(start) = tokens.iter().find(|t| t.kind == YamlToken::FlowSeqStart) {
185            (
186                start.byte_start,
187                tokens.last().map(|t| t.byte_end).unwrap_or(start.byte_end),
188            )
189        } else {
190            tokens
191                .last()
192                .map(|t| (t.byte_start, t.byte_end))
193                .unwrap_or((0, 0))
194        };
195    Err(YamlDiagnostic {
196        code: diagnostic_codes::PARSE_UNTERMINATED_FLOW_SEQUENCE,
197        message: "unterminated flow sequence",
198        byte_start,
199        byte_end,
200    })
201}
202
203fn emit_flow_map<'a>(
204    builder: &mut GreenNodeBuilder<'_>,
205    tokens: &[YamlTokenSpan<'a>],
206    i: &mut usize,
207) -> Result<(), YamlDiagnostic> {
208    if *i >= tokens.len() || tokens[*i].kind != YamlToken::FlowMapStart {
209        return Err(YamlDiagnostic {
210            code: diagnostic_codes::PARSE_EXPECTED_FLOW_MAP_START,
211            message: "expected flow map start token",
212            byte_start: tokens.get(*i).map(|t| t.byte_start).unwrap_or(0),
213            byte_end: tokens.get(*i).map(|t| t.byte_end).unwrap_or(0),
214        });
215    }
216
217    builder.start_node(SyntaxKind::YAML_FLOW_MAP.into());
218    emit_token_as_yaml(builder, &tokens[*i]); // {
219    *i += 1;
220
221    loop {
222        // Skip inter-entry whitespace and newlines. The flow lexer chunks
223        // text between flow indicators into Scalar tokens, including
224        // whitespace-only chunks like the space in `, }` — treat those as
225        // trivia here so they do not synthesize phantom entries. Indent and
226        // Dedent are emitted on multi-line flow continuations and carry no
227        // semantic weight inside a flow collection.
228        while *i < tokens.len()
229            && (matches!(
230                tokens[*i].kind,
231                YamlToken::Whitespace | YamlToken::Newline | YamlToken::Indent | YamlToken::Dedent
232            ) || (tokens[*i].kind == YamlToken::Scalar && tokens[*i].text.trim().is_empty()))
233        {
234            emit_token_as_yaml(builder, &tokens[*i]);
235            *i += 1;
236        }
237
238        if *i >= tokens.len() {
239            let (byte_start, byte_end) = tokens
240                .last()
241                .map(|t| (t.byte_start, t.byte_end))
242                .unwrap_or((0, 0));
243            return Err(YamlDiagnostic {
244                code: diagnostic_codes::PARSE_UNTERMINATED_FLOW_MAP,
245                message: "unterminated flow map",
246                byte_start,
247                byte_end,
248            });
249        }
250
251        match tokens[*i].kind {
252            YamlToken::FlowMapEnd => {
253                emit_token_as_yaml(builder, &tokens[*i]);
254                *i += 1;
255                if *i < tokens.len() {
256                    match tokens[*i].kind {
257                        YamlToken::Newline
258                        | YamlToken::Comment
259                        | YamlToken::Whitespace
260                        | YamlToken::FlowMapEnd
261                        | YamlToken::FlowSeqEnd
262                        | YamlToken::Comma => {}
263                        _ => {
264                            return Err(diag_at_token(
265                                &tokens[*i],
266                                diagnostic_codes::PARSE_TRAILING_CONTENT_AFTER_FLOW_END,
267                                "trailing content after flow map end",
268                            ));
269                        }
270                    }
271                }
272                builder.finish_node(); // YAML_FLOW_MAP
273                return Ok(());
274            }
275            YamlToken::Comma => {
276                emit_token_as_yaml(builder, &tokens[*i]);
277                *i += 1;
278            }
279            _ => {
280                emit_flow_map_entry(builder, tokens, i)?;
281            }
282        }
283    }
284}
285
286fn emit_flow_map_entry<'a>(
287    builder: &mut GreenNodeBuilder<'_>,
288    tokens: &[YamlTokenSpan<'a>],
289    i: &mut usize,
290) -> Result<(), YamlDiagnostic> {
291    builder.start_node(SyntaxKind::YAML_FLOW_MAP_ENTRY.into());
292    builder.start_node(SyntaxKind::YAML_FLOW_MAP_KEY.into());
293
294    // Emit leading whitespace and zero-width indent markers inside key node.
295    // Indent/Dedent appear on multi-line flow continuations but are not
296    // semantic inside a flow collection.
297    while *i < tokens.len()
298        && matches!(
299            tokens[*i].kind,
300            YamlToken::Whitespace | YamlToken::Indent | YamlToken::Dedent
301        )
302    {
303        emit_token_as_yaml(builder, &tokens[*i]);
304        *i += 1;
305    }
306
307    // Locate the colon that terminates an implicit key, if any. The implicit
308    // key may span several Scalar tokens separated by Newline/Whitespace/
309    // Indent/Dedent when a flow-map entry's key wraps across lines (e.g.
310    // `{ multi\n  line: value}`). Nested flow openers, an explicit
311    // `Key`/`Tag`/`Anchor`/`Alias` indicator, or a structural delimiter end
312    // the search.
313    let colon_at: Option<usize> = {
314        let mut j = *i;
315        let mut found = None;
316        while j < tokens.len() {
317            match tokens[j].kind {
318                YamlToken::Comma
319                | YamlToken::FlowMapEnd
320                | YamlToken::FlowSeqEnd
321                | YamlToken::FlowMapStart
322                | YamlToken::FlowSeqStart
323                | YamlToken::Tag
324                | YamlToken::Key
325                | YamlToken::Anchor
326                | YamlToken::Alias => break,
327                YamlToken::Scalar => {
328                    if split_once_unquoted_key_colon(tokens[j].text).is_some() {
329                        found = Some(j);
330                        break;
331                    }
332                }
333                _ => {}
334            }
335            j += 1;
336        }
337        found
338    };
339
340    let value_prefix: Option<&'a str> = if let Some(target) = colon_at {
341        // Emit any inter-scalar trivia (Newline / Whitespace / Indent / Dedent
342        // / preceding key-half Scalar chunks) into the key node before the
343        // colon split.
344        while *i < target {
345            emit_token_as_yaml(builder, &tokens[*i]);
346            *i += 1;
347        }
348        let scalar = tokens[target];
349        *i += 1;
350        let (key_text, rest_text) = split_once_unquoted_key_colon(scalar.text)
351            .expect("implicit-key scan promised a colon in this scalar");
352        if !key_text.is_empty() {
353            builder.token(SyntaxKind::YAML_KEY.into(), key_text);
354        }
355        builder.token(
356            SyntaxKind::YAML_COLON.into(),
357            &scalar.text[key_text.len()..key_text.len() + 1],
358        );
359        Some(rest_text)
360    } else {
361        match tokens.get(*i).map(|t| t.kind) {
362            Some(YamlToken::Scalar) => {
363                let scalar = tokens[*i];
364                *i += 1;
365                builder.token(SyntaxKind::YAML_SCALAR.into(), scalar.text);
366                None
367            }
368            Some(YamlToken::Key) => {
369                builder.token(SyntaxKind::YAML_KEY.into(), tokens[*i].text);
370                *i += 1;
371                while *i < tokens.len() && tokens[*i].kind == YamlToken::Whitespace {
372                    emit_token_as_yaml(builder, &tokens[*i]);
373                    *i += 1;
374                }
375                if *i < tokens.len() && tokens[*i].kind == YamlToken::Colon {
376                    builder.token(SyntaxKind::YAML_COLON.into(), tokens[*i].text);
377                    *i += 1;
378                }
379                None
380            }
381            Some(YamlToken::Tag) => {
382                emit_token_as_yaml(builder, &tokens[*i]);
383                *i += 1;
384                None
385            }
386            _ => None,
387        }
388    };
389
390    builder.finish_node(); // YAML_FLOW_MAP_KEY
391
392    builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
393    if let Some(prefix) = value_prefix
394        && !prefix.is_empty()
395    {
396        builder.token(SyntaxKind::YAML_SCALAR.into(), prefix);
397    }
398    emit_flow_value_tokens(builder, tokens, i)?;
399    builder.finish_node(); // YAML_FLOW_MAP_VALUE
400
401    builder.finish_node(); // YAML_FLOW_MAP_ENTRY
402    Ok(())
403}
404
405fn emit_flow_value_tokens<'a>(
406    builder: &mut GreenNodeBuilder<'_>,
407    tokens: &[YamlTokenSpan<'a>],
408    i: &mut usize,
409) -> Result<(), YamlDiagnostic> {
410    while *i < tokens.len() {
411        match tokens[*i].kind {
412            YamlToken::Comma | YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd => break,
413            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
414            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
415            _ => {
416                emit_token_as_yaml(builder, &tokens[*i]);
417                *i += 1;
418            }
419        }
420    }
421    Ok(())
422}
423
424fn emit_scalar_document<'a>(
425    builder: &mut GreenNodeBuilder<'_>,
426    tokens: &[YamlTokenSpan<'a>],
427    i: &mut usize,
428) -> Result<(), YamlDiagnostic> {
429    while *i < tokens.len() {
430        let kind = match tokens[*i].kind {
431            YamlToken::Newline => SyntaxKind::NEWLINE,
432            // Document boundaries close the scalar body; the stream loop will
433            // emit them at the YAML_DOCUMENT level.
434            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
435            YamlToken::Tag => SyntaxKind::YAML_TAG,
436            YamlToken::Comment => SyntaxKind::YAML_COMMENT,
437            YamlToken::Whitespace => SyntaxKind::WHITESPACE,
438            YamlToken::Colon => SyntaxKind::YAML_COLON,
439            YamlToken::FlowMapStart
440            | YamlToken::FlowMapEnd
441            | YamlToken::FlowSeqStart
442            | YamlToken::FlowSeqEnd
443            | YamlToken::Comma => {
444                return Err(diag_at_token(
445                    &tokens[*i],
446                    diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
447                    "unexpected flow indicator in plain scalar document",
448                ));
449            }
450            _ => SyntaxKind::YAML_SCALAR,
451        };
452        builder.token(kind.into(), tokens[*i].text);
453        *i += 1;
454    }
455    Ok(())
456}
457
458fn emit_block_seq<'a>(
459    builder: &mut GreenNodeBuilder<'_>,
460    tokens: &[YamlTokenSpan<'a>],
461    i: &mut usize,
462    stop_on_dedent: bool,
463) -> Result<(), YamlDiagnostic> {
464    // Consume leading document-level node properties (a tag, or a scalar
465    // that is just `&anchor`) that the classifier has already routed into
466    // a BlockSequence body — patterns like `&seq\n- a` and
467    // `--- !!omap\n- a`. They become siblings of the
468    // YAML_BLOCK_SEQUENCE_ITEM nodes; projection picks them up to attach
469    // anchor/tag info onto the `+SEQ` event.
470    let mut header_done = false;
471    while !header_done && *i < tokens.len() {
472        match tokens[*i].kind {
473            YamlToken::Tag => {
474                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
475                *i += 1;
476            }
477            YamlToken::Scalar if tokens[*i].text.trim_start().starts_with('&') => {
478                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
479                *i += 1;
480            }
481            YamlToken::Whitespace => {
482                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
483                *i += 1;
484            }
485            YamlToken::Newline => {
486                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
487                *i += 1;
488            }
489            YamlToken::Comment => {
490                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
491                *i += 1;
492            }
493            _ => header_done = true,
494        }
495    }
496    while *i < tokens.len() {
497        match tokens[*i].kind {
498            YamlToken::Newline => {
499                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
500                *i += 1;
501            }
502            // Document boundaries close the body; the stream loop will pick
503            // them up at the YAML_DOCUMENT level.
504            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
505            YamlToken::Whitespace => {
506                // Between-item indentation in a nested sequence.
507                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
508                *i += 1;
509            }
510            // A comment between/before block-seq items is sequence-level
511            // trivia — absorb it so the sequence isn't split into two
512            // documents at the stream level. Only absorb when another
513            // BlockSeqEntry follows; if a document boundary follows the
514            // comment, the sequence has ended and the comment belongs at
515            // the document/stream level (the snapshot for case JHB9 pins
516            // this).
517            YamlToken::Comment => {
518                let mut peek = *i + 1;
519                while peek < tokens.len()
520                    && matches!(
521                        tokens[peek].kind,
522                        YamlToken::Newline | YamlToken::Whitespace | YamlToken::Comment
523                    )
524                {
525                    peek += 1;
526                }
527                if peek < tokens.len() && tokens[peek].kind == YamlToken::BlockSeqEntry {
528                    builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
529                    *i += 1;
530                } else {
531                    break;
532                }
533            }
534            YamlToken::Dedent => {
535                if stop_on_dedent {
536                    *i += 1;
537                    break;
538                }
539                break;
540            }
541            YamlToken::BlockSeqEntry => emit_block_seq_item(builder, tokens, i)?,
542            _ => break,
543        }
544    }
545    Ok(())
546}
547
548fn emit_block_seq_item<'a>(
549    builder: &mut GreenNodeBuilder<'_>,
550    tokens: &[YamlTokenSpan<'a>],
551    i: &mut usize,
552) -> Result<(), YamlDiagnostic> {
553    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM.into());
554    builder.token(SyntaxKind::YAML_BLOCK_SEQ_ENTRY.into(), tokens[*i].text);
555    *i += 1;
556    let mut closed_via_nested_seq = false;
557    while *i < tokens.len() && tokens[*i].kind != YamlToken::Newline {
558        match tokens[*i].kind {
559            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
560            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
561            YamlToken::Indent => {
562                // Nested block sequence triggered by `- - ...`: the lexer
563                // emitted an Indent between the outer `- ` and the inner
564                // `-`. Recurse; the nested emitter consumes through the
565                // matching Dedent (including any intervening Newlines), so
566                // the outer item has no trailing Newline to emit.
567                *i += 1;
568                builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
569                emit_block_seq(builder, tokens, i, true)?;
570                builder.finish_node(); // YAML_BLOCK_SEQUENCE
571                closed_via_nested_seq = true;
572                break;
573            }
574            _ => {
575                emit_token_as_yaml(builder, &tokens[*i]);
576                *i += 1;
577            }
578        }
579    }
580    if !closed_via_nested_seq && *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
581        builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
582        *i += 1;
583    }
584    // Nested block map following a bare `-\n` entry: lexer has emitted an
585    // Indent after the Newline, terminated by a Dedent.
586    if !closed_via_nested_seq && *i < tokens.len() && tokens[*i].kind == YamlToken::Indent {
587        *i += 1;
588        builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
589        emit_block_map(builder, tokens, i, true)?;
590        builder.finish_node(); // YAML_BLOCK_MAP
591    }
592    builder.finish_node(); // YAML_BLOCK_SEQUENCE_ITEM
593    Ok(())
594}
595
596fn emit_block_map<'a>(
597    builder: &mut GreenNodeBuilder<'_>,
598    tokens: &[YamlTokenSpan<'a>],
599    i: &mut usize,
600    stop_on_dedent: bool,
601) -> Result<(), YamlDiagnostic> {
602    let mut closed_by_dedent = false;
603    while *i < tokens.len() {
604        match tokens[*i].kind {
605            YamlToken::Newline => {
606                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
607                *i += 1;
608            }
609            // Document boundaries close the body; the stream loop picks them
610            // up at the YAML_DOCUMENT level.
611            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
612            YamlToken::Directive | YamlToken::Comma => {
613                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
614                *i += 1;
615            }
616            YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd => {
617                return Err(diag_at_token(
618                    &tokens[*i],
619                    diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
620                    "unexpected flow closing token",
621                ));
622            }
623            YamlToken::FlowMapStart | YamlToken::FlowSeqStart => {
624                if tokens[*i].kind == YamlToken::FlowMapStart {
625                    emit_flow_map(builder, tokens, i)?;
626                } else {
627                    emit_flow_sequence(builder, tokens, i)?;
628                }
629            }
630            YamlToken::Anchor
631            | YamlToken::Alias
632            | YamlToken::BlockScalarHeader
633            | YamlToken::BlockScalarContent => {
634                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
635                *i += 1;
636            }
637            YamlToken::Scalar | YamlToken::Comment => {
638                while *i < tokens.len() && tokens[*i].kind != YamlToken::Newline {
639                    if matches!(
640                        tokens[*i].kind,
641                        YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd
642                    ) {
643                        return Err(diag_at_token(
644                            &tokens[*i],
645                            diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
646                            "unexpected flow closing token",
647                        ));
648                    }
649                    emit_token_as_yaml(builder, &tokens[*i]);
650                    *i += 1;
651                }
652            }
653            YamlToken::Indent => {
654                return Err(diag_at_token(
655                    &tokens[*i],
656                    diagnostic_codes::PARSE_UNEXPECTED_INDENT,
657                    "unexpected indent token while parsing block map",
658                ));
659            }
660            YamlToken::Dedent => {
661                if stop_on_dedent {
662                    *i += 1;
663                    closed_by_dedent = true;
664                    break;
665                }
666                return Err(diag_at_token(
667                    &tokens[*i],
668                    diagnostic_codes::PARSE_UNEXPECTED_DEDENT,
669                    "unexpected dedent token while parsing block map",
670                ));
671            }
672            _ => emit_block_map_entry(builder, tokens, i)?,
673        }
674    }
675
676    if stop_on_dedent && !closed_by_dedent {
677        let (byte_start, byte_end) = tokens
678            .last()
679            .map(|t| (t.byte_start, t.byte_end))
680            .unwrap_or((0, 0));
681        return Err(YamlDiagnostic {
682            code: diagnostic_codes::PARSE_UNTERMINATED_BLOCK_MAP,
683            message: "unterminated indented block map",
684            byte_start,
685            byte_end,
686        });
687    }
688
689    Ok(())
690}
691
692fn emit_block_map_entry<'a>(
693    builder: &mut GreenNodeBuilder<'_>,
694    tokens: &[YamlTokenSpan<'a>],
695    i: &mut usize,
696) -> Result<(), YamlDiagnostic> {
697    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_ENTRY.into());
698    emit_block_map_key(builder, tokens, i)?;
699    let trailing_newline = emit_block_map_value(builder, tokens, i)?;
700    if let Some(newline) = trailing_newline {
701        builder.token(SyntaxKind::NEWLINE.into(), newline);
702    }
703    builder.finish_node(); // YAML_BLOCK_MAP_ENTRY
704    Ok(())
705}
706
707fn emit_block_map_key<'a>(
708    builder: &mut GreenNodeBuilder<'_>,
709    tokens: &[YamlTokenSpan<'a>],
710    i: &mut usize,
711) -> Result<(), YamlDiagnostic> {
712    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_KEY.into());
713
714    let mut saw_colon = false;
715    while *i < tokens.len() {
716        match tokens[*i].kind {
717            YamlToken::Key => {
718                builder.token(SyntaxKind::YAML_KEY.into(), tokens[*i].text);
719                *i += 1;
720            }
721            YamlToken::Tag => {
722                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
723                *i += 1;
724            }
725            YamlToken::Whitespace => {
726                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
727                *i += 1;
728            }
729            YamlToken::Colon => {
730                builder.token(SyntaxKind::YAML_COLON.into(), tokens[*i].text);
731                *i += 1;
732                saw_colon = true;
733                break;
734            }
735            _ => {
736                return Err(diag_at_token(
737                    &tokens[*i],
738                    diagnostic_codes::PARSE_INVALID_KEY_TOKEN,
739                    "invalid token while parsing block map key",
740                ));
741            }
742        }
743    }
744    if !saw_colon {
745        return Err(diag_at_token(
746            &tokens[(*i).saturating_sub(1)],
747            diagnostic_codes::PARSE_MISSING_COLON,
748            "missing colon in block map entry",
749        ));
750    }
751    builder.finish_node(); // YAML_BLOCK_MAP_KEY
752    Ok(())
753}
754
755/// Emit `YAML_BLOCK_MAP_VALUE` and return the trailing newline (if any) that
756/// the caller should emit after the value node closes. The newline is held
757/// back so that a nested block map can be wired in after the newline rather
758/// than before, preserving byte order in the CST.
759fn emit_block_map_value<'a>(
760    builder: &mut GreenNodeBuilder<'_>,
761    tokens: &[YamlTokenSpan<'a>],
762    i: &mut usize,
763) -> Result<Option<&'a str>, YamlDiagnostic> {
764    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
765    while *i < tokens.len() {
766        match tokens[*i].kind {
767            YamlToken::Scalar => {
768                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
769                *i += 1;
770            }
771            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
772            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
773            YamlToken::Anchor | YamlToken::Alias => {
774                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
775                *i += 1;
776            }
777            YamlToken::BlockScalarHeader => {
778                consume_block_scalar(builder, tokens, i);
779            }
780            YamlToken::BlockScalarContent => {
781                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
782                *i += 1;
783            }
784            YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd | YamlToken::Comma => break,
785            YamlToken::Tag => {
786                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
787                *i += 1;
788            }
789            YamlToken::Comment => {
790                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
791                *i += 1;
792            }
793            YamlToken::Whitespace => {
794                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
795                *i += 1;
796            }
797            _ => break,
798        }
799    }
800
801    let mut trailing_newline: Option<&str> = None;
802    if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
803        trailing_newline = Some(tokens[*i].text);
804        *i += 1;
805    }
806
807    if *i < tokens.len() && tokens[*i].kind == YamlToken::Indent {
808        *i += 1;
809        // Emit trailing newline before nested content to preserve byte order.
810        if let Some(newline) = trailing_newline.take() {
811            builder.token(SyntaxKind::NEWLINE.into(), newline);
812        }
813        builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
814        emit_block_map(builder, tokens, i, true)?;
815        builder.finish_node(); // YAML_BLOCK_MAP
816    }
817
818    builder.finish_node(); // YAML_BLOCK_MAP_VALUE
819    Ok(trailing_newline)
820}
821
822/// Consume a literal/folded block-scalar header (`|` / `>`) and the
823/// following content lines. Each line is emitted as a `YAML_SCALAR` token
824/// with `NEWLINE` separators. Blank-line newlines that belong to the scalar
825/// body are absorbed so the entire body lives inside the value node.
826fn consume_block_scalar<'a>(
827    builder: &mut GreenNodeBuilder<'_>,
828    tokens: &[YamlTokenSpan<'a>],
829    i: &mut usize,
830) {
831    builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
832    *i += 1;
833    while *i < tokens.len() {
834        match tokens[*i].kind {
835            YamlToken::Newline => {
836                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
837                *i += 1;
838                if *i < tokens.len()
839                    && matches!(
840                        tokens[*i].kind,
841                        YamlToken::BlockScalarContent | YamlToken::Newline
842                    )
843                {
844                    continue;
845                }
846                break;
847            }
848            YamlToken::BlockScalarContent => {
849                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
850                *i += 1;
851            }
852            _ => break,
853        }
854    }
855}
856
857/// Parse prototype YAML tree structure from input
858pub fn parse_yaml_tree(input: &str) -> Option<SyntaxNode> {
859    parse_yaml_report(input).tree
860}
861
862/// Parse prototype YAML tree structure and include diagnostics on failure.
863pub fn parse_yaml_report(input: &str) -> YamlParseReport {
864    let tokens = match lex_mapping_tokens_with_diagnostic(input) {
865        Ok(tokens) => tokens,
866        Err(err) => {
867            return YamlParseReport {
868                tree: None,
869                diagnostics: vec![err],
870            };
871        }
872    };
873
874    let mut seen_content = false;
875    for token in &tokens {
876        match token.kind {
877            YamlToken::Directive if seen_content => {
878                return YamlParseReport {
879                    tree: None,
880                    diagnostics: vec![diag_at_token(
881                        token,
882                        diagnostic_codes::PARSE_DIRECTIVE_AFTER_CONTENT,
883                        "directive requires document end before subsequent directives",
884                    )],
885                };
886            }
887            YamlToken::Directive
888            | YamlToken::Newline
889            | YamlToken::Whitespace
890            | YamlToken::Comment => {}
891            YamlToken::DocumentEnd => seen_content = false,
892            _ => seen_content = true,
893        }
894    }
895
896    if let Some(directive) = tokens.iter().find(|t| t.kind == YamlToken::Directive)
897        && !tokens.iter().any(|t| t.kind == YamlToken::DocumentStart)
898    {
899        return YamlParseReport {
900            tree: None,
901            diagnostics: vec![diag_at_token(
902                directive,
903                diagnostic_codes::PARSE_DIRECTIVE_WITHOUT_DOCUMENT_START,
904                "directive requires an explicit document start marker",
905            )],
906        };
907    }
908
909    let mut builder = GreenNodeBuilder::new();
910    builder.start_node(SyntaxKind::DOCUMENT.into());
911    builder.start_node(SyntaxKind::YAML_METADATA_CONTENT.into());
912    builder.start_node(SyntaxKind::YAML_STREAM.into());
913    if let Err(err) = parse_stream(&mut builder, &tokens) {
914        return YamlParseReport {
915            tree: None,
916            diagnostics: vec![err],
917        };
918    }
919    builder.finish_node(); // YAML_STREAM
920    builder.finish_node(); // YAML_METADATA_CONTENT
921    builder.finish_node(); // DOCUMENT
922    YamlParseReport {
923        tree: Some(SyntaxNode::new_root(builder.finish())),
924        diagnostics: Vec::new(),
925    }
926}
927
928/// Outer stream loop. Walks every token and emits zero or more `YAML_DOCUMENT`
929/// nodes interleaved with stream-level trivia (newlines, whitespace, comments,
930/// and bare `...` markers that don't bracket a document body).
931fn parse_stream<'a>(
932    builder: &mut GreenNodeBuilder<'_>,
933    tokens: &[YamlTokenSpan<'a>],
934) -> Result<(), YamlDiagnostic> {
935    let mut i = 0usize;
936    while i < tokens.len() {
937        match tokens[i].kind {
938            YamlToken::Newline => {
939                builder.token(SyntaxKind::NEWLINE.into(), tokens[i].text);
940                i += 1;
941            }
942            YamlToken::Whitespace => {
943                builder.token(SyntaxKind::WHITESPACE.into(), tokens[i].text);
944                i += 1;
945            }
946            YamlToken::Comment => {
947                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[i].text);
948                i += 1;
949            }
950            // Indent/Dedent are zero-width balance markers from the lexer.
951            // If they leak out of a body emitter (e.g. trailing Dedent at
952            // end of input), absorb them silently — they carry no bytes.
953            YamlToken::Indent | YamlToken::Dedent => {
954                i += 1;
955            }
956            // Bare `...` at stream level — no preceding document body, no
957            // following body before another `...`/EOF — is stream-level
958            // trivia, not its own document.
959            YamlToken::DocumentEnd if !document_follows(tokens, i + 1) => {
960                builder.token(SyntaxKind::YAML_DOCUMENT_END.into(), tokens[i].text);
961                i += 1;
962            }
963            _ => {
964                builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
965                emit_document(builder, tokens, &mut i)?;
966                builder.finish_node(); // YAML_DOCUMENT
967            }
968        }
969    }
970    Ok(())
971}
972
973/// Returns `true` if the tokens at or after `start` contain any
974/// document-defining token (directive, doc-start, body content, doc-end). We
975/// use this to decide whether a bare `...` is "the end of nothing" (stream
976/// trivia) or actually closes a document yet to come (still trivia, just at
977/// stream level).
978fn document_follows(tokens: &[YamlTokenSpan<'_>], start: usize) -> bool {
979    tokens[start..].iter().any(|t| {
980        !matches!(
981            t.kind,
982            YamlToken::Newline
983                | YamlToken::Whitespace
984                | YamlToken::Comment
985                | YamlToken::DocumentEnd
986        )
987    })
988}
989
990/// Emit a single `YAML_DOCUMENT`. Optionally consumes leading directives and a
991/// `---` marker, dispatches to the body emitter, then optionally consumes a
992/// trailing `...` marker. Each phase is forgiving: an absent `---`, absent
993/// `...`, or empty body is fine.
994fn emit_document<'a>(
995    builder: &mut GreenNodeBuilder<'_>,
996    tokens: &[YamlTokenSpan<'a>],
997    i: &mut usize,
998) -> Result<(), YamlDiagnostic> {
999    // Phase 1: optional directives + `---` marker (with intervening trivia).
1000    let mut saw_marker = false;
1001    while *i < tokens.len() {
1002        match tokens[*i].kind {
1003            YamlToken::Directive => {
1004                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
1005                *i += 1;
1006            }
1007            YamlToken::Newline => {
1008                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
1009                *i += 1;
1010            }
1011            YamlToken::Whitespace => {
1012                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
1013                *i += 1;
1014            }
1015            YamlToken::Comment => {
1016                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
1017                *i += 1;
1018            }
1019            YamlToken::DocumentStart => {
1020                builder.token(SyntaxKind::YAML_DOCUMENT_START.into(), tokens[*i].text);
1021                *i += 1;
1022                saw_marker = true;
1023                if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
1024                    builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
1025                    *i += 1;
1026                }
1027                break;
1028            }
1029            _ => break,
1030        }
1031    }
1032    let _ = saw_marker;
1033
1034    // Phase 2: body.
1035    let next_significant = tokens[*i..].iter().find(|t| {
1036        !matches!(
1037            t.kind,
1038            YamlToken::Newline | YamlToken::Whitespace | YamlToken::Comment
1039        )
1040    });
1041
1042    let body_kind = match next_significant.map(|t| t.kind) {
1043        Some(YamlToken::DocumentStart) | Some(YamlToken::DocumentEnd) | None => DocumentBody::Empty,
1044        Some(YamlToken::BlockSeqEntry) => DocumentBody::BlockSequence,
1045        _ => {
1046            // Body classification scans up to the next document boundary. A
1047            // colon (block-map indicator) or any flow-collection indicator
1048            // routes to the block-map emitter, which already accommodates
1049            // tagless mapping/flow content. A tag with no colon routes to
1050            // the dedicated scalar path (`! a`, `!!str foo`); plain content
1051            // with no colon (e.g. `--- text`, `--- "quoted"`, `--- |` with
1052            // following content) is also a scalar document. Without the
1053            // scalar branch, a bare `--- text` would dispatch to BlockMap
1054            // and fail with INVALID_KEY_TOKEN.
1055            let mut has_colon = false;
1056            let mut has_tag = false;
1057            let mut has_scalar = false;
1058            let mut has_flow = false;
1059            let mut has_block_seq = false;
1060            // Track whether all significant tokens BEFORE the first
1061            // BlockSeqEntry look like document-level node properties (a tag,
1062            // or a scalar starting with `&` — i.e. a bare anchor on its own
1063            // line). This lets `&seq\n- a` and `--- !!omap\n- a` route to a
1064            // BlockSequence body even though their first significant token is
1065            // a Tag/Scalar rather than the BlockSeqEntry itself.
1066            let mut pre_seq_only_properties = true;
1067            let mut seen_block_seq = false;
1068            for tok in &tokens[*i..] {
1069                match tok.kind {
1070                    YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
1071                    YamlToken::Colon => has_colon = true,
1072                    YamlToken::Tag => has_tag = true,
1073                    YamlToken::Scalar
1074                    | YamlToken::BlockScalarHeader
1075                    | YamlToken::BlockScalarContent => {
1076                        has_scalar = true;
1077                        if !seen_block_seq && !tok.text.trim_start().starts_with('&') {
1078                            pre_seq_only_properties = false;
1079                        }
1080                    }
1081                    YamlToken::FlowMapStart
1082                    | YamlToken::FlowMapEnd
1083                    | YamlToken::FlowSeqStart
1084                    | YamlToken::FlowSeqEnd
1085                    | YamlToken::Comma => has_flow = true,
1086                    YamlToken::BlockSeqEntry => {
1087                        has_block_seq = true;
1088                        seen_block_seq = true;
1089                    }
1090                    _ => {}
1091                }
1092            }
1093            if has_colon || has_flow {
1094                DocumentBody::BlockMap
1095            } else if has_block_seq && pre_seq_only_properties {
1096                DocumentBody::BlockSequence
1097            } else if has_tag || has_scalar {
1098                DocumentBody::Scalar
1099            } else {
1100                DocumentBody::BlockMap
1101            }
1102        }
1103    };
1104
1105    match body_kind {
1106        DocumentBody::Empty => {}
1107        DocumentBody::BlockSequence => {
1108            builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
1109            emit_block_seq(builder, tokens, i, false)?;
1110            builder.finish_node(); // YAML_BLOCK_SEQUENCE
1111        }
1112        DocumentBody::Scalar => emit_scalar_document(builder, tokens, i)?,
1113        DocumentBody::BlockMap => {
1114            builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
1115            emit_block_map(builder, tokens, i, false)?;
1116            builder.finish_node(); // YAML_BLOCK_MAP
1117        }
1118    }
1119
1120    // Phase 3: optional `...` marker (and its trailing newline). Trivia
1121    // between the body and the marker that we did NOT consume into the body
1122    // belongs to the stream, not this document, so we don't drain it here —
1123    // except when the body was empty: then any comments/blank lines between
1124    // `---` and `...` semantically belong to the empty document, and so does
1125    // the `...` itself even when it lies a few trivia tokens away.
1126    if matches!(body_kind, DocumentBody::Empty) {
1127        let mut peek = *i;
1128        while peek < tokens.len() {
1129            match tokens[peek].kind {
1130                YamlToken::Newline | YamlToken::Whitespace | YamlToken::Comment => peek += 1,
1131                _ => break,
1132            }
1133        }
1134        if peek < tokens.len() && tokens[peek].kind == YamlToken::DocumentEnd {
1135            while *i < peek {
1136                match tokens[*i].kind {
1137                    YamlToken::Newline => {
1138                        builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text)
1139                    }
1140                    YamlToken::Whitespace => {
1141                        builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text)
1142                    }
1143                    YamlToken::Comment => {
1144                        builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text)
1145                    }
1146                    _ => unreachable!("only trivia in this range"),
1147                }
1148                *i += 1;
1149            }
1150        }
1151    }
1152    if *i < tokens.len() && tokens[*i].kind == YamlToken::DocumentEnd {
1153        builder.token(SyntaxKind::YAML_DOCUMENT_END.into(), tokens[*i].text);
1154        *i += 1;
1155        if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
1156            builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
1157            *i += 1;
1158        }
1159    }
1160
1161    Ok(())
1162}
1163
1164#[derive(Clone, Copy)]
1165enum DocumentBody {
1166    Empty,
1167    BlockSequence,
1168    BlockMap,
1169    Scalar,
1170}