Skip to main content

panache_parser/parser/yaml/
parser.rs

1use crate::syntax::{SyntaxKind, SyntaxNode};
2use rowan::GreenNodeBuilder;
3
4use super::lexer::{lex_mapping_tokens_with_diagnostic, split_once_unquoted_key_colon};
5use super::model::{
6    ShadowYamlOptions, ShadowYamlOutcome, ShadowYamlReport, YamlDiagnostic, YamlInputKind,
7    YamlParseReport, YamlToken, YamlTokenSpan, diagnostic_codes,
8};
9
10/// Parse YAML in shadow mode using prototype groundwork only.
11///
12/// This API is intentionally read-only and does not replace production YAML
13/// parsing. By default it is disabled and reports `SkippedDisabled`.
14pub fn parse_shadow(input: &str, options: ShadowYamlOptions) -> ShadowYamlReport {
15    let line_count = input.lines().count().max(1);
16
17    if !options.enabled {
18        return ShadowYamlReport {
19            outcome: ShadowYamlOutcome::SkippedDisabled,
20            shadow_reason: "shadow-disabled",
21            input_kind: options.input_kind,
22            input_len_bytes: input.len(),
23            line_count,
24            normalized_input: None,
25        };
26    }
27
28    let normalized = match options.input_kind {
29        YamlInputKind::Plain => input.to_owned(),
30        YamlInputKind::Hashpipe => normalize_hashpipe_input(input),
31    };
32
33    let parsed = parse_yaml_tree(&normalized).is_some();
34
35    ShadowYamlReport {
36        outcome: if parsed {
37            ShadowYamlOutcome::PrototypeParsed
38        } else {
39            ShadowYamlOutcome::PrototypeRejected
40        },
41        shadow_reason: if parsed {
42            "prototype-basic-mapping-parsed"
43        } else {
44            "prototype-basic-mapping-rejected"
45        },
46        input_kind: options.input_kind,
47        input_len_bytes: input.len(),
48        line_count,
49        normalized_input: Some(normalized),
50    }
51}
52
53fn normalize_hashpipe_input(input: &str) -> String {
54    input
55        .lines()
56        .map(strip_hashpipe_prefix)
57        .collect::<Vec<_>>()
58        .join("\n")
59}
60
61fn strip_hashpipe_prefix(line: &str) -> &str {
62    if let Some(rest) = line.strip_prefix("#|") {
63        return rest.strip_prefix(' ').unwrap_or(rest);
64    }
65    line
66}
67
68fn emit_token_as_yaml(builder: &mut GreenNodeBuilder<'_>, token: &YamlTokenSpan<'_>) {
69    let kind = match token.kind {
70        YamlToken::Whitespace => SyntaxKind::WHITESPACE,
71        YamlToken::Comment => SyntaxKind::YAML_COMMENT,
72        YamlToken::Tag => SyntaxKind::YAML_TAG,
73        YamlToken::Colon => SyntaxKind::YAML_COLON,
74        _ => SyntaxKind::YAML_SCALAR,
75    };
76    builder.token(kind.into(), token.text);
77}
78
79fn diag_at_token(
80    token: &YamlTokenSpan<'_>,
81    code: &'static str,
82    message: &'static str,
83) -> YamlDiagnostic {
84    YamlDiagnostic {
85        code,
86        message,
87        byte_start: token.byte_start,
88        byte_end: token.byte_end,
89    }
90}
91
92fn emit_flow_sequence<'a>(
93    builder: &mut GreenNodeBuilder<'_>,
94    tokens: &[YamlTokenSpan<'a>],
95    i: &mut usize,
96) -> Result<(), YamlDiagnostic> {
97    if *i >= tokens.len() || tokens[*i].kind != YamlToken::FlowSeqStart {
98        return Err(YamlDiagnostic {
99            code: diagnostic_codes::PARSE_EXPECTED_FLOW_SEQUENCE_START,
100            message: "expected flow sequence start token",
101            byte_start: tokens.get(*i).map(|t| t.byte_start).unwrap_or(0),
102            byte_end: tokens.get(*i).map(|t| t.byte_end).unwrap_or(0),
103        });
104    }
105
106    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE.into());
107    emit_token_as_yaml(builder, &tokens[*i]); // [
108    *i += 1;
109
110    let mut open_item = false;
111    while *i < tokens.len() {
112        match tokens[*i].kind {
113            YamlToken::FlowSeqEnd => {
114                if open_item {
115                    builder.finish_node(); // YAML_FLOW_SEQUENCE_ITEM
116                }
117                emit_token_as_yaml(builder, &tokens[*i]); // ]
118                *i += 1;
119                if *i < tokens.len() {
120                    match tokens[*i].kind {
121                        YamlToken::Newline | YamlToken::Comment => {}
122                        YamlToken::Whitespace if tokens[*i].text.trim().is_empty() => {}
123                        _ => {
124                            return Err(diag_at_token(
125                                &tokens[*i],
126                                diagnostic_codes::PARSE_TRAILING_CONTENT_AFTER_FLOW_END,
127                                "trailing content after flow sequence end",
128                            ));
129                        }
130                    }
131                }
132                builder.finish_node(); // YAML_FLOW_SEQUENCE
133                return Ok(());
134            }
135            YamlToken::Comma => {
136                if !open_item {
137                    return Err(diag_at_token(
138                        &tokens[*i],
139                        diagnostic_codes::PARSE_INVALID_FLOW_SEQUENCE_COMMA,
140                        "invalid comma position in flow sequence",
141                    ));
142                }
143                builder.finish_node(); // YAML_FLOW_SEQUENCE_ITEM
144                open_item = false;
145                emit_token_as_yaml(builder, &tokens[*i]);
146                *i += 1;
147            }
148            YamlToken::Whitespace | YamlToken::Newline | YamlToken::Indent | YamlToken::Dedent
149                if !open_item =>
150            {
151                emit_token_as_yaml(builder, &tokens[*i]);
152                *i += 1;
153            }
154            YamlToken::Scalar if !open_item && tokens[*i].text.trim().is_empty() => {
155                emit_token_as_yaml(builder, &tokens[*i]);
156                *i += 1;
157            }
158            YamlToken::FlowSeqStart => {
159                if !open_item {
160                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
161                    open_item = true;
162                }
163                emit_flow_sequence(builder, tokens, i)?;
164            }
165            YamlToken::FlowMapStart => {
166                if !open_item {
167                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
168                    open_item = true;
169                }
170                emit_flow_map(builder, tokens, i)?;
171            }
172            _ => {
173                if !open_item {
174                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
175                    open_item = true;
176                }
177                emit_token_as_yaml(builder, &tokens[*i]);
178                *i += 1;
179            }
180        }
181    }
182
183    let (byte_start, byte_end) =
184        if let Some(start) = tokens.iter().find(|t| t.kind == YamlToken::FlowSeqStart) {
185            (
186                start.byte_start,
187                tokens.last().map(|t| t.byte_end).unwrap_or(start.byte_end),
188            )
189        } else {
190            tokens
191                .last()
192                .map(|t| (t.byte_start, t.byte_end))
193                .unwrap_or((0, 0))
194        };
195    Err(YamlDiagnostic {
196        code: diagnostic_codes::PARSE_UNTERMINATED_FLOW_SEQUENCE,
197        message: "unterminated flow sequence",
198        byte_start,
199        byte_end,
200    })
201}
202
203fn emit_flow_map<'a>(
204    builder: &mut GreenNodeBuilder<'_>,
205    tokens: &[YamlTokenSpan<'a>],
206    i: &mut usize,
207) -> Result<(), YamlDiagnostic> {
208    if *i >= tokens.len() || tokens[*i].kind != YamlToken::FlowMapStart {
209        return Err(YamlDiagnostic {
210            code: diagnostic_codes::PARSE_EXPECTED_FLOW_MAP_START,
211            message: "expected flow map start token",
212            byte_start: tokens.get(*i).map(|t| t.byte_start).unwrap_or(0),
213            byte_end: tokens.get(*i).map(|t| t.byte_end).unwrap_or(0),
214        });
215    }
216
217    builder.start_node(SyntaxKind::YAML_FLOW_MAP.into());
218    emit_token_as_yaml(builder, &tokens[*i]); // {
219    *i += 1;
220
221    loop {
222        // Skip inter-entry whitespace and newlines. The flow lexer chunks
223        // text between flow indicators into Scalar tokens, including
224        // whitespace-only chunks like the space in `, }` — treat those as
225        // trivia here so they do not synthesize phantom entries. Indent and
226        // Dedent are emitted on multi-line flow continuations and carry no
227        // semantic weight inside a flow collection.
228        while *i < tokens.len()
229            && (matches!(
230                tokens[*i].kind,
231                YamlToken::Whitespace | YamlToken::Newline | YamlToken::Indent | YamlToken::Dedent
232            ) || (tokens[*i].kind == YamlToken::Scalar && tokens[*i].text.trim().is_empty()))
233        {
234            emit_token_as_yaml(builder, &tokens[*i]);
235            *i += 1;
236        }
237
238        if *i >= tokens.len() {
239            let (byte_start, byte_end) = tokens
240                .last()
241                .map(|t| (t.byte_start, t.byte_end))
242                .unwrap_or((0, 0));
243            return Err(YamlDiagnostic {
244                code: diagnostic_codes::PARSE_UNTERMINATED_FLOW_MAP,
245                message: "unterminated flow map",
246                byte_start,
247                byte_end,
248            });
249        }
250
251        match tokens[*i].kind {
252            YamlToken::FlowMapEnd => {
253                emit_token_as_yaml(builder, &tokens[*i]);
254                *i += 1;
255                if *i < tokens.len() {
256                    match tokens[*i].kind {
257                        YamlToken::Newline
258                        | YamlToken::Comment
259                        | YamlToken::Whitespace
260                        | YamlToken::FlowMapEnd
261                        | YamlToken::FlowSeqEnd
262                        | YamlToken::Comma => {}
263                        _ => {
264                            return Err(diag_at_token(
265                                &tokens[*i],
266                                diagnostic_codes::PARSE_TRAILING_CONTENT_AFTER_FLOW_END,
267                                "trailing content after flow map end",
268                            ));
269                        }
270                    }
271                }
272                builder.finish_node(); // YAML_FLOW_MAP
273                return Ok(());
274            }
275            YamlToken::Comma => {
276                emit_token_as_yaml(builder, &tokens[*i]);
277                *i += 1;
278            }
279            _ => {
280                emit_flow_map_entry(builder, tokens, i)?;
281            }
282        }
283    }
284}
285
286fn emit_flow_map_entry<'a>(
287    builder: &mut GreenNodeBuilder<'_>,
288    tokens: &[YamlTokenSpan<'a>],
289    i: &mut usize,
290) -> Result<(), YamlDiagnostic> {
291    builder.start_node(SyntaxKind::YAML_FLOW_MAP_ENTRY.into());
292    builder.start_node(SyntaxKind::YAML_FLOW_MAP_KEY.into());
293
294    // Emit leading whitespace and zero-width indent markers inside key node.
295    // Indent/Dedent appear on multi-line flow continuations but are not
296    // semantic inside a flow collection.
297    while *i < tokens.len()
298        && matches!(
299            tokens[*i].kind,
300            YamlToken::Whitespace | YamlToken::Indent | YamlToken::Dedent
301        )
302    {
303        emit_token_as_yaml(builder, &tokens[*i]);
304        *i += 1;
305    }
306
307    // Locate the colon that terminates an implicit key, if any. The implicit
308    // key may span several Scalar tokens separated by Newline/Whitespace/
309    // Indent/Dedent when a flow-map entry's key wraps across lines (e.g.
310    // `{ multi\n  line: value}`). Nested flow openers, an explicit
311    // `Key`/`Tag`/`Anchor`/`Alias` indicator, or a structural delimiter end
312    // the search.
313    let colon_at: Option<usize> = {
314        let mut j = *i;
315        let mut found = None;
316        while j < tokens.len() {
317            match tokens[j].kind {
318                YamlToken::Comma
319                | YamlToken::FlowMapEnd
320                | YamlToken::FlowSeqEnd
321                | YamlToken::FlowMapStart
322                | YamlToken::FlowSeqStart
323                | YamlToken::Tag
324                | YamlToken::Key
325                | YamlToken::Anchor
326                | YamlToken::Alias => break,
327                YamlToken::Scalar => {
328                    if split_once_unquoted_key_colon(tokens[j].text).is_some() {
329                        found = Some(j);
330                        break;
331                    }
332                }
333                _ => {}
334            }
335            j += 1;
336        }
337        found
338    };
339
340    let value_prefix: Option<&'a str> = if let Some(target) = colon_at {
341        // Emit any inter-scalar trivia (Newline / Whitespace / Indent / Dedent
342        // / preceding key-half Scalar chunks) into the key node before the
343        // colon split.
344        while *i < target {
345            emit_token_as_yaml(builder, &tokens[*i]);
346            *i += 1;
347        }
348        let scalar = tokens[target];
349        *i += 1;
350        let (key_text, rest_text) = split_once_unquoted_key_colon(scalar.text)
351            .expect("implicit-key scan promised a colon in this scalar");
352        if !key_text.is_empty() {
353            builder.token(SyntaxKind::YAML_KEY.into(), key_text);
354        }
355        builder.token(
356            SyntaxKind::YAML_COLON.into(),
357            &scalar.text[key_text.len()..key_text.len() + 1],
358        );
359        Some(rest_text)
360    } else {
361        match tokens.get(*i).map(|t| t.kind) {
362            Some(YamlToken::Scalar) => {
363                let scalar = tokens[*i];
364                *i += 1;
365                builder.token(SyntaxKind::YAML_SCALAR.into(), scalar.text);
366                None
367            }
368            Some(YamlToken::Key) => {
369                builder.token(SyntaxKind::YAML_KEY.into(), tokens[*i].text);
370                *i += 1;
371                while *i < tokens.len() && tokens[*i].kind == YamlToken::Whitespace {
372                    emit_token_as_yaml(builder, &tokens[*i]);
373                    *i += 1;
374                }
375                if *i < tokens.len() && tokens[*i].kind == YamlToken::Colon {
376                    builder.token(SyntaxKind::YAML_COLON.into(), tokens[*i].text);
377                    *i += 1;
378                }
379                None
380            }
381            Some(YamlToken::Tag) => {
382                emit_token_as_yaml(builder, &tokens[*i]);
383                *i += 1;
384                None
385            }
386            _ => None,
387        }
388    };
389
390    builder.finish_node(); // YAML_FLOW_MAP_KEY
391
392    builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
393    if let Some(prefix) = value_prefix
394        && !prefix.is_empty()
395    {
396        builder.token(SyntaxKind::YAML_SCALAR.into(), prefix);
397    }
398    emit_flow_value_tokens(builder, tokens, i)?;
399    builder.finish_node(); // YAML_FLOW_MAP_VALUE
400
401    builder.finish_node(); // YAML_FLOW_MAP_ENTRY
402    Ok(())
403}
404
405fn emit_flow_value_tokens<'a>(
406    builder: &mut GreenNodeBuilder<'_>,
407    tokens: &[YamlTokenSpan<'a>],
408    i: &mut usize,
409) -> Result<(), YamlDiagnostic> {
410    while *i < tokens.len() {
411        match tokens[*i].kind {
412            YamlToken::Comma | YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd => break,
413            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
414            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
415            _ => {
416                emit_token_as_yaml(builder, &tokens[*i]);
417                *i += 1;
418            }
419        }
420    }
421    Ok(())
422}
423
424fn emit_scalar_document<'a>(
425    builder: &mut GreenNodeBuilder<'_>,
426    tokens: &[YamlTokenSpan<'a>],
427    i: &mut usize,
428) -> Result<(), YamlDiagnostic> {
429    while *i < tokens.len() {
430        let kind = match tokens[*i].kind {
431            YamlToken::Newline => SyntaxKind::NEWLINE,
432            // Document boundaries close the scalar body; the stream loop will
433            // emit them at the YAML_DOCUMENT level.
434            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
435            YamlToken::Tag => SyntaxKind::YAML_TAG,
436            YamlToken::Comment => SyntaxKind::YAML_COMMENT,
437            YamlToken::Whitespace => SyntaxKind::WHITESPACE,
438            YamlToken::Colon => SyntaxKind::YAML_COLON,
439            YamlToken::FlowMapStart
440            | YamlToken::FlowMapEnd
441            | YamlToken::FlowSeqStart
442            | YamlToken::FlowSeqEnd
443            | YamlToken::Comma => {
444                return Err(diag_at_token(
445                    &tokens[*i],
446                    diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
447                    "unexpected flow indicator in plain scalar document",
448                ));
449            }
450            _ => SyntaxKind::YAML_SCALAR,
451        };
452        builder.token(kind.into(), tokens[*i].text);
453        *i += 1;
454    }
455    Ok(())
456}
457
458fn emit_block_seq<'a>(
459    builder: &mut GreenNodeBuilder<'_>,
460    tokens: &[YamlTokenSpan<'a>],
461    i: &mut usize,
462    stop_on_dedent: bool,
463) -> Result<(), YamlDiagnostic> {
464    while *i < tokens.len() {
465        match tokens[*i].kind {
466            YamlToken::Newline => {
467                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
468                *i += 1;
469            }
470            // Document boundaries close the body; the stream loop will pick
471            // them up at the YAML_DOCUMENT level.
472            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
473            YamlToken::Whitespace => {
474                // Between-item indentation in a nested sequence.
475                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
476                *i += 1;
477            }
478            YamlToken::Dedent => {
479                if stop_on_dedent {
480                    *i += 1;
481                    break;
482                }
483                break;
484            }
485            YamlToken::BlockSeqEntry => emit_block_seq_item(builder, tokens, i)?,
486            _ => break,
487        }
488    }
489    Ok(())
490}
491
492fn emit_block_seq_item<'a>(
493    builder: &mut GreenNodeBuilder<'_>,
494    tokens: &[YamlTokenSpan<'a>],
495    i: &mut usize,
496) -> Result<(), YamlDiagnostic> {
497    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM.into());
498    builder.token(SyntaxKind::YAML_BLOCK_SEQ_ENTRY.into(), tokens[*i].text);
499    *i += 1;
500    let mut closed_via_nested_seq = false;
501    while *i < tokens.len() && tokens[*i].kind != YamlToken::Newline {
502        match tokens[*i].kind {
503            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
504            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
505            YamlToken::Indent => {
506                // Nested block sequence triggered by `- - ...`: the lexer
507                // emitted an Indent between the outer `- ` and the inner
508                // `-`. Recurse; the nested emitter consumes through the
509                // matching Dedent (including any intervening Newlines), so
510                // the outer item has no trailing Newline to emit.
511                *i += 1;
512                builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
513                emit_block_seq(builder, tokens, i, true)?;
514                builder.finish_node(); // YAML_BLOCK_SEQUENCE
515                closed_via_nested_seq = true;
516                break;
517            }
518            _ => {
519                emit_token_as_yaml(builder, &tokens[*i]);
520                *i += 1;
521            }
522        }
523    }
524    if !closed_via_nested_seq && *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
525        builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
526        *i += 1;
527    }
528    // Nested block map following a bare `-\n` entry: lexer has emitted an
529    // Indent after the Newline, terminated by a Dedent.
530    if !closed_via_nested_seq && *i < tokens.len() && tokens[*i].kind == YamlToken::Indent {
531        *i += 1;
532        builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
533        emit_block_map(builder, tokens, i, true)?;
534        builder.finish_node(); // YAML_BLOCK_MAP
535    }
536    builder.finish_node(); // YAML_BLOCK_SEQUENCE_ITEM
537    Ok(())
538}
539
540fn emit_block_map<'a>(
541    builder: &mut GreenNodeBuilder<'_>,
542    tokens: &[YamlTokenSpan<'a>],
543    i: &mut usize,
544    stop_on_dedent: bool,
545) -> Result<(), YamlDiagnostic> {
546    let mut closed_by_dedent = false;
547    while *i < tokens.len() {
548        match tokens[*i].kind {
549            YamlToken::Newline => {
550                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
551                *i += 1;
552            }
553            // Document boundaries close the body; the stream loop picks them
554            // up at the YAML_DOCUMENT level.
555            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
556            YamlToken::Directive | YamlToken::Comma => {
557                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
558                *i += 1;
559            }
560            YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd => {
561                return Err(diag_at_token(
562                    &tokens[*i],
563                    diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
564                    "unexpected flow closing token",
565                ));
566            }
567            YamlToken::FlowMapStart | YamlToken::FlowSeqStart => {
568                if tokens[*i].kind == YamlToken::FlowMapStart {
569                    emit_flow_map(builder, tokens, i)?;
570                } else {
571                    emit_flow_sequence(builder, tokens, i)?;
572                }
573            }
574            YamlToken::Anchor
575            | YamlToken::Alias
576            | YamlToken::BlockScalarHeader
577            | YamlToken::BlockScalarContent => {
578                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
579                *i += 1;
580            }
581            YamlToken::Scalar | YamlToken::Comment => {
582                while *i < tokens.len() && tokens[*i].kind != YamlToken::Newline {
583                    if matches!(
584                        tokens[*i].kind,
585                        YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd
586                    ) {
587                        return Err(diag_at_token(
588                            &tokens[*i],
589                            diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
590                            "unexpected flow closing token",
591                        ));
592                    }
593                    emit_token_as_yaml(builder, &tokens[*i]);
594                    *i += 1;
595                }
596            }
597            YamlToken::Indent => {
598                return Err(diag_at_token(
599                    &tokens[*i],
600                    diagnostic_codes::PARSE_UNEXPECTED_INDENT,
601                    "unexpected indent token while parsing block map",
602                ));
603            }
604            YamlToken::Dedent => {
605                if stop_on_dedent {
606                    *i += 1;
607                    closed_by_dedent = true;
608                    break;
609                }
610                return Err(diag_at_token(
611                    &tokens[*i],
612                    diagnostic_codes::PARSE_UNEXPECTED_DEDENT,
613                    "unexpected dedent token while parsing block map",
614                ));
615            }
616            _ => emit_block_map_entry(builder, tokens, i)?,
617        }
618    }
619
620    if stop_on_dedent && !closed_by_dedent {
621        let (byte_start, byte_end) = tokens
622            .last()
623            .map(|t| (t.byte_start, t.byte_end))
624            .unwrap_or((0, 0));
625        return Err(YamlDiagnostic {
626            code: diagnostic_codes::PARSE_UNTERMINATED_BLOCK_MAP,
627            message: "unterminated indented block map",
628            byte_start,
629            byte_end,
630        });
631    }
632
633    Ok(())
634}
635
636fn emit_block_map_entry<'a>(
637    builder: &mut GreenNodeBuilder<'_>,
638    tokens: &[YamlTokenSpan<'a>],
639    i: &mut usize,
640) -> Result<(), YamlDiagnostic> {
641    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_ENTRY.into());
642    emit_block_map_key(builder, tokens, i)?;
643    let trailing_newline = emit_block_map_value(builder, tokens, i)?;
644    if let Some(newline) = trailing_newline {
645        builder.token(SyntaxKind::NEWLINE.into(), newline);
646    }
647    builder.finish_node(); // YAML_BLOCK_MAP_ENTRY
648    Ok(())
649}
650
651fn emit_block_map_key<'a>(
652    builder: &mut GreenNodeBuilder<'_>,
653    tokens: &[YamlTokenSpan<'a>],
654    i: &mut usize,
655) -> Result<(), YamlDiagnostic> {
656    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_KEY.into());
657
658    let mut saw_colon = false;
659    while *i < tokens.len() {
660        match tokens[*i].kind {
661            YamlToken::Key => {
662                builder.token(SyntaxKind::YAML_KEY.into(), tokens[*i].text);
663                *i += 1;
664            }
665            YamlToken::Tag => {
666                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
667                *i += 1;
668            }
669            YamlToken::Whitespace => {
670                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
671                *i += 1;
672            }
673            YamlToken::Colon => {
674                builder.token(SyntaxKind::YAML_COLON.into(), tokens[*i].text);
675                *i += 1;
676                saw_colon = true;
677                break;
678            }
679            _ => {
680                return Err(diag_at_token(
681                    &tokens[*i],
682                    diagnostic_codes::PARSE_INVALID_KEY_TOKEN,
683                    "invalid token while parsing block map key",
684                ));
685            }
686        }
687    }
688    if !saw_colon {
689        return Err(diag_at_token(
690            &tokens[(*i).saturating_sub(1)],
691            diagnostic_codes::PARSE_MISSING_COLON,
692            "missing colon in block map entry",
693        ));
694    }
695    builder.finish_node(); // YAML_BLOCK_MAP_KEY
696    Ok(())
697}
698
699/// Emit `YAML_BLOCK_MAP_VALUE` and return the trailing newline (if any) that
700/// the caller should emit after the value node closes. The newline is held
701/// back so that a nested block map can be wired in after the newline rather
702/// than before, preserving byte order in the CST.
703fn emit_block_map_value<'a>(
704    builder: &mut GreenNodeBuilder<'_>,
705    tokens: &[YamlTokenSpan<'a>],
706    i: &mut usize,
707) -> Result<Option<&'a str>, YamlDiagnostic> {
708    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
709    while *i < tokens.len() {
710        match tokens[*i].kind {
711            YamlToken::Scalar => {
712                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
713                *i += 1;
714            }
715            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
716            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
717            YamlToken::Anchor | YamlToken::Alias => {
718                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
719                *i += 1;
720            }
721            YamlToken::BlockScalarHeader => {
722                consume_block_scalar(builder, tokens, i);
723            }
724            YamlToken::BlockScalarContent => {
725                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
726                *i += 1;
727            }
728            YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd | YamlToken::Comma => break,
729            YamlToken::Tag => {
730                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
731                *i += 1;
732            }
733            YamlToken::Comment => {
734                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
735                *i += 1;
736            }
737            YamlToken::Whitespace => {
738                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
739                *i += 1;
740            }
741            _ => break,
742        }
743    }
744
745    let mut trailing_newline: Option<&str> = None;
746    if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
747        trailing_newline = Some(tokens[*i].text);
748        *i += 1;
749    }
750
751    if *i < tokens.len() && tokens[*i].kind == YamlToken::Indent {
752        *i += 1;
753        // Emit trailing newline before nested content to preserve byte order.
754        if let Some(newline) = trailing_newline.take() {
755            builder.token(SyntaxKind::NEWLINE.into(), newline);
756        }
757        builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
758        emit_block_map(builder, tokens, i, true)?;
759        builder.finish_node(); // YAML_BLOCK_MAP
760    }
761
762    builder.finish_node(); // YAML_BLOCK_MAP_VALUE
763    Ok(trailing_newline)
764}
765
766/// Consume a literal/folded block-scalar header (`|` / `>`) and the
767/// following content lines. Each line is emitted as a `YAML_SCALAR` token
768/// with `NEWLINE` separators. Blank-line newlines that belong to the scalar
769/// body are absorbed so the entire body lives inside the value node.
770fn consume_block_scalar<'a>(
771    builder: &mut GreenNodeBuilder<'_>,
772    tokens: &[YamlTokenSpan<'a>],
773    i: &mut usize,
774) {
775    builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
776    *i += 1;
777    while *i < tokens.len() {
778        match tokens[*i].kind {
779            YamlToken::Newline => {
780                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
781                *i += 1;
782                if *i < tokens.len()
783                    && matches!(
784                        tokens[*i].kind,
785                        YamlToken::BlockScalarContent | YamlToken::Newline
786                    )
787                {
788                    continue;
789                }
790                break;
791            }
792            YamlToken::BlockScalarContent => {
793                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
794                *i += 1;
795            }
796            _ => break,
797        }
798    }
799}
800
801/// Parse prototype YAML tree structure from input
802pub fn parse_yaml_tree(input: &str) -> Option<SyntaxNode> {
803    parse_yaml_report(input).tree
804}
805
806/// Parse prototype YAML tree structure and include diagnostics on failure.
807pub fn parse_yaml_report(input: &str) -> YamlParseReport {
808    let tokens = match lex_mapping_tokens_with_diagnostic(input) {
809        Ok(tokens) => tokens,
810        Err(err) => {
811            return YamlParseReport {
812                tree: None,
813                diagnostics: vec![err],
814            };
815        }
816    };
817
818    let mut seen_content = false;
819    for token in &tokens {
820        match token.kind {
821            YamlToken::Directive if seen_content => {
822                return YamlParseReport {
823                    tree: None,
824                    diagnostics: vec![diag_at_token(
825                        token,
826                        diagnostic_codes::PARSE_DIRECTIVE_AFTER_CONTENT,
827                        "directive requires document end before subsequent directives",
828                    )],
829                };
830            }
831            YamlToken::Directive
832            | YamlToken::Newline
833            | YamlToken::Whitespace
834            | YamlToken::Comment => {}
835            YamlToken::DocumentEnd => seen_content = false,
836            _ => seen_content = true,
837        }
838    }
839
840    if let Some(directive) = tokens.iter().find(|t| t.kind == YamlToken::Directive)
841        && !tokens.iter().any(|t| t.kind == YamlToken::DocumentStart)
842    {
843        return YamlParseReport {
844            tree: None,
845            diagnostics: vec![diag_at_token(
846                directive,
847                diagnostic_codes::PARSE_DIRECTIVE_WITHOUT_DOCUMENT_START,
848                "directive requires an explicit document start marker",
849            )],
850        };
851    }
852
853    let mut builder = GreenNodeBuilder::new();
854    builder.start_node(SyntaxKind::DOCUMENT.into());
855    builder.start_node(SyntaxKind::YAML_METADATA_CONTENT.into());
856    builder.start_node(SyntaxKind::YAML_STREAM.into());
857    if let Err(err) = parse_stream(&mut builder, &tokens) {
858        return YamlParseReport {
859            tree: None,
860            diagnostics: vec![err],
861        };
862    }
863    builder.finish_node(); // YAML_STREAM
864    builder.finish_node(); // YAML_METADATA_CONTENT
865    builder.finish_node(); // DOCUMENT
866    YamlParseReport {
867        tree: Some(SyntaxNode::new_root(builder.finish())),
868        diagnostics: Vec::new(),
869    }
870}
871
872/// Outer stream loop. Walks every token and emits zero or more `YAML_DOCUMENT`
873/// nodes interleaved with stream-level trivia (newlines, whitespace, comments,
874/// and bare `...` markers that don't bracket a document body).
875fn parse_stream<'a>(
876    builder: &mut GreenNodeBuilder<'_>,
877    tokens: &[YamlTokenSpan<'a>],
878) -> Result<(), YamlDiagnostic> {
879    let mut i = 0usize;
880    while i < tokens.len() {
881        match tokens[i].kind {
882            YamlToken::Newline => {
883                builder.token(SyntaxKind::NEWLINE.into(), tokens[i].text);
884                i += 1;
885            }
886            YamlToken::Whitespace => {
887                builder.token(SyntaxKind::WHITESPACE.into(), tokens[i].text);
888                i += 1;
889            }
890            YamlToken::Comment => {
891                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[i].text);
892                i += 1;
893            }
894            // Indent/Dedent are zero-width balance markers from the lexer.
895            // If they leak out of a body emitter (e.g. trailing Dedent at
896            // end of input), absorb them silently — they carry no bytes.
897            YamlToken::Indent | YamlToken::Dedent => {
898                i += 1;
899            }
900            // Bare `...` at stream level — no preceding document body, no
901            // following body before another `...`/EOF — is stream-level
902            // trivia, not its own document.
903            YamlToken::DocumentEnd if !document_follows(tokens, i + 1) => {
904                builder.token(SyntaxKind::YAML_DOCUMENT_END.into(), tokens[i].text);
905                i += 1;
906            }
907            _ => {
908                builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
909                emit_document(builder, tokens, &mut i)?;
910                builder.finish_node(); // YAML_DOCUMENT
911            }
912        }
913    }
914    Ok(())
915}
916
917/// Returns `true` if the tokens at or after `start` contain any
918/// document-defining token (directive, doc-start, body content, doc-end). We
919/// use this to decide whether a bare `...` is "the end of nothing" (stream
920/// trivia) or actually closes a document yet to come (still trivia, just at
921/// stream level).
922fn document_follows(tokens: &[YamlTokenSpan<'_>], start: usize) -> bool {
923    tokens[start..].iter().any(|t| {
924        !matches!(
925            t.kind,
926            YamlToken::Newline
927                | YamlToken::Whitespace
928                | YamlToken::Comment
929                | YamlToken::DocumentEnd
930        )
931    })
932}
933
934/// Emit a single `YAML_DOCUMENT`. Optionally consumes leading directives and a
935/// `---` marker, dispatches to the body emitter, then optionally consumes a
936/// trailing `...` marker. Each phase is forgiving: an absent `---`, absent
937/// `...`, or empty body is fine.
938fn emit_document<'a>(
939    builder: &mut GreenNodeBuilder<'_>,
940    tokens: &[YamlTokenSpan<'a>],
941    i: &mut usize,
942) -> Result<(), YamlDiagnostic> {
943    // Phase 1: optional directives + `---` marker (with intervening trivia).
944    let mut saw_marker = false;
945    while *i < tokens.len() {
946        match tokens[*i].kind {
947            YamlToken::Directive => {
948                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
949                *i += 1;
950            }
951            YamlToken::Newline => {
952                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
953                *i += 1;
954            }
955            YamlToken::Whitespace => {
956                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
957                *i += 1;
958            }
959            YamlToken::Comment => {
960                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
961                *i += 1;
962            }
963            YamlToken::DocumentStart => {
964                builder.token(SyntaxKind::YAML_DOCUMENT_START.into(), tokens[*i].text);
965                *i += 1;
966                saw_marker = true;
967                if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
968                    builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
969                    *i += 1;
970                }
971                break;
972            }
973            _ => break,
974        }
975    }
976    let _ = saw_marker;
977
978    // Phase 2: body.
979    let next_significant = tokens[*i..].iter().find(|t| {
980        !matches!(
981            t.kind,
982            YamlToken::Newline | YamlToken::Whitespace | YamlToken::Comment
983        )
984    });
985
986    let body_kind = match next_significant.map(|t| t.kind) {
987        Some(YamlToken::DocumentStart) | Some(YamlToken::DocumentEnd) | None => DocumentBody::Empty,
988        Some(YamlToken::BlockSeqEntry) => DocumentBody::BlockSequence,
989        _ => {
990            // Tagless scalar documents continue to dispatch to the block-map
991            // emitter for byte-level CST stability. Tagged scalar documents
992            // (e.g. `! a`, `!!str foo`) take the dedicated path because they
993            // lack a colon and would trip the key/colon expectation.
994            let mut has_colon = false;
995            let mut has_tag = false;
996            for tok in &tokens[*i..] {
997                match tok.kind {
998                    YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
999                    YamlToken::Colon => has_colon = true,
1000                    YamlToken::Tag => has_tag = true,
1001                    _ => {}
1002                }
1003            }
1004            if !has_colon && has_tag {
1005                DocumentBody::Scalar
1006            } else {
1007                DocumentBody::BlockMap
1008            }
1009        }
1010    };
1011
1012    match body_kind {
1013        DocumentBody::Empty => {}
1014        DocumentBody::BlockSequence => {
1015            builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
1016            emit_block_seq(builder, tokens, i, false)?;
1017            builder.finish_node(); // YAML_BLOCK_SEQUENCE
1018        }
1019        DocumentBody::Scalar => emit_scalar_document(builder, tokens, i)?,
1020        DocumentBody::BlockMap => {
1021            builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
1022            emit_block_map(builder, tokens, i, false)?;
1023            builder.finish_node(); // YAML_BLOCK_MAP
1024        }
1025    }
1026
1027    // Phase 3: optional `...` marker (and its trailing newline). Trivia
1028    // between the body and the marker that we did NOT consume into the body
1029    // belongs to the stream, not this document, so we don't drain it here —
1030    // except when the body was empty: then any comments/blank lines between
1031    // `---` and `...` semantically belong to the empty document, and so does
1032    // the `...` itself even when it lies a few trivia tokens away.
1033    if matches!(body_kind, DocumentBody::Empty) {
1034        let mut peek = *i;
1035        while peek < tokens.len() {
1036            match tokens[peek].kind {
1037                YamlToken::Newline | YamlToken::Whitespace | YamlToken::Comment => peek += 1,
1038                _ => break,
1039            }
1040        }
1041        if peek < tokens.len() && tokens[peek].kind == YamlToken::DocumentEnd {
1042            while *i < peek {
1043                match tokens[*i].kind {
1044                    YamlToken::Newline => {
1045                        builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text)
1046                    }
1047                    YamlToken::Whitespace => {
1048                        builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text)
1049                    }
1050                    YamlToken::Comment => {
1051                        builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text)
1052                    }
1053                    _ => unreachable!("only trivia in this range"),
1054                }
1055                *i += 1;
1056            }
1057        }
1058    }
1059    if *i < tokens.len() && tokens[*i].kind == YamlToken::DocumentEnd {
1060        builder.token(SyntaxKind::YAML_DOCUMENT_END.into(), tokens[*i].text);
1061        *i += 1;
1062        if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
1063            builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
1064            *i += 1;
1065        }
1066    }
1067
1068    Ok(())
1069}
1070
1071#[derive(Clone, Copy)]
1072enum DocumentBody {
1073    Empty,
1074    BlockSequence,
1075    BlockMap,
1076    Scalar,
1077}