Skip to main content

panache_parser/parser/yaml/
parser.rs

1use crate::syntax::{SyntaxKind, SyntaxNode};
2use rowan::GreenNodeBuilder;
3
4use super::lexer::{lex_mapping_tokens_with_diagnostic, split_once_unquoted};
5use super::model::{
6    ShadowYamlOptions, ShadowYamlOutcome, ShadowYamlReport, YamlDiagnostic, YamlInputKind,
7    YamlParseReport, YamlToken, YamlTokenSpan, diagnostic_codes,
8};
9
10/// Parse YAML in shadow mode using prototype groundwork only.
11///
12/// This API is intentionally read-only and does not replace production YAML
13/// parsing. By default it is disabled and reports `SkippedDisabled`.
14pub fn parse_shadow(input: &str, options: ShadowYamlOptions) -> ShadowYamlReport {
15    let line_count = input.lines().count().max(1);
16
17    if !options.enabled {
18        return ShadowYamlReport {
19            outcome: ShadowYamlOutcome::SkippedDisabled,
20            shadow_reason: "shadow-disabled",
21            input_kind: options.input_kind,
22            input_len_bytes: input.len(),
23            line_count,
24            normalized_input: None,
25        };
26    }
27
28    let normalized = match options.input_kind {
29        YamlInputKind::Plain => input.to_owned(),
30        YamlInputKind::Hashpipe => normalize_hashpipe_input(input),
31    };
32
33    let parsed = parse_yaml_tree(&normalized).is_some();
34
35    ShadowYamlReport {
36        outcome: if parsed {
37            ShadowYamlOutcome::PrototypeParsed
38        } else {
39            ShadowYamlOutcome::PrototypeRejected
40        },
41        shadow_reason: if parsed {
42            "prototype-basic-mapping-parsed"
43        } else {
44            "prototype-basic-mapping-rejected"
45        },
46        input_kind: options.input_kind,
47        input_len_bytes: input.len(),
48        line_count,
49        normalized_input: Some(normalized),
50    }
51}
52
53fn normalize_hashpipe_input(input: &str) -> String {
54    input
55        .lines()
56        .map(strip_hashpipe_prefix)
57        .collect::<Vec<_>>()
58        .join("\n")
59}
60
61fn strip_hashpipe_prefix(line: &str) -> &str {
62    if let Some(rest) = line.strip_prefix("#|") {
63        return rest.strip_prefix(' ').unwrap_or(rest);
64    }
65    line
66}
67
68fn emit_token_as_yaml(builder: &mut GreenNodeBuilder<'_>, token: &YamlTokenSpan<'_>) {
69    let kind = match token.kind {
70        YamlToken::Whitespace => SyntaxKind::WHITESPACE,
71        YamlToken::Comment => SyntaxKind::YAML_COMMENT,
72        YamlToken::Tag => SyntaxKind::YAML_TAG,
73        YamlToken::Colon => SyntaxKind::YAML_COLON,
74        _ => SyntaxKind::YAML_SCALAR,
75    };
76    builder.token(kind.into(), token.text);
77}
78
79fn diag_at_token(
80    token: &YamlTokenSpan<'_>,
81    code: &'static str,
82    message: &'static str,
83) -> YamlDiagnostic {
84    YamlDiagnostic {
85        code,
86        message,
87        byte_start: token.byte_start,
88        byte_end: token.byte_end,
89    }
90}
91
92fn emit_flow_sequence<'a>(
93    builder: &mut GreenNodeBuilder<'_>,
94    tokens: &[YamlTokenSpan<'a>],
95    i: &mut usize,
96) -> Result<(), YamlDiagnostic> {
97    if *i >= tokens.len() || tokens[*i].kind != YamlToken::FlowSeqStart {
98        return Err(YamlDiagnostic {
99            code: diagnostic_codes::PARSE_EXPECTED_FLOW_SEQUENCE_START,
100            message: "expected flow sequence start token",
101            byte_start: tokens.get(*i).map(|t| t.byte_start).unwrap_or(0),
102            byte_end: tokens.get(*i).map(|t| t.byte_end).unwrap_or(0),
103        });
104    }
105
106    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE.into());
107    emit_token_as_yaml(builder, &tokens[*i]); // [
108    *i += 1;
109
110    let mut open_item = false;
111    while *i < tokens.len() {
112        match tokens[*i].kind {
113            YamlToken::FlowSeqEnd => {
114                if open_item {
115                    builder.finish_node(); // YAML_FLOW_SEQUENCE_ITEM
116                }
117                emit_token_as_yaml(builder, &tokens[*i]); // ]
118                *i += 1;
119                if *i < tokens.len() {
120                    match tokens[*i].kind {
121                        YamlToken::Newline | YamlToken::Comment => {}
122                        YamlToken::Whitespace if tokens[*i].text.trim().is_empty() => {}
123                        _ => {
124                            return Err(diag_at_token(
125                                &tokens[*i],
126                                diagnostic_codes::PARSE_TRAILING_CONTENT_AFTER_FLOW_END,
127                                "trailing content after flow sequence end",
128                            ));
129                        }
130                    }
131                }
132                builder.finish_node(); // YAML_FLOW_SEQUENCE
133                return Ok(());
134            }
135            YamlToken::Comma => {
136                if !open_item {
137                    return Err(diag_at_token(
138                        &tokens[*i],
139                        diagnostic_codes::PARSE_INVALID_FLOW_SEQUENCE_COMMA,
140                        "invalid comma position in flow sequence",
141                    ));
142                }
143                builder.finish_node(); // YAML_FLOW_SEQUENCE_ITEM
144                open_item = false;
145                emit_token_as_yaml(builder, &tokens[*i]);
146                *i += 1;
147            }
148            YamlToken::Whitespace if !open_item => {
149                emit_token_as_yaml(builder, &tokens[*i]);
150                *i += 1;
151            }
152            YamlToken::Scalar if !open_item && tokens[*i].text.trim().is_empty() => {
153                emit_token_as_yaml(builder, &tokens[*i]);
154                *i += 1;
155            }
156            YamlToken::FlowSeqStart => {
157                if !open_item {
158                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
159                    open_item = true;
160                }
161                emit_flow_sequence(builder, tokens, i)?;
162            }
163            YamlToken::FlowMapStart => {
164                if !open_item {
165                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
166                    open_item = true;
167                }
168                emit_flow_map(builder, tokens, i)?;
169            }
170            _ => {
171                if !open_item {
172                    builder.start_node(SyntaxKind::YAML_FLOW_SEQUENCE_ITEM.into());
173                    open_item = true;
174                }
175                emit_token_as_yaml(builder, &tokens[*i]);
176                *i += 1;
177            }
178        }
179    }
180
181    let (byte_start, byte_end) =
182        if let Some(start) = tokens.iter().find(|t| t.kind == YamlToken::FlowSeqStart) {
183            (
184                start.byte_start,
185                tokens.last().map(|t| t.byte_end).unwrap_or(start.byte_end),
186            )
187        } else {
188            tokens
189                .last()
190                .map(|t| (t.byte_start, t.byte_end))
191                .unwrap_or((0, 0))
192        };
193    Err(YamlDiagnostic {
194        code: diagnostic_codes::PARSE_UNTERMINATED_FLOW_SEQUENCE,
195        message: "unterminated flow sequence",
196        byte_start,
197        byte_end,
198    })
199}
200
201fn emit_flow_map<'a>(
202    builder: &mut GreenNodeBuilder<'_>,
203    tokens: &[YamlTokenSpan<'a>],
204    i: &mut usize,
205) -> Result<(), YamlDiagnostic> {
206    if *i >= tokens.len() || tokens[*i].kind != YamlToken::FlowMapStart {
207        return Err(YamlDiagnostic {
208            code: diagnostic_codes::PARSE_EXPECTED_FLOW_MAP_START,
209            message: "expected flow map start token",
210            byte_start: tokens.get(*i).map(|t| t.byte_start).unwrap_or(0),
211            byte_end: tokens.get(*i).map(|t| t.byte_end).unwrap_or(0),
212        });
213    }
214
215    builder.start_node(SyntaxKind::YAML_FLOW_MAP.into());
216    emit_token_as_yaml(builder, &tokens[*i]); // {
217    *i += 1;
218
219    loop {
220        // Skip inter-entry whitespace and newlines. The flow lexer chunks
221        // text between flow indicators into Scalar tokens, including
222        // whitespace-only chunks like the space in `, }` — treat those as
223        // trivia here so they do not synthesize phantom entries.
224        while *i < tokens.len()
225            && (matches!(tokens[*i].kind, YamlToken::Whitespace | YamlToken::Newline)
226                || (tokens[*i].kind == YamlToken::Scalar && tokens[*i].text.trim().is_empty()))
227        {
228            emit_token_as_yaml(builder, &tokens[*i]);
229            *i += 1;
230        }
231
232        if *i >= tokens.len() {
233            let (byte_start, byte_end) = tokens
234                .last()
235                .map(|t| (t.byte_start, t.byte_end))
236                .unwrap_or((0, 0));
237            return Err(YamlDiagnostic {
238                code: diagnostic_codes::PARSE_UNTERMINATED_FLOW_MAP,
239                message: "unterminated flow map",
240                byte_start,
241                byte_end,
242            });
243        }
244
245        match tokens[*i].kind {
246            YamlToken::FlowMapEnd => {
247                emit_token_as_yaml(builder, &tokens[*i]);
248                *i += 1;
249                if *i < tokens.len() {
250                    match tokens[*i].kind {
251                        YamlToken::Newline
252                        | YamlToken::Comment
253                        | YamlToken::Whitespace
254                        | YamlToken::FlowMapEnd
255                        | YamlToken::FlowSeqEnd
256                        | YamlToken::Comma => {}
257                        _ => {
258                            return Err(diag_at_token(
259                                &tokens[*i],
260                                diagnostic_codes::PARSE_TRAILING_CONTENT_AFTER_FLOW_END,
261                                "trailing content after flow map end",
262                            ));
263                        }
264                    }
265                }
266                builder.finish_node(); // YAML_FLOW_MAP
267                return Ok(());
268            }
269            YamlToken::Comma => {
270                emit_token_as_yaml(builder, &tokens[*i]);
271                *i += 1;
272            }
273            _ => {
274                emit_flow_map_entry(builder, tokens, i)?;
275            }
276        }
277    }
278}
279
280fn emit_flow_map_entry<'a>(
281    builder: &mut GreenNodeBuilder<'_>,
282    tokens: &[YamlTokenSpan<'a>],
283    i: &mut usize,
284) -> Result<(), YamlDiagnostic> {
285    builder.start_node(SyntaxKind::YAML_FLOW_MAP_ENTRY.into());
286    builder.start_node(SyntaxKind::YAML_FLOW_MAP_KEY.into());
287
288    // Emit leading whitespace inside key node
289    while *i < tokens.len() && tokens[*i].kind == YamlToken::Whitespace {
290        emit_token_as_yaml(builder, &tokens[*i]);
291        *i += 1;
292    }
293
294    // Determine key/value split point
295    let value_prefix: Option<&'a str> = match tokens.get(*i).map(|t| t.kind) {
296        Some(YamlToken::Scalar) => {
297            let scalar = tokens[*i];
298            *i += 1;
299            if let Some((key_text, rest_text)) = split_once_unquoted(scalar.text, ':') {
300                builder.token(SyntaxKind::YAML_KEY.into(), key_text);
301                builder.token(
302                    SyntaxKind::YAML_COLON.into(),
303                    &scalar.text[key_text.len()..key_text.len() + 1],
304                );
305                Some(rest_text)
306            } else {
307                // No colon — standalone scalar (implicit key or value-only)
308                builder.token(SyntaxKind::YAML_SCALAR.into(), scalar.text);
309                None
310            }
311        }
312        Some(YamlToken::Key) => {
313            // Already-tokenized key (from multi-line block lexing inside a flow map)
314            builder.token(SyntaxKind::YAML_KEY.into(), tokens[*i].text);
315            *i += 1;
316            while *i < tokens.len() && tokens[*i].kind == YamlToken::Whitespace {
317                emit_token_as_yaml(builder, &tokens[*i]);
318                *i += 1;
319            }
320            if *i < tokens.len() && tokens[*i].kind == YamlToken::Colon {
321                builder.token(SyntaxKind::YAML_COLON.into(), tokens[*i].text);
322                *i += 1;
323            }
324            None
325        }
326        Some(YamlToken::Tag) => {
327            emit_token_as_yaml(builder, &tokens[*i]);
328            *i += 1;
329            None
330        }
331        _ => None,
332    };
333
334    builder.finish_node(); // YAML_FLOW_MAP_KEY
335
336    builder.start_node(SyntaxKind::YAML_FLOW_MAP_VALUE.into());
337    if let Some(prefix) = value_prefix
338        && !prefix.is_empty()
339    {
340        builder.token(SyntaxKind::YAML_SCALAR.into(), prefix);
341    }
342    emit_flow_value_tokens(builder, tokens, i)?;
343    builder.finish_node(); // YAML_FLOW_MAP_VALUE
344
345    builder.finish_node(); // YAML_FLOW_MAP_ENTRY
346    Ok(())
347}
348
349fn emit_flow_value_tokens<'a>(
350    builder: &mut GreenNodeBuilder<'_>,
351    tokens: &[YamlTokenSpan<'a>],
352    i: &mut usize,
353) -> Result<(), YamlDiagnostic> {
354    while *i < tokens.len() {
355        match tokens[*i].kind {
356            YamlToken::Comma | YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd => break,
357            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
358            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
359            _ => {
360                emit_token_as_yaml(builder, &tokens[*i]);
361                *i += 1;
362            }
363        }
364    }
365    Ok(())
366}
367
368fn emit_scalar_document<'a>(
369    builder: &mut GreenNodeBuilder<'_>,
370    tokens: &[YamlTokenSpan<'a>],
371    i: &mut usize,
372) -> Result<(), YamlDiagnostic> {
373    while *i < tokens.len() {
374        let kind = match tokens[*i].kind {
375            YamlToken::Newline => SyntaxKind::NEWLINE,
376            // Document boundaries close the scalar body; the stream loop will
377            // emit them at the YAML_DOCUMENT level.
378            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
379            YamlToken::Tag => SyntaxKind::YAML_TAG,
380            YamlToken::Comment => SyntaxKind::YAML_COMMENT,
381            YamlToken::Whitespace => SyntaxKind::WHITESPACE,
382            YamlToken::Colon => SyntaxKind::YAML_COLON,
383            YamlToken::FlowMapStart
384            | YamlToken::FlowMapEnd
385            | YamlToken::FlowSeqStart
386            | YamlToken::FlowSeqEnd
387            | YamlToken::Comma => {
388                return Err(diag_at_token(
389                    &tokens[*i],
390                    diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
391                    "unexpected flow indicator in plain scalar document",
392                ));
393            }
394            _ => SyntaxKind::YAML_SCALAR,
395        };
396        builder.token(kind.into(), tokens[*i].text);
397        *i += 1;
398    }
399    Ok(())
400}
401
402fn emit_block_seq<'a>(
403    builder: &mut GreenNodeBuilder<'_>,
404    tokens: &[YamlTokenSpan<'a>],
405    i: &mut usize,
406    stop_on_dedent: bool,
407) -> Result<(), YamlDiagnostic> {
408    while *i < tokens.len() {
409        match tokens[*i].kind {
410            YamlToken::Newline => {
411                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
412                *i += 1;
413            }
414            // Document boundaries close the body; the stream loop will pick
415            // them up at the YAML_DOCUMENT level.
416            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
417            YamlToken::Whitespace => {
418                // Between-item indentation in a nested sequence.
419                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
420                *i += 1;
421            }
422            YamlToken::Dedent => {
423                if stop_on_dedent {
424                    *i += 1;
425                    break;
426                }
427                break;
428            }
429            YamlToken::BlockSeqEntry => emit_block_seq_item(builder, tokens, i)?,
430            _ => break,
431        }
432    }
433    Ok(())
434}
435
436fn emit_block_seq_item<'a>(
437    builder: &mut GreenNodeBuilder<'_>,
438    tokens: &[YamlTokenSpan<'a>],
439    i: &mut usize,
440) -> Result<(), YamlDiagnostic> {
441    builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM.into());
442    builder.token(SyntaxKind::YAML_BLOCK_SEQ_ENTRY.into(), tokens[*i].text);
443    *i += 1;
444    let mut closed_via_nested_seq = false;
445    while *i < tokens.len() && tokens[*i].kind != YamlToken::Newline {
446        match tokens[*i].kind {
447            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
448            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
449            YamlToken::Indent => {
450                // Nested block sequence triggered by `- - ...`: the lexer
451                // emitted an Indent between the outer `- ` and the inner
452                // `-`. Recurse; the nested emitter consumes through the
453                // matching Dedent (including any intervening Newlines), so
454                // the outer item has no trailing Newline to emit.
455                *i += 1;
456                builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
457                emit_block_seq(builder, tokens, i, true)?;
458                builder.finish_node(); // YAML_BLOCK_SEQUENCE
459                closed_via_nested_seq = true;
460                break;
461            }
462            _ => {
463                emit_token_as_yaml(builder, &tokens[*i]);
464                *i += 1;
465            }
466        }
467    }
468    if !closed_via_nested_seq && *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
469        builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
470        *i += 1;
471    }
472    // Nested block map following a bare `-\n` entry: lexer has emitted an
473    // Indent after the Newline, terminated by a Dedent.
474    if !closed_via_nested_seq && *i < tokens.len() && tokens[*i].kind == YamlToken::Indent {
475        *i += 1;
476        builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
477        emit_block_map(builder, tokens, i, true)?;
478        builder.finish_node(); // YAML_BLOCK_MAP
479    }
480    builder.finish_node(); // YAML_BLOCK_SEQUENCE_ITEM
481    Ok(())
482}
483
484fn emit_block_map<'a>(
485    builder: &mut GreenNodeBuilder<'_>,
486    tokens: &[YamlTokenSpan<'a>],
487    i: &mut usize,
488    stop_on_dedent: bool,
489) -> Result<(), YamlDiagnostic> {
490    let mut closed_by_dedent = false;
491    while *i < tokens.len() {
492        match tokens[*i].kind {
493            YamlToken::Newline => {
494                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
495                *i += 1;
496            }
497            // Document boundaries close the body; the stream loop picks them
498            // up at the YAML_DOCUMENT level.
499            YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
500            YamlToken::Directive | YamlToken::Comma => {
501                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
502                *i += 1;
503            }
504            YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd => {
505                return Err(diag_at_token(
506                    &tokens[*i],
507                    diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
508                    "unexpected flow closing token",
509                ));
510            }
511            YamlToken::FlowMapStart | YamlToken::FlowSeqStart => {
512                if tokens[*i].kind == YamlToken::FlowMapStart {
513                    emit_flow_map(builder, tokens, i)?;
514                } else {
515                    emit_flow_sequence(builder, tokens, i)?;
516                }
517            }
518            YamlToken::Anchor
519            | YamlToken::Alias
520            | YamlToken::BlockScalarHeader
521            | YamlToken::BlockScalarContent => {
522                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
523                *i += 1;
524            }
525            YamlToken::Scalar | YamlToken::Comment => {
526                while *i < tokens.len() && tokens[*i].kind != YamlToken::Newline {
527                    if matches!(
528                        tokens[*i].kind,
529                        YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd
530                    ) {
531                        return Err(diag_at_token(
532                            &tokens[*i],
533                            diagnostic_codes::PARSE_UNEXPECTED_FLOW_CLOSER,
534                            "unexpected flow closing token",
535                        ));
536                    }
537                    emit_token_as_yaml(builder, &tokens[*i]);
538                    *i += 1;
539                }
540            }
541            YamlToken::Indent => {
542                return Err(diag_at_token(
543                    &tokens[*i],
544                    diagnostic_codes::PARSE_UNEXPECTED_INDENT,
545                    "unexpected indent token while parsing block map",
546                ));
547            }
548            YamlToken::Dedent => {
549                if stop_on_dedent {
550                    *i += 1;
551                    closed_by_dedent = true;
552                    break;
553                }
554                return Err(diag_at_token(
555                    &tokens[*i],
556                    diagnostic_codes::PARSE_UNEXPECTED_DEDENT,
557                    "unexpected dedent token while parsing block map",
558                ));
559            }
560            _ => emit_block_map_entry(builder, tokens, i)?,
561        }
562    }
563
564    if stop_on_dedent && !closed_by_dedent {
565        let (byte_start, byte_end) = tokens
566            .last()
567            .map(|t| (t.byte_start, t.byte_end))
568            .unwrap_or((0, 0));
569        return Err(YamlDiagnostic {
570            code: diagnostic_codes::PARSE_UNTERMINATED_BLOCK_MAP,
571            message: "unterminated indented block map",
572            byte_start,
573            byte_end,
574        });
575    }
576
577    Ok(())
578}
579
580fn emit_block_map_entry<'a>(
581    builder: &mut GreenNodeBuilder<'_>,
582    tokens: &[YamlTokenSpan<'a>],
583    i: &mut usize,
584) -> Result<(), YamlDiagnostic> {
585    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_ENTRY.into());
586    emit_block_map_key(builder, tokens, i)?;
587    let trailing_newline = emit_block_map_value(builder, tokens, i)?;
588    if let Some(newline) = trailing_newline {
589        builder.token(SyntaxKind::NEWLINE.into(), newline);
590    }
591    builder.finish_node(); // YAML_BLOCK_MAP_ENTRY
592    Ok(())
593}
594
595fn emit_block_map_key<'a>(
596    builder: &mut GreenNodeBuilder<'_>,
597    tokens: &[YamlTokenSpan<'a>],
598    i: &mut usize,
599) -> Result<(), YamlDiagnostic> {
600    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_KEY.into());
601
602    let mut saw_colon = false;
603    while *i < tokens.len() {
604        match tokens[*i].kind {
605            YamlToken::Key => {
606                builder.token(SyntaxKind::YAML_KEY.into(), tokens[*i].text);
607                *i += 1;
608            }
609            YamlToken::Tag => {
610                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
611                *i += 1;
612            }
613            YamlToken::Whitespace => {
614                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
615                *i += 1;
616            }
617            YamlToken::Colon => {
618                builder.token(SyntaxKind::YAML_COLON.into(), tokens[*i].text);
619                *i += 1;
620                saw_colon = true;
621                break;
622            }
623            _ => {
624                return Err(diag_at_token(
625                    &tokens[*i],
626                    diagnostic_codes::PARSE_INVALID_KEY_TOKEN,
627                    "invalid token while parsing block map key",
628                ));
629            }
630        }
631    }
632    if !saw_colon {
633        return Err(diag_at_token(
634            &tokens[(*i).saturating_sub(1)],
635            diagnostic_codes::PARSE_MISSING_COLON,
636            "missing colon in block map entry",
637        ));
638    }
639    builder.finish_node(); // YAML_BLOCK_MAP_KEY
640    Ok(())
641}
642
643/// Emit `YAML_BLOCK_MAP_VALUE` and return the trailing newline (if any) that
644/// the caller should emit after the value node closes. The newline is held
645/// back so that a nested block map can be wired in after the newline rather
646/// than before, preserving byte order in the CST.
647fn emit_block_map_value<'a>(
648    builder: &mut GreenNodeBuilder<'_>,
649    tokens: &[YamlTokenSpan<'a>],
650    i: &mut usize,
651) -> Result<Option<&'a str>, YamlDiagnostic> {
652    builder.start_node(SyntaxKind::YAML_BLOCK_MAP_VALUE.into());
653    while *i < tokens.len() {
654        match tokens[*i].kind {
655            YamlToken::Scalar => {
656                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
657                *i += 1;
658            }
659            YamlToken::FlowMapStart => emit_flow_map(builder, tokens, i)?,
660            YamlToken::FlowSeqStart => emit_flow_sequence(builder, tokens, i)?,
661            YamlToken::Anchor | YamlToken::Alias => {
662                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
663                *i += 1;
664            }
665            YamlToken::BlockScalarHeader => {
666                consume_block_scalar(builder, tokens, i);
667            }
668            YamlToken::BlockScalarContent => {
669                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
670                *i += 1;
671            }
672            YamlToken::FlowMapEnd | YamlToken::FlowSeqEnd | YamlToken::Comma => break,
673            YamlToken::Tag => {
674                builder.token(SyntaxKind::YAML_TAG.into(), tokens[*i].text);
675                *i += 1;
676            }
677            YamlToken::Comment => {
678                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
679                *i += 1;
680            }
681            YamlToken::Whitespace => {
682                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
683                *i += 1;
684            }
685            _ => break,
686        }
687    }
688
689    let mut trailing_newline: Option<&str> = None;
690    if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
691        trailing_newline = Some(tokens[*i].text);
692        *i += 1;
693    }
694
695    if *i < tokens.len() && tokens[*i].kind == YamlToken::Indent {
696        *i += 1;
697        // Emit trailing newline before nested content to preserve byte order.
698        if let Some(newline) = trailing_newline.take() {
699            builder.token(SyntaxKind::NEWLINE.into(), newline);
700        }
701        builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
702        emit_block_map(builder, tokens, i, true)?;
703        builder.finish_node(); // YAML_BLOCK_MAP
704    }
705
706    builder.finish_node(); // YAML_BLOCK_MAP_VALUE
707    Ok(trailing_newline)
708}
709
710/// Consume a literal/folded block-scalar header (`|` / `>`) and the
711/// following content lines. Each line is emitted as a `YAML_SCALAR` token
712/// with `NEWLINE` separators. Blank-line newlines that belong to the scalar
713/// body are absorbed so the entire body lives inside the value node.
714fn consume_block_scalar<'a>(
715    builder: &mut GreenNodeBuilder<'_>,
716    tokens: &[YamlTokenSpan<'a>],
717    i: &mut usize,
718) {
719    builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
720    *i += 1;
721    while *i < tokens.len() {
722        match tokens[*i].kind {
723            YamlToken::Newline => {
724                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
725                *i += 1;
726                if *i < tokens.len()
727                    && matches!(
728                        tokens[*i].kind,
729                        YamlToken::BlockScalarContent | YamlToken::Newline
730                    )
731                {
732                    continue;
733                }
734                break;
735            }
736            YamlToken::BlockScalarContent => {
737                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
738                *i += 1;
739            }
740            _ => break,
741        }
742    }
743}
744
745/// Parse prototype YAML tree structure from input
746pub fn parse_yaml_tree(input: &str) -> Option<SyntaxNode> {
747    parse_yaml_report(input).tree
748}
749
750/// Parse prototype YAML tree structure and include diagnostics on failure.
751pub fn parse_yaml_report(input: &str) -> YamlParseReport {
752    let tokens = match lex_mapping_tokens_with_diagnostic(input) {
753        Ok(tokens) => tokens,
754        Err(err) => {
755            return YamlParseReport {
756                tree: None,
757                diagnostics: vec![err],
758            };
759        }
760    };
761
762    let mut seen_content = false;
763    for token in &tokens {
764        match token.kind {
765            YamlToken::Directive if seen_content => {
766                return YamlParseReport {
767                    tree: None,
768                    diagnostics: vec![diag_at_token(
769                        token,
770                        diagnostic_codes::PARSE_DIRECTIVE_AFTER_CONTENT,
771                        "directive requires document end before subsequent directives",
772                    )],
773                };
774            }
775            YamlToken::Directive
776            | YamlToken::Newline
777            | YamlToken::Whitespace
778            | YamlToken::Comment => {}
779            YamlToken::DocumentEnd => seen_content = false,
780            _ => seen_content = true,
781        }
782    }
783
784    if let Some(directive) = tokens.iter().find(|t| t.kind == YamlToken::Directive)
785        && !tokens.iter().any(|t| t.kind == YamlToken::DocumentStart)
786    {
787        return YamlParseReport {
788            tree: None,
789            diagnostics: vec![diag_at_token(
790                directive,
791                diagnostic_codes::PARSE_DIRECTIVE_WITHOUT_DOCUMENT_START,
792                "directive requires an explicit document start marker",
793            )],
794        };
795    }
796
797    let mut builder = GreenNodeBuilder::new();
798    builder.start_node(SyntaxKind::DOCUMENT.into());
799    builder.start_node(SyntaxKind::YAML_METADATA_CONTENT.into());
800    builder.start_node(SyntaxKind::YAML_STREAM.into());
801    if let Err(err) = parse_stream(&mut builder, &tokens) {
802        return YamlParseReport {
803            tree: None,
804            diagnostics: vec![err],
805        };
806    }
807    builder.finish_node(); // YAML_STREAM
808    builder.finish_node(); // YAML_METADATA_CONTENT
809    builder.finish_node(); // DOCUMENT
810    YamlParseReport {
811        tree: Some(SyntaxNode::new_root(builder.finish())),
812        diagnostics: Vec::new(),
813    }
814}
815
816/// Outer stream loop. Walks every token and emits zero or more `YAML_DOCUMENT`
817/// nodes interleaved with stream-level trivia (newlines, whitespace, comments,
818/// and bare `...` markers that don't bracket a document body).
819fn parse_stream<'a>(
820    builder: &mut GreenNodeBuilder<'_>,
821    tokens: &[YamlTokenSpan<'a>],
822) -> Result<(), YamlDiagnostic> {
823    let mut i = 0usize;
824    while i < tokens.len() {
825        match tokens[i].kind {
826            YamlToken::Newline => {
827                builder.token(SyntaxKind::NEWLINE.into(), tokens[i].text);
828                i += 1;
829            }
830            YamlToken::Whitespace => {
831                builder.token(SyntaxKind::WHITESPACE.into(), tokens[i].text);
832                i += 1;
833            }
834            YamlToken::Comment => {
835                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[i].text);
836                i += 1;
837            }
838            // Indent/Dedent are zero-width balance markers from the lexer.
839            // If they leak out of a body emitter (e.g. trailing Dedent at
840            // end of input), absorb them silently — they carry no bytes.
841            YamlToken::Indent | YamlToken::Dedent => {
842                i += 1;
843            }
844            // Bare `...` at stream level — no preceding document body, no
845            // following body before another `...`/EOF — is stream-level
846            // trivia, not its own document.
847            YamlToken::DocumentEnd if !document_follows(tokens, i + 1) => {
848                builder.token(SyntaxKind::YAML_DOCUMENT_END.into(), tokens[i].text);
849                i += 1;
850            }
851            _ => {
852                builder.start_node(SyntaxKind::YAML_DOCUMENT.into());
853                emit_document(builder, tokens, &mut i)?;
854                builder.finish_node(); // YAML_DOCUMENT
855            }
856        }
857    }
858    Ok(())
859}
860
861/// Returns `true` if the tokens at or after `start` contain any
862/// document-defining token (directive, doc-start, body content, doc-end). We
863/// use this to decide whether a bare `...` is "the end of nothing" (stream
864/// trivia) or actually closes a document yet to come (still trivia, just at
865/// stream level).
866fn document_follows(tokens: &[YamlTokenSpan<'_>], start: usize) -> bool {
867    tokens[start..].iter().any(|t| {
868        !matches!(
869            t.kind,
870            YamlToken::Newline
871                | YamlToken::Whitespace
872                | YamlToken::Comment
873                | YamlToken::DocumentEnd
874        )
875    })
876}
877
878/// Emit a single `YAML_DOCUMENT`. Optionally consumes leading directives and a
879/// `---` marker, dispatches to the body emitter, then optionally consumes a
880/// trailing `...` marker. Each phase is forgiving: an absent `---`, absent
881/// `...`, or empty body is fine.
882fn emit_document<'a>(
883    builder: &mut GreenNodeBuilder<'_>,
884    tokens: &[YamlTokenSpan<'a>],
885    i: &mut usize,
886) -> Result<(), YamlDiagnostic> {
887    // Phase 1: optional directives + `---` marker (with intervening trivia).
888    let mut saw_marker = false;
889    while *i < tokens.len() {
890        match tokens[*i].kind {
891            YamlToken::Directive => {
892                builder.token(SyntaxKind::YAML_SCALAR.into(), tokens[*i].text);
893                *i += 1;
894            }
895            YamlToken::Newline => {
896                builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
897                *i += 1;
898            }
899            YamlToken::Whitespace => {
900                builder.token(SyntaxKind::WHITESPACE.into(), tokens[*i].text);
901                *i += 1;
902            }
903            YamlToken::Comment => {
904                builder.token(SyntaxKind::YAML_COMMENT.into(), tokens[*i].text);
905                *i += 1;
906            }
907            YamlToken::DocumentStart => {
908                builder.token(SyntaxKind::YAML_DOCUMENT_START.into(), tokens[*i].text);
909                *i += 1;
910                saw_marker = true;
911                if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
912                    builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
913                    *i += 1;
914                }
915                break;
916            }
917            _ => break,
918        }
919    }
920    let _ = saw_marker;
921
922    // Phase 2: body.
923    let next_significant = tokens[*i..].iter().find(|t| {
924        !matches!(
925            t.kind,
926            YamlToken::Newline | YamlToken::Whitespace | YamlToken::Comment
927        )
928    });
929
930    let body_kind = match next_significant.map(|t| t.kind) {
931        Some(YamlToken::DocumentStart) | Some(YamlToken::DocumentEnd) | None => DocumentBody::Empty,
932        Some(YamlToken::BlockSeqEntry) => DocumentBody::BlockSequence,
933        _ => {
934            // Tagless scalar documents continue to dispatch to the block-map
935            // emitter for byte-level CST stability. Tagged scalar documents
936            // (e.g. `! a`, `!!str foo`) take the dedicated path because they
937            // lack a colon and would trip the key/colon expectation.
938            let mut has_colon = false;
939            let mut has_tag = false;
940            for tok in &tokens[*i..] {
941                match tok.kind {
942                    YamlToken::DocumentStart | YamlToken::DocumentEnd => break,
943                    YamlToken::Colon => has_colon = true,
944                    YamlToken::Tag => has_tag = true,
945                    _ => {}
946                }
947            }
948            if !has_colon && has_tag {
949                DocumentBody::Scalar
950            } else {
951                DocumentBody::BlockMap
952            }
953        }
954    };
955
956    match body_kind {
957        DocumentBody::Empty => {}
958        DocumentBody::BlockSequence => {
959            builder.start_node(SyntaxKind::YAML_BLOCK_SEQUENCE.into());
960            emit_block_seq(builder, tokens, i, false)?;
961            builder.finish_node(); // YAML_BLOCK_SEQUENCE
962        }
963        DocumentBody::Scalar => emit_scalar_document(builder, tokens, i)?,
964        DocumentBody::BlockMap => {
965            builder.start_node(SyntaxKind::YAML_BLOCK_MAP.into());
966            emit_block_map(builder, tokens, i, false)?;
967            builder.finish_node(); // YAML_BLOCK_MAP
968        }
969    }
970
971    // Phase 3: optional `...` marker (and its trailing newline). Trivia
972    // between the body and the marker that we did NOT consume into the body
973    // belongs to the stream, not this document, so we don't drain it here.
974    if *i < tokens.len() && tokens[*i].kind == YamlToken::DocumentEnd {
975        builder.token(SyntaxKind::YAML_DOCUMENT_END.into(), tokens[*i].text);
976        *i += 1;
977        if *i < tokens.len() && tokens[*i].kind == YamlToken::Newline {
978            builder.token(SyntaxKind::NEWLINE.into(), tokens[*i].text);
979            *i += 1;
980        }
981    }
982
983    Ok(())
984}
985
986#[derive(Clone, Copy)]
987enum DocumentBody {
988    Empty,
989    BlockSequence,
990    BlockMap,
991    Scalar,
992}