Skip to main content

panache_parser/parser/yaml/
events.rs

1//! YAML event projection: walk a shadow-parser CST and produce a
2//! yaml-test-suite style event stream (`+STR`, `+DOC`, `+MAP`, `=VAL :foo`,
3//! ...).
4//!
5//! This module is parser-crate scoped and used only by the test harness in
6//! `crates/panache-parser/tests/yaml.rs` for fixture parity. It reads the
7//! green tree built by [`crate::parser::yaml::parse_yaml_tree`] and re-derives
8//! event-stream semantics (tag resolution, anchor stripping, flow-seq
9//! splitting). The intent is to keep the projection adjacent to the parser so
10//! CST shape is the single source of truth for events.
11
12use std::collections::HashMap;
13
14use crate::syntax::{SyntaxKind, SyntaxNode, SyntaxToken};
15
16use super::parser::parse_yaml_tree;
17
18/// Per-document tag handle map: handle (`!!`, `!yaml!`, `!e!`) → URI prefix.
19/// The secondary handle `!!` always defaults to `tag:yaml.org,2002:` per the
20/// YAML 1.2 spec. Per-document `%TAG` directives override and add to this map.
21type TagHandles = HashMap<String, String>;
22
23fn default_tag_handles() -> TagHandles {
24    let mut handles = HashMap::new();
25    handles.insert("!!".to_string(), "tag:yaml.org,2002:".to_string());
26    handles
27}
28
29/// Scan a `YAML_DOCUMENT` for `%TAG` directive lines and merge them into
30/// the default handle map.
31fn collect_tag_handles(doc: &SyntaxNode) -> TagHandles {
32    let mut handles = default_tag_handles();
33    for tok in doc
34        .descendants_with_tokens()
35        .filter_map(|el| el.into_token())
36    {
37        if tok.kind() != SyntaxKind::YAML_SCALAR {
38            continue;
39        }
40        let line = tok.text().trim_start();
41        let Some(rest) = line.strip_prefix("%TAG") else {
42            continue;
43        };
44        let mut parts = rest.split_whitespace();
45        let Some(handle) = parts.next() else { continue };
46        let Some(prefix) = parts.next() else { continue };
47        handles.insert(handle.to_string(), prefix.to_string());
48    }
49    handles
50}
51
52/// Resolve a tag shorthand (e.g. `!!str`, `!yaml!str`, `!e!foo`, `!local`) to
53/// the long-form `<tag:...>` event token, consulting the per-document handle
54/// map. Handles are checked first (so a `%TAG !` directive can override the
55/// primary handle); we fall back to the built-in handling for unknown handles.
56fn resolve_long_tag(tag: &str, handles: &TagHandles) -> Option<String> {
57    let mut best: Option<(&str, &String)> = None;
58    for (h, p) in handles {
59        if tag.starts_with(h)
60            && best.is_none_or(|(b_handle, _): (&str, _)| h.len() > b_handle.len())
61        {
62            best = Some((h.as_str(), p));
63        }
64    }
65    if let Some((handle, prefix)) = best {
66        let suffix = &tag[handle.len()..];
67        let resolved = format!("{prefix}{suffix}");
68        return Some(format!("<{}>", percent_decode_tag(&resolved)));
69    }
70    long_tag_builtin(tag)
71}
72
73/// Decode percent-encoded bytes (`%xx`) in a resolved tag URI. YAML 1.2 allows
74/// percent-encoding in tag suffixes so callers can embed otherwise-special
75/// characters (`!`, `:`, etc.); event-stream parity expects the decoded form.
76fn percent_decode_tag(tag: &str) -> String {
77    let bytes = tag.as_bytes();
78    let mut out = Vec::with_capacity(bytes.len());
79    let mut i = 0;
80    while i < bytes.len() {
81        if bytes[i] == b'%'
82            && i + 2 < bytes.len()
83            && let (Some(hi), Some(lo)) =
84                (hex_digit_value(bytes[i + 1]), hex_digit_value(bytes[i + 2]))
85        {
86            out.push(hi * 16 + lo);
87            i += 3;
88            continue;
89        }
90        out.push(bytes[i]);
91        i += 1;
92    }
93    String::from_utf8(out).unwrap_or_else(|_| tag.to_string())
94}
95
96fn hex_digit_value(byte: u8) -> Option<u8> {
97    match byte {
98        b'0'..=b'9' => Some(byte - b'0'),
99        b'a'..=b'f' => Some(byte - b'a' + 10),
100        b'A'..=b'F' => Some(byte - b'A' + 10),
101        _ => None,
102    }
103}
104
105/// Walk the shadow CST for `input` and return the projected yaml-test-suite
106/// event stream. Returns an empty vector if the input fails to parse.
107pub fn project_events(input: &str) -> Vec<String> {
108    let Some(tree) = parse_yaml_tree(input) else {
109        return Vec::new();
110    };
111
112    let mut events = vec!["+STR".to_string()];
113    let stream = tree
114        .descendants()
115        .find(|n| n.kind() == SyntaxKind::YAML_STREAM);
116    if let Some(stream) = stream {
117        for doc in stream
118            .children()
119            .filter(|n| n.kind() == SyntaxKind::YAML_DOCUMENT)
120        {
121            project_document(&doc, &mut events);
122        }
123    }
124    events.push("-STR".to_string());
125    events
126}
127
128fn project_document(doc: &SyntaxNode, out: &mut Vec<String>) {
129    let has_doc_start = doc
130        .children_with_tokens()
131        .filter_map(|el| el.into_token())
132        .any(|tok| tok.kind() == SyntaxKind::YAML_DOCUMENT_START);
133    let has_doc_end = doc
134        .children_with_tokens()
135        .filter_map(|el| el.into_token())
136        .any(|tok| tok.kind() == SyntaxKind::YAML_DOCUMENT_END);
137    out.push(if has_doc_start {
138        "+DOC ---".to_string()
139    } else {
140        "+DOC".to_string()
141    });
142    let handles = collect_tag_handles(doc);
143
144    if let Some(seq_node) = doc
145        .descendants()
146        .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE)
147    {
148        out.push(seq_open_event(&seq_node, &handles));
149        project_block_sequence_items(&seq_node, &handles, out);
150        out.push("-SEQ".to_string());
151    } else if let Some(root_map) = doc
152        .descendants()
153        .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
154    {
155        let mut values = Vec::new();
156        project_block_map_entries(&root_map, &handles, &mut values);
157        if !values.is_empty() {
158            out.push("+MAP".to_string());
159            out.append(&mut values);
160            out.push("-MAP".to_string());
161        } else if let Some(flow_map) = doc
162            .descendants()
163            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
164        {
165            let mut flow_values = Vec::new();
166            project_flow_map_entries(&flow_map, &handles, &mut flow_values);
167            out.push("+MAP {}".to_string());
168            out.append(&mut flow_values);
169            out.push("-MAP".to_string());
170        } else if let Some(flow_seq) = doc
171            .descendants()
172            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
173            && let Some(items) = simple_flow_sequence_items(&flow_seq.text().to_string())
174        {
175            out.push("+SEQ []".to_string());
176            for item in items {
177                project_flow_seq_item(&item, &handles, out);
178            }
179            out.push("-SEQ".to_string());
180        } else if let Some(scalar) = scalar_document_value(doc, &handles) {
181            out.push(scalar);
182        } else {
183            out.push("=VAL :".to_string());
184        }
185    } else if let Some(flow_map) = doc
186        .descendants()
187        .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
188    {
189        out.push("+MAP {}".to_string());
190        project_flow_map_entries(&flow_map, &handles, out);
191        out.push("-MAP".to_string());
192    } else if let Some(flow_seq) = doc
193        .descendants()
194        .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
195        && let Some(items) = simple_flow_sequence_items(&flow_seq.text().to_string())
196    {
197        out.push("+SEQ []".to_string());
198        for item in items {
199            project_flow_seq_item(&item, &handles, out);
200        }
201        out.push("-SEQ".to_string());
202    } else if let Some(scalar) = scalar_document_value(doc, &handles) {
203        out.push(scalar);
204    } else {
205        out.push("=VAL :".to_string());
206    }
207
208    out.push(if has_doc_end {
209        "-DOC ...".to_string()
210    } else {
211        "-DOC".to_string()
212    });
213}
214
215fn scalar_document_value(doc: &SyntaxNode, handles: &TagHandles) -> Option<String> {
216    // `--- |` / `--- >` packs a block-scalar header onto the directive-end
217    // marker line. Detect that pattern first so the folded body (with proper
218    // chomping) is emitted instead of a single-line plain scalar.
219    if let Some((indicator, body)) = extract_scalar_doc_block_body(doc) {
220        let escaped = escape_block_scalar_text(&body);
221        return Some(format!("=VAL {indicator}{escaped}"));
222    }
223    // Bare top-level block scalar (no `---` marker) — e.g. a doc that begins
224    // with `>\n …` or `|\n …`. Reuse the same folder; the only difference vs
225    // the directive-end-packed form is the absence of a `YAML_DOCUMENT_START`
226    // sentinel separating the header from the body.
227    if let Some((indicator, body)) = extract_top_level_block_body(doc) {
228        let escaped = escape_block_scalar_text(&body);
229        return Some(format!("=VAL {indicator}{escaped}"));
230    }
231    // Skip `%TAG`/`%YAML` directive lines: those are document-level metadata,
232    // not part of the scalar body.
233    let text = doc
234        .descendants_with_tokens()
235        .filter_map(|el| el.into_token())
236        .filter(|tok| tok.kind() == SyntaxKind::YAML_SCALAR)
237        .filter(|tok| !tok.text().trim_start().starts_with('%'))
238        .map(|tok| tok.text().to_string())
239        .collect::<Vec<_>>()
240        .join("");
241    let trimmed_text = text.trim();
242    if trimmed_text.is_empty() {
243        // Tagged-but-empty scalar document still emits a `=VAL <tag> :` event.
244        let tag_only = doc
245            .descendants_with_tokens()
246            .filter_map(|el| el.into_token())
247            .find(|tok| tok.kind() == SyntaxKind::YAML_TAG)
248            .map(|tok| tok.text().to_string());
249        if let Some(tag) = tag_only
250            && let Some(long) = resolve_long_tag(&tag, handles)
251        {
252            return Some(format!("=VAL {long} :"));
253        }
254        return None;
255    }
256    let tag_text = doc
257        .descendants_with_tokens()
258        .filter_map(|el| el.into_token())
259        .find(|tok| tok.kind() == SyntaxKind::YAML_TAG)
260        .map(|tok| tok.text().to_string());
261    let multi_line_text = collect_doc_scalar_text_with_newlines(doc);
262    let is_multi_line_quoted = multi_line_text.contains('\n')
263        && (trimmed_text.starts_with('"') || trimmed_text.starts_with('\''));
264    let event = if let Some(tag) = tag_text
265        && let Some(long) = resolve_long_tag(&tag, handles)
266    {
267        if trimmed_text.starts_with('"') || trimmed_text.starts_with('\'') {
268            let quoted = if is_multi_line_quoted {
269                quoted_val_event_multi_line(&multi_line_text)
270            } else {
271                quoted_val_event(trimmed_text)
272            };
273            // quoted_val_event returns `=VAL "body` — splice the tag in.
274            quoted.replacen("=VAL ", &format!("=VAL {long} "), 1)
275        } else {
276            format!("=VAL {long} :{trimmed_text}")
277        }
278    } else if is_multi_line_quoted {
279        quoted_val_event_multi_line(&multi_line_text)
280    } else if trimmed_text.starts_with('"') || trimmed_text.starts_with('\'') {
281        quoted_val_event(&text)
282    } else {
283        let folded = fold_plain_document_lines(doc);
284        // Plain top-level scalars may carry node properties (`&anchor`,
285        // `!tag`) before the actual scalar body; decompose so events project
286        // them in canonical `&anchor <tag> :body` order.
287        let (anchor, body_tag, body) = decompose_scalar(folded.trim_start(), handles);
288        if anchor.is_some() || body_tag.is_some() {
289            scalar_event(anchor, body_tag.as_deref(), &escape_block_scalar_text(body))
290        } else {
291            format!("=VAL :{}", escape_block_scalar_text(&folded))
292        }
293    };
294    Some(event)
295}
296
297/// Reconstruct the doc's scalar text with line breaks intact: walk
298/// `YAML_SCALAR` + `NEWLINE` tokens in order (skipping directive lines).
299/// Required for multi-line quoted folding because `YAML_SCALAR`-only joins
300/// throw away the line structure that drives YAML 1.2 §7.3.2/§7.3.3 folding.
301fn collect_doc_scalar_text_with_newlines(doc: &SyntaxNode) -> String {
302    doc.descendants_with_tokens()
303        .filter_map(|el| el.into_token())
304        .filter(|tok| matches!(tok.kind(), SyntaxKind::YAML_SCALAR | SyntaxKind::NEWLINE))
305        .filter(|tok| !tok.text().trim_start().starts_with('%'))
306        .map(|tok| tok.text().to_string())
307        .collect()
308}
309
310fn plain_val_event(text: &str) -> String {
311    format!("=VAL :{}", text.replace('\\', "\\\\"))
312}
313
314/// Fold the YAML-1.2 plain-scalar body of a top-level scalar `YAML_DOCUMENT`
315/// into its canonical value: walk `YAML_SCALAR` and `NEWLINE` tokens in order
316/// (skipping directive lines), then apply plain-scalar folding —
317/// non-empty-line breaks fold to a single space, runs of `n` empty lines fold
318/// to `n` line feeds. Leading/trailing empty lines are stripped.
319fn fold_plain_document_lines(doc: &SyntaxNode) -> String {
320    let raw: String = doc
321        .descendants_with_tokens()
322        .filter_map(|el| el.into_token())
323        .filter(|tok| matches!(tok.kind(), SyntaxKind::YAML_SCALAR | SyntaxKind::NEWLINE))
324        .filter(|tok| !tok.text().trim_start().starts_with('%'))
325        .map(|tok| tok.text().to_string())
326        .collect();
327
328    let mut out = String::with_capacity(raw.len());
329    let mut empty_run: usize = 0;
330    let mut have_content = false;
331    for line in raw.split('\n') {
332        let trimmed = line.trim();
333        if trimmed.is_empty() {
334            if have_content {
335                empty_run += 1;
336            }
337            continue;
338        }
339        if !have_content {
340            out.push_str(trimmed);
341            have_content = true;
342        } else if empty_run == 0 {
343            out.push(' ');
344            out.push_str(trimmed);
345        } else {
346            for _ in 0..empty_run {
347                out.push('\n');
348            }
349            out.push_str(trimmed);
350        }
351        empty_run = 0;
352    }
353    out
354}
355
356/// Project a flow-collection scalar token, preserving quoted-scalar
357/// classification when the source uses `"..."` or `'...'`. Plain scalars are
358/// folded just like outside flow context. A leading tag shorthand (`!!str`,
359/// `!handle!suffix`, `!local`) is resolved through `handles`.
360fn flow_scalar_event(text: &str, handles: &TagHandles) -> String {
361    let trimmed = text.trim();
362    if trimmed.starts_with('"') || trimmed.starts_with('\'') {
363        return quoted_val_event(trimmed);
364    }
365    let (anchor, long_tag, body) = decompose_scalar(trimmed, handles);
366    if anchor.is_some() || long_tag.is_some() {
367        return scalar_event(anchor, long_tag.as_deref(), body);
368    }
369    plain_val_event(&fold_plain_scalar(text))
370}
371
372/// Split a leading tag shorthand (`!handle!suffix` or `!local`) off `text`,
373/// returning `(tag, remainder)`. The tag must be terminated by whitespace or
374/// end of input; otherwise `text` is returned as-is.
375fn split_leading_tag(text: &str) -> Option<(&str, &str)> {
376    let rest = text.strip_prefix('!')?;
377    let mut i = 0usize;
378    let mut bangs = 0usize;
379    for (idx, ch) in rest.char_indices() {
380        if ch == '!' {
381            bangs += 1;
382            if bangs > 1 {
383                return None;
384            }
385            i = idx + 1;
386            continue;
387        }
388        if matches!(ch, ' ' | '\t' | '\n' | ',' | '}' | ']') {
389            i = idx;
390            break;
391        }
392        i = idx + ch.len_utf8();
393    }
394    let tag_len = 1 + i;
395    let (tag, remainder) = text.split_at(tag_len);
396    Some((tag, remainder))
397}
398
399/// Locate a flow-context key/value `:` indicator within a flow-sequence item.
400/// Per YAML 1.2 a `:` is the mapping-key indicator only when followed by
401/// whitespace or by end of the item; otherwise it's part of a plain scalar
402/// (e.g. `http://foo.com`). Quoted regions are skipped.
403fn flow_kv_split(item: &str) -> Option<(usize, usize)> {
404    let bytes = item.as_bytes();
405    let mut in_single = false;
406    let mut in_double = false;
407    let mut escaped_double = false;
408    for (idx, ch) in item.char_indices() {
409        if in_double {
410            if escaped_double {
411                escaped_double = false;
412                continue;
413            }
414            match ch {
415                '\\' => escaped_double = true,
416                '"' => in_double = false,
417                _ => {}
418            }
419            continue;
420        }
421        if in_single {
422            if ch == '\'' {
423                in_single = false;
424            }
425            continue;
426        }
427        match ch {
428            '\'' => in_single = true,
429            '"' => in_double = true,
430            ':' => {
431                let next_off = idx + ch.len_utf8();
432                let after_is_break = next_off >= bytes.len()
433                    || matches!(bytes[next_off], b' ' | b'\t' | b'\n' | b'\r');
434                if after_is_break {
435                    return Some((idx, next_off));
436                }
437            }
438            _ => {}
439        }
440    }
441    None
442}
443
444/// Emit events for a single flow-sequence item: either `+MAP {} key val -MAP`
445/// when the item is a flow-map entry (`key: value`, possibly with empty key
446/// or value), or a single `=VAL` for a bare scalar.
447fn project_flow_seq_item(item: &str, handles: &TagHandles, out: &mut Vec<String>) {
448    if let Some((colon, after)) = flow_kv_split(item) {
449        let raw_key_full = item[..colon].trim();
450        // Strip the explicit-key `?` indicator (followed by whitespace or
451        // end-of-key) when present.
452        let raw_key = strip_explicit_key_indicator(raw_key_full);
453        let raw_value = item[after..].trim();
454        out.push("+MAP {}".to_string());
455        if raw_key.is_empty() {
456            out.push("=VAL :".to_string());
457        } else {
458            out.push(flow_scalar_event(raw_key, handles));
459        }
460        if raw_value.is_empty() {
461            out.push("=VAL :".to_string());
462        } else {
463            out.push(flow_scalar_event(raw_value, handles));
464        }
465        out.push("-MAP".to_string());
466    } else if item.trim_start().starts_with('"') || item.trim_start().starts_with('\'') {
467        out.push(quoted_val_event(item.trim()));
468    } else {
469        out.push(plain_val_event(&fold_plain_scalar(item)));
470    }
471}
472
473fn strip_explicit_key_indicator(key: &str) -> &str {
474    let trimmed = key.trim_start();
475    if let Some(rest) = trimmed.strip_prefix('?')
476        && (rest.is_empty() || rest.starts_with([' ', '\t', '\n']))
477    {
478        return rest.trim_start();
479    }
480    key
481}
482
483fn quoted_val_event(text: &str) -> String {
484    if text.starts_with('\'') {
485        let inner = decode_single_quoted(text);
486        format!("=VAL '{}", escape_for_event(&inner))
487    } else {
488        let inner = decode_double_quoted(text);
489        format!("=VAL \"{}", escape_for_event(&inner))
490    }
491}
492
493/// Multi-line quoted scalar projection: applies YAML 1.2 §7.3.2 / §7.3.3 line
494/// folding (single line break → space, blank-line run of `n` blanks → `n`
495/// `\n`s) before escape decoding. Required when a top-level quoted document
496/// spans more than one source line — the single-line `quoted_val_event`
497/// concatenates `YAML_SCALAR` tokens directly and would lose all line
498/// structure.
499fn quoted_val_event_multi_line(raw: &str) -> String {
500    let trimmed = raw.trim_start_matches([' ', '\t', '\n']);
501    if trimmed.starts_with('\'') {
502        let inner_with_breaks = strip_quoted_wrapper(trimmed, '\'');
503        let folded = fold_quoted_inner(&inner_with_breaks);
504        let decoded = folded.replace("''", "'");
505        format!("=VAL '{}", escape_for_event(&decoded))
506    } else {
507        let inner_with_breaks = strip_quoted_wrapper(trimmed, '"');
508        let folded = fold_quoted_inner(&inner_with_breaks);
509        let decoded = decode_double_quoted_inner(&folded);
510        format!("=VAL \"{}", escape_for_event(&decoded))
511    }
512}
513
514/// Strip the surrounding quote characters from a multi-line quoted scalar's
515/// raw source. Walks until the first un-escaped (for `"`) or non-doubled
516/// (for `'`) closing quote so embedded `\"` / `''` don't terminate early.
517fn strip_quoted_wrapper(text: &str, quote: char) -> String {
518    let body = text.strip_prefix(quote).unwrap_or(text);
519    let mut out = String::with_capacity(body.len());
520    let mut chars = body.chars().peekable();
521    while let Some(ch) = chars.next() {
522        if quote == '"' {
523            if ch == '\\' {
524                out.push(ch);
525                if let Some(next) = chars.next() {
526                    out.push(next);
527                }
528                continue;
529            }
530            if ch == '"' {
531                break;
532            }
533        } else if ch == '\'' {
534            if chars.peek() == Some(&'\'') {
535                out.push('\'');
536                out.push('\'');
537                chars.next();
538                continue;
539            }
540            break;
541        }
542        out.push(ch);
543    }
544    out
545}
546
547/// Fold the inner body of a multi-line quoted scalar per YAML §7.3:
548/// - On the first line, leading whitespace is preserved as-is.
549/// - On continuation lines, leading whitespace is stripped.
550/// - Trailing whitespace from the running output is dropped before folding.
551/// - A run of `n` consecutive empty lines folds to `n` `\n` chars.
552/// - A single line break (no blank between) folds to a single space.
553/// - Trailing whitespace of the final line is stripped (matching
554///   yaml-test-suite event expectations for multi-line quoted scalars).
555fn fold_quoted_inner(inner: &str) -> String {
556    let mut out = String::new();
557    let mut blanks = 0usize;
558    let mut have_first = false;
559    for (idx, line) in inner.split('\n').enumerate() {
560        if idx == 0 {
561            out.push_str(line);
562            have_first = true;
563            continue;
564        }
565        let stripped = line.trim_start_matches([' ', '\t']);
566        if stripped.is_empty() {
567            blanks += 1;
568            continue;
569        }
570        let trimmed_end = out.trim_end_matches([' ', '\t']);
571        out.truncate(trimmed_end.len());
572        if !have_first {
573            // No content yet, so prepend nothing — first-line leading
574            // whitespace is preserved later by the `idx == 0` branch only.
575        } else if blanks == 0 {
576            out.push(' ');
577        } else {
578            for _ in 0..blanks {
579                out.push('\n');
580            }
581        }
582        out.push_str(stripped);
583        blanks = 0;
584        have_first = true;
585    }
586    let trimmed_tail = out.trim_end_matches([' ', '\t']);
587    out.truncate(trimmed_tail.len());
588    out
589}
590
591/// Inner-only variant of [`decode_double_quoted`]: the input has no
592/// surrounding quote characters and is consumed in full. Shares escape
593/// decoding semantics with the wrapped form.
594fn decode_double_quoted_inner(body: &str) -> String {
595    let mut out = String::with_capacity(body.len());
596    let mut chars = body.chars();
597    while let Some(ch) = chars.next() {
598        if ch != '\\' {
599            out.push(ch);
600            continue;
601        }
602        let Some(next) = chars.next() else {
603            out.push('\\');
604            break;
605        };
606        match next {
607            '0' => out.push('\0'),
608            'a' => out.push('\u{07}'),
609            'b' => out.push('\u{08}'),
610            't' | '\t' => out.push('\t'),
611            'n' => out.push('\n'),
612            'v' => out.push('\u{0B}'),
613            'f' => out.push('\u{0C}'),
614            'r' => out.push('\r'),
615            'e' => out.push('\u{1B}'),
616            ' ' => out.push(' '),
617            '"' => out.push('"'),
618            '/' => out.push('/'),
619            '\\' => out.push('\\'),
620            'N' => out.push('\u{85}'),
621            '_' => out.push('\u{A0}'),
622            'L' => out.push('\u{2028}'),
623            'P' => out.push('\u{2029}'),
624            'x' => {
625                if let Some(c) = take_hex_char(&mut chars, 2) {
626                    out.push(c);
627                }
628            }
629            'u' => {
630                if let Some(c) = take_hex_char(&mut chars, 4) {
631                    out.push(c);
632                }
633            }
634            'U' => {
635                if let Some(c) = take_hex_char(&mut chars, 8) {
636                    out.push(c);
637                }
638            }
639            other => {
640                out.push('\\');
641                out.push(other);
642            }
643        }
644    }
645    out
646}
647
648fn decode_single_quoted(text: &str) -> String {
649    let body = text.strip_prefix('\'').unwrap_or(text);
650    let body = body.strip_suffix('\'').unwrap_or(body);
651    body.replace("''", "'")
652}
653
654/// Decode YAML double-quoted scalar escape sequences into actual characters
655/// per YAML 1.2 §5.7. Unknown escapes are kept verbatim so the harness can
656/// surface them as bare backslash-prefixed text.
657fn decode_double_quoted(text: &str) -> String {
658    let body = text.strip_prefix('"').unwrap_or(text);
659    let mut out = String::with_capacity(body.len());
660    let mut chars = body.chars();
661    while let Some(ch) = chars.next() {
662        if ch == '"' {
663            break;
664        }
665        if ch != '\\' {
666            out.push(ch);
667            continue;
668        }
669        let Some(next) = chars.next() else {
670            out.push('\\');
671            break;
672        };
673        match next {
674            '0' => out.push('\0'),
675            'a' => out.push('\u{07}'),
676            'b' => out.push('\u{08}'),
677            't' | '\t' => out.push('\t'),
678            'n' => out.push('\n'),
679            'v' => out.push('\u{0B}'),
680            'f' => out.push('\u{0C}'),
681            'r' => out.push('\r'),
682            'e' => out.push('\u{1B}'),
683            ' ' => out.push(' '),
684            '"' => out.push('"'),
685            '/' => out.push('/'),
686            '\\' => out.push('\\'),
687            'N' => out.push('\u{85}'),
688            '_' => out.push('\u{A0}'),
689            'L' => out.push('\u{2028}'),
690            'P' => out.push('\u{2029}'),
691            'x' => {
692                if let Some(c) = take_hex_char(&mut chars, 2) {
693                    out.push(c);
694                }
695            }
696            'u' => {
697                if let Some(c) = take_hex_char(&mut chars, 4) {
698                    out.push(c);
699                }
700            }
701            'U' => {
702                if let Some(c) = take_hex_char(&mut chars, 8) {
703                    out.push(c);
704                }
705            }
706            other => {
707                out.push('\\');
708                out.push(other);
709            }
710        }
711    }
712    out
713}
714
715fn take_hex_char(chars: &mut std::str::Chars<'_>, n: usize) -> Option<char> {
716    let hex: String = chars.take(n).collect();
717    if hex.len() != n {
718        return None;
719    }
720    u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32)
721}
722
723/// Escape decoded scalar text for the yaml-test-suite event format, where
724/// control characters and structural backslashes are rendered as backslash
725/// escapes (`\n`, `\t`, `\b`, ...).
726fn escape_for_event(text: &str) -> String {
727    let mut out = String::with_capacity(text.len());
728    for ch in text.chars() {
729        match ch {
730            '\\' => out.push_str("\\\\"),
731            '\n' => out.push_str("\\n"),
732            '\t' => out.push_str("\\t"),
733            '\r' => out.push_str("\\r"),
734            '\u{07}' => out.push_str("\\a"),
735            '\u{08}' => out.push_str("\\b"),
736            '\u{0B}' => out.push_str("\\v"),
737            '\u{0C}' => out.push_str("\\f"),
738            '\u{1B}' => out.push_str("\\e"),
739            '\0' => out.push_str("\\0"),
740            other => out.push(other),
741        }
742    }
743    out
744}
745
746fn long_tag_builtin(tag: &str) -> Option<String> {
747    if tag == "!" {
748        return Some("<!>".to_string());
749    }
750    // Bare local tag: `!local` (single leading `!`, no second `!`).
751    if let Some(rest) = tag.strip_prefix('!')
752        && !rest.contains('!')
753    {
754        return Some(format!("<!{rest}>"));
755    }
756    None
757}
758
759fn simple_flow_sequence_items(text: &str) -> Option<Vec<String>> {
760    let trimmed = text.trim();
761    let inner = trimmed.strip_prefix('[')?.strip_suffix(']')?;
762    let inner = inner.trim();
763    if inner.is_empty() {
764        return Some(Vec::new());
765    }
766
767    let mut items = Vec::new();
768    let mut start = 0usize;
769    let mut in_single = false;
770    let mut in_double = false;
771    let mut escaped_double = false;
772
773    for (idx, ch) in inner.char_indices() {
774        if in_double {
775            if escaped_double {
776                escaped_double = false;
777                continue;
778            }
779            match ch {
780                '\\' => escaped_double = true,
781                '"' => in_double = false,
782                _ => {}
783            }
784            continue;
785        }
786
787        if in_single {
788            if ch == '\'' {
789                in_single = false;
790            }
791            continue;
792        }
793
794        match ch {
795            '\'' => in_single = true,
796            '"' => in_double = true,
797            ',' => {
798                let item = inner[start..idx].trim();
799                if item.is_empty() {
800                    return None;
801                }
802                items.push(item.to_string());
803                start = idx + 1;
804            }
805            _ => {}
806        }
807    }
808
809    let last = inner[start..].trim();
810    if !last.is_empty() {
811        items.push(last.to_string());
812    }
813    Some(items)
814}
815
816fn escape_block_scalar_text(text: &str) -> String {
817    let mut out = String::with_capacity(text.len());
818    for ch in text.chars() {
819        match ch {
820            '\\' => out.push_str("\\\\"),
821            '\n' => out.push_str("\\n"),
822            '\t' => out.push_str("\\t"),
823            '\r' => out.push_str("\\r"),
824            other => out.push(other),
825        }
826    }
827    out
828}
829
830/// If `value_node` encodes a literal (`|`) or folded (`>`) block scalar,
831/// return the folded scalar body. Headers with explicit chomping (`-` strip,
832/// `+` keep) or indent indicators are recognized; chomping is applied to the
833/// final body. Default chomping is "clip" (single trailing newline).
834fn extract_block_scalar_body(value_node: &SyntaxNode) -> Option<(char, String)> {
835    let tokens: Vec<_> = value_node
836        .descendants_with_tokens()
837        .filter_map(|el| el.into_token())
838        .filter(|tok| {
839            matches!(
840                tok.kind(),
841                SyntaxKind::YAML_SCALAR
842                    | SyntaxKind::NEWLINE
843                    | SyntaxKind::WHITESPACE
844                    | SyntaxKind::YAML_COMMENT,
845            )
846        })
847        .collect();
848    fold_block_scalar_tokens(&tokens)
849}
850
851/// Variant of [`extract_block_scalar_body`] that walks a full `YAML_DOCUMENT`
852/// node and applies block-scalar folding to the tokens *after* a
853/// `YAML_DOCUMENT_START` marker. Used for the directive-end-with-payload
854/// pattern (`--- |\n  ab\n  cd\n`) where the block-scalar header is packed
855/// onto the marker line itself rather than being a block-map value.
856fn extract_scalar_doc_block_body(doc: &SyntaxNode) -> Option<(char, String)> {
857    let mut started = false;
858    let mut tokens = Vec::new();
859    for el in doc.descendants_with_tokens() {
860        let Some(tok) = el.into_token() else { continue };
861        if !started {
862            if tok.kind() == SyntaxKind::YAML_DOCUMENT_START {
863                started = true;
864            }
865            continue;
866        }
867        match tok.kind() {
868            SyntaxKind::YAML_DOCUMENT_END => break,
869            SyntaxKind::YAML_SCALAR
870            | SyntaxKind::NEWLINE
871            | SyntaxKind::WHITESPACE
872            | SyntaxKind::YAML_COMMENT => tokens.push(tok),
873            _ => {}
874        }
875    }
876    fold_block_scalar_tokens(&tokens)
877}
878
879/// Detect a top-level (no `YAML_DOCUMENT_START` marker) block-scalar document
880/// of the form `>\n …` or `|\n …`. Walks the document's content tokens and
881/// applies block-scalar folding when the first scalar token is a bare
882/// block-scalar header. Returns `None` otherwise so plain / quoted scalar
883/// handling can proceed.
884fn extract_top_level_block_body(doc: &SyntaxNode) -> Option<(char, String)> {
885    if doc
886        .descendants_with_tokens()
887        .filter_map(|el| el.into_token())
888        .any(|tok| tok.kind() == SyntaxKind::YAML_DOCUMENT_START)
889    {
890        return None;
891    }
892    let tokens: Vec<_> = doc
893        .descendants_with_tokens()
894        .filter_map(|el| el.into_token())
895        .filter(|tok| {
896            matches!(
897                tok.kind(),
898                SyntaxKind::YAML_SCALAR
899                    | SyntaxKind::NEWLINE
900                    | SyntaxKind::WHITESPACE
901                    | SyntaxKind::YAML_COMMENT,
902            )
903        })
904        .collect();
905    let first = tokens.iter().find(|tok| {
906        tok.kind() == SyntaxKind::YAML_SCALAR && parse_block_scalar_indicator(tok.text()).is_some()
907    })?;
908    let _ = first;
909    fold_block_scalar_tokens(&tokens)
910}
911
912fn fold_block_scalar_tokens(tokens: &[SyntaxToken]) -> Option<(char, String)> {
913    let header_idx = tokens.iter().position(|t| {
914        t.kind() == SyntaxKind::YAML_SCALAR && parse_block_scalar_indicator(t.text()).is_some()
915    })?;
916    let (indicator, chomp) = parse_block_scalar_indicator(tokens[header_idx].text())?;
917
918    // Reconstruct the body source by stitching every token AFTER the header
919    // and its trailing newline. Including `WHITESPACE` and `YAML_COMMENT`
920    // tokens preserves the indentation needed for content-indent calculation
921    // and lets a `# ...` line at column 0 (DK3J) land inside the body, while
922    // a less-indented `# Comment` after a fully-indented body region (7T8X)
923    // gets recognized as a body terminator.
924    let mut raw = String::new();
925    let mut skipped_header_newline = false;
926    for tok in &tokens[header_idx + 1..] {
927        if !skipped_header_newline && tok.kind() == SyntaxKind::NEWLINE {
928            skipped_header_newline = true;
929            continue;
930        }
931        raw.push_str(tok.text());
932    }
933
934    let raw_trailing_newlines = raw.chars().rev().take_while(|c| *c == '\n').count();
935
936    let lines: Vec<&str> = raw.split('\n').collect();
937
938    // Per YAML 1.2 §8.1.1.1, the content indentation level is set by the
939    // first non-empty line of the contents.
940    let content_indent = lines
941        .iter()
942        .find(|l| !l.trim().is_empty())
943        .map(|l| l.chars().take_while(|c| *c == ' ').count())
944        .unwrap_or(0);
945
946    // Truncate at the first non-empty line whose indentation drops below the
947    // content indent — that's where the block scalar's body ends per spec.
948    // Trailing blanks (and the final empty split-tail) are kept; chomping
949    // re-derives the right number of trailing newlines below.
950    let mut body_lines: Vec<&str> = Vec::new();
951    let mut seen_content = false;
952    for line in lines.iter() {
953        let is_blank = line.trim().is_empty();
954        let indent = line.chars().take_while(|c| *c == ' ').count();
955        if !is_blank && seen_content && indent < content_indent {
956            break;
957        }
958        body_lines.push(line);
959        if !is_blank {
960            seen_content = true;
961        }
962    }
963    if body_lines.last().is_some_and(|s| s.is_empty()) {
964        body_lines.pop();
965    }
966
967    let stripped: Vec<BlockBodyLine> = body_lines
968        .iter()
969        .map(|l| {
970            let is_blank = l.trim().is_empty();
971            let indent = l.chars().take_while(|c| *c == ' ').count();
972            // Always strip up to `content_indent` columns; for `|` style this
973            // preserves trailing spaces past the content indent (T26H).
974            let text = if l.len() >= content_indent {
975                l[content_indent..].to_string()
976            } else {
977                String::new()
978            };
979            // More-indented lines (per §8.1.3) keep literal line breaks in
980            // folded scalars. Blank lines are not flagged MI here; the folder
981            // counts them and applies the surrounding-line rule.
982            let is_mi = !is_blank && indent > content_indent;
983            BlockBodyLine {
984                text,
985                is_blank,
986                is_mi,
987            }
988        })
989        .collect();
990
991    let folded = match indicator {
992        '|' => stripped
993            .iter()
994            .map(|l| l.text.as_str())
995            .collect::<Vec<_>>()
996            .join("\n"),
997        '>' => fold_greater_lines(&stripped),
998        _ => unreachable!(),
999    };
1000
1001    let trimmed = folded.trim_end_matches('\n');
1002    let body = match chomp {
1003        BlockScalarChomp::Strip => trimmed.to_string(),
1004        BlockScalarChomp::Clip => {
1005            if trimmed.is_empty() {
1006                String::new()
1007            } else {
1008                format!("{trimmed}\n")
1009            }
1010        }
1011        BlockScalarChomp::Keep => {
1012            format!("{trimmed}{}", "\n".repeat(raw_trailing_newlines))
1013        }
1014    };
1015    Some((indicator, body))
1016}
1017
1018struct BlockBodyLine {
1019    text: String,
1020    is_blank: bool,
1021    is_mi: bool,
1022}
1023
1024/// Apply the YAML 1.2 §8.1.3 folded-scalar rules to a sequence of
1025/// content-indent-stripped body lines:
1026/// - Each leading blank line contributes a single `\n` to the output.
1027/// - Between two adjacent non-MI content lines, a single line break folds to
1028///   ` `; a run of `n` blank lines folds to `n` `\n` chars.
1029/// - When either side of the boundary is more-indented, *all* line breaks
1030///   between the two content lines are preserved literally.
1031fn fold_greater_lines(lines: &[BlockBodyLine]) -> String {
1032    let mut out = String::new();
1033    let mut idx = 0usize;
1034
1035    while idx < lines.len() && lines[idx].is_blank {
1036        out.push('\n');
1037        idx += 1;
1038    }
1039    if idx >= lines.len() {
1040        return out;
1041    }
1042
1043    out.push_str(&lines[idx].text);
1044    let mut prev_is_mi = lines[idx].is_mi;
1045    idx += 1;
1046
1047    while idx < lines.len() {
1048        let mut empty_count = 0usize;
1049        while idx < lines.len() && lines[idx].is_blank {
1050            empty_count += 1;
1051            idx += 1;
1052        }
1053        if idx >= lines.len() {
1054            break;
1055        }
1056        let line = &lines[idx];
1057        let mi_involved = prev_is_mi || line.is_mi;
1058        if mi_involved {
1059            for _ in 0..(empty_count + 1) {
1060                out.push('\n');
1061            }
1062        } else if empty_count == 0 {
1063            out.push(' ');
1064        } else {
1065            for _ in 0..empty_count {
1066                out.push('\n');
1067            }
1068        }
1069        out.push_str(&line.text);
1070        prev_is_mi = line.is_mi;
1071        idx += 1;
1072    }
1073    out
1074}
1075
1076#[derive(Clone, Copy)]
1077enum BlockScalarChomp {
1078    Clip,
1079    Strip,
1080    Keep,
1081}
1082
1083fn parse_block_scalar_indicator(text: &str) -> Option<(char, BlockScalarChomp)> {
1084    let mut chars = text.chars();
1085    let indicator = match chars.next()? {
1086        '|' => '|',
1087        '>' => '>',
1088        _ => return None,
1089    };
1090    let mut chomp = BlockScalarChomp::Clip;
1091    let mut seen_chomp = false;
1092    let mut seen_indent = false;
1093    for ch in chars {
1094        match ch {
1095            '+' if !seen_chomp => {
1096                chomp = BlockScalarChomp::Keep;
1097                seen_chomp = true;
1098            }
1099            '-' if !seen_chomp => {
1100                chomp = BlockScalarChomp::Strip;
1101                seen_chomp = true;
1102            }
1103            '1'..='9' if !seen_indent => seen_indent = true,
1104            _ => return None,
1105        }
1106    }
1107    Some((indicator, chomp))
1108}
1109
1110fn fold_plain_scalar(text: &str) -> String {
1111    let mut pieces = Vec::new();
1112    for line in text.split('\n') {
1113        let trimmed = line.trim();
1114        // A line whose first non-blank character is `#` is a YAML comment
1115        // line (the lexer currently leaves these embedded in scalar token
1116        // text inside multi-line flow continuations); skip it from folding.
1117        if trimmed.is_empty() || trimmed.starts_with('#') {
1118            continue;
1119        }
1120        pieces.push(trimmed.to_string());
1121    }
1122    if pieces.is_empty() {
1123        return String::new();
1124    }
1125    pieces.join(" ")
1126}
1127
1128fn project_flow_map_entries(flow_map: &SyntaxNode, handles: &TagHandles, out: &mut Vec<String>) {
1129    for entry in flow_map
1130        .children()
1131        .filter(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_ENTRY)
1132    {
1133        let key_node = entry
1134            .children()
1135            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_KEY)
1136            .expect("flow map key");
1137        let value_node = entry
1138            .children()
1139            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP_VALUE)
1140            .expect("flow map value");
1141
1142        let has_explicit_colon = key_node
1143            .children_with_tokens()
1144            .filter_map(|el| el.into_token())
1145            .any(|tok| tok.kind() == SyntaxKind::YAML_COLON);
1146
1147        let raw_key = key_node
1148            .descendants_with_tokens()
1149            .filter_map(|el| el.into_token())
1150            .filter(|tok| matches!(tok.kind(), SyntaxKind::YAML_SCALAR | SyntaxKind::YAML_KEY))
1151            .map(|tok| tok.text().to_string())
1152            .collect::<Vec<_>>()
1153            .join("");
1154
1155        if has_explicit_colon {
1156            // Strip the explicit-key `?` indicator (`{ ? foo : v }`) from
1157            // the projected key text. A bare `? :` entry (key reduces to
1158            // empty after stripping) projects to an empty `=VAL :`.
1159            let stripped_key = strip_explicit_key_indicator(raw_key.trim());
1160            if stripped_key.is_empty() {
1161                out.push("=VAL :".to_string());
1162            } else {
1163                out.push(flow_scalar_event(stripped_key, handles));
1164            }
1165            project_flow_map_value(&value_node, handles, out);
1166        } else {
1167            let raw_value = value_node
1168                .descendants_with_tokens()
1169                .filter_map(|el| el.into_token())
1170                .filter(|tok| tok.kind() == SyntaxKind::YAML_SCALAR)
1171                .map(|tok| tok.text().to_string())
1172                .collect::<Vec<_>>()
1173                .join("");
1174            let combined = format!("{raw_key}{raw_value}");
1175            let folded = fold_plain_scalar(&combined);
1176            let stripped = strip_explicit_key_indicator(&folded);
1177            if stripped.is_empty() {
1178                out.push("=VAL :".to_string());
1179            } else {
1180                out.push(plain_val_event(stripped));
1181            }
1182            out.push("=VAL :".to_string());
1183        }
1184    }
1185}
1186
1187/// Project a `YAML_FLOW_MAP_VALUE` node, recursing into nested flow
1188/// collections (`+SEQ [] ... -SEQ`, `+MAP {} ... -MAP`) when present so that
1189/// multi-line nested flow values like `{ a: [ b, c, { d: [e, f] } ] }`
1190/// produce structured event streams instead of one slurped scalar.
1191fn project_flow_map_value(value_node: &SyntaxNode, handles: &TagHandles, out: &mut Vec<String>) {
1192    if let Some(flow_seq) = value_node
1193        .children()
1194        .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
1195    {
1196        out.push("+SEQ []".to_string());
1197        project_flow_sequence_items_cst(&flow_seq, handles, out);
1198        out.push("-SEQ".to_string());
1199        return;
1200    }
1201    if let Some(nested_map) = value_node
1202        .children()
1203        .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
1204    {
1205        out.push("+MAP {}".to_string());
1206        project_flow_map_entries(&nested_map, handles, out);
1207        out.push("-MAP".to_string());
1208        return;
1209    }
1210
1211    let raw_value = value_node
1212        .descendants_with_tokens()
1213        .filter_map(|el| el.into_token())
1214        .filter(|tok| tok.kind() == SyntaxKind::YAML_SCALAR)
1215        .map(|tok| tok.text().to_string())
1216        .collect::<Vec<_>>()
1217        .join("");
1218    out.push(flow_scalar_event(&raw_value, handles));
1219}
1220
1221/// CST-walking variant of flow-sequence projection. Each
1222/// `YAML_FLOW_SEQUENCE_ITEM` may contain a nested `YAML_FLOW_SEQUENCE` /
1223/// `YAML_FLOW_MAP`; if neither is present we fall back to the text-based
1224/// `project_flow_seq_item` for plain/quoted scalar items.
1225fn project_flow_sequence_items_cst(
1226    flow_seq: &SyntaxNode,
1227    handles: &TagHandles,
1228    out: &mut Vec<String>,
1229) {
1230    for item in flow_seq
1231        .children()
1232        .filter(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE_ITEM)
1233    {
1234        if let Some(nested_seq) = item
1235            .children()
1236            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
1237        {
1238            out.push("+SEQ []".to_string());
1239            project_flow_sequence_items_cst(&nested_seq, handles, out);
1240            out.push("-SEQ".to_string());
1241            continue;
1242        }
1243        if let Some(nested_map) = item
1244            .children()
1245            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
1246        {
1247            out.push("+MAP {}".to_string());
1248            project_flow_map_entries(&nested_map, handles, out);
1249            out.push("-MAP".to_string());
1250            continue;
1251        }
1252        // Build the item text from scalar/key tokens only so embedded
1253        // `YAML_COMMENT` tokens (e.g. `[ word1\n# comment\n, word2]`) do not
1254        // leak into the projected scalar value.
1255        let item_text: String = item
1256            .descendants_with_tokens()
1257            .filter_map(|el| el.into_token())
1258            .filter(|tok| matches!(tok.kind(), SyntaxKind::YAML_SCALAR | SyntaxKind::YAML_KEY))
1259            .map(|tok| tok.text().to_string())
1260            .collect();
1261        project_flow_seq_item(&item_text, handles, out);
1262    }
1263}
1264
1265/// Locate a key/colon split in a block-context scalar. Honors a leading
1266/// quoted body (`"key": value`, `'key': value`) and percent-encoded URIs by
1267/// only treating `:` as a key indicator when followed by whitespace, a flow
1268/// indicator, or end-of-input. Per YAML 1.2 §7.4.3.1, embedded `"` / `'`
1269/// inside plain scalars are literal, so no quote-toggling occurs after the
1270/// leading-quote phase.
1271fn find_block_scalar_kv_split(text: &str) -> Option<usize> {
1272    let bytes = text.as_bytes();
1273    let lead = bytes
1274        .iter()
1275        .position(|b| !matches!(b, b' ' | b'\t'))
1276        .unwrap_or(bytes.len());
1277    let mut idx = lead;
1278    match bytes.get(idx) {
1279        Some(b'"') => {
1280            idx += 1;
1281            let mut escaped = false;
1282            while idx < bytes.len() {
1283                let b = bytes[idx];
1284                idx += 1;
1285                if escaped {
1286                    escaped = false;
1287                    continue;
1288                }
1289                if b == b'\\' {
1290                    escaped = true;
1291                    continue;
1292                }
1293                if b == b'"' {
1294                    break;
1295                }
1296            }
1297        }
1298        Some(b'\'') => {
1299            idx += 1;
1300            while idx < bytes.len() {
1301                let b = bytes[idx];
1302                idx += 1;
1303                if b == b'\'' {
1304                    if bytes.get(idx) == Some(&b'\'') {
1305                        idx += 1;
1306                        continue;
1307                    }
1308                    break;
1309                }
1310            }
1311        }
1312        _ => {}
1313    }
1314    while idx < bytes.len() {
1315        if bytes[idx] == b':' {
1316            let after = idx + 1;
1317            let next = bytes.get(after);
1318            // In block context (which is where this helper runs) only
1319            // whitespace or end-of-input qualifies as the key/value
1320            // indicator's trailing context. The flow-collection terminators
1321            // (`,`, `}`, `]`) are literal here — `- :,` is a single scalar
1322            // `:,`, not an empty-key map.
1323            let is_separator = matches!(next, None | Some(b' ' | b'\t' | b'\n' | b'\r'));
1324            if is_separator {
1325                return Some(idx);
1326            }
1327        }
1328        idx += 1;
1329    }
1330    None
1331}
1332
1333/// Project a single scalar (without surrounding `+MAP`/`-MAP`) for an inline
1334/// map key or value position. Anchors/tags are decomposed in canonical order;
1335/// alias references (`*name`) emit `=ALI`. An empty body emits `=VAL :`.
1336fn project_inline_scalar(text: &str, handles: &TagHandles, out: &mut Vec<String>) {
1337    let trimmed = text.trim();
1338    if trimmed.is_empty() {
1339        out.push("=VAL :".to_string());
1340        return;
1341    }
1342    if trimmed.starts_with('*') {
1343        out.push(format!("=ALI {trimmed}"));
1344        return;
1345    }
1346    let (anchor, body_tag, body) = decompose_scalar(trimmed, handles);
1347    out.push(scalar_event(anchor, body_tag.as_deref(), body));
1348}
1349
1350fn project_block_sequence_items(
1351    seq_node: &SyntaxNode,
1352    handles: &TagHandles,
1353    out: &mut Vec<String>,
1354) {
1355    for item in seq_node
1356        .children()
1357        .filter(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM)
1358    {
1359        if let Some(nested_seq) = item
1360            .children()
1361            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE)
1362        {
1363            out.push("+SEQ".to_string());
1364            project_block_sequence_items(&nested_seq, handles, out);
1365            out.push("-SEQ".to_string());
1366            continue;
1367        }
1368        // Inline-map sequence item: `- key: value` (with optional continuation
1369        // lines that the parser captures as a nested YAML_BLOCK_MAP). The
1370        // direct YAML_SCALAR/YAML_TAG/whitespace token chain encodes the first
1371        // entry; subsequent entries live in the nested map node. Including
1372        // YAML_TAG keeps tagged empty keys/values (`- !!str : !!null`) intact
1373        // so `decompose_scalar` can recover the tag.
1374        let direct_scalar: String = item
1375            .children_with_tokens()
1376            .filter_map(|el| el.into_token())
1377            .filter(|tok| {
1378                matches!(
1379                    tok.kind(),
1380                    SyntaxKind::YAML_SCALAR
1381                        | SyntaxKind::YAML_TAG
1382                        | SyntaxKind::YAML_KEY
1383                        | SyntaxKind::YAML_COLON
1384                        | SyntaxKind::WHITESPACE,
1385                )
1386            })
1387            .map(|tok| tok.text().to_string())
1388            .collect();
1389        if let Some(colon_idx) = find_block_scalar_kv_split(&direct_scalar) {
1390            let nested_map = item
1391                .children()
1392                .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP);
1393            out.push("+MAP".to_string());
1394            project_inline_scalar(&direct_scalar[..colon_idx], handles, out);
1395            project_inline_scalar(&direct_scalar[colon_idx + 1..], handles, out);
1396            if let Some(nm) = nested_map {
1397                project_block_map_entries(&nm, handles, out);
1398            }
1399            out.push("-MAP".to_string());
1400            continue;
1401        }
1402        if let Some(nested_map) = item
1403            .children()
1404            .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1405        {
1406            out.push("+MAP".to_string());
1407            project_block_map_entries(&nested_map, handles, out);
1408            out.push("-MAP".to_string());
1409            continue;
1410        }
1411        if let Some(flow_seq) = item
1412            .children()
1413            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_SEQUENCE)
1414        {
1415            let flow_text = flow_seq.text().to_string();
1416            if let Some(flow_items) = simple_flow_sequence_items(&flow_text) {
1417                out.push("+SEQ []".to_string());
1418                for value in flow_items {
1419                    project_flow_seq_item(&value, handles, out);
1420                }
1421                out.push("-SEQ".to_string());
1422                continue;
1423            }
1424        }
1425        if let Some(flow_map) = item
1426            .children()
1427            .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
1428        {
1429            out.push("+MAP {}".to_string());
1430            project_flow_map_entries(&flow_map, handles, out);
1431            out.push("-MAP".to_string());
1432            continue;
1433        }
1434        let item_tag = item
1435            .descendants_with_tokens()
1436            .filter_map(|el| el.into_token())
1437            .find(|tok| tok.kind() == SyntaxKind::YAML_TAG)
1438            .map(|tok| tok.text().to_string());
1439        let scalar_text = item
1440            .descendants_with_tokens()
1441            .filter_map(|el| el.into_token())
1442            .filter(|tok| tok.kind() == SyntaxKind::YAML_SCALAR)
1443            .map(|tok| tok.text().to_string())
1444            .collect::<Vec<_>>()
1445            .join("");
1446        let scalar_trimmed = scalar_text.trim();
1447        let event = if scalar_trimmed.starts_with('*') {
1448            format!("=ALI {scalar_trimmed}")
1449        } else {
1450            // Combine the optional `YAML_TAG` token (already separated from
1451            // the scalar text by the parser) with anchors/tags found in the
1452            // scalar body, and render the YAML event in canonical
1453            // `&anchor <tag> :body` order.
1454            let item_long_tag = item_tag
1455                .as_deref()
1456                .and_then(|t| resolve_long_tag(t, handles));
1457            let (anchor, body_tag, body) = decompose_scalar(scalar_trimmed, handles);
1458            let long_tag = item_long_tag.or(body_tag);
1459            scalar_event(anchor, long_tag.as_deref(), body)
1460        };
1461        out.push(event);
1462    }
1463}
1464
1465/// Decompose a node-property + scalar string into `(anchor, long_tag, body)`,
1466/// peeling off any leading `&anchor` and tag shorthand in either order
1467/// (`&a !!str foo` or `!!str &a foo`). Returns the raw body trimmed.
1468/// Build the `+SEQ` open event for a YAML_BLOCK_SEQUENCE, attaching any
1469/// document-level node properties (a tag, or a `&anchor` carried by the
1470/// block-sequence header line) that precede the first sequence item. The
1471/// parser stores those properties as YAML_TAG / YAML_SCALAR siblings of
1472/// the YAML_BLOCK_SEQUENCE_ITEM children, in source order.
1473fn seq_open_event(seq_node: &SyntaxNode, handles: &TagHandles) -> String {
1474    let mut anchor: Option<String> = None;
1475    let mut long_tag: Option<String> = None;
1476    for child in seq_node.children_with_tokens() {
1477        if let Some(node) = child.as_node()
1478            && node.kind() == SyntaxKind::YAML_BLOCK_SEQUENCE_ITEM
1479        {
1480            break;
1481        }
1482        let Some(tok) = child.as_token() else {
1483            continue;
1484        };
1485        match tok.kind() {
1486            SyntaxKind::YAML_TAG => {
1487                if long_tag.is_none()
1488                    && let Some(long) = resolve_long_tag(tok.text(), handles)
1489                {
1490                    long_tag = Some(long);
1491                }
1492            }
1493            SyntaxKind::YAML_SCALAR => {
1494                let trimmed = tok.text().trim();
1495                if anchor.is_none()
1496                    && let Some(name) = trimmed.strip_prefix('&')
1497                {
1498                    anchor = Some(name.to_string());
1499                }
1500            }
1501            _ => {}
1502        }
1503    }
1504    let mut event = String::from("+SEQ");
1505    if let Some(t) = long_tag {
1506        event.push(' ');
1507        event.push_str(&t);
1508    }
1509    if let Some(a) = anchor {
1510        event.push_str(" &");
1511        event.push_str(&a);
1512    }
1513    event
1514}
1515
1516fn decompose_scalar<'a>(
1517    text: &'a str,
1518    handles: &TagHandles,
1519) -> (Option<&'a str>, Option<String>, &'a str) {
1520    let mut anchor: Option<&str> = None;
1521    let mut long_tag: Option<String> = None;
1522    let mut rest = text.trim();
1523    loop {
1524        if anchor.is_none()
1525            && let Some(after) = rest.strip_prefix('&')
1526        {
1527            let end = after
1528                .find(|c: char| c.is_whitespace() || matches!(c, ',' | '}' | ']'))
1529                .unwrap_or(after.len());
1530            let (name, tail) = after.split_at(end);
1531            anchor = Some(name);
1532            rest = tail.trim_start();
1533            continue;
1534        }
1535        if long_tag.is_none()
1536            && let Some((tag, tail)) = split_leading_tag(rest)
1537            && let Some(long) = resolve_long_tag(tag, handles)
1538        {
1539            long_tag = Some(long);
1540            rest = tail.trim_start();
1541            continue;
1542        }
1543        break;
1544    }
1545    (anchor, long_tag, rest)
1546}
1547
1548/// Render a scalar event from its decomposed parts: optional anchor,
1549/// optional long-form tag (already in `<...>` form), and the scalar body.
1550/// Handles plain, double-quoted, and single-quoted bodies; quoted bodies
1551/// share the same escape normalization as [`quoted_val_event`].
1552fn scalar_event(anchor: Option<&str>, long_tag: Option<&str>, body: &str) -> String {
1553    let mut prefix = String::new();
1554    if let Some(a) = anchor {
1555        prefix.push_str(&format!("&{a} "));
1556    }
1557    if let Some(t) = long_tag {
1558        prefix.push_str(t);
1559        prefix.push(' ');
1560    }
1561    let body = body.trim();
1562    if body.is_empty() {
1563        return format!("=VAL {prefix}:");
1564    }
1565    if body.starts_with('"') || body.starts_with('\'') {
1566        // Reuse the shared escape/normalization rules; splice the prefix in
1567        // place of the leading `=VAL ` token.
1568        let quoted = quoted_val_event(body);
1569        return quoted.replacen("=VAL ", &format!("=VAL {prefix}"), 1);
1570    }
1571    format!("=VAL {prefix}:{body}")
1572}
1573
1574fn project_block_map_entries(map_node: &SyntaxNode, handles: &TagHandles, out: &mut Vec<String>) {
1575    for child in map_node.children_with_tokens() {
1576        match child {
1577            rowan::NodeOrToken::Token(tok)
1578                if tok.kind() == SyntaxKind::YAML_SCALAR
1579                    && tok.text().trim_start().starts_with("? ") =>
1580            {
1581                let body = tok.text().trim_start().trim_start_matches("? ").trim();
1582                if body.is_empty() {
1583                    out.push("=VAL :".to_string());
1584                } else {
1585                    let (anchor, body_tag, rest) = decompose_scalar(body, handles);
1586                    out.push(scalar_event(anchor, body_tag.as_deref(), rest));
1587                }
1588                out.push("=VAL :".to_string());
1589            }
1590            rowan::NodeOrToken::Node(entry) if entry.kind() == SyntaxKind::YAML_BLOCK_MAP_ENTRY => {
1591                project_block_map_entry(&entry, handles, out);
1592            }
1593            _ => {}
1594        }
1595    }
1596}
1597
1598fn project_block_map_entry(entry: &SyntaxNode, handles: &TagHandles, out: &mut Vec<String>) {
1599    let key_node = entry
1600        .children()
1601        .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_KEY)
1602        .expect("key node");
1603    let value_node = entry
1604        .children()
1605        .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP_VALUE)
1606        .expect("value node");
1607
1608    let key_tag = key_node
1609        .children_with_tokens()
1610        .filter_map(|el| el.into_token())
1611        .find(|tok| tok.kind() == SyntaxKind::YAML_TAG)
1612        .map(|tok| tok.text().to_string());
1613    let key_text = key_node
1614        .children_with_tokens()
1615        .filter_map(|el| el.into_token())
1616        .find(|tok| tok.kind() == SyntaxKind::YAML_KEY)
1617        .map(|tok| tok.text().trim_end().to_string())
1618        .expect("key token");
1619
1620    let key_trimmed = key_text.trim();
1621    if key_trimmed.starts_with('[')
1622        && key_trimmed.ends_with(']')
1623        && let Some(items) = simple_flow_sequence_items(key_trimmed)
1624    {
1625        out.push("+SEQ []".to_string());
1626        for item in items {
1627            project_flow_seq_item(&item, handles, out);
1628        }
1629        out.push("-SEQ".to_string());
1630    } else if key_trimmed.starts_with('*') {
1631        out.push(format!("=ALI {key_trimmed}"));
1632    } else {
1633        let key_long_tag = key_tag
1634            .as_deref()
1635            .and_then(|t| resolve_long_tag(t, handles));
1636        let (anchor, body_tag, body) = decompose_scalar(key_trimmed, handles);
1637        let long_tag = key_long_tag.or(body_tag);
1638        out.push(scalar_event(anchor, long_tag.as_deref(), body));
1639    }
1640
1641    if let Some(nested_map) = value_node
1642        .children()
1643        .find(|n| n.kind() == SyntaxKind::YAML_BLOCK_MAP)
1644    {
1645        out.push("+MAP".to_string());
1646        project_block_map_entries(&nested_map, handles, out);
1647        out.push("-MAP".to_string());
1648        return;
1649    }
1650
1651    if let Some(flow_map) = value_node
1652        .children()
1653        .find(|n| n.kind() == SyntaxKind::YAML_FLOW_MAP)
1654    {
1655        out.push("+MAP {}".to_string());
1656        project_flow_map_entries(&flow_map, handles, out);
1657        out.push("-MAP".to_string());
1658        return;
1659    }
1660
1661    if let Some((indicator, body)) = extract_block_scalar_body(&value_node) {
1662        let escaped = escape_block_scalar_text(&body);
1663        out.push(format!("=VAL {indicator}{escaped}"));
1664        return;
1665    }
1666
1667    let value_tag = value_node
1668        .children_with_tokens()
1669        .filter_map(|el| el.into_token())
1670        .find(|tok| tok.kind() == SyntaxKind::YAML_TAG)
1671        .map(|tok| tok.text().to_string());
1672    let value_text = value_node
1673        .descendants_with_tokens()
1674        .filter_map(|el| el.into_token())
1675        .filter(|tok| tok.kind() == SyntaxKind::YAML_SCALAR)
1676        .map(|tok| tok.text().to_string())
1677        .collect::<Vec<_>>()
1678        .join("");
1679
1680    if value_tag.is_none()
1681        && let Some(items) = simple_flow_sequence_items(&value_text)
1682    {
1683        out.push("+SEQ []".to_string());
1684        for item in items {
1685            project_flow_seq_item(&item, handles, out);
1686        }
1687        out.push("-SEQ".to_string());
1688    } else if value_text.trim().is_empty() {
1689        if let Some(tag) = value_tag
1690            && let Some(long) = resolve_long_tag(&tag, handles)
1691        {
1692            out.push(format!("=VAL {long} :"));
1693        } else {
1694            out.push("=VAL :".to_string());
1695        }
1696    } else if value_text.trim_start().starts_with('*') {
1697        out.push(format!("=ALI {}", value_text.trim()));
1698    } else {
1699        let value_long_tag = value_tag
1700            .as_deref()
1701            .and_then(|t| resolve_long_tag(t, handles));
1702        let (anchor, body_tag, body) = decompose_scalar(value_text.trim(), handles);
1703        let long_tag = value_long_tag.or(body_tag);
1704        out.push(scalar_event(anchor, long_tag.as_deref(), body));
1705    }
1706}