rlsp_yaml_parser/lib.rs
1// SPDX-License-Identifier: MIT
2#![deny(clippy::panic)]
3
4mod chars;
5pub mod encoding;
6mod error;
7mod event;
8mod lexer;
9mod lines;
10pub mod loader;
11pub mod node;
12mod pos;
13
14pub use error::Error;
15pub use event::{Chomp, CollectionStyle, Event, ScalarStyle};
16pub use lines::{BreakType, Line, LineBuffer};
17pub use loader::{LoadError, LoadMode, Loader, LoaderBuilder, LoaderOptions, load};
18pub use node::{Document, Node};
19pub use pos::{Pos, Span};
20
21use std::collections::{HashMap, VecDeque};
22
23use lexer::Lexer;
24
25/// Parse a YAML string into a lazy event stream.
26///
27/// The iterator yields <code>Result<([Event], [Span]), [Error]></code> items.
28/// The first event is always [`Event::StreamStart`] and the last is always
29/// [`Event::StreamEnd`].
30///
31/// # Example
32///
33/// ```
34/// use rlsp_yaml_parser::{parse_events, Event};
35///
36/// let events: Vec<_> = parse_events("").collect();
37/// assert!(matches!(events.first(), Some(Ok((Event::StreamStart, _)))));
38/// assert!(matches!(events.last(), Some(Ok((Event::StreamEnd, _)))));
39/// ```
40pub fn parse_events(input: &str) -> impl Iterator<Item = Result<(Event<'_>, Span), Error>> + '_ {
41 EventIter::new(input)
42}
43
44// ---------------------------------------------------------------------------
45// Depth limit (security: DoS via deeply nested collections)
46// ---------------------------------------------------------------------------
47
48/// Maximum combined block-collection nesting depth accepted from untrusted
49/// input.
50///
51/// This limit covers all open [`Event::SequenceStart`] and
52/// [`Event::MappingStart`] events combined. Using a unified limit prevents
53/// an attacker from nesting 512 sequences inside 512 mappings (total depth
54/// 1024) by exploiting separate per-type limits.
55///
56/// 512 is generous for all real-world YAML (Kubernetes / Helm documents are
57/// typically under 20 levels deep) and small enough that the explicit-stack
58/// overhead stays within a few KB.
59pub const MAX_COLLECTION_DEPTH: usize = 512;
60
61/// Maximum byte length of an anchor name accepted from untrusted input.
62///
63/// Maximum byte length of an anchor or alias name.
64///
65/// The YAML spec places no upper limit on anchor names, but scanning a name
66/// consisting of millions of valid `ns-anchor-char` bytes would exhaust CPU
67/// time without any heap allocation. This limit caps anchor and alias name
68/// scanning at 1 KiB — generous for all real-world YAML (Kubernetes names are
69/// typically under 64 bytes) while preventing degenerate-input stalls.
70///
71/// The limit is enforced by [`parse_events`] for both `&name` (anchors) and
72/// `*name` (aliases). Exceeding it returns an [`Error`], not a panic.
73pub const MAX_ANCHOR_NAME_BYTES: usize = 1024;
74
75/// Maximum byte length of a tag accepted from untrusted input.
76///
77/// The YAML spec places no upper limit on tag length, but scanning a tag
78/// consisting of millions of valid bytes would exhaust CPU time without any
79/// heap allocation. This limit caps tag scanning at 4 KiB — generous for all
80/// real-world YAML (standard tags like `tag:yaml.org,2002:str` are under 30
81/// bytes; custom namespace URIs are rarely over 200 bytes) while preventing
82/// degenerate-input stalls.
83///
84/// The limit applies to the raw scanned portion: the URI content between `<`
85/// and `>` for verbatim tags, or the suffix portion for shorthand tags.
86/// Exceeding it returns an [`Error`], not a panic.
87pub const MAX_TAG_LEN: usize = 4096;
88
89/// Maximum byte length of a comment body accepted from untrusted input.
90///
91/// The YAML spec places no upper limit on comment length. With zero-copy
92/// `&'input str` slices, comment scanning itself allocates nothing, but
93/// character-by-character iteration over a very long comment line still burns
94/// CPU proportional to the line length. This limit matches `MAX_TAG_LEN` —
95/// comment-only files produce one `Comment` event per line (O(input size),
96/// acceptable) as long as individual lines are bounded.
97///
98/// Exceeding this limit returns an [`Error`], not a panic or truncation.
99pub const MAX_COMMENT_LEN: usize = 4096;
100
101/// Maximum number of directives (`%YAML` + `%TAG` combined) per document.
102///
103/// Without this cap, an attacker could supply thousands of distinct `%TAG`
104/// directives, each allocating a `HashMap` entry, to exhaust heap memory.
105/// 64 is generous for all real-world YAML (the typical document has 0–2
106/// directives) while bounding per-document directive overhead.
107///
108/// Exceeding this limit returns an [`Error`], not a panic.
109pub const MAX_DIRECTIVES_PER_DOC: usize = 64;
110
111/// Maximum byte length of a `%TAG` handle (e.g. `!foo!`) accepted from
112/// untrusted input.
113///
114/// Tag handles are short by design; a 256-byte cap is generous while
115/// preventing `DoS` via scanning very long handle strings.
116///
117/// Exceeding this limit returns an [`Error`], not a panic.
118pub const MAX_TAG_HANDLE_BYTES: usize = 256;
119
120/// Maximum byte length of the fully-resolved tag string after prefix expansion.
121///
122/// When a shorthand tag `!foo!bar` is resolved against its `%TAG` prefix, the
123/// result is `prefix + suffix`. This cap prevents the resolved string from
124/// exceeding a safe bound even when the prefix and suffix are both at their
125/// individual limits. Reuses [`MAX_TAG_LEN`] so the bound is consistent with
126/// verbatim tag limits.
127///
128/// The check is performed before allocation; exceeding this limit returns an
129/// [`Error`], not a panic.
130pub const MAX_RESOLVED_TAG_LEN: usize = MAX_TAG_LEN;
131
132// ---------------------------------------------------------------------------
133// Directive scope
134// ---------------------------------------------------------------------------
135
136/// Per-document directive state accumulated from `%YAML` and `%TAG` directives.
137///
138/// Cleared at the start of each new document (on `---` in `BetweenDocs`, on
139/// `...`, or at EOF). The default handles (`!!` and `!`) are **not** stored
140/// here — they are resolved directly in [`DirectiveScope::resolve_tag`].
141#[derive(Debug, Default)]
142struct DirectiveScope {
143 /// Version from `%YAML`, if any.
144 version: Option<(u8, u8)>,
145 /// Custom tag handles declared via `%TAG` directives.
146 ///
147 /// Key: handle (e.g. `"!foo!"`). Value: prefix (e.g. `"tag:example.com:"`).
148 tag_handles: HashMap<String, String>,
149 /// Total directive count (YAML + TAG combined) for the `DoS` limit check.
150 directive_count: usize,
151}
152
153impl DirectiveScope {
154 /// Resolve a raw tag slice (as stored in `pending_tag`) to its final form.
155 ///
156 /// Resolution rules:
157 /// - Verbatim tag (no leading `!`, i.e. already a bare URI from `!<URI>` scanning) → returned as-is.
158 /// - `!!suffix` → look up `"!!"` in custom handles; fall back to default `tag:yaml.org,2002:`.
159 /// - `!suffix` (no inner `!`) → returned as-is (local tag, no expansion).
160 /// - `!handle!suffix` → look up `"!handle!"` in custom handles; error if not found.
161 /// - `!` (bare) → returned as-is.
162 ///
163 /// Returns `Ok(Cow::Borrowed(raw))` when no allocation is needed, or
164 /// `Ok(Cow::Owned(resolved))` after prefix expansion. Returns `Err` when
165 /// a named handle has no registered prefix.
166 fn resolve_tag<'a>(
167 &self,
168 raw: &'a str,
169 indicator_pos: Pos,
170 ) -> Result<std::borrow::Cow<'a, str>, Error> {
171 use std::borrow::Cow;
172
173 // Verbatim tags arrive as bare URIs (scan_tag strips the `!<` / `>` wrappers).
174 // They do not start with `!`, so no resolution is needed.
175 if !raw.starts_with('!') {
176 return Ok(Cow::Borrowed(raw));
177 }
178
179 let after_first_bang = &raw[1..];
180
181 // `!!suffix` — primary handle.
182 if let Some(suffix) = after_first_bang.strip_prefix('!') {
183 let prefix = self
184 .tag_handles
185 .get("!!")
186 .map_or("tag:yaml.org,2002:", String::as_str);
187 let resolved = format!("{prefix}{suffix}");
188 if resolved.len() > MAX_RESOLVED_TAG_LEN {
189 return Err(Error {
190 pos: indicator_pos,
191 message: format!(
192 "resolved tag exceeds maximum length of {MAX_RESOLVED_TAG_LEN} bytes"
193 ),
194 });
195 }
196 return Ok(Cow::Owned(resolved));
197 }
198
199 // `!handle!suffix` — named handle.
200 if let Some(inner_bang) = after_first_bang.find('!') {
201 let handle = &raw[..inner_bang + 2]; // `!handle!`
202 let suffix = &after_first_bang[inner_bang + 1..];
203 if let Some(prefix) = self.tag_handles.get(handle) {
204 let resolved = format!("{prefix}{suffix}");
205 if resolved.len() > MAX_RESOLVED_TAG_LEN {
206 return Err(Error {
207 pos: indicator_pos,
208 message: format!(
209 "resolved tag exceeds maximum length of {MAX_RESOLVED_TAG_LEN} bytes"
210 ),
211 });
212 }
213 return Ok(Cow::Owned(resolved));
214 }
215 return Err(Error {
216 pos: indicator_pos,
217 message: format!("undefined tag handle: {handle}"),
218 });
219 }
220
221 // `!suffix` (local tag) or bare `!` — no expansion.
222 Ok(Cow::Borrowed(raw))
223 }
224
225 /// Collect the tag handle/prefix pairs for inclusion in `DocumentStart`.
226 fn tag_directives(&self) -> Vec<(String, String)> {
227 let mut pairs: Vec<(String, String)> = self
228 .tag_handles
229 .iter()
230 .map(|(h, p)| (h.clone(), p.clone()))
231 .collect();
232 // Sort for deterministic ordering in tests and events.
233 pairs.sort_unstable_by(|a, b| a.0.cmp(&b.0));
234 pairs
235 }
236}
237
238// ---------------------------------------------------------------------------
239// Iterator implementation
240// ---------------------------------------------------------------------------
241
242/// Outcome of one state-machine step inside [`EventIter::next`].
243enum StepResult<'input> {
244 /// The step pushed to `queue` or changed state; loop again to drain.
245 Continue,
246 /// The step produced an event or error to return immediately.
247 Yield(Result<(Event<'input>, Span), Error>),
248}
249
250/// State of the top-level event iterator.
251#[derive(Debug, Clone, Copy, PartialEq, Eq)]
252enum IterState {
253 /// About to emit `StreamStart`.
254 BeforeStream,
255 /// Between documents: skip blanks/comments/directives, detect next document.
256 BetweenDocs,
257 /// Inside a document: consume lines until a boundary marker or EOF.
258 InDocument,
259 /// `StreamEnd` emitted; done.
260 Done,
261}
262
263/// What the state machine expects next for an open mapping entry.
264#[derive(Debug, Clone, Copy, PartialEq, Eq)]
265enum MappingPhase {
266 /// The next node is a key (first half of a pair).
267 Key,
268 /// The next node is a value (second half of a pair).
269 Value,
270}
271
272/// An entry on the collection stack, tracking open block sequences and mappings.
273///
274/// Flow collections are fully parsed by [`EventIter::handle_flow_collection`]
275/// before returning; they never leave an entry on this stack. The combined
276/// depth limit (block + flow) is enforced inside `handle_flow_collection` by
277/// summing `coll_stack.len()` with the local flow-frame count.
278#[derive(Debug, Clone, Copy, PartialEq, Eq)]
279enum CollectionEntry {
280 /// An open block sequence. Holds the column of its `-` indicator and
281 /// whether at least one complete item has been delivered. `has_had_item`
282 /// is `false` for a freshly opened sequence and becomes `true` once a
283 /// complete item (scalar or sub-collection) has been emitted. Used by
284 /// `handle_sequence_entry` to detect a `-` at the wrong indentation level.
285 Sequence(usize, bool),
286 /// An open block mapping. Holds the column of its first key, the
287 /// current phase (expecting key or value), and whether the mapping has
288 /// had at least one key advanced to the value phase (`has_had_value`).
289 /// `has_had_value` is `false` for a freshly opened mapping and becomes
290 /// `true` the first time `advance_mapping_to_value` is called on it.
291 /// The wrong-indentation check in `handle_mapping_entry` uses this flag
292 /// to avoid false positives on explicit-key content nodes (e.g. V9D5).
293 Mapping(usize, MappingPhase, bool),
294}
295
296/// Whether the next expected token in a flow mapping is a key or value.
297#[derive(Debug, Clone, Copy, PartialEq, Eq)]
298enum FlowMappingPhase {
299 /// Expecting the next key (or the closing `}`).
300 Key,
301 /// Expecting the value after a key has been consumed.
302 Value,
303}
304
305impl CollectionEntry {
306 /// The indentation column of this collection's indicator/key.
307 const fn indent(self) -> usize {
308 match self {
309 Self::Sequence(col, _) | Self::Mapping(col, _, _) => col,
310 }
311 }
312}
313
314/// Lazy iterator that yields events by walking a [`Lexer`].
315#[allow(clippy::struct_excessive_bools)]
316struct EventIter<'input> {
317 lexer: Lexer<'input>,
318 state: IterState,
319 /// Queued events to emit before resuming normal state dispatch.
320 ///
321 /// Used when a single parse step must produce multiple consecutive events —
322 /// e.g. `SequenceStart` before the first item, or multiple close events
323 /// when a dedent closes several nested collections at once.
324 queue: VecDeque<(Event<'input>, Span)>,
325 /// Stack of open block collections (sequences and mappings).
326 ///
327 /// Each entry records whether the open collection is a sequence or a
328 /// mapping, its indentation column, and (for mappings) whether the next
329 /// expected node is a key or a value. The combined length of this stack
330 /// is bounded by [`MAX_COLLECTION_DEPTH`].
331 coll_stack: Vec<CollectionEntry>,
332 /// Set to `true` after an `Err` is yielded.
333 ///
334 /// Once set, `next()` immediately returns `None` to prevent infinite
335 /// error loops (e.g. depth-limit firing on the same prepended synthetic
336 /// line).
337 failed: bool,
338 /// A pending anchor name (`&name`) that has been scanned but not yet
339 /// attached to a node event.
340 ///
341 /// Anchors in YAML precede the node they annotate. After scanning
342 /// `&name`, the parser stores the name here and attaches it to the next
343 /// `Scalar`, `SequenceStart`, or `MappingStart` event.
344 ///
345 /// `pending_anchor_for_collection` distinguishes two cases:
346 /// - `true`: anchor was on its own line (`&name\n- item`) — the anchor
347 /// annotates the next node regardless of type (collection or scalar).
348 /// - `false`: anchor was inline with key content
349 /// (`&name key: value`) — the anchor annotates the key scalar, not
350 /// the enclosing mapping.
351 pending_anchor: Option<&'input str>,
352 /// True when `pending_anchor` was set from a standalone anchor line (no
353 /// inline content after the name). False when set from an inline anchor
354 /// that precedes a key or scalar on the same line.
355 pending_anchor_for_collection: bool,
356 /// A pending tag that has been scanned but not yet attached to a node event.
357 ///
358 /// Tags in YAML precede the node they annotate (YAML 1.2 §6.8.1). After
359 /// scanning `!tag`, `!!tag`, `!<uri>`, or `!`, the parser stores the tag
360 /// here and attaches it to the next `Scalar`, `SequenceStart`, or
361 /// `MappingStart` event.
362 ///
363 /// Tags are resolved against the current directive scope at scan time:
364 /// - `!<URI>` → stored as `Cow::Borrowed("URI")` (verbatim, no change)
365 /// - `!!suffix` → resolved via `!!` handle (default: `tag:yaml.org,2002:suffix`)
366 /// - `!suffix` → stored as `Cow::Borrowed("!suffix")` (local tag, no expansion)
367 /// - `!` → stored as `Cow::Borrowed("!")`
368 /// - `!handle!suffix` → resolved via `%TAG !handle! prefix` directive
369 pending_tag: Option<std::borrow::Cow<'input, str>>,
370 /// True when `pending_tag` was set from a standalone tag line (no inline
371 /// content after the tag). False when set inline.
372 pending_tag_for_collection: bool,
373 /// Directive scope for the current document.
374 ///
375 /// Accumulated from `%YAML` and `%TAG` directives seen in `BetweenDocs`
376 /// state. Reset at document boundaries.
377 directive_scope: DirectiveScope,
378 /// Set to `true` once the root node of the current document has been
379 /// fully emitted (a scalar at the top level, or a collection after its
380 /// closing event empties `coll_stack`).
381 ///
382 /// Used to detect invalid extra content after the document root, such as
383 /// `foo:\n bar\ninvalid` where `invalid` appears after the root mapping
384 /// closes. Reset to `false` at each document boundary.
385 root_node_emitted: bool,
386 /// Set to `true` after consuming a `? ` explicit key indicator whose key
387 /// content will appear on the NEXT line (i.e., `had_key_inline = false`).
388 /// Cleared when the key content is processed.
389 ///
390 /// Used to allow a block sequence indicator on a line following `? ` to be
391 /// treated as the explicit key's content rather than triggering the
392 /// "invalid block sequence entry" guard.
393 explicit_key_pending: bool,
394 /// When a tag or anchor appears inline on a physical line (e.g. `!!str &a key:`),
395 /// the key content is prepended as a synthetic line with the key's column as its
396 /// indent. This field records the indent of the ORIGINAL physical line so that
397 /// `handle_mapping_entry` can open the mapping at the correct (original) indent
398 /// rather than the synthetic line's offset.
399 property_origin_indent: Option<usize>,
400}
401
402impl<'input> EventIter<'input> {
403 fn new(input: &'input str) -> Self {
404 Self {
405 lexer: Lexer::new(input),
406 state: IterState::BeforeStream,
407 queue: VecDeque::new(),
408 coll_stack: Vec::new(),
409 failed: false,
410 pending_anchor: None,
411 pending_anchor_for_collection: false,
412 pending_tag: None,
413 pending_tag_for_collection: false,
414 directive_scope: DirectiveScope::default(),
415 root_node_emitted: false,
416 explicit_key_pending: false,
417 property_origin_indent: None,
418 }
419 }
420
421 /// Current combined collection depth (sequences + mappings).
422 const fn collection_depth(&self) -> usize {
423 self.coll_stack.len()
424 }
425
426 /// Push close events for all collections whose indent is `>= threshold`,
427 /// from innermost to outermost.
428 ///
429 /// After each close, if the new top of the stack is a mapping in Value
430 /// phase, flips it to Key phase — the closed collection was that
431 /// mapping's value.
432 fn close_collections_at_or_above(&mut self, threshold: usize, pos: Pos) {
433 while let Some(&top) = self.coll_stack.last() {
434 if top.indent() >= threshold {
435 self.coll_stack.pop();
436 let ev = match top {
437 CollectionEntry::Sequence(_, _) => Event::SequenceEnd,
438 CollectionEntry::Mapping(_, _, _) => Event::MappingEnd,
439 };
440 self.queue.push_back((ev, zero_span(pos)));
441 // After closing a collection, the parent mapping (if any)
442 // transitions from Value phase to Key phase. The parent
443 // sequence (if any) marks its current item as completed.
444 match self.coll_stack.last_mut() {
445 Some(CollectionEntry::Mapping(_, phase, _)) => {
446 if *phase == MappingPhase::Value {
447 *phase = MappingPhase::Key;
448 }
449 }
450 Some(CollectionEntry::Sequence(_, has_had_item)) => {
451 *has_had_item = true;
452 }
453 None => {}
454 }
455 } else {
456 break;
457 }
458 }
459 }
460
461 /// Push close events for all open collections (document-end).
462 ///
463 /// If a mapping is in Value phase when it closes, an empty plain scalar is
464 /// emitted first to satisfy the pending key that had no inline value —
465 /// **unless** the previous closed item was a collection (sequence or
466 /// mapping), which was itself the value. After each closed collection,
467 /// the parent mapping (if any) is advanced from Value to Key phase.
468 fn close_all_collections(&mut self, pos: Pos) {
469 while let Some(top) = self.coll_stack.pop() {
470 let ev = match top {
471 CollectionEntry::Sequence(_, _) => Event::SequenceEnd,
472 CollectionEntry::Mapping(_, MappingPhase::Value, _) => {
473 // Mapping closed while waiting for a value — emit empty value.
474 // Consume any pending anchor so `&anchor\n` at end of doc
475 // is properly attached to the empty value.
476 self.queue.push_back((
477 Event::Scalar {
478 value: std::borrow::Cow::Borrowed(""),
479 style: ScalarStyle::Plain,
480 anchor: self.pending_anchor.take(),
481 tag: None,
482 },
483 zero_span(pos),
484 ));
485 Event::MappingEnd
486 }
487 CollectionEntry::Mapping(_, MappingPhase::Key, _) => Event::MappingEnd,
488 };
489 self.queue.push_back((ev, zero_span(pos)));
490 // After closing any collection, advance the parent mapping (if in
491 // Value phase) to Key phase — the just-closed collection was its value.
492 if let Some(CollectionEntry::Mapping(_, phase, _)) = self.coll_stack.last_mut() {
493 if *phase == MappingPhase::Value {
494 *phase = MappingPhase::Key;
495 }
496 }
497 }
498 }
499
500 /// Check whether the next available line is a block-sequence entry
501 /// indicator (`-` followed by space, tab, or end-of-content).
502 ///
503 /// Returns `(dash_indent, dash_pos)` where:
504 /// - `dash_indent` is the effective document column of the `-`.
505 /// - `dash_pos` is the absolute [`Pos`] of the `-` character.
506 fn peek_sequence_entry(&self) -> Option<(usize, Pos)> {
507 let line = self.lexer.peek_next_line()?;
508 let dash_indent = line.indent;
509 let trimmed = line.content.trim_start_matches(' ');
510
511 if !trimmed.starts_with('-') {
512 return None;
513 }
514 let after_dash = &trimmed[1..];
515 let is_entry =
516 after_dash.is_empty() || after_dash.starts_with(' ') || after_dash.starts_with('\t');
517 if !is_entry {
518 return None;
519 }
520
521 let leading_spaces = line.content.len() - trimmed.len();
522 let dash_pos = Pos {
523 byte_offset: line.pos.byte_offset + leading_spaces,
524 char_offset: line.pos.char_offset + leading_spaces,
525 line: line.pos.line,
526 column: line.pos.column + leading_spaces,
527 };
528 Some((dash_indent, dash_pos))
529 }
530
531 /// Check whether the next available line looks like an implicit mapping
532 /// key: a non-empty line whose plain-scalar content is followed by `: `
533 /// (colon + space) or `:\n` (colon at end-of-line) or `:\t`.
534 ///
535 /// Also recognises the explicit key indicator `? ` at the start of a line.
536 ///
537 /// Returns `(key_indent, key_pos)` on success, where `key_indent` is the
538 /// document column of the first character of the key (or `?` indicator),
539 /// and `key_pos` is its absolute [`Pos`].
540 fn peek_mapping_entry(&self) -> Option<(usize, Pos)> {
541 let line = self.lexer.peek_next_line()?;
542 let key_indent = line.indent;
543
544 let leading_spaces = line.content.len() - line.content.trim_start_matches(' ').len();
545 let trimmed = &line.content[leading_spaces..];
546
547 if trimmed.is_empty() {
548 return None;
549 }
550
551 let key_pos = Pos {
552 byte_offset: line.pos.byte_offset + leading_spaces,
553 char_offset: line.pos.char_offset + leading_spaces,
554 line: line.pos.line,
555 column: line.pos.column + leading_spaces,
556 };
557
558 // Explicit key indicator: `? ` or `?` at EOL.
559 if let Some(after_q) = trimmed.strip_prefix('?') {
560 if after_q.is_empty()
561 || after_q.starts_with(' ')
562 || after_q.starts_with('\t')
563 || after_q.starts_with('\n')
564 || after_q.starts_with('\r')
565 {
566 return Some((key_indent, key_pos));
567 }
568 }
569
570 // Implicit key: line contains `: ` or ends with `:`.
571 // We scan the plain-scalar portion of the line for the value indicator.
572 if is_implicit_mapping_line(trimmed) {
573 return Some((key_indent, key_pos));
574 }
575
576 None
577 }
578
579 /// Try to consume a scalar from the current lexer position.
580 ///
581 /// `plain_parent_indent` — the indent of the current line; plain scalar
582 /// continuation stops when the next line is less-indented than this.
583 ///
584 /// `block_parent_indent` — the indent of the enclosing block context;
585 /// block scalars collect content that is more indented than this value.
586 ///
587 /// Consumes `self.pending_anchor` and attaches it to the emitted scalar.
588 fn try_consume_scalar(
589 &mut self,
590 plain_parent_indent: usize,
591 block_parent_indent: usize,
592 ) -> Result<Option<(Event<'input>, Span)>, Error> {
593 if let Some(result) = self
594 .lexer
595 .try_consume_literal_block_scalar(block_parent_indent)
596 {
597 let (value, chomp, span) = result?;
598 return Ok(Some((
599 Event::Scalar {
600 value,
601 style: ScalarStyle::Literal(chomp),
602 anchor: self.pending_anchor.take(),
603 tag: self.pending_tag.take(),
604 },
605 span,
606 )));
607 }
608 if let Some(result) = self
609 .lexer
610 .try_consume_folded_block_scalar(block_parent_indent)
611 {
612 let (value, chomp, span) = result?;
613 return Ok(Some((
614 Event::Scalar {
615 value,
616 style: ScalarStyle::Folded(chomp),
617 anchor: self.pending_anchor.take(),
618 tag: self.pending_tag.take(),
619 },
620 span,
621 )));
622 }
623 if let Some((value, span)) = self.lexer.try_consume_single_quoted(plain_parent_indent)? {
624 return Ok(Some((
625 Event::Scalar {
626 value,
627 style: ScalarStyle::SingleQuoted,
628 anchor: self.pending_anchor.take(),
629 tag: self.pending_tag.take(),
630 },
631 span,
632 )));
633 }
634 // Pass Some(parent_indent) when inside a block collection so
635 // collect_double_quoted_continuations can validate continuation-line
636 // indentation (YAML 1.2 §7.3.1). At document root (coll_stack empty)
637 // there is no enclosing block, so no indent constraint: pass None.
638 let dq_block_indent = if self.coll_stack.is_empty() {
639 None
640 } else {
641 Some(plain_parent_indent)
642 };
643 if let Some((value, span)) = self.lexer.try_consume_double_quoted(dq_block_indent)? {
644 // In block context, after a double-quoted scalar closes, the only
645 // valid trailing content is optional whitespace followed by an
646 // optional comment (with mandatory preceding whitespace before `#`).
647 // Non-comment, non-whitespace content is an error.
648 if let Some((tail, tail_pos)) = self.lexer.pending_multiline_tail.take() {
649 let first_non_ws = tail.trim_start_matches([' ', '\t']);
650 if !first_non_ws.is_empty() {
651 let ws_len = tail.len() - first_non_ws.len();
652 if first_non_ws.starts_with('#') && ws_len == 0 {
653 // `#` immediately after closing quote — not a comment.
654 self.failed = true;
655 return Err(Error {
656 pos: tail_pos,
657 message: "comment requires at least one space before '#'".into(),
658 });
659 } else if !first_non_ws.starts_with('#') {
660 // Non-comment content after quoted scalar.
661 self.failed = true;
662 return Err(Error {
663 pos: tail_pos,
664 message: "unexpected content after quoted scalar".into(),
665 });
666 }
667 // Valid comment: discard (the comment event is not emitted
668 // in block context here; it will be picked up by drain_trailing_comment
669 // in the normal flow).
670 }
671 }
672 return Ok(Some((
673 Event::Scalar {
674 value,
675 style: ScalarStyle::DoubleQuoted,
676 anchor: self.pending_anchor.take(),
677 tag: self.pending_tag.take(),
678 },
679 span,
680 )));
681 }
682 if let Some((value, span)) = self.lexer.try_consume_plain_scalar(plain_parent_indent) {
683 // Check for invalid content in the suffix (e.g. NUL or mid-stream
684 // BOM that stopped the scanner but is not valid at this position).
685 if let Some(e) = self.lexer.plain_scalar_suffix_error.take() {
686 return Err(e);
687 }
688 return Ok(Some((
689 Event::Scalar {
690 value,
691 style: ScalarStyle::Plain,
692 anchor: self.pending_anchor.take(),
693 tag: self.pending_tag.take(),
694 },
695 span,
696 )));
697 }
698 Ok(None)
699 }
700
701 /// Consume the leading `-` indicator from the current line and (if
702 /// present) prepend a synthetic line for the inline content.
703 ///
704 /// Returns `true` if inline content was found and prepended.
705 fn consume_sequence_dash(&mut self, dash_indent: usize) -> bool {
706 // SAFETY: caller verified via peek_sequence_entry — the line exists.
707 let Some(line) = self.lexer.peek_next_line() else {
708 unreachable!("consume_sequence_dash called without a pending line")
709 };
710
711 let content = line.content;
712 let after_spaces = content.trim_start_matches(' ');
713 debug_assert!(
714 after_spaces.starts_with('-'),
715 "sequence dash not at expected position"
716 );
717 let rest_of_line = &after_spaces[1..];
718 let inline = rest_of_line.trim_start_matches([' ', '\t']);
719 let had_inline = !inline.is_empty();
720
721 if had_inline {
722 let leading_spaces = content.len() - after_spaces.len();
723 let spaces_after_dash = rest_of_line.len() - inline.len();
724 let offset_from_dash = 1 + spaces_after_dash;
725 let total_offset = leading_spaces + offset_from_dash;
726 let inline_col = dash_indent + offset_from_dash;
727 let inline_pos = Pos {
728 byte_offset: line.pos.byte_offset + total_offset,
729 char_offset: line.pos.char_offset + total_offset,
730 line: line.pos.line,
731 column: line.pos.column + total_offset,
732 };
733 let synthetic = Line {
734 content: inline,
735 offset: inline_pos.byte_offset,
736 indent: inline_col,
737 break_type: line.break_type,
738 pos: inline_pos,
739 };
740 self.lexer.consume_line();
741 self.lexer.prepend_inline_line(synthetic);
742 } else {
743 self.lexer.consume_line();
744 }
745
746 had_inline
747 }
748
749 /// Consume the current mapping-entry line.
750 ///
751 /// Handles both forms:
752 /// - **Explicit key** (`? key`): consume the `?` indicator line, extract
753 /// any inline key content and prepend a synthetic line for it.
754 /// - **Implicit key** (`key: value`): split the line at the `: ` / `:\n`
755 /// boundary. Return the key as a pre-extracted slice so the caller can
756 /// emit it as a `Scalar` event directly (bypassing the plain-scalar
757 /// continuation logic). Prepend the value portion (if non-empty) as a
758 /// synthetic line.
759 ///
760 /// Returns a `ConsumedMapping` describing what was found.
761 #[allow(clippy::too_many_lines)]
762 fn consume_mapping_entry(&mut self, key_indent: usize) -> ConsumedMapping<'input> {
763 // SAFETY: caller verified via peek_mapping_entry — the line exists.
764 let Some(line) = self.lexer.peek_next_line() else {
765 unreachable!("consume_mapping_entry called without a pending line")
766 };
767
768 // Extract all data from the borrowed line before any mutable lexer calls.
769 // `content` is `'input`-lived (borrows the original input string, not
770 // the lexer's internal buffer), so it remains valid after consume_line().
771 let content: &'input str = line.content;
772 let line_pos = line.pos;
773 let line_break_type = line.break_type;
774
775 let leading_spaces = content.len() - content.trim_start_matches(' ').len();
776 let trimmed = &content[leading_spaces..];
777
778 // --- Explicit key: `? ` or `?` at EOL ---
779 //
780 // The explicit key indicator is `?` followed by whitespace or end of
781 // line (YAML 1.2 §8.2.2). A `?` followed by a non-whitespace character
782 // (e.g. `?foo: val`) is NOT an explicit key — `?foo` is an implicit key
783 // that starts with `?`, just like `?foo: val` being a mapping entry where
784 // the key is the plain scalar `?foo`. This check must mirror the
785 // condition in peek_mapping_entry to keep consume and peek consistent.
786 if let Some(after_q) = trimmed.strip_prefix('?') {
787 let is_explicit_key = after_q.is_empty()
788 || after_q.starts_with(' ')
789 || after_q.starts_with('\t')
790 || after_q.starts_with('\n')
791 || after_q.starts_with('\r');
792 if is_explicit_key {
793 let inline = after_q.trim_start_matches([' ', '\t']);
794 // A trailing comment (`# ...`) is not key content — treat as
795 // if nothing followed the `?` indicator.
796 let had_key_inline = !inline.is_empty() && !inline.starts_with('#');
797
798 if had_key_inline {
799 // Offset from line start to inline key content.
800 let spaces_after_q = after_q.len() - inline.len();
801 let total_offset = leading_spaces + 1 + spaces_after_q;
802 let inline_col = key_indent + 1 + spaces_after_q;
803 let inline_pos = Pos {
804 byte_offset: line_pos.byte_offset + total_offset,
805 char_offset: line_pos.char_offset + total_offset,
806 line: line_pos.line,
807 column: line_pos.column + total_offset,
808 };
809 let synthetic = Line {
810 content: inline,
811 offset: inline_pos.byte_offset,
812 indent: inline_col,
813 break_type: line_break_type,
814 pos: inline_pos,
815 };
816 self.lexer.consume_line();
817 self.lexer.prepend_inline_line(synthetic);
818 } else {
819 self.lexer.consume_line();
820 }
821 return ConsumedMapping::ExplicitKey { had_key_inline };
822 }
823 }
824
825 // --- Implicit key: `key: value` or `key:` ---
826 // Find the `: ` (or `:\t` or `:\n` or `:` at EOL) boundary.
827 // SAFETY: peek_mapping_entry already confirmed this line is a mapping
828 // entry, so find_value_indicator_offset will return Some.
829 let Some(colon_offset) = find_value_indicator_offset(trimmed) else {
830 unreachable!("consume_mapping_entry: implicit key line has no value indicator")
831 };
832
833 let key_content = trimmed[..colon_offset].trim_end_matches([' ', '\t']);
834 let after_colon = &trimmed[colon_offset + 1..]; // skip ':'
835 let value_content = after_colon.trim_start_matches([' ', '\t']);
836
837 // Key span: starts at the first non-space character.
838 let key_start_pos = Pos {
839 byte_offset: line_pos.byte_offset + leading_spaces,
840 char_offset: line_pos.char_offset + leading_spaces,
841 line: line_pos.line,
842 column: line_pos.column + leading_spaces,
843 };
844 let key_end_pos = {
845 let mut p = key_start_pos;
846 for ch in key_content.chars() {
847 p = p.advance(ch);
848 }
849 p
850 };
851 let key_span = Span {
852 start: key_start_pos,
853 end: key_end_pos,
854 };
855
856 // Compute position of value content (after `: ` / `:\t`).
857 let spaces_after_colon = after_colon.len() - value_content.len();
858 let value_offset_in_trimmed = colon_offset + 1 + spaces_after_colon;
859 let value_col = key_indent + value_offset_in_trimmed;
860 let value_pos = Pos {
861 byte_offset: line_pos.byte_offset + leading_spaces + value_offset_in_trimmed,
862 char_offset: line_pos.char_offset + leading_spaces + value_offset_in_trimmed,
863 line: line_pos.line,
864 column: line_pos.column + leading_spaces + value_offset_in_trimmed,
865 };
866
867 // Detect whether the key is a quoted scalar. `key_content` already
868 // has its outer whitespace stripped; if it starts with `'` or `"` the
869 // key is quoted and must be decoded rather than emitted as Plain.
870 let key_is_quoted = matches!(key_content.as_bytes().first(), Some(b'"' | b'\''));
871
872 // Consume the physical line, then (if inline value content exists)
873 // prepend one synthetic line for the value. The key is returned
874 // directly in the ConsumedMapping variant — not via a synthetic line —
875 // so that the caller can push a Scalar event without routing through
876 // try_consume_plain_scalar (which would incorrectly treat the value
877 // synthetic line as a plain-scalar continuation).
878 self.lexer.consume_line();
879
880 // If the key is quoted, decode it now using the lexer's existing
881 // quoted-scalar methods. We prepend a synthetic line containing only
882 // the key text (including the surrounding quote characters) so the
883 // method can parse it normally, then discard the synthetic line.
884 //
885 // libfyaml (fy-parse.c, fy_attach_comments_if_any / token scanner):
886 // all scalar tokens — quoted or plain — flow through the same token
887 // queue; the *scanner* decodes the scalar at the token level before
888 // the parser ever sees it. We replicate that by decoding quoted keys
889 // here, at the point where we know the key is quoted.
890 let (decoded_key, key_style) = if key_is_quoted {
891 let key_synthetic = Line {
892 content: key_content,
893 offset: key_start_pos.byte_offset,
894 indent: leading_spaces,
895 break_type: line_break_type,
896 pos: key_start_pos,
897 };
898 self.lexer.prepend_inline_line(key_synthetic);
899
900 if key_content.starts_with('\'') {
901 match self.lexer.try_consume_single_quoted(0) {
902 Ok(Some((value, _))) => (value, ScalarStyle::SingleQuoted),
903 Ok(None) => {
904 return ConsumedMapping::QuotedKeyError {
905 pos: key_start_pos,
906 message: "single-quoted key could not be parsed".into(),
907 };
908 }
909 Err(e) => {
910 return ConsumedMapping::QuotedKeyError {
911 pos: e.pos,
912 message: e.message,
913 };
914 }
915 }
916 } else {
917 match self.lexer.try_consume_double_quoted(None) {
918 Ok(Some((value, _))) => (value, ScalarStyle::DoubleQuoted),
919 Ok(None) => {
920 return ConsumedMapping::QuotedKeyError {
921 pos: key_start_pos,
922 message: "double-quoted key could not be parsed".into(),
923 };
924 }
925 Err(e) => {
926 return ConsumedMapping::QuotedKeyError {
927 pos: e.pos,
928 message: e.message,
929 };
930 }
931 }
932 }
933 } else {
934 (std::borrow::Cow::Borrowed(key_content), ScalarStyle::Plain)
935 };
936
937 if !value_content.is_empty() {
938 // Detect illegal inline implicit mapping: if the inline value itself
939 // contains a value indicator (`:` followed by space/EOL), this is an
940 // attempt to start a block mapping inline (e.g. `a: b: c: d` or
941 // `a: 'b': c`). Block mappings cannot appear inline — their entries
942 // must start on new lines. Return an error before prepending the value.
943 if find_value_indicator_offset(value_content).is_some() {
944 return ConsumedMapping::InlineImplicitMappingError { pos: value_pos };
945 }
946
947 // Detect illegal inline block sequence: `key: - item` is invalid
948 // because a block sequence indicator (`-`) cannot appear as an
949 // inline value of a block mapping entry — the sequence must start
950 // on a new line. Only `- `, `-\t`, or bare `-` (at EOL) qualify
951 // as sequence indicators.
952 {
953 let after_dash = value_content.strip_prefix('-');
954 let is_seq_indicator = after_dash.is_some_and(|rest| {
955 rest.is_empty() || rest.starts_with(' ') || rest.starts_with('\t')
956 });
957 if is_seq_indicator {
958 return ConsumedMapping::InlineImplicitMappingError { pos: value_pos };
959 }
960 }
961
962 let value_synthetic = Line {
963 content: value_content,
964 offset: value_pos.byte_offset,
965 indent: value_col,
966 break_type: line_break_type,
967 pos: value_pos,
968 };
969 self.lexer.prepend_inline_line(value_synthetic);
970 }
971
972 ConsumedMapping::ImplicitKey {
973 key_value: decoded_key,
974 key_style,
975 key_span,
976 }
977 }
978
979 /// After emitting a key scalar, flip the innermost mapping to `Value` phase.
980 ///
981 /// **Call-site invariant:** the top of `coll_stack` must be a
982 /// `CollectionEntry::Mapping`. This function is only called from
983 /// mapping-emission paths (`handle_mapping_entry`, explicit-key handling)
984 /// where the caller has already verified that a mapping is the active
985 /// collection. Do **not** call this after emitting a scalar that may be a
986 /// sequence item — use `tick_mapping_phase_after_scalar` instead, which
987 /// stops at a Sequence entry and handles the ambiguity correctly.
988 fn advance_mapping_to_value(&mut self) {
989 debug_assert!(
990 matches!(self.coll_stack.last(), Some(CollectionEntry::Mapping(..))),
991 "advance_mapping_to_value called but top of coll_stack is not a Mapping"
992 );
993 // The explicit key's content has been processed; clear the pending flag.
994 self.explicit_key_pending = false;
995 for entry in self.coll_stack.iter_mut().rev() {
996 if let CollectionEntry::Mapping(_, phase, has_had_value) = entry {
997 *phase = MappingPhase::Value;
998 *has_had_value = true;
999 return;
1000 }
1001 }
1002 }
1003
1004 /// Drain any pending trailing comment from the lexer into the event queue.
1005 ///
1006 /// Called after emitting a scalar event. If a trailing comment was
1007 /// detected on the scalar's line (e.g. `foo # comment`), it is pushed to
1008 /// `self.queue` as `Event::Comment`.
1009 ///
1010 /// Trailing comments are bounded by the physical line length, which is
1011 /// itself bounded by the total input size. No separate length limit is
1012 /// applied here; the security constraint (`MAX_COMMENT_LEN`) applies to
1013 /// standalone comment lines (scanned in [`Self::skip_and_collect_comments_in_doc`]
1014 /// and [`Self::skip_and_collect_comments_between_docs`]).
1015 fn drain_trailing_comment(&mut self) {
1016 if let Some((text, span)) = self.lexer.trailing_comment.take() {
1017 self.queue.push_back((Event::Comment { text }, span));
1018 }
1019 }
1020
1021 /// After emitting a value scalar/collection, flip the innermost mapping
1022 /// back to `Key` phase.
1023 ///
1024 /// **Call-site invariant:** the top of `coll_stack` must be a
1025 /// `CollectionEntry::Mapping`. This function is only called from
1026 /// mapping-emission paths where the caller has already verified that a
1027 /// mapping is the active collection. Do **not** call this after emitting a
1028 /// scalar that may be a sequence item — use `tick_mapping_phase_after_scalar`
1029 /// instead.
1030 fn advance_mapping_to_key(&mut self) {
1031 debug_assert!(
1032 matches!(self.coll_stack.last(), Some(CollectionEntry::Mapping(..))),
1033 "advance_mapping_to_key called but top of coll_stack is not a Mapping"
1034 );
1035 for entry in self.coll_stack.iter_mut().rev() {
1036 if let CollectionEntry::Mapping(_, phase, _) = entry {
1037 *phase = MappingPhase::Key;
1038 return;
1039 }
1040 }
1041 }
1042
1043 /// Returns the minimum column at which a standalone block-node property
1044 /// (anchor or tag on its own line) is valid in the current context.
1045 ///
1046 /// - Mapping in Value phase at indent `n`: the value node must be at col > n.
1047 /// - Sequence at indent `n`: item content must be at col > n.
1048 /// - Mapping in Key phase at indent `n`: a key at col `n` is valid.
1049 /// - Root (empty stack): any column is valid.
1050 fn min_standalone_property_indent(&self) -> usize {
1051 match self.coll_stack.last() {
1052 Some(
1053 CollectionEntry::Mapping(n, MappingPhase::Value, _)
1054 | CollectionEntry::Sequence(n, _),
1055 ) => n + 1,
1056 Some(CollectionEntry::Mapping(n, MappingPhase::Key, _)) => *n,
1057 None => 0,
1058 }
1059 }
1060}
1061
1062/// Result of consuming a mapping-entry line.
1063enum ConsumedMapping<'input> {
1064 /// Explicit key (`? key`).
1065 ExplicitKey {
1066 /// Whether there was key content on the same line as `?`.
1067 had_key_inline: bool,
1068 },
1069 /// Implicit key (`key: value`).
1070 ///
1071 /// The key content and span are pre-extracted so the caller can push the
1072 /// key `Scalar` event directly without routing it through
1073 /// `try_consume_plain_scalar` — which would treat the adjacent value
1074 /// synthetic line as a plain-scalar continuation.
1075 ImplicitKey {
1076 /// The decoded key value (may be owned if escapes were resolved).
1077 key_value: std::borrow::Cow<'input, str>,
1078 /// The scalar style of the key (`Plain`, `SingleQuoted`, or `DoubleQuoted`).
1079 key_style: ScalarStyle,
1080 /// Span covering the key text (including quotes if quoted).
1081 key_span: Span,
1082 },
1083 /// The inline value of an implicit key itself contained a value indicator,
1084 /// making it an illegal inline block mapping (e.g. `a: b: c` or `a: 'b': c`).
1085 /// The error position points to the start of the inline value content.
1086 InlineImplicitMappingError { pos: Pos },
1087 /// A quoted implicit key could not be decoded (e.g. bad escape sequence).
1088 QuotedKeyError { pos: Pos, message: String },
1089}
1090
1091/// True when `trimmed` (content after stripping leading spaces) represents
1092/// an implicit mapping key: it contains `: `, `:\t`, or ends with `:`.
1093fn is_implicit_mapping_line(trimmed: &str) -> bool {
1094 find_value_indicator_offset(trimmed).is_some()
1095}
1096
1097/// Returns `true` when `s` is a block structure indicator that cannot appear
1098/// at tab-based indentation: a block sequence entry (`-` followed by
1099/// whitespace or EOL), an explicit key marker (`?` followed by whitespace or
1100/// EOL), or an implicit mapping key (contains a `:` value indicator).
1101///
1102/// Used to detect tab-as-block-indentation violations (YAML 1.2 §6.1).
1103fn is_tab_indented_block_indicator(s: &str) -> bool {
1104 s.strip_prefix(['-', '?']).map_or_else(
1105 || is_implicit_mapping_line(s),
1106 |after| after.is_empty() || after.starts_with([' ', '\t']),
1107 )
1108}
1109
1110/// Like `find_value_indicator_offset`, but skips any leading anchor (`&name`)
1111/// and/or tag (`!tag`) tokens before checking for a mapping key indicator.
1112///
1113/// This handles cases like `&anchor key: value` or `!!str &a key: value`
1114/// where the actual key content starts after the properties.
1115fn inline_contains_mapping_key(inline: &str) -> bool {
1116 if find_value_indicator_offset(inline).is_some() {
1117 return true;
1118 }
1119 // Skip leading anchor/tag tokens and retry
1120 let mut s = inline;
1121 loop {
1122 let trimmed = s.trim_start_matches([' ', '\t']);
1123 if let Some(after_amp) = trimmed.strip_prefix('&') {
1124 // skip anchor name (non-space chars)
1125 let name_end = after_amp.find([' ', '\t']).unwrap_or(after_amp.len());
1126 s = &after_amp[name_end..];
1127 } else if trimmed.starts_with('!') {
1128 // skip tag token (non-space chars)
1129 let tag_end = trimmed.find([' ', '\t']).unwrap_or(trimmed.len());
1130 s = &trimmed[tag_end..];
1131 } else {
1132 break;
1133 }
1134 if find_value_indicator_offset(s.trim_start_matches([' ', '\t'])).is_some() {
1135 return true;
1136 }
1137 }
1138 false
1139}
1140
1141/// Return the byte offset of the `:` value indicator within `trimmed`, or
1142/// `None` if the line is not a mapping entry.
1143///
1144/// The `:` must be followed by a space, tab, newline/CR, or end-of-string to
1145/// count as a value indicator (YAML 1.2 §7.4). A `:` immediately followed by
1146/// a non-space `ns-char` is part of a plain scalar.
1147///
1148/// Double-quoted and single-quoted spans are skipped correctly: a `:` inside
1149/// quotes is not a value indicator.
1150///
1151/// Lines that begin with YAML indicator characters that cannot start a plain
1152/// scalar (e.g. `%`, `@`, `` ` ``, `,`, `[`, `]`, `{`, `}`, `#`, `&`, `*`,
1153/// `!`, `|`, `>`) are rejected immediately — they are not implicit mapping
1154/// keys. Quoted-scalar starts (`"`, `'`) and bare-indicator starts (`?`, `-`,
1155/// `:`) are handled specially.
1156fn find_value_indicator_offset(trimmed: &str) -> Option<usize> {
1157 // Reject lines that start with indicator characters that cannot begin a
1158 // plain scalar (and are thus not valid implicit mapping keys).
1159 // Also reject lines starting with `\t`: YAML 1.2 §6.1 forbids tabs as
1160 // indentation, so a line beginning with a tab cannot be a mapping entry.
1161 if matches!(
1162 trimmed.as_bytes().first().copied(),
1163 Some(
1164 b'\t'
1165 | b'%'
1166 | b'@'
1167 | b'`'
1168 | b','
1169 | b'['
1170 | b']'
1171 | b'{'
1172 | b'}'
1173 | b'#'
1174 | b'&'
1175 | b'*'
1176 | b'!'
1177 | b'|'
1178 | b'>'
1179 )
1180 ) {
1181 return None;
1182 }
1183
1184 let bytes = trimmed.as_bytes();
1185 let mut i = 0;
1186 let mut prev_was_space = false; // tracks whether the previous byte was whitespace
1187 while let Some(&ch) = bytes.get(i) {
1188 // Stop at an unquoted `#` preceded by whitespace (or at position 0):
1189 // YAML 1.2 §6.6 — a `#` after whitespace begins a comment; any `:` that
1190 // follows is inside the comment and cannot be a value indicator.
1191 if ch == b'#' && (i == 0 || prev_was_space) {
1192 return None;
1193 }
1194
1195 // Skip double-quoted span (handles `\"` escapes).
1196 // Only treat `"` as a quoted-span delimiter when it appears at the
1197 // very start of the key (i == 0) — in YAML, `"key": value` has a
1198 // double-quoted key, but `a"b": value` has a literal `"` inside a
1199 // plain scalar key, which must not be mistaken for a quoted span.
1200 // After a quoted span, `prev_was_space` is false — a closing `"` is
1201 // not whitespace.
1202 if ch == b'"' && i == 0 {
1203 i += 1; // skip opening `"`
1204 while let Some(&inner) = bytes.get(i) {
1205 match inner {
1206 b'\\' => i += 2, // skip escape sequence (two bytes)
1207 b'"' => {
1208 i += 1; // skip closing `"`
1209 break;
1210 }
1211 _ => i += 1,
1212 }
1213 }
1214 prev_was_space = false;
1215 continue;
1216 }
1217
1218 // Skip single-quoted span (handles `''` escape).
1219 // Same rule: only treat `'` as a quoted-span delimiter at position 0.
1220 // After a quoted span, `prev_was_space` is false — a closing `'` is
1221 // not whitespace.
1222 if ch == b'\'' && i == 0 {
1223 i += 1; // skip opening `'`
1224 while let Some(&inner) = bytes.get(i) {
1225 i += 1;
1226 if inner == b'\'' {
1227 // `''` is an escaped single-quote; a lone `'` ends the span.
1228 if bytes.get(i).copied() == Some(b'\'') {
1229 i += 1; // consume the second `'` of the `''` escape
1230 } else {
1231 break; // lone `'` — end of quoted span
1232 }
1233 }
1234 }
1235 prev_was_space = false;
1236 continue;
1237 }
1238
1239 if ch == b':' {
1240 match bytes.get(i + 1).copied() {
1241 None | Some(b' ' | b'\t' | b'\n' | b'\r') => return Some(i),
1242 _ => {}
1243 }
1244 }
1245
1246 prev_was_space = ch == b' ' || ch == b'\t';
1247
1248 // Multi-byte char: advance by UTF-8 lead-byte length.
1249 i += if ch < 0x80 {
1250 1
1251 } else if ch & 0xE0 == 0xC0 {
1252 2
1253 } else if ch & 0xF0 == 0xE0 {
1254 3
1255 } else {
1256 4
1257 };
1258 }
1259 None
1260}
1261
1262/// Scan an anchor name from `content`, returning the name slice.
1263///
1264/// `content` must begin immediately after the `&` or `*` indicator — the first
1265/// character is the first character of the name. The name continues until
1266/// a character that is not `ns-anchor-char` (i.e., whitespace, flow indicator,
1267/// or end of content).
1268///
1269/// Returns `Ok(name)` where `name` is a non-empty borrowed slice of `content`.
1270/// Returns `Err` if:
1271/// - The name would be empty (first character is not `ns-anchor-char`).
1272/// - The name exceeds [`MAX_ANCHOR_NAME_BYTES`] bytes.
1273///
1274/// The caller is responsible for providing the correct [`Pos`] for error
1275/// reporting.
1276fn scan_anchor_name(content: &str, indicator_pos: Pos) -> Result<&str, Error> {
1277 use crate::chars::is_ns_anchor_char;
1278 let end = content
1279 .char_indices()
1280 .take_while(|&(_, ch)| is_ns_anchor_char(ch))
1281 .last()
1282 .map_or(0, |(i, ch)| i + ch.len_utf8());
1283 if end == 0 {
1284 return Err(Error {
1285 pos: indicator_pos,
1286 message: "anchor name must not be empty".into(),
1287 });
1288 }
1289 if end > MAX_ANCHOR_NAME_BYTES {
1290 return Err(Error {
1291 pos: indicator_pos,
1292 message: format!("anchor name exceeds maximum length of {MAX_ANCHOR_NAME_BYTES} bytes"),
1293 });
1294 }
1295 Ok(&content[..end])
1296}
1297
1298/// Scan a tag from `content`, returning the tag slice and its byte length in `content`.
1299///
1300/// `content` must begin immediately after the `!` indicator. The function
1301/// handles all four YAML 1.2 §6.8.1 tag forms:
1302///
1303/// - **Verbatim** `!<URI>` → `content` starts with `<`; returns the URI
1304/// (between the angle brackets) and its length including the `<` and `>`.
1305/// - **Primary shorthand** `!!suffix` → `content` starts with `!`; returns
1306/// the full `!!suffix` slice (including the leading `!` that is part of
1307/// `content`).
1308/// - **Named-handle shorthand** `!handle!suffix` → returns the full slice
1309/// `!handle!suffix` (the leading `!` of `handle` is in `content`).
1310/// - **Secondary shorthand** `!suffix` → `content` starts with a tag-char;
1311/// returns `!suffix` via a slice that includes one byte before `content`
1312/// (the caller provides `full_tag_start` for this).
1313/// - **Non-specific** `!` alone → `content` is empty or starts with a
1314/// separator; returns `"!"` as a one-byte slice of the `!` indicator.
1315///
1316/// # Parameters
1317///
1318/// - `content`: the input slice immediately after the `!` indicator character.
1319/// - `tag_start`: the input slice starting at the `!` (one byte before `content`).
1320/// - `indicator_pos`: the [`Pos`] of the `!` indicator (for error reporting).
1321///
1322/// # Returns
1323///
1324/// `Ok((tag_slice, advance_past_exclamation))` where:
1325/// - `tag_slice` is the borrowed slice to store in `pending_tag`.
1326/// - `advance_past_exclamation` is the number of bytes to advance past the
1327/// `!` indicator (i.e. the advance for the entire tag token, not counting
1328/// the `!` itself).
1329///
1330/// Returns `Err` on invalid verbatim tags (unmatched `<`, empty URI, control
1331/// character in URI) or when the tag length exceeds [`MAX_TAG_LEN`].
1332fn scan_tag<'i>(
1333 content: &'i str,
1334 tag_start: &'i str,
1335 indicator_pos: Pos,
1336) -> Result<(&'i str, usize), Error> {
1337 // ---- Verbatim tag: `!<URI>` ----
1338 if let Some(after_open) = content.strip_prefix('<') {
1339 // Find the closing `>`.
1340 let close = after_open.find('>').ok_or_else(|| Error {
1341 pos: indicator_pos,
1342 message: "verbatim tag missing closing '>'".into(),
1343 })?;
1344 let uri = &after_open[..close];
1345 if uri.is_empty() {
1346 return Err(Error {
1347 pos: indicator_pos,
1348 message: "verbatim tag URI must not be empty".into(),
1349 });
1350 }
1351 if uri.len() > MAX_TAG_LEN {
1352 return Err(Error {
1353 pos: indicator_pos,
1354 message: format!("verbatim tag URI exceeds maximum length of {MAX_TAG_LEN} bytes"),
1355 });
1356 }
1357 // Reject control characters in the URI.
1358 for ch in uri.chars() {
1359 if ch < '\x20' || ch == '\x7F' {
1360 return Err(Error {
1361 pos: indicator_pos,
1362 message: format!("verbatim tag URI contains invalid character {ch:?}"),
1363 });
1364 }
1365 }
1366 // advance = 1 (for '<') + uri.len() + 1 (for '>') bytes past the `!`
1367 let advance = 1 + uri.len() + 1;
1368 return Ok((uri, advance));
1369 }
1370
1371 // ---- Primary handle: `!!suffix` ----
1372 if let Some(suffix) = content.strip_prefix('!') {
1373 // suffix starts after the second `!`
1374 let suffix_bytes = scan_tag_suffix(suffix);
1375 // `!!` alone with no suffix is valid (empty suffix shorthand).
1376 if suffix_bytes > MAX_TAG_LEN {
1377 return Err(Error {
1378 pos: indicator_pos,
1379 message: format!("tag exceeds maximum length of {MAX_TAG_LEN} bytes"),
1380 });
1381 }
1382 // tag_slice = `!!suffix` — one byte back for the first `!` (in `tag_start`)
1383 // plus `!` in content plus suffix.
1384 let tag_slice = &tag_start[..2 + suffix_bytes]; // `!` + `!` + suffix
1385 let advance = 1 + suffix_bytes; // past the `!` in content and suffix
1386 return Ok((tag_slice, advance));
1387 }
1388
1389 // ---- Non-specific tag: bare `!` (content is empty or starts with non-tag-char) ----
1390 // A `%` alone (without two following hex digits) also falls here via scan_tag_suffix.
1391 if scan_tag_suffix(content) == 0 {
1392 // The tag is just `!` — a one-byte slice from `tag_start`.
1393 let tag_slice = &tag_start[..1];
1394 return Ok((tag_slice, 0)); // 0 bytes advance past `!` (nothing follows the `!`)
1395 }
1396
1397 // ---- Named handle `!handle!suffix` or secondary handle `!suffix` ----
1398 // Scan tag chars until we hit a `!` (named handle delimiter) or non-tag-char.
1399 let mut end = 0;
1400 let mut found_inner_bang = false;
1401 for (i, ch) in content.char_indices() {
1402 if ch == '!' {
1403 // Named handle: `!handle!suffix` — scan the suffix after the inner `!`.
1404 found_inner_bang = true;
1405 end = i + 1; // include the `!`
1406 // Scan suffix chars (and %HH sequences) after the inner `!`.
1407 end += scan_tag_suffix(&content[i + 1..]);
1408 break;
1409 } else if is_tag_char(ch) {
1410 end = i + ch.len_utf8();
1411 } else if ch == '%' {
1412 // Percent-encoded sequence: %HH.
1413 let pct_len = scan_tag_suffix(&content[i..]);
1414 if pct_len == 0 {
1415 break; // bare `%` without two hex digits — stop
1416 }
1417 end = i + pct_len;
1418 } else {
1419 break;
1420 }
1421 }
1422
1423 if end == 0 && !found_inner_bang {
1424 // No tag chars at all (covered by non-specific check above, but defensive).
1425 let tag_slice = &tag_start[..1];
1426 return Ok((tag_slice, 0));
1427 }
1428
1429 if end > MAX_TAG_LEN {
1430 return Err(Error {
1431 pos: indicator_pos,
1432 message: format!("tag exceeds maximum length of {MAX_TAG_LEN} bytes"),
1433 });
1434 }
1435
1436 // tag_slice = `!` + content[..end] — includes the leading `!` from tag_start.
1437 let tag_slice = &tag_start[..=end];
1438 Ok((tag_slice, end))
1439}
1440
1441/// Returns true if `ch` is a valid YAML 1.2 `ns-tag-char` (§6.8.1) single character.
1442///
1443/// This is the *closed* set defined in the spec: `ns-uri-char` minus `!` and
1444/// the flow indicators. `%` is NOT included here — percent-encoded sequences
1445/// (`%HH`) are handled separately via [`scan_tag_suffix`].
1446const fn is_tag_char(ch: char) -> bool {
1447 ch.is_ascii_alphanumeric()
1448 || matches!(
1449 ch,
1450 '-' | '_'
1451 | '.'
1452 | '~'
1453 | '*'
1454 | '\''
1455 | '('
1456 | ')'
1457 | '#'
1458 | ';'
1459 | '/'
1460 | '?'
1461 | ':'
1462 | '@'
1463 | '&'
1464 | '='
1465 | '+'
1466 | '$'
1467 )
1468}
1469
1470/// Returns the byte length of the valid tag suffix starting at `s`.
1471///
1472/// A tag suffix is a sequence of `ns-tag-char` characters and percent-encoded
1473/// `%HH` sequences (YAML 1.2 §6.8.1). Scanning stops at the first character
1474/// that does not satisfy either condition.
1475fn scan_tag_suffix(s: &str) -> usize {
1476 let bytes = s.as_bytes();
1477 let mut pos = 0;
1478 while pos < bytes.len() {
1479 // Percent-encoded sequence: `%` followed by exactly two hex digits.
1480 if bytes.get(pos) == Some(&b'%') {
1481 let h1 = bytes
1482 .get(pos + 1)
1483 .copied()
1484 .is_some_and(|b| b.is_ascii_hexdigit());
1485 let h2 = bytes
1486 .get(pos + 2)
1487 .copied()
1488 .is_some_and(|b| b.is_ascii_hexdigit());
1489 if h1 && h2 {
1490 pos += 3;
1491 continue;
1492 }
1493 break;
1494 }
1495 // Safe to decode the next char: all is_tag_char matches are ASCII,
1496 // so multi-byte UTF-8 chars will fail is_tag_char and stop the scan.
1497 let Some(ch) = s[pos..].chars().next() else {
1498 break;
1499 };
1500 if is_tag_char(ch) {
1501 pos += ch.len_utf8();
1502 } else {
1503 break;
1504 }
1505 }
1506 pos
1507}
1508
1509/// Build an empty plain scalar event.
1510const fn empty_scalar_event<'input>() -> Event<'input> {
1511 Event::Scalar {
1512 value: std::borrow::Cow::Borrowed(""),
1513 style: ScalarStyle::Plain,
1514 anchor: None,
1515 tag: None,
1516 }
1517}
1518
1519/// Build a span that covers exactly the 3-byte document marker at `marker_pos`.
1520const fn marker_span(marker_pos: Pos) -> Span {
1521 Span {
1522 start: marker_pos,
1523 end: Pos {
1524 byte_offset: marker_pos.byte_offset + 3,
1525 char_offset: marker_pos.char_offset + 3,
1526 line: marker_pos.line,
1527 column: marker_pos.column + 3,
1528 },
1529 }
1530}
1531
1532/// Build a zero-width span at `pos`.
1533const fn zero_span(pos: Pos) -> Span {
1534 Span {
1535 start: pos,
1536 end: pos,
1537 }
1538}
1539
1540/// Returns `true` if `handle` is a syntactically valid YAML tag handle.
1541///
1542/// Valid forms per YAML 1.2 §6.8.1 productions [89]–[92]:
1543/// - `!` — primary tag handle
1544/// - `!!` — secondary tag handle
1545/// - `!<word-chars>!` — named tag handle, where word chars are `[a-zA-Z0-9_-]`
1546fn is_valid_tag_handle(handle: &str) -> bool {
1547 match handle {
1548 "!" | "!!" => true,
1549 _ => {
1550 // Named handle: starts and ends with `!`, interior non-empty word chars.
1551 let inner = handle.strip_prefix('!').and_then(|s| s.strip_suffix('!'));
1552 match inner {
1553 Some(word) if !word.is_empty() => word
1554 .chars()
1555 .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_'),
1556 _ => false,
1557 }
1558 }
1559 }
1560}
1561
1562impl<'input> EventIter<'input> {
1563 /// Consume blank lines, comment lines, and directive lines in `BetweenDocs`
1564 /// context.
1565 ///
1566 /// - Blank lines: silently consumed.
1567 /// - Comment lines: emitted as `Event::Comment` items into `self.queue`.
1568 /// - Directive lines (`%`-prefixed): parsed and accumulated into
1569 /// `self.directive_scope`.
1570 ///
1571 /// Returns `Err` on malformed directives, exceeded limits, or comment
1572 /// bodies exceeding `MAX_COMMENT_LEN`. Stops at the first non-blank,
1573 /// non-comment, non-directive line (i.e. `---`, `...`, or content).
1574 ///
1575 /// The caller is responsible for resetting `self.directive_scope` before
1576 /// entering the `BetweenDocs` state (at each document boundary transition).
1577 /// This function does NOT reset it — `step_between_docs` re-enters it on
1578 /// every comment yield, so resetting here would clobber directives parsed
1579 /// on earlier re-entries for the same document.
1580 fn consume_preamble_between_docs(&mut self) -> Result<(), Error> {
1581 loop {
1582 // Skip blank lines first.
1583 self.lexer.skip_blank_lines_between_docs();
1584
1585 // Collect comment lines.
1586 while self.lexer.is_comment_line() {
1587 match self.lexer.try_consume_comment(MAX_COMMENT_LEN) {
1588 Ok(Some((text, span))) => {
1589 self.queue.push_back((Event::Comment { text }, span));
1590 }
1591 Ok(None) => break,
1592 Err(e) => return Err(e),
1593 }
1594 self.lexer.skip_blank_lines_between_docs();
1595 }
1596
1597 // Parse directive lines.
1598 while self.lexer.is_directive_line() {
1599 let Some((content, dir_pos)) = self.lexer.try_consume_directive_line() else {
1600 break;
1601 };
1602 self.parse_directive(content, dir_pos)?;
1603 self.lexer.skip_blank_lines_between_docs();
1604 }
1605
1606 // After parsing directives, there may be more blank lines or comments.
1607 if !self.lexer.is_comment_line() && !self.lexer.is_directive_line() {
1608 return Ok(());
1609 }
1610 }
1611 }
1612
1613 /// Parse a single directive line and update `self.directive_scope`.
1614 ///
1615 /// `content` is the full line content starting with `%` (e.g. `"%YAML 1.2"`).
1616 /// `dir_pos` is the position of the `%` character.
1617 fn parse_directive(&mut self, content: &'input str, dir_pos: Pos) -> Result<(), Error> {
1618 // Enforce per-document directive count limit.
1619 if self.directive_scope.directive_count >= MAX_DIRECTIVES_PER_DOC {
1620 return Err(Error {
1621 pos: dir_pos,
1622 message: format!(
1623 "directive count exceeds maximum of {MAX_DIRECTIVES_PER_DOC} per document"
1624 ),
1625 });
1626 }
1627
1628 // `content` starts with `%`; the rest is `NAME[ params...]`.
1629 let after_percent = &content[1..];
1630
1631 // Determine directive name (up to first whitespace).
1632 let name_end = after_percent
1633 .find([' ', '\t'])
1634 .unwrap_or(after_percent.len());
1635 let name = &after_percent[..name_end];
1636 let rest = after_percent[name_end..].trim_start_matches([' ', '\t']);
1637
1638 match name {
1639 "YAML" => self.parse_yaml_directive(rest, dir_pos),
1640 "TAG" => self.parse_tag_directive(rest, dir_pos),
1641 _ => {
1642 // Reserved directive — silently ignore per YAML 1.2 spec.
1643 self.directive_scope.directive_count += 1;
1644 Ok(())
1645 }
1646 }
1647 }
1648
1649 /// Parse `%YAML major.minor` and store in directive scope.
1650 fn parse_yaml_directive(&mut self, params: &str, dir_pos: Pos) -> Result<(), Error> {
1651 if self.directive_scope.version.is_some() {
1652 return Err(Error {
1653 pos: dir_pos,
1654 message: "duplicate %YAML directive in the same document".into(),
1655 });
1656 }
1657
1658 // Parse `major.minor`.
1659 let dot = params.find('.').ok_or_else(|| Error {
1660 pos: dir_pos,
1661 message: format!("malformed %YAML directive: expected 'major.minor', got {params:?}"),
1662 })?;
1663 let major_str = ¶ms[..dot];
1664 let after_dot = ¶ms[dot + 1..];
1665 // Minor version ends at first whitespace or end of string.
1666 let minor_end = after_dot.find([' ', '\t']).unwrap_or(after_dot.len());
1667 let minor_str = &after_dot[..minor_end];
1668 // Anything after the minor version must be empty or a comment (# ...).
1669 let trailing = after_dot[minor_end..].trim_start_matches([' ', '\t']);
1670 if !trailing.is_empty() && !trailing.starts_with('#') {
1671 return Err(Error {
1672 pos: dir_pos,
1673 message: format!(
1674 "malformed %YAML directive: unexpected trailing content {trailing:?}"
1675 ),
1676 });
1677 }
1678
1679 let major = major_str.parse::<u8>().map_err(|_| Error {
1680 pos: dir_pos,
1681 message: format!("malformed %YAML major version: {major_str:?}"),
1682 })?;
1683 let minor = minor_str.parse::<u8>().map_err(|_| Error {
1684 pos: dir_pos,
1685 message: format!("malformed %YAML minor version: {minor_str:?}"),
1686 })?;
1687
1688 // Only major version 1 is accepted; 2+ is a hard error.
1689 if major != 1 {
1690 return Err(Error {
1691 pos: dir_pos,
1692 message: format!("unsupported YAML version {major}.{minor}: only 1.x is supported"),
1693 });
1694 }
1695
1696 self.directive_scope.version = Some((major, minor));
1697 self.directive_scope.directive_count += 1;
1698 Ok(())
1699 }
1700
1701 /// Parse `%TAG !handle! prefix` and store in directive scope.
1702 fn parse_tag_directive(&mut self, params: &'input str, dir_pos: Pos) -> Result<(), Error> {
1703 // Split on whitespace to get handle and prefix.
1704 let handle_end = params.find([' ', '\t']).ok_or_else(|| Error {
1705 pos: dir_pos,
1706 message: format!("malformed %TAG directive: expected 'handle prefix', got {params:?}"),
1707 })?;
1708 let handle = ¶ms[..handle_end];
1709 let prefix = params[handle_end..].trim_start_matches([' ', '\t']);
1710
1711 if prefix.is_empty() {
1712 return Err(Error {
1713 pos: dir_pos,
1714 message: "malformed %TAG directive: missing prefix".into(),
1715 });
1716 }
1717
1718 // Validate handle shape: must be `!`, `!!`, or `!<word-chars>!`
1719 // where word chars are ASCII alphanumeric, `-`, or `_`
1720 // (YAML 1.2 §6.8.1 productions [89]–[92]).
1721 if !is_valid_tag_handle(handle) {
1722 return Err(Error {
1723 pos: dir_pos,
1724 message: format!("malformed %TAG handle: {handle:?} is not a valid tag handle"),
1725 });
1726 }
1727
1728 // Validate handle length.
1729 if handle.len() > MAX_TAG_HANDLE_BYTES {
1730 return Err(Error {
1731 pos: dir_pos,
1732 message: format!(
1733 "tag handle exceeds maximum length of {MAX_TAG_HANDLE_BYTES} bytes"
1734 ),
1735 });
1736 }
1737
1738 // Validate prefix length.
1739 if prefix.len() > MAX_TAG_LEN {
1740 return Err(Error {
1741 pos: dir_pos,
1742 message: format!("tag prefix exceeds maximum length of {MAX_TAG_LEN} bytes"),
1743 });
1744 }
1745
1746 // Reject control characters in prefix.
1747 for ch in prefix.chars() {
1748 if (ch as u32) < 0x20 || ch == '\x7F' {
1749 return Err(Error {
1750 pos: dir_pos,
1751 message: format!("tag prefix contains invalid control character {ch:?}"),
1752 });
1753 }
1754 }
1755
1756 // Duplicate handle check.
1757 if self.directive_scope.tag_handles.contains_key(handle) {
1758 return Err(Error {
1759 pos: dir_pos,
1760 message: format!("duplicate %TAG directive for handle {handle:?}"),
1761 });
1762 }
1763
1764 self.directive_scope
1765 .tag_handles
1766 .insert(handle.to_owned(), prefix.to_owned());
1767 self.directive_scope.directive_count += 1;
1768 Ok(())
1769 }
1770
1771 /// Skip blank lines while collecting any comment lines encountered as
1772 /// `Event::Comment` items pushed to `self.queue`.
1773 ///
1774 /// Used in `InDocument` context.
1775 /// Returns `Err` if a comment body exceeds `MAX_COMMENT_LEN`.
1776 fn skip_and_collect_comments_in_doc(&mut self) -> Result<(), Error> {
1777 loop {
1778 // Skip truly blank lines (not comments).
1779 self.lexer.skip_empty_lines();
1780 // Collect any comment lines.
1781 if !self.lexer.is_comment_line() {
1782 return Ok(());
1783 }
1784 while self.lexer.is_comment_line() {
1785 match self.lexer.try_consume_comment(MAX_COMMENT_LEN) {
1786 Ok(Some((text, span))) => {
1787 self.queue.push_back((Event::Comment { text }, span));
1788 }
1789 Ok(None) => break,
1790 Err(e) => return Err(e),
1791 }
1792 }
1793 // Loop to skip any blank lines that follow the comments.
1794 }
1795 }
1796
1797 /// Handle one iteration step in the `BetweenDocs` state.
1798 fn step_between_docs(&mut self) -> StepResult<'input> {
1799 match self.consume_preamble_between_docs() {
1800 Ok(()) => {}
1801 Err(e) => {
1802 self.failed = true;
1803 return StepResult::Yield(Err(e));
1804 }
1805 }
1806 // If comments were queued, drain them before checking document state.
1807 if !self.queue.is_empty() {
1808 return StepResult::Continue;
1809 }
1810
1811 if self.lexer.at_eof() {
1812 // Per YAML 1.2 §9.2, directives require a `---` marker.
1813 // A directive followed by EOF (no `---`) is a spec violation.
1814 if self.directive_scope.directive_count > 0 {
1815 let pos = self.lexer.current_pos();
1816 self.failed = true;
1817 return StepResult::Yield(Err(Error {
1818 pos,
1819 message: "directives must be followed by a '---' document-start marker".into(),
1820 }));
1821 }
1822 let end = self.lexer.current_pos();
1823 self.state = IterState::Done;
1824 return StepResult::Yield(Ok((Event::StreamEnd, zero_span(end))));
1825 }
1826 if self.lexer.is_directives_end() {
1827 let (marker_pos, _) = self.lexer.consume_marker_line(false);
1828 if let Some(e) = self.lexer.marker_inline_error.take() {
1829 self.failed = true;
1830 return StepResult::Yield(Err(e));
1831 }
1832 self.state = IterState::InDocument;
1833 self.root_node_emitted = false;
1834 // Take the accumulated directives — scope stays active for document body tag resolution.
1835 let version = self.directive_scope.version;
1836 let tag_directives = self.directive_scope.tag_directives();
1837 self.queue.push_back((
1838 Event::DocumentStart {
1839 explicit: true,
1840 version,
1841 tag_directives,
1842 },
1843 marker_span(marker_pos),
1844 ));
1845 self.drain_trailing_comment();
1846 return StepResult::Continue;
1847 }
1848 if self.lexer.is_document_end() {
1849 // Orphan `...` — if directives were parsed without a `---` marker,
1850 // that is a spec violation (YAML 1.2 §9.2: directives require `---`).
1851 if self.directive_scope.directive_count > 0 {
1852 let pos = self.lexer.current_pos();
1853 self.failed = true;
1854 return StepResult::Yield(Err(Error {
1855 pos,
1856 message: "directives must be followed by a '---' document-start marker".into(),
1857 }));
1858 }
1859 self.lexer.consume_marker_line(true);
1860 if let Some(e) = self.lexer.marker_inline_error.take() {
1861 self.failed = true;
1862 return StepResult::Yield(Err(e));
1863 }
1864 return StepResult::Continue; // orphan `...`, no event
1865 }
1866 // Per YAML 1.2 §9.2, directives require a `---` marker. If the next
1867 // line is not `---` and we have already parsed directives, that is a
1868 // spec violation — reject before emitting an implicit DocumentStart.
1869 if self.directive_scope.directive_count > 0 {
1870 let pos = self.lexer.current_pos();
1871 self.failed = true;
1872 return StepResult::Yield(Err(Error {
1873 pos,
1874 message: "directives must be followed by a '---' document-start marker".into(),
1875 }));
1876 }
1877 debug_assert!(
1878 self.lexer.has_content(),
1879 "expected content after consuming blank/comment/directive lines"
1880 );
1881 let content_pos = self.lexer.current_pos();
1882 self.state = IterState::InDocument;
1883 self.root_node_emitted = false;
1884 // Take the accumulated directives — scope stays active for document body tag resolution.
1885 let version = self.directive_scope.version;
1886 let tag_directives = self.directive_scope.tag_directives();
1887 StepResult::Yield(Ok((
1888 Event::DocumentStart {
1889 explicit: false,
1890 version,
1891 tag_directives,
1892 },
1893 zero_span(content_pos),
1894 )))
1895 }
1896
1897 /// Handle one iteration step in the `InDocument` state.
1898 #[allow(clippy::too_many_lines)]
1899 fn step_in_document(&mut self) -> StepResult<'input> {
1900 match self.skip_and_collect_comments_in_doc() {
1901 Ok(()) => {}
1902 Err(e) => {
1903 self.failed = true;
1904 return StepResult::Yield(Err(e));
1905 }
1906 }
1907 // If comments were queued, drain them before checking document state.
1908 if !self.queue.is_empty() {
1909 return StepResult::Continue;
1910 }
1911
1912 // ---- Tab indentation check ----
1913 //
1914 // YAML 1.2 §6.1: tabs cannot be used for indentation in block context.
1915 // Only lines whose VERY FIRST character is `\t` (no leading spaces) are
1916 // using a tab as the indentation character and must be rejected.
1917 //
1918 // Exceptions: `\t[`, `\t{`, `\t]`, `\t}` are allowed because flow
1919 // collection delimiters can follow tabs (YAML test suite 6CA3, Q5MG).
1920 // Lines like ` \tx` have SPACES as indentation; the tab is content.
1921 if let Some(line) = self.lexer.peek_next_line() {
1922 if line.content.starts_with('\t') {
1923 // First char is a tab — check what the first non-tab character
1924 // is. Flow collection delimiters are allowed after leading tabs.
1925 let first_non_tab = line.content.trim_start_matches('\t').chars().next();
1926 if !matches!(first_non_tab, Some('[' | '{' | ']' | '}')) {
1927 let err_pos = line.pos;
1928 self.failed = true;
1929 self.lexer.consume_line();
1930 return StepResult::Yield(Err(Error {
1931 pos: err_pos,
1932 message: "tabs are not allowed as indentation (YAML 1.2 §6.1)".into(),
1933 }));
1934 }
1935 }
1936 }
1937
1938 // ---- Document / stream boundaries ----
1939
1940 if self.lexer.at_eof() && !self.lexer.has_inline_scalar() {
1941 let end = self.lexer.drain_to_end();
1942 self.close_all_collections(end);
1943 self.queue
1944 .push_back((Event::DocumentEnd { explicit: false }, zero_span(end)));
1945 self.queue.push_back((Event::StreamEnd, zero_span(end)));
1946 self.state = IterState::Done;
1947 return StepResult::Continue;
1948 }
1949 if self.lexer.is_document_end() {
1950 let pos = self.lexer.current_pos();
1951 self.close_all_collections(pos);
1952 let (marker_pos, _) = self.lexer.consume_marker_line(true);
1953 if let Some(e) = self.lexer.marker_inline_error.take() {
1954 self.failed = true;
1955 return StepResult::Yield(Err(e));
1956 }
1957 // Reset directive scope at the document boundary so directives from
1958 // this document do not leak into the next one.
1959 self.directive_scope = DirectiveScope::default();
1960 self.state = IterState::BetweenDocs;
1961 self.queue.push_back((
1962 Event::DocumentEnd { explicit: true },
1963 marker_span(marker_pos),
1964 ));
1965 self.drain_trailing_comment();
1966 return StepResult::Continue;
1967 }
1968 if self.lexer.is_directives_end() {
1969 let pos = self.lexer.current_pos();
1970 self.close_all_collections(pos);
1971 let (marker_pos, _) = self.lexer.consume_marker_line(false);
1972 if let Some(e) = self.lexer.marker_inline_error.take() {
1973 self.failed = true;
1974 return StepResult::Yield(Err(e));
1975 }
1976 // A bare `---` inside a document implicitly ends the current document
1977 // and starts a new one without a preamble. Reset the directive scope
1978 // here since consume_preamble_between_docs will not be called for this
1979 // transition.
1980 self.directive_scope = DirectiveScope::default();
1981 // Validate any inline tag on this `---` line against the new
1982 // document's (empty) directive scope. Tags defined in the previous
1983 // document do not carry over (YAML §9.2), so an undefined handle
1984 // must fail immediately.
1985 if let Some((tag_val, tag_pos)) = self.lexer.peek_inline_scalar() {
1986 if tag_val.starts_with('!') {
1987 if let Err(e) = self.directive_scope.resolve_tag(tag_val, tag_pos) {
1988 self.lexer.drain_inline_scalar();
1989 self.failed = true;
1990 return StepResult::Yield(Err(e));
1991 }
1992 }
1993 }
1994 self.state = IterState::InDocument;
1995 self.root_node_emitted = false;
1996 self.queue.push_back((
1997 Event::DocumentEnd { explicit: false },
1998 zero_span(marker_pos),
1999 ));
2000 self.queue.push_back((
2001 Event::DocumentStart {
2002 explicit: true,
2003 version: None,
2004 tag_directives: Vec::new(),
2005 },
2006 marker_span(marker_pos),
2007 ));
2008 self.drain_trailing_comment();
2009 return StepResult::Continue;
2010 }
2011
2012 // ---- Directive lines (`%YAML`/`%TAG`) inside document body ----
2013 //
2014 // YAML 1.2 §9.2: directives can only appear in the preamble (before
2015 // `---`). A `%YAML` or `%TAG` line inside a document body, followed
2016 // by `---`, indicates the author forgot to close the previous document
2017 // with `...` before writing the next document's preamble.
2018 //
2019 // We only fire the error when:
2020 // 1. The current line starts with `%YAML ` or `%TAG ` (a genuine
2021 // YAML directive keyword, not arbitrary content like `%!PS-Adobe`).
2022 // 2. The following line is a `---` document-start marker.
2023 //
2024 // This avoids false positives when `%` appears as content in plain
2025 // scalars (XLQ9) or inside block scalar bodies (M7A3, W4TN).
2026 if let Some(line) = self.lexer.peek_next_line() {
2027 let is_yaml_directive =
2028 line.content.starts_with("%YAML ") || line.content.starts_with("%TAG ");
2029 if is_yaml_directive {
2030 let next_is_doc_start = self.lexer.peek_second_line().is_some_and(|l| {
2031 l.content == "---"
2032 || l.content.starts_with("--- ")
2033 || l.content.starts_with("---\t")
2034 });
2035 if next_is_doc_start {
2036 let err_pos = line.pos;
2037 self.failed = true;
2038 self.lexer.consume_line();
2039 return StepResult::Yield(Err(Error {
2040 pos: err_pos,
2041 message:
2042 "directive '%' is only valid before the document-start marker '---'"
2043 .into(),
2044 }));
2045 }
2046 }
2047 }
2048
2049 // ---- Root-node guard ----
2050 //
2051 // A YAML document contains exactly one root node. Once the root has
2052 // been fully emitted (`root_node_emitted = true`) and the collection
2053 // stack is empty, any further non-comment, non-blank content is invalid.
2054 if self.root_node_emitted && self.coll_stack.is_empty() && !self.lexer.has_inline_scalar() {
2055 if let Some(line) = self.lexer.peek_next_line() {
2056 let err_pos = line.pos;
2057 self.failed = true;
2058 self.lexer.consume_line();
2059 return StepResult::Yield(Err(Error {
2060 pos: err_pos,
2061 message: "unexpected content after document root node".into(),
2062 }));
2063 }
2064 }
2065
2066 // ---- Alias node: `*name` is a complete node ----
2067
2068 if let Some(peek) = self.lexer.peek_next_line() {
2069 let content: &'input str = peek.content;
2070 let line_pos = peek.pos;
2071 let line_break_type = peek.break_type;
2072 let line_char_offset = line_pos.char_offset;
2073 let trimmed = content.trim_start_matches(' ');
2074 if let Some(after_star) = trimmed.strip_prefix('*') {
2075 let leading = content.len() - trimmed.len();
2076 let star_pos = Pos {
2077 byte_offset: line_pos.byte_offset + leading,
2078 char_offset: line_char_offset + leading,
2079 line: line_pos.line,
2080 column: line_pos.column + leading,
2081 };
2082 // YAML 1.2 §7.1: alias nodes cannot have properties (anchor or tag).
2083 if self.pending_tag.is_some() {
2084 self.failed = true;
2085 return StepResult::Yield(Err(Error {
2086 pos: star_pos,
2087 message: "alias node cannot have a tag property".into(),
2088 }));
2089 }
2090 // An anchor is only a property of the alias if it's item-level
2091 // (pending_anchor_for_collection=false). A collection-level anchor
2092 // (pending_anchor_for_collection=true) belongs to the surrounding
2093 // collection, not the alias node.
2094 if self.pending_anchor.is_some() && !self.pending_anchor_for_collection {
2095 self.failed = true;
2096 return StepResult::Yield(Err(Error {
2097 pos: star_pos,
2098 message: "alias node cannot have an anchor property".into(),
2099 }));
2100 }
2101 match scan_anchor_name(after_star, star_pos) {
2102 Err(e) => {
2103 self.failed = true;
2104 return StepResult::Yield(Err(e));
2105 }
2106 Ok(name) => {
2107 let name_char_count = name.chars().count();
2108 // Build alias span: from `*` through end of name.
2109 let alias_end = Pos {
2110 byte_offset: star_pos.byte_offset + 1 + name.len(),
2111 char_offset: star_pos.char_offset + 1 + name_char_count,
2112 line: star_pos.line,
2113 column: star_pos.column + 1 + name_char_count,
2114 };
2115 let alias_span = Span {
2116 start: star_pos,
2117 end: alias_end,
2118 };
2119 // Compute remaining content after the alias name, before
2120 // consuming the line (which would invalidate the borrow).
2121 let after_name = &after_star[name.len()..];
2122 let remaining: &'input str = after_name.trim_start_matches([' ', '\t']);
2123 let spaces = after_name.len() - remaining.len();
2124 let had_remaining = !remaining.is_empty();
2125 let rem_byte_offset = star_pos.byte_offset + 1 + name.len() + spaces;
2126 let rem_char_offset = line_char_offset + leading + 1 + name.len() + spaces;
2127 let rem_col = star_pos.column + 1 + name_char_count + spaces;
2128 self.lexer.consume_line();
2129 if had_remaining {
2130 let rem_pos = Pos {
2131 byte_offset: rem_byte_offset,
2132 char_offset: rem_char_offset,
2133 line: star_pos.line,
2134 column: rem_col,
2135 };
2136 let synthetic = crate::lines::Line {
2137 content: remaining,
2138 offset: rem_byte_offset,
2139 indent: rem_col,
2140 break_type: line_break_type,
2141 pos: rem_pos,
2142 };
2143 self.lexer.prepend_inline_line(synthetic);
2144 }
2145 self.tick_mapping_phase_after_scalar();
2146 return StepResult::Yield(Ok((Event::Alias { name }, alias_span)));
2147 }
2148 }
2149 }
2150 }
2151
2152 // ---- Tag: `!tag`, `!!tag`, `!<uri>`, or `!` — attach to next node ----
2153
2154 if let Some(peek) = self.lexer.peek_next_line() {
2155 let content: &'input str = peek.content;
2156 let line_pos = peek.pos;
2157 let line_indent = peek.indent;
2158 let line_break_type = peek.break_type;
2159 let trimmed = content.trim_start_matches(' ');
2160 if trimmed.starts_with('!') {
2161 let leading = content.len() - trimmed.len();
2162 let bang_pos = Pos {
2163 byte_offset: line_pos.byte_offset + leading,
2164 char_offset: line_pos.char_offset + leading,
2165 line: line_pos.line,
2166 column: line_pos.column + leading,
2167 };
2168 // `tag_start` starts at the `!`; `after_bang` is everything after it.
2169 let tag_start: &'input str = &content[leading..];
2170 let after_bang: &'input str = &content[leading + 1..];
2171 match scan_tag(after_bang, tag_start, bang_pos) {
2172 Err(e) => {
2173 self.failed = true;
2174 return StepResult::Yield(Err(e));
2175 }
2176 Ok((tag_slice, advance_past_bang)) => {
2177 // Total bytes consumed for the tag token: 1 (`!`) + advance.
2178 let tag_token_bytes = 1 + advance_past_bang;
2179 let after_tag = &trimmed[tag_token_bytes..];
2180 let inline: &'input str = after_tag.trim_start_matches([' ', '\t']);
2181 let spaces = after_tag.len() - inline.len();
2182 let had_inline = !inline.is_empty();
2183 // YAML 1.2 §6.8.1: a tag property must be separated from
2184 // the following node content by `s-separate` when the first
2185 // character after the tag could be confused with a tag
2186 // continuation or creates structural ambiguity:
2187 // - `!` starts another tag property
2188 // - flow indicators (`,`, `[`, `]`, `{`, `}`) cause
2189 // structural confusion (e.g. `!!str,`)
2190 // - `%` may be a valid percent-encoded continuation that
2191 // should have been part of the tag, or an invalid
2192 // percent-sequence that makes the input unparseable
2193 // When the tag scanner stopped at a plain non-tag char like
2194 // `<`, the tag ended naturally and the content is the value
2195 // (e.g. `!foo<bar val` → tag=`!foo`, scalar=`<bar val`).
2196 if had_inline && spaces == 0 {
2197 let first = inline.chars().next().unwrap_or('\0');
2198 if first == '!'
2199 || first == '%'
2200 || matches!(first, ',' | '[' | ']' | '{' | '}')
2201 {
2202 self.failed = true;
2203 return StepResult::Yield(Err(Error {
2204 pos: bang_pos,
2205 message:
2206 "tag must be separated from node content by whitespace"
2207 .into(),
2208 }));
2209 }
2210 }
2211 let inline_offset =
2212 line_pos.byte_offset + leading + tag_token_bytes + spaces;
2213 let inline_char_offset =
2214 line_pos.char_offset + leading + tag_token_bytes + spaces;
2215 let inline_col = line_pos.column + leading + tag_token_bytes + spaces;
2216 // Duplicate tags on the same node are an error.
2217 // Exception: if the existing tag is collection-level
2218 // (pending_tag_for_collection=true) and the new tag has
2219 // inline content that is (or contains) a mapping key line,
2220 // they apply to different nodes (collection vs. key scalar).
2221 if self.pending_tag.is_some() {
2222 let is_different_node = self.pending_tag_for_collection
2223 && had_inline
2224 && inline_contains_mapping_key(inline);
2225 if !is_different_node {
2226 self.failed = true;
2227 return StepResult::Yield(Err(Error {
2228 pos: bang_pos,
2229 message: "a node may not have more than one tag".into(),
2230 }));
2231 }
2232 }
2233 // Resolve tag handle against directive scope at scan time.
2234 let resolved_tag =
2235 match self.directive_scope.resolve_tag(tag_slice, bang_pos) {
2236 Ok(t) => t,
2237 Err(e) => {
2238 self.failed = true;
2239 return StepResult::Yield(Err(e));
2240 }
2241 };
2242 self.pending_tag = Some(resolved_tag);
2243 self.lexer.consume_line();
2244 if had_inline {
2245 self.pending_tag_for_collection = false;
2246 // Record the original physical line's indent so that
2247 // handle_mapping_entry can open the mapping at the correct
2248 // indent when the key is on a synthetic (offset) line.
2249 // Only set when the inline content is (or leads to) a
2250 // mapping key — if it's a plain value, there is no
2251 // handle_mapping_entry call to consume this, and leaving
2252 // it set would corrupt the next unrelated mapping entry.
2253 if self.property_origin_indent.is_none()
2254 && inline_contains_mapping_key(inline)
2255 {
2256 self.property_origin_indent = Some(line_indent);
2257 }
2258 let inline_pos = Pos {
2259 byte_offset: inline_offset,
2260 char_offset: inline_char_offset,
2261 line: line_pos.line,
2262 column: inline_col,
2263 };
2264 let synthetic = crate::lines::Line {
2265 content: inline,
2266 offset: inline_offset,
2267 indent: inline_col,
2268 break_type: line_break_type,
2269 pos: inline_pos,
2270 };
2271 self.lexer.prepend_inline_line(synthetic);
2272 } else {
2273 // Standalone tag line — applies to whatever node comes next.
2274 // Validate: the tag must be indented enough for this context.
2275 let min = self.min_standalone_property_indent();
2276 if line_indent < min {
2277 self.pending_tag = None;
2278 self.failed = true;
2279 return StepResult::Yield(Err(Error {
2280 pos: bang_pos,
2281 message:
2282 "node property is not indented enough for this context"
2283 .into(),
2284 }));
2285 }
2286 self.pending_tag_for_collection = true;
2287 }
2288 return StepResult::Continue;
2289 }
2290 }
2291 }
2292 }
2293
2294 // ---- Anchor: `&name` — attach to the next node ----
2295
2296 if let Some(peek) = self.lexer.peek_next_line() {
2297 let content: &'input str = peek.content;
2298 let line_pos = peek.pos;
2299 let line_indent = peek.indent;
2300 let line_break_type = peek.break_type;
2301 let trimmed = content.trim_start_matches(' ');
2302 if let Some(after_amp) = trimmed.strip_prefix('&') {
2303 // We only look for `&` at the start of the trimmed line.
2304 // Tags (`!`) before `&` are handled in Task 17.
2305 //
2306 // IMPORTANT for Task 17: when implementing tag-skip, the skip
2307 // logic must consume the *full* tag token (all `ns-anchor-char`
2308 // bytes after `!`), not just the `!` character alone. The `!`
2309 // character is itself a valid `ns-anchor-char`, so skipping
2310 // only `!` and then re-entering anchor detection would silently
2311 // include the tag body in the anchor name. Example: `!tag &a`
2312 // — skip must advance past `tag` before looking for `&a`.
2313 let leading = content.len() - trimmed.len();
2314 let amp_pos = Pos {
2315 byte_offset: line_pos.byte_offset + leading,
2316 char_offset: line_pos.char_offset + leading,
2317 line: line_pos.line,
2318 column: line_pos.column + leading,
2319 };
2320 match scan_anchor_name(after_amp, amp_pos) {
2321 Err(e) => {
2322 self.failed = true;
2323 return StepResult::Yield(Err(e));
2324 }
2325 Ok(name) => {
2326 // Determine what follows the anchor name on this line,
2327 // before consuming the line (borrow ends here).
2328 let after_name = &after_amp[name.len()..];
2329 let inline: &'input str = after_name.trim_start_matches([' ', '\t']);
2330 let spaces = after_name.len() - inline.len();
2331 let had_inline = !inline.is_empty();
2332 let inline_offset =
2333 line_pos.byte_offset + leading + 1 + name.len() + spaces;
2334 let inline_char_offset =
2335 line_pos.char_offset + leading + 1 + name.len() + spaces;
2336 let inline_col = line_pos.column + leading + 1 + name.len() + spaces;
2337 // Duplicate anchors on the same node are an error.
2338 //
2339 // Case 1: existing anchor is item-level (pending_anchor_for_collection=false)
2340 // and no collection tag is pending — both this and the existing anchor
2341 // are for the same item-level node.
2342 //
2343 // Case 2: existing anchor is collection-level (pending_anchor_for_collection=true)
2344 // and the new anchor has inline content that is NOT a collection opener
2345 // ([, {) or property (!, &) — both anchors apply to the same scalar node.
2346 let amp_pos2 = amp_pos;
2347 let is_duplicate = if self.pending_anchor.is_some()
2348 && !self.pending_anchor_for_collection
2349 && !self.pending_tag_for_collection
2350 {
2351 true
2352 } else if self.pending_anchor.is_some()
2353 && self.pending_anchor_for_collection
2354 && had_inline
2355 && !self.pending_tag_for_collection
2356 {
2357 // The existing anchor is collection-level, but the new anchor
2358 // has inline content. If that content is a mapping key line
2359 // (contains `: ` etc.), the new anchor is for the key and the
2360 // existing anchor is for the mapping — different nodes, no error.
2361 // If the inline is a plain scalar (no key indicator), both
2362 // anchors apply to the same scalar node — error.
2363 let first_ch = inline.chars().next();
2364 // If inline starts with a collection/property opener, treat as
2365 // different node — no error.
2366 let starts_with_opener =
2367 matches!(first_ch, Some('[' | '{' | '!' | '&' | '*' | '|' | '>'));
2368 // If inline contains a mapping key indicator (`: `), the new
2369 // anchor is for a key — different node from the collection.
2370 let is_mapping_key = find_value_indicator_offset(inline).is_some();
2371 !starts_with_opener && !is_mapping_key
2372 } else {
2373 false
2374 };
2375 if is_duplicate {
2376 self.failed = true;
2377 return StepResult::Yield(Err(Error {
2378 pos: amp_pos2,
2379 message: "a node may not have more than one anchor".into(),
2380 }));
2381 }
2382 self.pending_anchor = Some(name);
2383 self.lexer.consume_line();
2384 if had_inline {
2385 // Detect illegal inline block sequence: `&anchor - item`
2386 // is invalid — a block sequence indicator cannot appear
2387 // inline after an anchor property in block context.
2388 let is_seq = inline.strip_prefix('-').is_some_and(|rest| {
2389 rest.is_empty() || rest.starts_with(' ') || rest.starts_with('\t')
2390 });
2391 if is_seq {
2392 self.pending_anchor = None;
2393 self.failed = true;
2394 let seq_pos = Pos {
2395 byte_offset: inline_offset,
2396 char_offset: inline_char_offset,
2397 line: line_pos.line,
2398 column: inline_col,
2399 };
2400 return StepResult::Yield(Err(Error {
2401 pos: seq_pos,
2402 message:
2403 "block sequence indicator cannot appear inline after a node property"
2404 .into(),
2405 }));
2406 }
2407 // Inline content after anchor — anchor applies to the
2408 // inline node (scalar or key), not to any enclosing
2409 // collection opened on this same line.
2410 self.pending_anchor_for_collection = false;
2411 // Record the original physical line's indent so that
2412 // handle_mapping_entry can open the mapping at the correct
2413 // indent when the key is on a synthetic (offset) line.
2414 // Only set when the inline content leads to a mapping key;
2415 // value-context anchors must not corrupt the next entry.
2416 if self.property_origin_indent.is_none()
2417 && inline_contains_mapping_key(inline)
2418 {
2419 self.property_origin_indent = Some(line_indent);
2420 }
2421 let inline_pos = Pos {
2422 byte_offset: inline_offset,
2423 char_offset: inline_char_offset,
2424 line: line_pos.line,
2425 column: inline_col,
2426 };
2427 let synthetic = crate::lines::Line {
2428 content: inline,
2429 offset: inline_offset,
2430 indent: inline_col,
2431 break_type: line_break_type,
2432 pos: inline_pos,
2433 };
2434 self.lexer.prepend_inline_line(synthetic);
2435 } else {
2436 // Standalone anchor line — anchor applies to whatever
2437 // node comes next (collection or scalar).
2438 // Validate: the anchor must be indented enough for this context.
2439 let min = self.min_standalone_property_indent();
2440 if line_indent < min {
2441 self.pending_anchor = None;
2442 self.failed = true;
2443 let err_pos = amp_pos;
2444 return StepResult::Yield(Err(Error {
2445 pos: err_pos,
2446 message:
2447 "node property is not indented enough for this context"
2448 .into(),
2449 }));
2450 }
2451 self.pending_anchor_for_collection = true;
2452 }
2453 // Let the next iteration handle whatever follows.
2454 return StepResult::Continue;
2455 }
2456 }
2457 }
2458 }
2459
2460 // ---- Flow collection detection: `[` or `{` starts a flow collection ----
2461 // Stray closing flow indicators (`]`, `}`) in block context are errors.
2462
2463 if let Some(line) = self.lexer.peek_next_line() {
2464 let trimmed = line.content.trim_start_matches(' ');
2465 if trimmed.starts_with('[') || trimmed.starts_with('{') {
2466 return self.handle_flow_collection();
2467 }
2468 if trimmed.starts_with(']') || trimmed.starts_with('}') {
2469 let err_pos = line.pos;
2470 let ch = trimmed.chars().next().unwrap_or(']');
2471 self.failed = true;
2472 self.lexer.consume_line();
2473 return StepResult::Yield(Err(Error {
2474 pos: err_pos,
2475 message: format!("unexpected '{ch}' outside flow collection"),
2476 }));
2477 }
2478 }
2479
2480 // ---- Block sequence / mapping entry detection ----
2481
2482 if let Some((dash_indent, dash_pos)) = self.peek_sequence_entry() {
2483 return self.handle_sequence_entry(dash_indent, dash_pos);
2484 }
2485 if let Some((key_indent, key_pos)) = self.peek_mapping_entry() {
2486 return self.handle_mapping_entry(key_indent, key_pos);
2487 }
2488
2489 // ---- Dedent: close collections more deeply nested than the current line ----
2490
2491 if let Some(line) = self.lexer.peek_next_line() {
2492 let line_indent = line.indent;
2493 let close_pos = self.lexer.current_pos();
2494 // Record the minimum indent across all open collections before
2495 // closing. A root collection has indent 0. If the minimum indent
2496 // before closure was 0 and the stack empties, the root node is
2497 // complete. When a tag-inline mapping opens at a column > 0 (a
2498 // pre-existing indent-tracking limitation), closing it must not
2499 // prematurely mark the root as emitted.
2500 let min_indent_before = self.coll_stack.iter().map(|e| e.indent()).min();
2501 self.close_collections_at_or_above(line_indent.saturating_add(1), close_pos);
2502 // If closing collections emptied the stack, the root node is
2503 // complete — but only if the outermost collection was at indent 0
2504 // (a true root collection, not a spuriously-indented inline tag).
2505 if self.coll_stack.is_empty() && !self.queue.is_empty() && min_indent_before == Some(0)
2506 {
2507 self.root_node_emitted = true;
2508 }
2509 if !self.queue.is_empty() {
2510 return StepResult::Continue;
2511 }
2512 }
2513
2514 // ---- Block structure validity checks ----
2515 //
2516 // After closing deeper collections and before consuming a scalar,
2517 // validate that the current line's indentation is consistent with
2518 // the innermost open block collection.
2519 //
2520 // For block sequences: the only valid content at the sequence's own
2521 // indent level is `- ` (handled by peek_sequence_entry above).
2522 // Any other content at that indent level is invalid YAML.
2523 //
2524 // For block mappings in Key phase: the only valid content at the
2525 // mapping's indent level is a mapping entry (handled by
2526 // peek_mapping_entry above). A plain scalar without `: ` is not
2527 // a valid implicit mapping key.
2528 if let Some(line) = self.lexer.peek_next_line() {
2529 let line_indent = line.indent;
2530 match self.coll_stack.last() {
2531 Some(&CollectionEntry::Sequence(seq_indent, _)) if line_indent == seq_indent => {
2532 // Content at the sequence indent level that is NOT `- ` is
2533 // invalid. peek_sequence_entry already returned None, so this
2534 // line is not a sequence entry.
2535 let err_pos = line.pos;
2536 self.failed = true;
2537 self.lexer.consume_line();
2538 return StepResult::Yield(Err(Error {
2539 pos: err_pos,
2540 message: "invalid content at block sequence indent level: expected '- '"
2541 .into(),
2542 }));
2543 }
2544 Some(&CollectionEntry::Mapping(map_indent, MappingPhase::Key, _))
2545 if line_indent == map_indent =>
2546 {
2547 let err_pos = line.pos;
2548 self.failed = true;
2549 self.lexer.consume_line();
2550 return StepResult::Yield(Err(Error {
2551 pos: err_pos,
2552 message:
2553 "invalid content at block mapping indent level: expected mapping key"
2554 .into(),
2555 }));
2556 }
2557 // Content more deeply indented than the mapping key level is only
2558 // valid as an explicit-key continuation (explicit_key_pending=true)
2559 // or as the very first key (has_had_value=false — the first key may
2560 // be at any indent >= map_indent). After at least one key-value pair
2561 // has been processed (has_had_value=true) with no explicit-key pending,
2562 // deeper content that is not a valid mapping key is an error.
2563 Some(&CollectionEntry::Mapping(map_indent, MappingPhase::Key, true))
2564 if line_indent > map_indent
2565 && !self.explicit_key_pending
2566 && !self.lexer.is_next_line_synthetic() =>
2567 {
2568 let err_pos = line.pos;
2569 self.failed = true;
2570 self.lexer.consume_line();
2571 return StepResult::Yield(Err(Error {
2572 pos: err_pos,
2573 message: "unexpected indented content after mapping value".into(),
2574 }));
2575 }
2576 _ => {}
2577 }
2578 }
2579
2580 // ---- Scalars ----
2581
2582 // `block_parent_indent` — the indent of the enclosing block context;
2583 // block scalars (`|`, `>`) must have content lines more indented than
2584 // this value. For a block scalar embedded as inline content after `? `
2585 // or `- `, the enclosing block's indent is the *collection's* indent,
2586 // not the column of the inline `|`/`>` token.
2587 //
2588 // `plain_parent_indent` — the enclosing block's indent level.
2589 // Plain scalar continuation lines must be indented strictly more than
2590 // `plain_parent_indent` (YAML 1.2), with a special exception for
2591 // tab-indented lines when `plain_parent_indent == 0` (the tab provides
2592 // the s-separate-in-line separator required by s-flow-folded(0)).
2593 // Use usize::MAX as a sentinel for "root level" — the root node has no
2594 // parent collection, so block scalar body lines may start at column 0
2595 // (equivalent to a parent indent of -1 in the YAML spec).
2596 let block_parent_indent = self.coll_stack.last().map_or(usize::MAX, |e| e.indent());
2597 let plain_parent_indent = self.coll_stack.last().map_or(0, |e| e.indent());
2598 // Capture whether an inline scalar (from `--- text`) was pending before
2599 // the scalar dispatch call. If it was, the emitted plain scalar came
2600 // from the `---` marker line and is NOT necessarily the complete root
2601 // node — the lexer emits `--- >` / `--- |` / `--- "text` inline content
2602 // as a plain scalar, but the actual node body follows on subsequent
2603 // lines. Marking root_node_emitted in those cases would incorrectly
2604 // reject the body lines as "content after root node".
2605 let had_inline_scalar = self.lexer.has_inline_scalar();
2606 match self.try_consume_scalar(plain_parent_indent, block_parent_indent) {
2607 Ok(Some(event)) => {
2608 self.tick_mapping_phase_after_scalar();
2609 // Drain any trailing comment detected on the scalar's line.
2610 self.drain_trailing_comment();
2611 // A scalar emitted at the document root (no open collection)
2612 // is the complete root node — unless it came from inline
2613 // content after `---` (had_inline_scalar), in which case the
2614 // body on subsequent lines is part of the same node.
2615 if self.coll_stack.is_empty() && !had_inline_scalar {
2616 self.root_node_emitted = true;
2617 }
2618 return StepResult::Yield(Ok(event));
2619 }
2620 Err(e) => {
2621 self.failed = true;
2622 return StepResult::Yield(Err(e));
2623 }
2624 Ok(None) => {}
2625 }
2626
2627 // Check for invalid characters at the start of an unrecognised line.
2628 // A line that starts with a character that is neither whitespace nor a
2629 // valid YAML ns-char (e.g. NUL U+0000 or mid-stream BOM U+FEFF) is a
2630 // parse error.
2631 if let Some(line) = self.lexer.peek_next_line() {
2632 let first_ch = line.content.chars().next();
2633 if let Some(ch) = first_ch {
2634 if ch != ' ' && ch != '\t' && !crate::lexer::is_ns_char(ch) {
2635 let err_pos = line.pos;
2636 self.failed = true;
2637 self.lexer.consume_line();
2638 return StepResult::Yield(Err(Error {
2639 pos: err_pos,
2640 message: format!("invalid character U+{:04X} in document", ch as u32),
2641 }));
2642 }
2643 }
2644 }
2645
2646 // Fallback: unrecognised content line — consume and loop.
2647 self.lexer.consume_line();
2648 StepResult::Continue
2649 }
2650
2651 /// Handle a block-sequence dash entry (`-`).
2652 #[allow(clippy::too_many_lines)]
2653 fn handle_sequence_entry(&mut self, dash_indent: usize, dash_pos: Pos) -> StepResult<'input> {
2654 let cur_pos = self.lexer.current_pos();
2655 self.close_collections_at_or_above(dash_indent.saturating_add(1), cur_pos);
2656 if !self.queue.is_empty() {
2657 return StepResult::Continue;
2658 }
2659 // YAML §8.2.1 seq-spaces rule: a block sequence used as a mapping
2660 // value in `block-out` context may start at the same column as its
2661 // parent key (seq-spaces(n, block-out) = n, not n+1). We therefore
2662 // open a new sequence when:
2663 // - the stack is empty, OR
2664 // - dash_indent is greater than the current top's indent (normal
2665 // case: sequence is nested deeper than its parent), OR
2666 // - the top is a Mapping in Value phase at the same indent (the
2667 // seq-spaces case: the sequence is the value of the current key).
2668 let opens_new = match self.coll_stack.last() {
2669 None => true,
2670 Some(
2671 &(CollectionEntry::Sequence(col, _)
2672 | CollectionEntry::Mapping(col, MappingPhase::Key, _)),
2673 ) => dash_indent > col,
2674 Some(&CollectionEntry::Mapping(col, MappingPhase::Value, _)) => dash_indent >= col,
2675 };
2676 if opens_new {
2677 // A block sequence cannot be an implicit mapping key — only flow nodes
2678 // may appear as implicit keys. If the parent is a mapping in Key phase
2679 // and we are about to open a new sequence, this is a block sequence
2680 // where a mapping key is expected: an error.
2681 // Exception: when explicit_key_pending is set, the sequence IS the
2682 // content of an explicit key (`? \n- seq_key`), which is valid.
2683 if matches!(
2684 self.coll_stack.last(),
2685 Some(&CollectionEntry::Mapping(_, MappingPhase::Key, true))
2686 ) && !self.explicit_key_pending
2687 {
2688 self.failed = true;
2689 return StepResult::Yield(Err(Error {
2690 pos: dash_pos,
2691 message: "block sequence cannot appear as an implicit mapping key".into(),
2692 }));
2693 }
2694 // A block sequence item at a wrong indent level is invalid. When the
2695 // parent is a sequence that has already completed at least one item
2696 // (`has_had_item = true`) and the new dash is NOT at the parent
2697 // sequence's column (not a new sibling item), this is a wrong-indent
2698 // sequence entry.
2699 if let Some(&CollectionEntry::Sequence(parent_col, true)) = self.coll_stack.last() {
2700 if dash_indent != parent_col {
2701 self.failed = true;
2702 return StepResult::Yield(Err(Error {
2703 pos: dash_pos,
2704 message: "block sequence entry at wrong indentation level".into(),
2705 }));
2706 }
2707 }
2708 if self.collection_depth() >= MAX_COLLECTION_DEPTH {
2709 self.failed = true;
2710 return StepResult::Yield(Err(Error {
2711 pos: dash_pos,
2712 message: "collection nesting depth exceeds limit".into(),
2713 }));
2714 }
2715 // Sequence opening consumes any pending explicit-key context.
2716 self.explicit_key_pending = false;
2717 // Mark the parent sequence (if any) as having started an item.
2718 if let Some(CollectionEntry::Sequence(_, current_item_started)) =
2719 self.coll_stack.last_mut()
2720 {
2721 *current_item_started = true;
2722 }
2723 self.coll_stack
2724 .push(CollectionEntry::Sequence(dash_indent, false));
2725 self.queue.push_back((
2726 Event::SequenceStart {
2727 anchor: self.pending_anchor.take(),
2728 tag: self.pending_tag.take(),
2729 style: CollectionStyle::Block,
2730 },
2731 zero_span(dash_pos),
2732 ));
2733 }
2734 // When continuing an existing sequence (opens_new = false), reset
2735 // `current_item_started` so that the new item can receive content.
2736 if !opens_new {
2737 if let Some(CollectionEntry::Sequence(_, current_item_started)) =
2738 self.coll_stack.last_mut()
2739 {
2740 *current_item_started = false;
2741 }
2742 }
2743 // When continuing an existing sequence (opens_new = false) and there is
2744 // a pending tag/anchor from the previous item's content (e.g. `- !!str`
2745 // whose inline extraction left a standalone tag line), that tag/anchor
2746 // applies to an empty scalar for the previous item. Emit it now before
2747 // processing the current `-`.
2748 if !opens_new
2749 && (self.pending_tag_for_collection || self.pending_anchor_for_collection)
2750 && (self.pending_tag.is_some() || self.pending_anchor.is_some())
2751 {
2752 let item_pos = self.lexer.current_pos();
2753 self.queue.push_back((
2754 Event::Scalar {
2755 value: std::borrow::Cow::Borrowed(""),
2756 style: ScalarStyle::Plain,
2757 anchor: self.pending_anchor.take(),
2758 tag: self.pending_tag.take(),
2759 },
2760 zero_span(item_pos),
2761 ));
2762 self.pending_tag_for_collection = false;
2763 self.pending_anchor_for_collection = false;
2764 }
2765 // Check for tab-indented block structure before consuming the dash.
2766 // In YAML, tabs cannot be used for block-level indentation. When the
2767 // separator between the dash and the inline content is (or contains) a
2768 // tab, and the inline content is a block structure indicator, the tab
2769 // is acting as indentation for a block node — which is invalid
2770 // (YAML 1.2 §6.1).
2771 if let Some(line) = self.lexer.peek_next_line() {
2772 let after_spaces = line.content.trim_start_matches(' ');
2773 if let Some(rest) = after_spaces.strip_prefix('-') {
2774 let inline = rest.trim_start_matches([' ', '\t']);
2775 let separator = &rest[..rest.len() - inline.len()];
2776 if separator.contains('\t') && is_tab_indented_block_indicator(inline) {
2777 let err_pos = line.pos;
2778 self.failed = true;
2779 self.lexer.consume_line();
2780 return StepResult::Yield(Err(Error {
2781 pos: err_pos,
2782 message: "tab character is not valid block indentation".into(),
2783 }));
2784 }
2785 }
2786 }
2787 let had_inline = self.consume_sequence_dash(dash_indent);
2788 if !had_inline {
2789 // Only emit an empty scalar for a bare `-` when there is no
2790 // following indented content that could be the item's value.
2791 // If the next line is at an indent strictly greater than
2792 // `dash_indent`, it belongs to this sequence item — let the
2793 // main loop handle it. Otherwise the item is truly empty.
2794 let next_indent = self.lexer.peek_next_line().map_or(0, |l| l.indent);
2795 if next_indent <= dash_indent {
2796 let item_pos = self.lexer.current_pos();
2797 self.queue.push_back((
2798 Event::Scalar {
2799 value: std::borrow::Cow::Borrowed(""),
2800 style: ScalarStyle::Plain,
2801 anchor: self.pending_anchor.take(),
2802 tag: None,
2803 },
2804 zero_span(item_pos),
2805 ));
2806 }
2807 }
2808 StepResult::Continue
2809 }
2810
2811 /// Handle a block-mapping key entry.
2812 #[allow(clippy::too_many_lines)]
2813 fn handle_mapping_entry(&mut self, key_indent: usize, key_pos: Pos) -> StepResult<'input> {
2814 let cur_pos = self.lexer.current_pos();
2815
2816 // When an anchor or tag appeared inline on the physical line before
2817 // the key content (e.g. `&anchor key: value`), the key is prepended
2818 // as a synthetic line at the property's column (e.g. column 8).
2819 // All indent-relative decisions below must use the PHYSICAL line's
2820 // indent (column 0 in that example), not the synthetic line's column.
2821 let effective_key_indent = self.property_origin_indent.unwrap_or(key_indent);
2822
2823 self.close_collections_at_or_above(effective_key_indent.saturating_add(1), cur_pos);
2824 if !self.queue.is_empty() {
2825 return StepResult::Continue;
2826 }
2827
2828 // YAML §8.2.1 seq-spaces close: a block sequence opened as a mapping
2829 // value in `block-out` context may reside at the *same* column as its
2830 // parent key (seq-spaces(n, block-out) = n). When a new mapping key
2831 // appears at column `n`, such a same-indent sequence must be closed —
2832 // the standard `close_collections_at_or_above(n+1)` above does not
2833 // reach it because its indent is exactly `n`, not `>= n+1`.
2834 //
2835 // Close the sequence only when the collection immediately beneath it
2836 // (the next item down the stack) is a Mapping at the same indent in
2837 // Value phase — that confirms it was opened by the seq-spaces rule,
2838 // not as an independent sequence at column 0.
2839 if let Some(&CollectionEntry::Sequence(seq_col, _)) = self.coll_stack.last() {
2840 if seq_col == effective_key_indent {
2841 let parent_is_seq_spaces_mapping = self.coll_stack.iter().rev().nth(1).is_some_and(
2842 |e| matches!(e, CollectionEntry::Mapping(col, _, _) if *col == effective_key_indent),
2843 );
2844 if parent_is_seq_spaces_mapping {
2845 self.coll_stack.pop();
2846 self.queue
2847 .push_back((Event::SequenceEnd, zero_span(cur_pos)));
2848 // Advance parent mapping from Value to Key phase — the
2849 // sequence was its value and is now fully closed.
2850 if let Some(CollectionEntry::Mapping(_, phase, _)) = self.coll_stack.last_mut()
2851 {
2852 *phase = MappingPhase::Key;
2853 }
2854 return StepResult::Continue;
2855 }
2856 }
2857 }
2858
2859 let is_in_mapping_at_this_indent = self.coll_stack.last().is_some_and(
2860 |top| matches!(top, CollectionEntry::Mapping(col, _, _) if *col == effective_key_indent),
2861 );
2862
2863 if !is_in_mapping_at_this_indent {
2864 // A mapping entry at `effective_key_indent` cannot be opened when:
2865 //
2866 // 1. The top of the stack is a block sequence at the same indent —
2867 // this would nest a mapping inside the sequence without a `- `
2868 // prefix (BD7L pattern).
2869 //
2870 // 2. The top of the stack is a block mapping in Key phase at a
2871 // lesser indent that has already had at least one entry — this
2872 // would open a nested mapping when no current key exists for it
2873 // to be the value of (EW3V, DMG6, N4JP, U44R patterns: wrong
2874 // indentation). The `has_had_value` flag suppresses this check
2875 // for fresh mappings whose first key node is nested deeper than
2876 // the mapping indicator (e.g. V9D5 explicit-key content).
2877 // Also skip when a value-indicator line (`: value`) is next
2878 // because it is the value portion of an alias/anchor mapping key
2879 // split across tokens (e.g. `*alias : scalar` in 26DV), or when
2880 // a pending tag or anchor is present (tags prepend synthetic
2881 // inlines at their column — 74H7).
2882 match self.coll_stack.last() {
2883 Some(&CollectionEntry::Sequence(seq_col, _)) if seq_col == effective_key_indent => {
2884 self.failed = true;
2885 return StepResult::Yield(Err(Error {
2886 pos: key_pos,
2887 message:
2888 "invalid mapping entry at block sequence indent level: expected '- '"
2889 .into(),
2890 }));
2891 }
2892 Some(&CollectionEntry::Mapping(map_col, MappingPhase::Key, true))
2893 if map_col < effective_key_indent
2894 && self.pending_tag.is_none()
2895 && self.pending_anchor.is_none()
2896 && !self.is_value_indicator_line() =>
2897 {
2898 self.failed = true;
2899 return StepResult::Yield(Err(Error {
2900 pos: key_pos,
2901 message: "wrong indentation: mapping key is more indented than the enclosing mapping".into(),
2902 }));
2903 }
2904 _ => {}
2905 }
2906 if self.collection_depth() >= MAX_COLLECTION_DEPTH {
2907 self.failed = true;
2908 return StepResult::Yield(Err(Error {
2909 pos: key_pos,
2910 message: "collection nesting depth exceeds limit".into(),
2911 }));
2912 }
2913 // Mark the parent sequence (if any) as having started an item.
2914 if let Some(CollectionEntry::Sequence(_, current_item_started)) =
2915 self.coll_stack.last_mut()
2916 {
2917 *current_item_started = true;
2918 }
2919 // Note: property_origin_indent is NOT consumed here. It remains set
2920 // so the next call (which processes the synthetic key line at the
2921 // synthetic column) can again compute effective_key_indent = origin
2922 // indent and recognize the already-open mapping. It will be cleared
2923 // in the "continuing existing mapping" branch below.
2924 self.coll_stack.push(CollectionEntry::Mapping(
2925 effective_key_indent,
2926 MappingPhase::Key,
2927 false,
2928 ));
2929 // Consume pending anchor/tag for the mapping only for standalone
2930 // properties (e.g. `&a\nkey: v`) where `pending_*_for_collection`
2931 // is true.
2932 //
2933 // Inline properties (e.g. `&a key: v`) leave `pending_*_for_collection`
2934 // false — they annotate the key scalar, not the mapping (YAML test
2935 // suite 9KAX: inline property → key scalar). The pending anchor/tag
2936 // is left on `self.pending_anchor`/`self.pending_tag` and will be
2937 // consumed by `consume_mapping_entry` when it emits the key scalar.
2938 let mapping_anchor = if self.pending_anchor_for_collection {
2939 self.pending_anchor.take()
2940 } else {
2941 None
2942 };
2943 let mapping_tag = if self.pending_tag_for_collection {
2944 self.pending_tag.take()
2945 } else {
2946 None
2947 };
2948 self.queue.push_back((
2949 Event::MappingStart {
2950 anchor: mapping_anchor,
2951 tag: mapping_tag,
2952 style: CollectionStyle::Block,
2953 },
2954 zero_span(key_pos),
2955 ));
2956 return StepResult::Continue;
2957 }
2958
2959 // Continuing an existing mapping.
2960 if self.is_value_indicator_line() {
2961 // If there is a pending tag/anchor that is not designated for the
2962 // mapping collection itself (i.e. it came from an inline `!!tag`
2963 // or `&anchor` before the `:` value indicator), it applies to the
2964 // empty implicit key scalar. Emit that key scalar first so the
2965 // pending properties are not lost and the mapping phase advances
2966 // correctly before the value indicator is consumed.
2967 let in_key_phase = self.coll_stack.last().is_some_and(|top| {
2968 matches!(top, CollectionEntry::Mapping(col, MappingPhase::Key, _) if *col == effective_key_indent)
2969 });
2970 if in_key_phase
2971 && !self.pending_tag_for_collection
2972 && !self.pending_anchor_for_collection
2973 && (self.pending_tag.is_some() || self.pending_anchor.is_some())
2974 {
2975 let pos = self.lexer.current_pos();
2976 self.queue.push_back((
2977 Event::Scalar {
2978 value: std::borrow::Cow::Borrowed(""),
2979 style: ScalarStyle::Plain,
2980 anchor: self.pending_anchor.take(),
2981 tag: self.pending_tag.take(),
2982 },
2983 zero_span(pos),
2984 ));
2985 self.advance_mapping_to_value();
2986 return StepResult::Continue;
2987 }
2988 // Check for tab-indented block structure after explicit value marker.
2989 // `: TAB -`, `: TAB ?`, or `: TAB key:` are invalid because the tab
2990 // makes the following block-structure-forming content block-indented
2991 // via a tab, which is forbidden (YAML 1.2 §6.1).
2992 if let Some(line) = self.lexer.peek_next_line() {
2993 let after_spaces = line.content.trim_start_matches(' ');
2994 if let Some(after_colon) = after_spaces.strip_prefix(':') {
2995 if !after_colon.is_empty() {
2996 let value = after_colon.trim_start_matches([' ', '\t']);
2997 let separator = &after_colon[..after_colon.len() - value.len()];
2998 if separator.contains('\t') && is_tab_indented_block_indicator(value) {
2999 let err_pos = line.pos;
3000 self.failed = true;
3001 self.lexer.consume_line();
3002 return StepResult::Yield(Err(Error {
3003 pos: err_pos,
3004 message: "tab character is not valid block indentation".into(),
3005 }));
3006 }
3007 }
3008 }
3009 }
3010 self.consume_explicit_value_line(key_indent);
3011 return StepResult::Continue;
3012 }
3013
3014 // If the mapping is in Value phase and the next line is another key
3015 // (not a `: value` line), the previous key had no value — emit empty.
3016 if self.coll_stack.last().is_some_and(|top| {
3017 matches!(top, CollectionEntry::Mapping(col, MappingPhase::Value, _) if *col == effective_key_indent)
3018 }) {
3019 let pos = self.lexer.current_pos();
3020 self.queue.push_back((
3021 Event::Scalar {
3022 value: std::borrow::Cow::Borrowed(""),
3023 style: ScalarStyle::Plain,
3024 anchor: self.pending_anchor.take(),
3025 tag: None,
3026 },
3027 zero_span(pos),
3028 ));
3029 self.advance_mapping_to_key();
3030 return StepResult::Continue;
3031 }
3032
3033 // Check for tab-indented block structure after explicit key marker.
3034 // `? TAB -`, `? TAB ?`, or `? TAB key:` are invalid because the tab
3035 // makes the following block-structure-forming content block-indented
3036 // via a tab, which is forbidden (YAML 1.2 §6.1).
3037 if let Some(line) = self.lexer.peek_next_line() {
3038 let after_spaces = line.content.trim_start_matches(' ');
3039 if let Some(after_q) = after_spaces.strip_prefix('?') {
3040 if !after_q.is_empty() {
3041 let inline = after_q.trim_start_matches([' ', '\t']);
3042 let separator = &after_q[..after_q.len() - inline.len()];
3043 if separator.contains('\t') && is_tab_indented_block_indicator(inline) {
3044 let err_pos = line.pos;
3045 self.failed = true;
3046 self.lexer.consume_line();
3047 return StepResult::Yield(Err(Error {
3048 pos: err_pos,
3049 message: "tab character is not valid block indentation".into(),
3050 }));
3051 }
3052 }
3053 }
3054 }
3055 // Normal key line: consume and emit key scalar.
3056 // property_origin_indent has served its purpose (selecting effective
3057 // indent for the mapping-open and for subsequent continues). Clear it
3058 // so it does not affect unrelated subsequent entries.
3059 self.property_origin_indent = None;
3060 let consumed = self.consume_mapping_entry(key_indent);
3061 match consumed {
3062 ConsumedMapping::ExplicitKey { had_key_inline } => {
3063 if had_key_inline {
3064 // The key content will appear inline (already prepended).
3065 // No explicit-key-pending needed since the key content is
3066 // already in the buffer.
3067 } else {
3068 let pos = self.lexer.current_pos();
3069 self.queue.push_back((
3070 Event::Scalar {
3071 value: std::borrow::Cow::Borrowed(""),
3072 style: ScalarStyle::Plain,
3073 anchor: self.pending_anchor.take(),
3074 tag: self.pending_tag.take(),
3075 },
3076 zero_span(pos),
3077 ));
3078 self.advance_mapping_to_value();
3079 // The key content is on the NEXT line — mark that an explicit
3080 // key is pending so block sequence entries are allowed
3081 // (e.g. `?\n- seq_key`).
3082 self.explicit_key_pending = true;
3083 }
3084 }
3085 ConsumedMapping::ImplicitKey {
3086 key_value,
3087 key_style,
3088 key_span,
3089 } => {
3090 self.queue.push_back((
3091 Event::Scalar {
3092 value: key_value,
3093 style: key_style,
3094 anchor: self.pending_anchor.take(),
3095 tag: self.pending_tag.take(),
3096 },
3097 key_span,
3098 ));
3099 self.advance_mapping_to_value();
3100 }
3101 ConsumedMapping::QuotedKeyError { pos, message } => {
3102 self.failed = true;
3103 return StepResult::Yield(Err(Error { pos, message }));
3104 }
3105 ConsumedMapping::InlineImplicitMappingError { pos } => {
3106 // The inline value is a block node (mapping or sequence indicator)
3107 // which cannot appear inline as a mapping value — block nodes must
3108 // start on a new line.
3109 self.failed = true;
3110 return StepResult::Yield(Err(Error {
3111 pos,
3112 message:
3113 "block node cannot appear as inline value; use a new line or a flow node"
3114 .into(),
3115 }));
3116 }
3117 }
3118 StepResult::Continue
3119 }
3120
3121 /// True when the next line is a bare value indicator (`: ` or `:`
3122 /// followed by space/EOL), used for the explicit-key form.
3123 fn is_value_indicator_line(&self) -> bool {
3124 let Some(line) = self.lexer.peek_next_line() else {
3125 return false;
3126 };
3127 let trimmed = line.content.trim_start_matches(' ');
3128 if !trimmed.starts_with(':') {
3129 return false;
3130 }
3131 let after_colon = &trimmed[1..];
3132 after_colon.is_empty()
3133 || after_colon.starts_with(' ')
3134 || after_colon.starts_with('\t')
3135 || after_colon.starts_with('\n')
3136 || after_colon.starts_with('\r')
3137 }
3138
3139 /// Consume a `: value` line (explicit value indicator).
3140 ///
3141 /// If there is inline content after `: `, prepend a synthetic line for it
3142 /// so the next iteration emits it as the value scalar.
3143 fn consume_explicit_value_line(&mut self, key_indent: usize) {
3144 // SAFETY: caller checked is_value_indicator_line() — the line exists.
3145 let Some(line) = self.lexer.peek_next_line() else {
3146 unreachable!("consume_explicit_value_line called without a pending line")
3147 };
3148
3149 // Extract all data from the borrowed line before any mutable lexer calls.
3150 let content: &'input str = line.content;
3151 let line_pos = line.pos;
3152 let line_break_type = line.break_type;
3153
3154 let leading_spaces = content.len() - content.trim_start_matches(' ').len();
3155 let trimmed = &content[leading_spaces..];
3156
3157 // Advance past `:` and any whitespace.
3158 let after_colon = &trimmed[1..]; // skip ':'
3159 let value_content = after_colon.trim_start_matches([' ', '\t']);
3160 // A comment-only value (e.g. `: # lala`) is not a real inline value.
3161 let had_value_inline = !value_content.is_empty() && !value_content.starts_with('#');
3162
3163 if had_value_inline {
3164 let spaces_after_colon = after_colon.len() - value_content.len();
3165 let total_offset = leading_spaces + 1 + spaces_after_colon;
3166 let value_col = key_indent + 1 + spaces_after_colon;
3167 let value_pos = Pos {
3168 byte_offset: line_pos.byte_offset + total_offset,
3169 char_offset: line_pos.char_offset + total_offset,
3170 line: line_pos.line,
3171 column: line_pos.column + total_offset,
3172 };
3173 let synthetic = Line {
3174 content: value_content,
3175 offset: value_pos.byte_offset,
3176 indent: value_col,
3177 break_type: line_break_type,
3178 pos: value_pos,
3179 };
3180 self.lexer.consume_line();
3181 self.lexer.prepend_inline_line(synthetic);
3182 } else {
3183 // `:` with no real value content (either bare or comment-only).
3184 // Consume the indicator line and advance to Value phase — the next
3185 // line may be a block node (the actual value), or if the next line
3186 // is another key at the same indent, the main loop emits an empty
3187 // scalar at that point (see the Value-phase empty-scalar guard).
3188 self.lexer.consume_line();
3189 self.advance_mapping_to_value();
3190 }
3191 }
3192
3193 /// Handle a flow collection (`[...]` or `{...}`) starting on the current line.
3194 ///
3195 /// This method reads the complete flow collection — potentially spanning
3196 /// multiple physical lines — and pushes all events (SequenceStart/End,
3197 /// MappingStart/End, Scalar) to `self.queue`. It returns when the
3198 /// outermost closing delimiter (`]` or `}`) is consumed.
3199 ///
3200 /// ## Security invariants
3201 ///
3202 /// - **No recursion:** the parser uses an explicit `Vec<FlowFrame>` stack
3203 /// rather than recursive function calls, preventing stack overflow on
3204 /// deeply nested input.
3205 /// - **Unified depth limit:** each new nested collection checks against
3206 /// `MAX_COLLECTION_DEPTH` using the same `coll_stack.len()` counter as
3207 /// block collections, so flow and block nesting depths are additive.
3208 /// - **Incremental parsing:** content is processed line-by-line; no
3209 /// `String` buffer holds the entire flow body.
3210 /// - **Unterminated collection:** reaching EOF without the matching closing
3211 /// delimiter returns `Err`.
3212 #[allow(clippy::too_many_lines)]
3213 fn handle_flow_collection(&mut self) -> StepResult<'input> {
3214 use crate::lexer::scan_plain_line_flow;
3215 use std::borrow::Cow;
3216
3217 // -----------------------------------------------------------------------
3218 // Local types for the explicit flow-parser stack.
3219 // -----------------------------------------------------------------------
3220
3221 /// One frame on the explicit flow-parser stack.
3222 #[derive(Clone, Copy)]
3223 enum FlowFrame {
3224 /// An open `[...]` sequence.
3225 ///
3226 /// `has_value` is `false` immediately after opening and immediately
3227 /// after each comma; it becomes `true` when a scalar or nested
3228 /// collection is emitted. A comma arriving when `has_value` is
3229 /// `false` is a leading comma error.
3230 ///
3231 /// `after_colon` is `true` when we have just consumed a `:` value
3232 /// separator in a single-pair implicit mapping context. In this
3233 /// state a new scalar or collection is the value of the single-pair
3234 /// mapping — not a new entry — so the missing-comma check must not
3235 /// fire.
3236 ///
3237 /// `last_was_plain` is `true` when the most recent emitted item was
3238 /// a plain scalar. Plain scalars may span multiple lines in flow
3239 /// context, so the missing-comma check must not fire after a plain
3240 /// scalar (the next line's content may be a continuation).
3241 Sequence {
3242 has_value: bool,
3243 after_colon: bool,
3244 last_was_plain: bool,
3245 },
3246 /// An open `{...}` mapping.
3247 ///
3248 /// `has_value` tracks the same invariant as in `Sequence` but for
3249 /// the mapping as a whole (not per key/value pair).
3250 ///
3251 /// `last_was_plain` mirrors the same concept as in `Sequence`: when
3252 /// the most recent emitted item was a plain scalar, the next line
3253 /// may be a multi-line continuation, so indicator-start validation
3254 /// must be deferred until we know whether it is a continuation.
3255 Mapping {
3256 phase: FlowMappingPhase,
3257 has_value: bool,
3258 last_was_plain: bool,
3259 },
3260 }
3261
3262 // Design note — phase-advance pattern
3263 //
3264 // Four sites below repeat the same `if let Some(frame) = flow_stack.last_mut()
3265 // { match frame { Sequence { has_value } => ... Mapping { phase, has_value } =>
3266 // ... } }` shape. Extracting a helper function would require moving `FlowFrame`
3267 // and `FlowMappingPhase` to module scope — adding module-level types whose sole
3268 // purpose is to enable this refactor adds more complexity than the duplication
3269 // costs. Each site is 6–8 lines and clearly labelled by its comment; the
3270 // repetition is intentional and stable.
3271
3272 // -----------------------------------------------------------------------
3273 // Buffer-management invariant
3274 // -----------------------------------------------------------------------
3275 //
3276 // The line buffer always holds the current line un-consumed. We peek to
3277 // read content and only consume the line when we need to advance past it
3278 // (end-of-line or quoted-scalar delegation).
3279 //
3280 // `cur_content` / `cur_base_pos` always mirror what `peek_next_line()`
3281 // returns. After any call that changes the buffer (consume_line /
3282 // prepend_inline_line), we immediately re-sync via peek.
3283 //
3284 // Helper: advance `pos` over `content[..byte_len]`, one char at a time.
3285
3286 let abs_pos = |base: Pos, content: &str, i: usize| -> Pos {
3287 let mut p = base;
3288 for ch in content[..i].chars() {
3289 p = p.advance(ch);
3290 }
3291 p
3292 };
3293
3294 // -----------------------------------------------------------------------
3295 // Initialise: read the current line, locate the opening delimiter.
3296 // -----------------------------------------------------------------------
3297
3298 // SAFETY: caller verified via peek in step_in_document.
3299 let Some(first_line) = self.lexer.peek_next_line() else {
3300 unreachable!("handle_flow_collection called without a pending line")
3301 };
3302
3303 let leading = first_line.content.len() - first_line.content.trim_start_matches(' ').len();
3304 // The physical line number where the outermost flow collection opened.
3305 // Used to detect multi-line flow keys (C2SP).
3306 let start_line = first_line.pos.line;
3307 // The physical line number of the most recent emitted value (scalar or
3308 // inner-collection close). Used to detect multi-line implicit keys (DK4H):
3309 // a `:` value separator on a different line than the preceding key is invalid.
3310 let mut last_token_line = first_line.pos.line;
3311 // Set when a `?` explicit-key indicator is consumed inside a flow sequence.
3312 // Suppresses the DK4H single-line check for the corresponding `:` separator —
3313 // explicit keys in flow sequences may span multiple lines (YAML 1.2 §7.4.2).
3314 let mut explicit_key_in_seq = false;
3315
3316 // Stack for tracking open flow collections (nested via explicit iteration,
3317 // not recursion — security requirement).
3318 let mut flow_stack: Vec<FlowFrame> = Vec::new();
3319 // All events assembled during this call (pushed to self.queue at end).
3320 let mut events: Vec<(Event<'input>, Span)> = Vec::new();
3321 // Current byte offset within `cur_content`.
3322 let mut pos_in_line: usize = leading;
3323 // Pending anchor for the next node in this flow collection.
3324 // Seeded from any block-context anchor that was pending when this flow
3325 // collection was entered (e.g. `&seq [a, b]` sets pending_anchor before
3326 // the `[` is dispatched to handle_flow_collection).
3327 let mut pending_flow_anchor: Option<&'input str> = self.pending_anchor.take();
3328 // Pending tag for the next node in this flow collection.
3329 // Seeded from any block-context tag that was pending when this flow
3330 // collection was entered (e.g. `!!seq [a, b]` sets pending_tag before
3331 // the `[` is dispatched to handle_flow_collection).
3332 let mut pending_flow_tag: Option<std::borrow::Cow<'input, str>> = self.pending_tag.take();
3333
3334 // Re-sync `cur_content` / `cur_base_pos` from the buffer.
3335 // Returns false when the buffer is empty (EOF mid-flow).
3336 // INVARIANT: called every time after consuming or prepending a line.
3337 macro_rules! resync {
3338 () => {{
3339 match self.lexer.peek_next_line() {
3340 Some(l) => {
3341 // Safe: we re-assign these immediately without holding
3342 // a borrow on `self.lexer` at the same time.
3343 (l.content, l.pos)
3344 }
3345 None => {
3346 // EOF
3347 ("", self.lexer.current_pos())
3348 }
3349 }
3350 }};
3351 }
3352
3353 let (mut cur_content, mut cur_base_pos) = resync!();
3354
3355 // The minimum indent for continuation lines in this flow collection.
3356 // When the flow collection is inside an enclosing block collection,
3357 // continuation lines must be indented more than the enclosing block's
3358 // indent level (YAML 1.2: flow context lines must not regress to or
3359 // below the enclosing block indent level).
3360 // At document root (coll_stack empty), there is no enclosing block, so
3361 // no constraint — represented as None.
3362 let flow_min_indent: Option<usize> = self.coll_stack.last().map(|e| e.indent());
3363
3364 // -----------------------------------------------------------------------
3365 // Main parse loop — iterates over characters in the current (and
3366 // subsequent) lines until the outermost closing delimiter is found.
3367 // -----------------------------------------------------------------------
3368
3369 'outer: loop {
3370 // Document markers (`---` and `...`) are only valid at the document
3371 // level — they are illegal inside flow collections (YAML 1.2 §8.1).
3372 // A document marker must appear at the very beginning of a line
3373 // (column 0) and be followed by whitespace or end-of-line.
3374 if pos_in_line == 0
3375 && (cur_content.starts_with("---") || cur_content.starts_with("..."))
3376 {
3377 let rest = &cur_content[3..];
3378 if rest.is_empty() || rest.starts_with(' ') || rest.starts_with('\t') {
3379 let err_pos = abs_pos(cur_base_pos, cur_content, 0);
3380 self.failed = true;
3381 return StepResult::Yield(Err(Error {
3382 pos: err_pos,
3383 message: "document marker is not allowed inside a flow collection".into(),
3384 }));
3385 }
3386 }
3387
3388 // Tabs as indentation on a new line in flow context are invalid
3389 // (YAML 1.2 §6.2 — indentation uses spaces only). A tab at the
3390 // start of a continuation line (before the first non-whitespace
3391 // character) is a tab used as indentation. Blank lines (tab only,
3392 // no content) are exempt — they are treated as empty separator lines.
3393 if pos_in_line == 0 {
3394 let has_tab_indent =
3395 cur_content.starts_with('\t') && !cur_content.trim().is_empty();
3396 if has_tab_indent {
3397 let err_pos = abs_pos(cur_base_pos, cur_content, 0);
3398 self.failed = true;
3399 return StepResult::Yield(Err(Error {
3400 pos: err_pos,
3401 message: "tab character is not allowed as indentation in flow context"
3402 .into(),
3403 }));
3404 }
3405 }
3406
3407 // Skip leading spaces/tabs and comments.
3408 // `#` is a comment start only when preceded by whitespace (or at
3409 // start of line, i.e. pos_in_line == 0 with all prior chars being
3410 // whitespace). A `#` immediately after a token (e.g. `,#`) is not
3411 // a comment — it is an error character that will be caught below.
3412 let prev_was_ws_at_loop_entry = pos_in_line == 0
3413 || cur_content[..pos_in_line]
3414 .chars()
3415 .next_back()
3416 .is_some_and(|c| c == ' ' || c == '\t');
3417 let mut prev_was_ws = prev_was_ws_at_loop_entry;
3418 while pos_in_line < cur_content.len() {
3419 let Some(ch) = cur_content[pos_in_line..].chars().next() else {
3420 break;
3421 };
3422 if ch == ' ' || ch == '\t' {
3423 prev_was_ws = true;
3424 pos_in_line += 1;
3425 } else if ch == '#' && prev_was_ws {
3426 // Emit a Comment event for this `# comment` to end of line.
3427 // No MAX_COMMENT_LEN check here — this comment is bounded by the
3428 // physical line length (itself bounded by total input size), the
3429 // same reason drain_trailing_comment does not apply the limit.
3430 let hash_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3431 // Comment text: everything after `#` (byte at pos_in_line is `#`,
3432 // ASCII 1 byte, so text starts at pos_in_line + 1).
3433 let text_start = pos_in_line + 1;
3434 // SAFETY: text_start <= cur_content.len() because we found
3435 // `#` at pos_in_line which is < cur_content.len().
3436 let comment_text: &'input str = cur_content.get(text_start..).unwrap_or("");
3437 let mut comment_end = hash_pos.advance('#');
3438 for c in comment_text.chars() {
3439 comment_end = comment_end.advance(c);
3440 }
3441 let comment_span = Span {
3442 start: hash_pos,
3443 end: comment_end,
3444 };
3445 events.push((Event::Comment { text: comment_text }, comment_span));
3446 pos_in_line = cur_content.len();
3447 } else {
3448 break;
3449 }
3450 }
3451
3452 // ----------------------------------------------------------------
3453 // End of line — consume and advance.
3454 // ----------------------------------------------------------------
3455 if pos_in_line >= cur_content.len() {
3456 self.lexer.consume_line();
3457
3458 if flow_stack.is_empty() {
3459 // Outermost collection closed; done.
3460 break 'outer;
3461 }
3462
3463 (cur_content, cur_base_pos) = resync!();
3464 if cur_content.is_empty() && self.lexer.at_eof() {
3465 let err_pos = self.lexer.current_pos();
3466 self.failed = true;
3467 return StepResult::Yield(Err(Error {
3468 pos: err_pos,
3469 message: "unterminated flow collection: unexpected end of input".into(),
3470 }));
3471 }
3472
3473 // Flow continuation lines must be indented more than the
3474 // enclosing block context (YAML 1.2: flow lines must not
3475 // regress to the block indent level). Blank/whitespace-only
3476 // lines are exempt — they act as line separators.
3477 // At document root (no enclosing block), there is no
3478 // indentation constraint.
3479 if let Some(min_indent) = flow_min_indent {
3480 if let Some(next_line) = self.lexer.peek_next_line() {
3481 let trimmed = next_line.content.trim();
3482 if !trimmed.is_empty() && next_line.indent <= min_indent {
3483 let err_pos = next_line.pos;
3484 self.failed = true;
3485 return StepResult::Yield(Err(Error {
3486 pos: err_pos,
3487 message: "flow collection continuation line is not indented enough"
3488 .into(),
3489 }));
3490 }
3491 }
3492 }
3493
3494 pos_in_line = 0;
3495 continue 'outer;
3496 }
3497
3498 let Some(ch) = cur_content[pos_in_line..].chars().next() else {
3499 continue 'outer;
3500 };
3501
3502 // ----------------------------------------------------------------
3503 // Opening delimiters `[` and `{`
3504 // ----------------------------------------------------------------
3505 if ch == '[' || ch == '{' {
3506 // Check unified depth limit (flow + block combined).
3507 let total_depth = self.coll_stack.len() + flow_stack.len();
3508 if total_depth >= MAX_COLLECTION_DEPTH {
3509 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3510 self.failed = true;
3511 return StepResult::Yield(Err(Error {
3512 pos: err_pos,
3513 message: "collection nesting depth exceeds limit".into(),
3514 }));
3515 }
3516
3517 let open_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3518 let open_span = zero_span(open_pos);
3519 pos_in_line += 1;
3520
3521 if ch == '[' {
3522 flow_stack.push(FlowFrame::Sequence {
3523 has_value: false,
3524 after_colon: false,
3525 last_was_plain: false,
3526 });
3527 events.push((
3528 Event::SequenceStart {
3529 anchor: pending_flow_anchor.take(),
3530 tag: pending_flow_tag.take(),
3531 style: CollectionStyle::Flow,
3532 },
3533 open_span,
3534 ));
3535 } else {
3536 flow_stack.push(FlowFrame::Mapping {
3537 phase: FlowMappingPhase::Key,
3538 has_value: false,
3539 last_was_plain: false,
3540 });
3541 events.push((
3542 Event::MappingStart {
3543 anchor: pending_flow_anchor.take(),
3544 tag: pending_flow_tag.take(),
3545 style: CollectionStyle::Flow,
3546 },
3547 open_span,
3548 ));
3549 }
3550 continue 'outer;
3551 }
3552
3553 // ----------------------------------------------------------------
3554 // Closing delimiters `]` and `}`
3555 // ----------------------------------------------------------------
3556 if ch == ']' || ch == '}' {
3557 let close_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3558 let close_span = zero_span(close_pos);
3559 pos_in_line += 1;
3560
3561 let Some(top) = flow_stack.pop() else {
3562 // Closing delimiter with empty stack — mismatched.
3563 self.failed = true;
3564 return StepResult::Yield(Err(Error {
3565 pos: close_pos,
3566 message: format!("unexpected '{ch}' in flow context"),
3567 }));
3568 };
3569
3570 match (ch, top) {
3571 (']', FlowFrame::Sequence { .. }) => {
3572 events.push((Event::SequenceEnd, close_span));
3573 }
3574 ('}', FlowFrame::Mapping { phase, .. }) => {
3575 // If mapping is in Value phase (key emitted, no value yet),
3576 // emit empty value before closing.
3577 if phase == FlowMappingPhase::Value {
3578 events.push((empty_scalar_event(), close_span));
3579 }
3580 events.push((Event::MappingEnd, close_span));
3581 }
3582 (']', FlowFrame::Mapping { .. }) => {
3583 self.failed = true;
3584 return StepResult::Yield(Err(Error {
3585 pos: close_pos,
3586 message: "expected '}' to close flow mapping, found ']'".into(),
3587 }));
3588 }
3589 ('}', FlowFrame::Sequence { .. }) => {
3590 self.failed = true;
3591 return StepResult::Yield(Err(Error {
3592 pos: close_pos,
3593 message: "expected ']' to close flow sequence, found '}'".into(),
3594 }));
3595 }
3596 _ => unreachable!("all (ch, top) combinations covered above"),
3597 }
3598
3599 // After a nested collection closes inside a parent frame,
3600 // mark the parent as having a value (the nested collection was it),
3601 // and if it's a mapping in Value phase, advance to Key phase.
3602 if let Some(parent) = flow_stack.last_mut() {
3603 // Update the last-token-line tracker so the multi-line implicit
3604 // key check (DK4H) knows where the key (inner collection) ended.
3605 last_token_line = cur_base_pos.line;
3606 match parent {
3607 FlowFrame::Sequence {
3608 has_value,
3609 after_colon,
3610 last_was_plain,
3611 } => {
3612 *has_value = true;
3613 *after_colon = false;
3614 *last_was_plain = false;
3615 }
3616 FlowFrame::Mapping {
3617 phase,
3618 has_value,
3619 last_was_plain,
3620 } => {
3621 *has_value = true;
3622 *last_was_plain = false;
3623 if *phase == FlowMappingPhase::Value {
3624 *phase = FlowMappingPhase::Key;
3625 }
3626 }
3627 }
3628 }
3629
3630 if flow_stack.is_empty() {
3631 // Outermost collection closed.
3632 // Consume the current line; prepend any non-empty tail so the
3633 // block state machine can process content after the `]`/`}`.
3634 let tail_content = &cur_content[pos_in_line..];
3635 let tail_trimmed = tail_content.trim_start_matches([' ', '\t']);
3636 // `#` is a comment only when preceded by whitespace. If the
3637 // closing bracket is immediately followed by `#` (no space),
3638 // that is not a valid comment — it is a syntax error.
3639 if tail_trimmed.starts_with('#') {
3640 let prev_was_ws = pos_in_line == 0
3641 || cur_content[..pos_in_line]
3642 .chars()
3643 .next_back()
3644 .is_some_and(|c| c == ' ' || c == '\t');
3645 if !prev_was_ws {
3646 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3647 self.failed = true;
3648 return StepResult::Yield(Err(Error {
3649 pos: err_pos,
3650 message: "comment requires at least one space before '#'".into(),
3651 }));
3652 }
3653 }
3654 // A flow collection used as an implicit mapping key must
3655 // fit on a single line (YAML 1.2 §7.4.2). If the tail
3656 // begins with `:` (making this collection a mapping key) and
3657 // the closing delimiter is on a different line than the
3658 // opening delimiter, reject as a multi-line flow key.
3659 if tail_trimmed.starts_with(':') && cur_base_pos.line != start_line {
3660 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3661 self.failed = true;
3662 return StepResult::Yield(Err(Error {
3663 pos: err_pos,
3664 message: "multi-line flow collection cannot be used as an implicit mapping key".into(),
3665 }));
3666 }
3667 // If the block collection stack is empty AND the tail does not
3668 // start with `:` (which would indicate this flow collection is a
3669 // mapping key), the flow collection is the document root node.
3670 // Mark it so subsequent content on the NEXT LINE triggers the
3671 // root-node guard in `step_in_document`.
3672 if self.coll_stack.is_empty() && !tail_trimmed.starts_with(':') {
3673 self.root_node_emitted = true;
3674 }
3675 self.lexer.consume_line();
3676 if !tail_trimmed.is_empty() {
3677 let tail_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3678 let synthetic = crate::lines::Line {
3679 content: tail_content,
3680 offset: tail_pos.byte_offset,
3681 indent: tail_pos.column,
3682 break_type: crate::lines::BreakType::Eof,
3683 pos: tail_pos,
3684 };
3685 self.lexer.prepend_inline_line(synthetic);
3686 }
3687 break 'outer;
3688 }
3689 continue 'outer;
3690 }
3691
3692 // ----------------------------------------------------------------
3693 // Comma separator
3694 // ----------------------------------------------------------------
3695 if ch == ',' {
3696 // Leading-comma check: if the current frame has not yet produced
3697 // any value since it was opened (or since the last comma), this
3698 // comma is invalid — e.g. `[,]` or `{,}`.
3699 let leading = match flow_stack.last() {
3700 Some(
3701 FlowFrame::Sequence { has_value, .. }
3702 | FlowFrame::Mapping { has_value, .. },
3703 ) => !has_value,
3704 None => false,
3705 };
3706 if leading {
3707 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3708 self.failed = true;
3709 return StepResult::Yield(Err(Error {
3710 pos: err_pos,
3711 message: "invalid leading comma in flow collection".into(),
3712 }));
3713 }
3714
3715 pos_in_line += 1;
3716
3717 // Skip whitespace after comma.
3718 while pos_in_line < cur_content.len() {
3719 match cur_content[pos_in_line..].chars().next() {
3720 Some(c) if c == ' ' || c == '\t' => pos_in_line += 1,
3721 _ => break,
3722 }
3723 }
3724
3725 // Double-comma check: next char must not be another comma.
3726 if cur_content[pos_in_line..].starts_with(',') {
3727 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3728 self.failed = true;
3729 return StepResult::Yield(Err(Error {
3730 pos: err_pos,
3731 message: "invalid empty entry: consecutive commas in flow collection"
3732 .into(),
3733 }));
3734 }
3735
3736 // If a tag or anchor is pending but no scalar was emitted yet,
3737 // the comma terminates an implicit empty-scalar node. Emit it
3738 // so the pending properties are attached to the correct node
3739 // rather than carried forward to the next entry.
3740 if pending_flow_tag.is_some() || pending_flow_anchor.is_some() {
3741 let empty_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3742 events.push((
3743 Event::Scalar {
3744 value: Cow::Borrowed(""),
3745 style: ScalarStyle::Plain,
3746 anchor: pending_flow_anchor.take(),
3747 tag: pending_flow_tag.take(),
3748 },
3749 zero_span(empty_pos),
3750 ));
3751 // Advance phase: this scalar acts as a value (or key).
3752 if let Some(frame) = flow_stack.last_mut() {
3753 match frame {
3754 FlowFrame::Sequence {
3755 has_value,
3756 after_colon,
3757 last_was_plain,
3758 } => {
3759 *has_value = true;
3760 *after_colon = false;
3761 *last_was_plain = false;
3762 }
3763 FlowFrame::Mapping {
3764 phase,
3765 has_value,
3766 last_was_plain,
3767 } => {
3768 *has_value = true;
3769 *last_was_plain = false;
3770 *phase = match *phase {
3771 FlowMappingPhase::Key => FlowMappingPhase::Value,
3772 FlowMappingPhase::Value => FlowMappingPhase::Key,
3773 };
3774 }
3775 }
3776 }
3777 }
3778
3779 // Reset has_value and (for mappings) go back to Key phase.
3780 if let Some(frame) = flow_stack.last_mut() {
3781 match frame {
3782 FlowFrame::Sequence {
3783 has_value,
3784 after_colon,
3785 last_was_plain,
3786 } => {
3787 *has_value = false;
3788 *after_colon = false;
3789 *last_was_plain = false;
3790 }
3791 FlowFrame::Mapping {
3792 phase,
3793 has_value,
3794 last_was_plain,
3795 } => {
3796 *has_value = false;
3797 *last_was_plain = false;
3798 if *phase == FlowMappingPhase::Value {
3799 *phase = FlowMappingPhase::Key;
3800 }
3801 }
3802 }
3803 }
3804 // Reset last_token_line after a comma — the next key can start
3805 // on the same line as the comma (or any subsequent line) without
3806 // triggering the multi-line implicit key error.
3807 last_token_line = cur_base_pos.line;
3808
3809 continue 'outer;
3810 }
3811
3812 // ----------------------------------------------------------------
3813 // Block scalar indicators forbidden in flow context
3814 // ----------------------------------------------------------------
3815 if ch == '|' || ch == '>' {
3816 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3817 self.failed = true;
3818 return StepResult::Yield(Err(Error {
3819 pos: err_pos,
3820 message: format!(
3821 "block scalar indicator '{ch}' is not allowed inside a flow collection"
3822 ),
3823 }));
3824 }
3825
3826 // ----------------------------------------------------------------
3827 // Block sequence entry indicator `-` forbidden in flow context.
3828 //
3829 // Per YAML 1.2 §7.4, block collections cannot appear inside flow
3830 // context. A `-` followed by space, tab, or end-of-content is
3831 // the block-sequence entry indicator; a `-` followed by any other
3832 // non-separator character is a valid plain-scalar start (e.g. `-x`
3833 // or `-1` are legal plain scalars in flow context).
3834 // ----------------------------------------------------------------
3835 if ch == '-' {
3836 let after = &cur_content[pos_in_line + 1..];
3837 let next_c = after.chars().next();
3838 if next_c.is_none_or(|c| matches!(c, ' ' | '\t')) {
3839 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3840 self.failed = true;
3841 return StepResult::Yield(Err(Error {
3842 pos: err_pos,
3843 message: "block sequence entry '-' is not allowed inside a flow collection"
3844 .into(),
3845 }));
3846 }
3847 }
3848
3849 // ----------------------------------------------------------------
3850 // Quoted scalars — delegate to existing lexer methods.
3851 //
3852 // Strategy: consume the current line, prepend a synthetic line
3853 // starting exactly at the quote character, call the method, then
3854 // re-sync `cur_content` / `cur_base_pos` from the buffer.
3855 // ----------------------------------------------------------------
3856 if ch == '\'' || ch == '"' {
3857 // `remaining` borrows from `cur_content` which borrows from `'input`.
3858 // We capture it before touching the lexer buffer.
3859 let remaining: &'input str = &cur_content[pos_in_line..];
3860 let cur_abs_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
3861
3862 // Consume the current line from the buffer and replace it with
3863 // a synthetic line that starts at the quote character. The
3864 // quoted-scalar method will consume this synthetic line entirely,
3865 // including any content after the closing quote — so we must
3866 // reconstruct the tail from `remaining` and `span` below.
3867 self.lexer.consume_line();
3868 let synthetic = crate::lines::Line {
3869 content: remaining,
3870 offset: cur_abs_pos.byte_offset,
3871 indent: cur_abs_pos.column,
3872 break_type: crate::lines::BreakType::Eof,
3873 pos: cur_abs_pos,
3874 };
3875 self.lexer.prepend_inline_line(synthetic);
3876
3877 // Call the appropriate quoted-scalar method.
3878 let result = if ch == '\'' {
3879 self.lexer.try_consume_single_quoted(0)
3880 } else {
3881 // Flow context: no block-indentation constraint on
3882 // continuation lines of double-quoted scalars.
3883 self.lexer.try_consume_double_quoted(None)
3884 };
3885
3886 let (value, span) = match result {
3887 Ok(Some(vs)) => vs,
3888 Ok(None) => {
3889 self.failed = true;
3890 return StepResult::Yield(Err(Error {
3891 pos: cur_abs_pos,
3892 message: "expected quoted scalar".into(),
3893 }));
3894 }
3895 Err(e) => {
3896 self.failed = true;
3897 return StepResult::Yield(Err(e));
3898 }
3899 };
3900
3901 let style = if ch == '\'' {
3902 ScalarStyle::SingleQuoted
3903 } else {
3904 ScalarStyle::DoubleQuoted
3905 };
3906 events.push((
3907 Event::Scalar {
3908 value,
3909 style,
3910 anchor: pending_flow_anchor.take(),
3911 tag: pending_flow_tag.take(),
3912 },
3913 span,
3914 ));
3915
3916 // Reconstruct the tail after the closing quote so the flow
3917 // parser can continue with `,`, `]`, `}`, etc.
3918 //
3919 // For single-line scalars, the tail is in `remaining` at byte
3920 // offset `span.end.byte_offset - cur_abs_pos.byte_offset`.
3921 //
3922 // For multiline scalars, the lexer's continuation loop consumed
3923 // additional input lines; the tail on the closing-quote line is
3924 // stored in `self.lexer.pending_multiline_tail`. Drain it here.
3925 if let Some((tail, tail_pos)) = self.lexer.pending_multiline_tail.take() {
3926 if !tail.is_empty() {
3927 let tail_syn = crate::lines::Line {
3928 content: tail,
3929 offset: tail_pos.byte_offset,
3930 indent: tail_pos.column,
3931 break_type: crate::lines::BreakType::Eof,
3932 pos: tail_pos,
3933 };
3934 self.lexer.prepend_inline_line(tail_syn);
3935 }
3936 } else {
3937 // Single-line scalar: derive tail from `remaining`.
3938 let consumed_bytes = span.end.byte_offset - cur_abs_pos.byte_offset;
3939 let tail_in_remaining = remaining.get(consumed_bytes..).unwrap_or("");
3940 if !tail_in_remaining.is_empty() {
3941 let tail_syn = crate::lines::Line {
3942 content: tail_in_remaining,
3943 offset: span.end.byte_offset,
3944 indent: span.end.column,
3945 break_type: crate::lines::BreakType::Eof,
3946 pos: span.end,
3947 };
3948 self.lexer.prepend_inline_line(tail_syn);
3949 }
3950 }
3951
3952 // Re-sync from the buffer.
3953 (cur_content, cur_base_pos) = resync!();
3954 pos_in_line = 0;
3955 // Track where this quoted scalar (potential key) ended.
3956 last_token_line = cur_base_pos.line;
3957
3958 if cur_content.is_empty() && self.lexer.at_eof() && !flow_stack.is_empty() {
3959 let err_pos = self.lexer.current_pos();
3960 self.failed = true;
3961 return StepResult::Yield(Err(Error {
3962 pos: err_pos,
3963 message: "unterminated flow collection: unexpected end of input".into(),
3964 }));
3965 }
3966
3967 // Advance mapping phase for the emitted scalar; mark frame as having a value.
3968 if let Some(frame) = flow_stack.last_mut() {
3969 match frame {
3970 FlowFrame::Sequence {
3971 has_value,
3972 after_colon,
3973 last_was_plain,
3974 } => {
3975 *has_value = true;
3976 *after_colon = false;
3977 *last_was_plain = false;
3978 }
3979 FlowFrame::Mapping {
3980 phase,
3981 has_value,
3982 last_was_plain,
3983 } => {
3984 *has_value = true;
3985 *last_was_plain = false;
3986 *phase = match *phase {
3987 FlowMappingPhase::Key => FlowMappingPhase::Value,
3988 FlowMappingPhase::Value => FlowMappingPhase::Key,
3989 };
3990 }
3991 }
3992 }
3993
3994 continue 'outer;
3995 }
3996
3997 // ----------------------------------------------------------------
3998 // Explicit key indicator `?` in flow mappings and sequences
3999 // ----------------------------------------------------------------
4000 if ch == '?' {
4001 let next_ch = cur_content[pos_in_line + 1..].chars().next();
4002 if next_ch.is_none_or(|c| matches!(c, ' ' | '\t' | '\n' | '\r')) {
4003 // `?` followed by whitespace/EOL: explicit key indicator.
4004 // In a flow sequence, remember this so the DK4H single-line
4005 // check is suppressed for the corresponding `:` separator.
4006 if matches!(flow_stack.last(), Some(FlowFrame::Sequence { .. })) {
4007 explicit_key_in_seq = true;
4008 }
4009 pos_in_line += 1;
4010 continue 'outer;
4011 }
4012 // `?` not followed by whitespace — treat as plain scalar start.
4013 }
4014
4015 // ----------------------------------------------------------------
4016 // `:` value separator in flow mappings
4017 // ----------------------------------------------------------------
4018 if ch == ':' {
4019 let next_ch = cur_content[pos_in_line + 1..].chars().next();
4020 // `:` is a value separator when followed by whitespace/delimiter
4021 // (standard case) OR when in a flow sequence with a synthetic
4022 // current line (adjacent `:` from JSON-like key — YAML 1.2
4023 // §7.4.2). A synthetic line means the `:` is on the same
4024 // physical line as the preceding quoted scalar / collection.
4025 let is_standard_sep =
4026 next_ch.is_none_or(|c| matches!(c, ' ' | '\t' | ',' | ']' | '}' | '\n' | '\r'));
4027 let is_adjacent_json_sep = !is_standard_sep
4028 && matches!(
4029 flow_stack.last(),
4030 Some(FlowFrame::Sequence {
4031 has_value: true,
4032 ..
4033 })
4034 )
4035 && self.lexer.is_next_line_synthetic();
4036 let is_value_sep = is_standard_sep || is_adjacent_json_sep;
4037 if is_value_sep {
4038 // Multi-line implicit single-pair mapping key check (YAML 1.2 §7.4.1):
4039 // inside a flow sequence `[...]`, a single-pair mapping entry's key must
4040 // be on the same line as the `:` separator. (Flow mappings `{...}` allow
4041 // multi-line implicit keys — see YAML 1.2 §7.4.2.)
4042 // Exception: when a `?` explicit-key indicator was seen in this sequence
4043 // (`explicit_key_in_seq`), the key may span multiple lines.
4044 let in_sequence = matches!(flow_stack.last(), Some(FlowFrame::Sequence { .. }));
4045 if in_sequence && cur_base_pos.line != last_token_line && !explicit_key_in_seq {
4046 let colon_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4047 self.failed = true;
4048 return StepResult::Yield(Err(Error {
4049 pos: colon_pos,
4050 message: "implicit flow mapping key must be on a single line".into(),
4051 }));
4052 }
4053 explicit_key_in_seq = false;
4054 if let Some(frame) = flow_stack.last_mut() {
4055 match frame {
4056 FlowFrame::Mapping {
4057 phase,
4058 has_value,
4059 last_was_plain,
4060 } => {
4061 *last_was_plain = false;
4062 if *phase == FlowMappingPhase::Key {
4063 // If a tag or anchor is pending but no key scalar was
4064 // emitted yet, the `:` terminates an implicit empty key.
4065 // Emit the empty key scalar now so the pending properties
4066 // are attached to the key, not carried to the value.
4067 if pending_flow_tag.is_some() || pending_flow_anchor.is_some() {
4068 let key_pos =
4069 abs_pos(cur_base_pos, cur_content, pos_in_line);
4070 events.push((
4071 Event::Scalar {
4072 value: Cow::Borrowed(""),
4073 style: ScalarStyle::Plain,
4074 anchor: pending_flow_anchor.take(),
4075 tag: pending_flow_tag.take(),
4076 },
4077 zero_span(key_pos),
4078 ));
4079 *has_value = true;
4080 }
4081 *phase = FlowMappingPhase::Value;
4082 }
4083 }
4084 FlowFrame::Sequence {
4085 after_colon,
4086 last_was_plain,
4087 ..
4088 } => {
4089 // `:` as value separator in a sequence means we are
4090 // entering the value part of a single-pair implicit
4091 // mapping. Mark `after_colon` so the next scalar or
4092 // collection is not rejected for missing a comma.
4093 *after_colon = true;
4094 // Reset last_was_plain so the value scalar on the next
4095 // line is not appended to the key via multi-line
4096 // plain-scalar continuation logic.
4097 *last_was_plain = false;
4098 }
4099 }
4100 }
4101 pos_in_line += 1;
4102 continue 'outer;
4103 }
4104 // `:` not followed by separator — treat as plain scalar char.
4105 }
4106
4107 // ----------------------------------------------------------------
4108 // Tag `!tag`, `!!tag`, `!<uri>`, or `!` in flow context
4109 // ----------------------------------------------------------------
4110 if ch == '!' {
4111 let bang_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4112 let after_bang = &cur_content[pos_in_line + 1..];
4113 let tag_start = &cur_content[pos_in_line..];
4114 match scan_tag(after_bang, tag_start, bang_pos) {
4115 Err(e) => {
4116 self.failed = true;
4117 return StepResult::Yield(Err(e));
4118 }
4119 Ok((tag_slice, advance_past_bang)) => {
4120 // Total bytes: 1 (`!`) + advance_past_bang.
4121 // `!<URI>`: advance_past_bang = 1 + uri.len() + 1
4122 // `!!suffix`: advance_past_bang = 1 + suffix.len()
4123 // `!suffix`: advance_past_bang = suffix.len()
4124 // `!` alone: advance_past_bang = 0
4125 if pending_flow_tag.is_some() {
4126 self.failed = true;
4127 return StepResult::Yield(Err(Error {
4128 pos: bang_pos,
4129 message: "a node may not have more than one tag".into(),
4130 }));
4131 }
4132 // Resolve tag handle against directive scope at scan time.
4133 let resolved_flow_tag =
4134 match self.directive_scope.resolve_tag(tag_slice, bang_pos) {
4135 Ok(t) => t,
4136 Err(e) => {
4137 self.failed = true;
4138 return StepResult::Yield(Err(e));
4139 }
4140 };
4141 pending_flow_tag = Some(resolved_flow_tag);
4142 pos_in_line += 1 + advance_past_bang;
4143 // Skip any whitespace after the tag.
4144 while pos_in_line < cur_content.len() {
4145 match cur_content[pos_in_line..].chars().next() {
4146 Some(c) if c == ' ' || c == '\t' => pos_in_line += 1,
4147 _ => break,
4148 }
4149 }
4150 continue 'outer;
4151 }
4152 }
4153 }
4154
4155 // ----------------------------------------------------------------
4156 // Anchor `&name` in flow context
4157 // ----------------------------------------------------------------
4158 if ch == '&' {
4159 let after_amp = &cur_content[pos_in_line + 1..];
4160 let amp_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4161 match scan_anchor_name(after_amp, amp_pos) {
4162 Err(e) => {
4163 self.failed = true;
4164 return StepResult::Yield(Err(e));
4165 }
4166 Ok(name) => {
4167 // Two anchors on the same flow node are an error.
4168 if pending_flow_anchor.is_some() {
4169 let amp_pos2 = abs_pos(cur_base_pos, cur_content, pos_in_line);
4170 self.failed = true;
4171 return StepResult::Yield(Err(Error {
4172 pos: amp_pos2,
4173 message: "a node may not have more than one anchor".into(),
4174 }));
4175 }
4176 pending_flow_anchor = Some(name);
4177 pos_in_line += 1 + name.len();
4178 // Skip any whitespace after the anchor name.
4179 while pos_in_line < cur_content.len() {
4180 match cur_content[pos_in_line..].chars().next() {
4181 Some(c) if c == ' ' || c == '\t' => pos_in_line += 1,
4182 _ => break,
4183 }
4184 }
4185 continue 'outer;
4186 }
4187 }
4188 }
4189
4190 // ----------------------------------------------------------------
4191 // Alias `*name` in flow context
4192 // ----------------------------------------------------------------
4193 if ch == '*' {
4194 let after_star = &cur_content[pos_in_line + 1..];
4195 let star_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4196 // YAML 1.2 §7.1: alias nodes cannot have properties (anchor or tag).
4197 if pending_flow_tag.is_some() {
4198 self.failed = true;
4199 return StepResult::Yield(Err(Error {
4200 pos: star_pos,
4201 message: "alias node cannot have a tag property".into(),
4202 }));
4203 }
4204 if pending_flow_anchor.is_some() {
4205 self.failed = true;
4206 return StepResult::Yield(Err(Error {
4207 pos: star_pos,
4208 message: "alias node cannot have an anchor property".into(),
4209 }));
4210 }
4211 match scan_anchor_name(after_star, star_pos) {
4212 Err(e) => {
4213 self.failed = true;
4214 return StepResult::Yield(Err(e));
4215 }
4216 Ok(name) => {
4217 let alias_end = Pos {
4218 byte_offset: star_pos.byte_offset + 1 + name.len(),
4219 char_offset: star_pos.char_offset + 1 + name.chars().count(),
4220 line: star_pos.line,
4221 column: star_pos.column + 1 + name.chars().count(),
4222 };
4223 let alias_span = Span {
4224 start: star_pos,
4225 end: alias_end,
4226 };
4227 events.push((Event::Alias { name }, alias_span));
4228 pos_in_line += 1 + name.len();
4229 // Advance mapping phase; mark frame as having a value.
4230 if let Some(frame) = flow_stack.last_mut() {
4231 match frame {
4232 FlowFrame::Sequence {
4233 has_value,
4234 after_colon,
4235 last_was_plain,
4236 } => {
4237 *has_value = true;
4238 *after_colon = false;
4239 *last_was_plain = false;
4240 }
4241 FlowFrame::Mapping {
4242 phase,
4243 has_value,
4244 last_was_plain,
4245 } => {
4246 *has_value = true;
4247 *last_was_plain = false;
4248 *phase = match *phase {
4249 FlowMappingPhase::Key => FlowMappingPhase::Value,
4250 FlowMappingPhase::Value => FlowMappingPhase::Key,
4251 };
4252 }
4253 }
4254 }
4255 continue 'outer;
4256 }
4257 }
4258 }
4259
4260 // ----------------------------------------------------------------
4261 // Multi-line plain scalar continuation in flow context
4262 //
4263 // A plain scalar may span multiple lines (YAML §7.3.3). When the
4264 // previous emitted token was a plain scalar (`last_was_plain`) and
4265 // the current character is a valid `ns-plain-char` (i.e. it can
4266 // appear within a plain scalar body, even if it cannot *start* one),
4267 // extend the in-progress scalar rather than treating the character
4268 // as the start of a new token.
4269 //
4270 // `ns-plain-char` in flow context: any `ns-char` that is not `:` or
4271 // `#`, plus `:` followed by ns-plain-safe, plus `#` not preceded by
4272 // whitespace. At the start of a continuation line all leading
4273 // whitespace has been consumed, so `#` at position 0 here would be
4274 // `#` after whitespace — a comment start, not a continuation char.
4275 // ----------------------------------------------------------------
4276 {
4277 // For flow MAPPINGS: a plain scalar may continue a key only when
4278 // the phase is currently Value — meaning the previous scalar was
4279 // a KEY (Key→Value phase advance was done when emitting it). A
4280 // VALUE scalar (phase Value→Key) must NOT continue: the next line
4281 // is a new key that requires a preceding comma.
4282 // For flow SEQUENCES: `last_was_plain` alone is enough (single-pair
4283 // implicit mapping keys can span lines, and regular sequence items
4284 // can also continue, though commas terminate them).
4285 let frame_last_was_plain = matches!(
4286 flow_stack.last(),
4287 Some(
4288 FlowFrame::Mapping {
4289 last_was_plain: true,
4290 phase: FlowMappingPhase::Value,
4291 ..
4292 } | FlowFrame::Sequence {
4293 last_was_plain: true,
4294 ..
4295 }
4296 )
4297 );
4298 // `ns-plain-char` check: ch must not be a flow terminator, `:` (alone),
4299 // or `#` (comment start after whitespace, which is the only `#` we can
4300 // see here since whitespace was consumed).
4301 let is_ns_plain_char_continuation = frame_last_was_plain
4302 && !matches!(ch, ',' | '[' | ']' | '{' | '}' | '#')
4303 && (ch != ':' || {
4304 let after = &cur_content[pos_in_line + 1..];
4305 let next_c = after.chars().next();
4306 // `:` is a valid continuation char only when NOT followed by
4307 // a separator (space, tab, flow indicator, or end-of-line).
4308 next_c.is_some_and(|nc| {
4309 !matches!(nc, ' ' | '\t' | ',' | '[' | ']' | '{' | '}')
4310 })
4311 });
4312
4313 if is_ns_plain_char_continuation {
4314 let slice = &cur_content[pos_in_line..];
4315 let scanned = scan_plain_line_flow(slice);
4316 if !scanned.is_empty() {
4317 // Extend the most-recently-emitted scalar event with a
4318 // line-fold (space) and the continuation content.
4319 if let Some((
4320 Event::Scalar {
4321 value,
4322 style: ScalarStyle::Plain,
4323 ..
4324 },
4325 _,
4326 )) = events.last_mut()
4327 {
4328 let extended = format!("{value} {scanned}");
4329 *value = Cow::Owned(extended);
4330 }
4331 pos_in_line += scanned.len();
4332 // Update last_token_line to this line so the DK4H
4333 // multi-line implicit-key check remains anchored to the
4334 // last real token (the continuation content).
4335 last_token_line = cur_base_pos.line;
4336 // The continuation may itself end at EOL, leaving the scalar
4337 // still incomplete. Keep `last_was_plain` true and, for
4338 // mappings, revert the phase back to Key so that the `: `
4339 // separator is still recognised.
4340 if let Some(frame) = flow_stack.last_mut() {
4341 match frame {
4342 FlowFrame::Mapping {
4343 phase,
4344 last_was_plain,
4345 ..
4346 } => {
4347 // Undo the premature Key→Value advance: the key is not
4348 // yet complete until `: ` is seen.
4349 *phase = FlowMappingPhase::Key;
4350 *last_was_plain = true;
4351 }
4352 FlowFrame::Sequence { last_was_plain, .. } => {
4353 *last_was_plain = true;
4354 }
4355 }
4356 }
4357 continue 'outer;
4358 }
4359 }
4360 }
4361
4362 // ----------------------------------------------------------------
4363 // Plain scalar in flow context
4364 // ----------------------------------------------------------------
4365 {
4366 // Indicator characters that cannot start a plain scalar in flow.
4367 let is_plain_first = if matches!(
4368 ch,
4369 ',' | '['
4370 | ']'
4371 | '{'
4372 | '}'
4373 | '#'
4374 | '&'
4375 | '*'
4376 | '!'
4377 | '|'
4378 | '>'
4379 | '\''
4380 | '"'
4381 | '%'
4382 | '@'
4383 | '`'
4384 ) {
4385 false
4386 } else if matches!(ch, '?' | ':' | '-') {
4387 // These start a plain scalar only if followed by a safe char.
4388 let after = &cur_content[pos_in_line + ch.len_utf8()..];
4389 let next_c = after.chars().next();
4390 next_c.is_some_and(|nc| !matches!(nc, ' ' | '\t' | ',' | '[' | ']' | '{' | '}'))
4391 } else {
4392 true
4393 };
4394
4395 if is_plain_first {
4396 // Missing-comma check: in a flow collection with has_value=true,
4397 // a new plain scalar is starting without a preceding comma —
4398 // YAML 1.2 §7.4 requires commas between entries.
4399 match flow_stack.last() {
4400 Some(FlowFrame::Mapping {
4401 phase: FlowMappingPhase::Key,
4402 has_value: true,
4403 ..
4404 }) => {
4405 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4406 self.failed = true;
4407 return StepResult::Yield(Err(Error {
4408 pos: err_pos,
4409 message: "missing comma between flow mapping entries".into(),
4410 }));
4411 }
4412 Some(FlowFrame::Sequence {
4413 has_value: true,
4414 after_colon: false,
4415 last_was_plain: false,
4416 }) => {
4417 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4418 self.failed = true;
4419 return StepResult::Yield(Err(Error {
4420 pos: err_pos,
4421 message: "missing comma between flow sequence entries".into(),
4422 }));
4423 }
4424 _ => {}
4425 }
4426 let slice = &cur_content[pos_in_line..];
4427 let scanned = scan_plain_line_flow(slice);
4428 if !scanned.is_empty() {
4429 let scalar_start = abs_pos(cur_base_pos, cur_content, pos_in_line);
4430 let scalar_end =
4431 abs_pos(cur_base_pos, cur_content, pos_in_line + scanned.len());
4432 let scalar_span = Span {
4433 start: scalar_start,
4434 end: scalar_end,
4435 };
4436
4437 events.push((
4438 Event::Scalar {
4439 value: Cow::Borrowed(scanned),
4440 style: ScalarStyle::Plain,
4441 anchor: pending_flow_anchor.take(),
4442 tag: pending_flow_tag.take(),
4443 },
4444 scalar_span,
4445 ));
4446 pos_in_line += scanned.len();
4447 // Track where this scalar (potential key) ended for the
4448 // multi-line implicit key check (DK4H).
4449 last_token_line = cur_base_pos.line;
4450
4451 // Advance mapping phase; mark frame as having a value.
4452 if let Some(frame) = flow_stack.last_mut() {
4453 match frame {
4454 FlowFrame::Sequence {
4455 has_value,
4456 after_colon,
4457 last_was_plain,
4458 } => {
4459 *has_value = true;
4460 *after_colon = false;
4461 *last_was_plain = true; // plain scalars may continue
4462 }
4463 FlowFrame::Mapping {
4464 phase,
4465 has_value,
4466 last_was_plain,
4467 } => {
4468 *has_value = true;
4469 *last_was_plain = true; // plain scalars may continue on next line
4470 *phase = match *phase {
4471 FlowMappingPhase::Key => FlowMappingPhase::Value,
4472 FlowMappingPhase::Value => FlowMappingPhase::Key,
4473 };
4474 }
4475 }
4476 }
4477 continue 'outer;
4478 }
4479 }
4480
4481 // Reserved indicators — task 19 will handle directives.
4482 // `!` (tags), `&`/`*` (anchors/aliases) are handled above.
4483 // Silently skipping remaining reserved indicators would mangle
4484 // YAML structure, so we error early here.
4485 if matches!(ch, '%' | '@' | '`') {
4486 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4487 self.failed = true;
4488 return StepResult::Yield(Err(Error {
4489 pos: err_pos,
4490 message: format!(
4491 "indicator '{ch}' inside flow collection is not yet supported"
4492 ),
4493 }));
4494 }
4495
4496 // Any other character that is not a plain-scalar start and is
4497 // not an indicator handled above (e.g. C0 control characters,
4498 // DEL, C1 controls, surrogates) is invalid here. Error rather
4499 // than panicking — this is user-supplied input.
4500 let err_pos = abs_pos(cur_base_pos, cur_content, pos_in_line);
4501 self.failed = true;
4502 return StepResult::Yield(Err(Error {
4503 pos: err_pos,
4504 message: format!("invalid character {ch:?} inside flow collection"),
4505 }));
4506 }
4507 }
4508
4509 // Tick the parent block mapping phase (if any) after completing a flow
4510 // collection that was a key or value in a block mapping.
4511 self.tick_mapping_phase_after_scalar();
4512
4513 // Push all accumulated events to the queue.
4514 self.queue.extend(events);
4515 StepResult::Continue
4516 }
4517
4518 /// Tick the key/value phase of the innermost open mapping after emitting a
4519 /// scalar event.
4520 ///
4521 /// - If the mapping was in `Key` phase, it flips to `Value`.
4522 /// - If the mapping was in `Value` phase (or there is no open mapping), it
4523 /// flips back to `Key`.
4524 fn tick_mapping_phase_after_scalar(&mut self) {
4525 // A scalar was consumed — clear any pending explicit-key context.
4526 self.explicit_key_pending = false;
4527 // Find the innermost mapping entry on the stack.
4528 for entry in self.coll_stack.iter_mut().rev() {
4529 if let CollectionEntry::Mapping(_, phase, has_had_value) = entry {
4530 *phase = match *phase {
4531 MappingPhase::Key => {
4532 *has_had_value = true;
4533 MappingPhase::Value
4534 }
4535 MappingPhase::Value => MappingPhase::Key,
4536 };
4537 return;
4538 }
4539 // Sequences between this mapping and the top don't count.
4540 if let CollectionEntry::Sequence(_, has_had_item) = entry {
4541 // A scalar here is an item in a sequence, not a mapping value.
4542 // Mark the sequence as having a completed item.
4543 *has_had_item = true;
4544 return;
4545 }
4546 }
4547 }
4548}
4549
4550impl<'input> Iterator for EventIter<'input> {
4551 type Item = Result<(Event<'input>, Span), Error>;
4552
4553 fn next(&mut self) -> Option<Self::Item> {
4554 // After an error, stop immediately — prevent infinite loops on the
4555 // same problematic input (e.g. depth-limit on a prepended synthetic line).
4556 if self.failed {
4557 return None;
4558 }
4559
4560 // Iterative dispatch — avoids unbounded recursion on large bare docs.
4561 loop {
4562 // Drain the event queue first.
4563 if let Some(event) = self.queue.pop_front() {
4564 return Some(Ok(event));
4565 }
4566
4567 let step = match self.state {
4568 IterState::BeforeStream => {
4569 self.state = IterState::BetweenDocs;
4570 return Some(Ok((Event::StreamStart, zero_span(Pos::ORIGIN))));
4571 }
4572 IterState::BetweenDocs => self.step_between_docs(),
4573 IterState::InDocument => self.step_in_document(),
4574 IterState::Done => return None,
4575 };
4576
4577 match step {
4578 StepResult::Continue => {}
4579 StepResult::Yield(result) => return Some(result),
4580 }
4581 }
4582 }
4583}
4584
4585// ---------------------------------------------------------------------------
4586// Unit tests for private helpers (Gap 2: peek/consume divergence guard)
4587// ---------------------------------------------------------------------------
4588
4589#[cfg(test)]
4590mod tests {
4591 use super::{find_value_indicator_offset, is_implicit_mapping_line};
4592
4593 /// Every line that `is_implicit_mapping_line` accepts must also produce
4594 /// `Some` from `find_value_indicator_offset`. This is the contract
4595 /// enforced by the `unreachable!` at the `consume_mapping_entry` call site —
4596 /// if the two ever diverge a future change will trigger a runtime panic
4597 /// under `#[deny(clippy::panic)]`.
4598 ///
4599 /// The table covers: trailing colon, colon-space, colon-tab, colon in
4600 /// quoted spans (must be accepted by peek but offset still returned),
4601 /// multi-byte characters before the colon, and lines that should not
4602 /// be accepted.
4603 #[test]
4604 fn find_value_indicator_agrees_with_is_implicit_mapping_line() {
4605 let accepted = [
4606 "key:",
4607 "key: value",
4608 "key:\t",
4609 "key: multiple spaces",
4610 "\"quoted key\": val",
4611 "'single quoted': val",
4612 "key with spaces: val",
4613 "k:",
4614 "longer-key-with-dashes: v",
4615 "unicode_\u{00e9}: v",
4616 ];
4617 for line in accepted {
4618 assert!(
4619 is_implicit_mapping_line(line),
4620 "expected is_implicit_mapping_line to accept: {line:?}"
4621 );
4622 assert!(
4623 find_value_indicator_offset(line).is_some(),
4624 "find_value_indicator_offset must return Some for accepted line: {line:?}"
4625 );
4626 }
4627
4628 let rejected = [
4629 "plain scalar",
4630 "http://example.com",
4631 "no colon here",
4632 "# comment: not a key",
4633 "",
4634 ];
4635 for line in rejected {
4636 assert!(
4637 !is_implicit_mapping_line(line),
4638 "expected is_implicit_mapping_line to reject: {line:?}"
4639 );
4640 assert!(
4641 find_value_indicator_offset(line).is_none(),
4642 "find_value_indicator_offset must return None for rejected line: {line:?}"
4643 );
4644 }
4645 }
4646}