granit_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// Maximum number of characters the scanner may look ahead while disambiguating a simple key.
29const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31/// The encoding of the input. Currently, only UTF-8 is supported.
32#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34    /// UTF-8 encoding.
35    Utf8,
36}
37
38/// The source style used for a YAML scalar.
39#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41    /// A YAML plain scalar.
42    Plain,
43    /// A YAML single quoted scalar.
44    SingleQuoted,
45    /// A YAML double quoted scalar.
46    DoubleQuoted,
47
48    /// A YAML literal block (`|` block).
49    ///
50    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
51    /// In literal blocks, any indented character is content, including white space characters.
52    /// There is no way to escape characters, nor to break a long line.
53    Literal,
54    /// A YAML folded block (`>` block).
55    ///
56    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
57    /// In folded blocks, any indented character is content, including white space characters.
58    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
59    /// long lines.
60    Folded,
61}
62
63/// Offset information for a [`Marker`].
64///
65/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
66/// character source. For stable inputs, we can track both a character index and a byte offset.
67/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
68/// meaningful underlying file/source), so they are optional.
69#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71    /// The index (in characters) in the source.
72    chars: usize,
73    /// The offset (in bytes) in the source, if available.
74    bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78    fn eq(&self, other: &Self) -> bool {
79        // Byte offsets are an optional diagnostic enhancement and may differ between input
80        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
81        // position only.
82        self.chars == other.chars
83    }
84}
85
86impl Eq for MarkerOffsets {}
87
88/// A location in a YAML document.
89#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91    /// Offsets in the source.
92    offsets: MarkerOffsets,
93    /// The line (1-indexed).
94    line: usize,
95    /// The column (0-indexed).
96    col: usize,
97}
98
99impl Marker {
100    /// Create a new [`Marker`] at the given position.
101    #[must_use]
102    pub fn new(index: usize, line: usize, col: usize) -> Marker {
103        Marker {
104            offsets: MarkerOffsets {
105                chars: index,
106                bytes: None,
107            },
108            line,
109            col,
110        }
111    }
112
113    /// Return a copy of the marker with the given optional byte offset.
114    #[must_use]
115    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116        self.offsets.bytes = byte_offset;
117        self
118    }
119
120    /// Return the index (in characters) of the marker in the source.
121    #[must_use]
122    pub fn index(&self) -> usize {
123        self.offsets.chars
124    }
125
126    /// Return the byte offset of the marker in the source, if available.
127    #[must_use]
128    pub fn byte_offset(&self) -> Option<usize> {
129        self.offsets.bytes
130    }
131
132    /// Return the line of the marker in the source.
133    #[must_use]
134    pub fn line(&self) -> usize {
135        self.line
136    }
137
138    /// Return the column of the marker in the source.
139    #[must_use]
140    pub fn col(&self) -> usize {
141        self.col
142    }
143}
144
145/// A range of locations in a YAML document.
146#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148    /// The start (inclusive) of the range.
149    pub start: Marker,
150    /// The end (exclusive) of the range.
151    pub end: Marker,
152
153    /// Optional indentation hint associated with this span.
154    ///
155    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
156    /// When indentation is not meaningful or cannot be provided, it must be `None`.
157    pub indent: Option<usize>,
158
159    /// Optional source marker for the explicit tag token attached to this node.
160    ///
161    /// This is only meaningful for parser-emitted node events that carry a resolved tag, such as
162    /// [`Event::Scalar`](crate::Event::Scalar),
163    /// [`Event::SequenceStart`](crate::Event::SequenceStart), or
164    /// [`Event::MappingStart`](crate::Event::MappingStart). The normal [`Span::start`] and
165    /// [`Span::end`] continue to cover the node value or collection; `tag_start` points to the
166    /// tag token when that token appears at a different source location.
167    pub tag_start: Option<Marker>,
168}
169
170impl Span {
171    /// Create a new [`Span`] for the given range.
172    #[must_use]
173    pub fn new(start: Marker, end: Marker) -> Span {
174        Span {
175            start,
176            end,
177            indent: None,
178            tag_start: None,
179        }
180    }
181
182    /// Create an empty [`Span`] at a given location.
183    ///
184    /// An empty span doesn't contain any characters, but its position may still be meaningful.
185    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
186    ///
187    /// [`SequenceEnd`]: crate::Event::SequenceEnd
188    #[must_use]
189    pub fn empty(mark: Marker) -> Span {
190        Span {
191            start: mark,
192            end: mark,
193            indent: None,
194            tag_start: None,
195        }
196    }
197
198    /// Return a copy of this [`Span`] with the given indentation hint.
199    #[must_use]
200    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
201        self.indent = indent;
202        self
203    }
204
205    /// Return a copy of this [`Span`] with the given explicit tag-token start marker.
206    #[must_use]
207    pub fn with_tag_start(mut self, tag_start: Option<Marker>) -> Span {
208        self.tag_start = tag_start;
209        self
210    }
211
212    /// Return the source marker of the explicit tag token attached to this node, if any.
213    ///
214    /// The regular span still covers the node value or collection. This accessor is useful for
215    /// diagnostics that should point at the tag itself, especially when a tagged block collection
216    /// begins on a later line than the tag token.
217    #[must_use]
218    pub fn tag_start(&self) -> Option<Marker> {
219        self.tag_start
220    }
221
222    /// Return the length of the span (in characters).
223    #[must_use]
224    pub fn len(&self) -> usize {
225        self.end.index() - self.start.index()
226    }
227
228    /// Return whether the [`Span`] has a length of zero.
229    #[must_use]
230    pub fn is_empty(&self) -> bool {
231        self.len() == 0
232    }
233
234    /// Return the byte range of the span, if available.
235    #[must_use]
236    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
237        let start = self.start.byte_offset()?;
238        let end = self.end.byte_offset()?;
239        Some(start..end)
240    }
241
242    /// Return the source text covered by this span, if byte offsets are available
243    /// and the range is valid for the provided input.
244    #[must_use]
245    pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
246        source.get(self.byte_range()?)
247    }
248}
249
250/// A positional hint for a YAML source comment.
251///
252/// The parser currently recognizes these placements:
253///
254/// ```yaml
255/// # Above
256/// key: value # Right
257///
258/// # Free
259///
260/// next: value
261///
262/// # Last
263/// ```
264#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
265pub enum Placement {
266    /// An own-line comment immediately before another YAML token.
267    ///
268    /// This usually means the comment visually describes the following node.
269    /// Consecutive own-line comments without blank lines between them are also considered
270    /// `Above`, so a comment block can attach to the next YAML element as a group.
271    Above,
272    /// A same-line comment after YAML content or syntax. Examples include `key: value # Right`
273    /// and `- # Right` for an empty sequence entry.
274    Right,
275    /// A standalone own-line comment that is separated from nearby YAML tokens.
276    ///
277    /// This is the fallback for comments that are neither same-line comments, immediately above a
278    /// following token, nor the final comment in the stream. Consumers should treat `Free` as not
279    /// having an obvious neighboring node.
280    #[default]
281    Free,
282    /// An own-line comment at the end of the input stream.
283    ///
284    /// A `Last` comment may be followed by blank lines, but no further YAML token appears before
285    /// `StreamEnd`.
286    Last,
287}
288
289/// A YAML comment captured from the source.
290///
291/// Comments are presentation metadata, not YAML data. This type carries the raw comment payload,
292/// source span, and a best-effort [`Placement`] hint for callers that want to correlate comments
293/// with nearby YAML presentation.
294#[derive(Clone, PartialEq, Debug, Eq)]
295pub struct Comment<'input> {
296    /// Span covering the whole source comment, including `#` and excluding the line break.
297    pub span: Span,
298    /// Raw comment payload exactly after `#`, excluding only the line break.
299    ///
300    /// Leading spaces are preserved, including a single space immediately after `#` when present.
301    pub text: Cow<'input, str>,
302    /// Best-effort placement of this comment relative to nearby YAML content.
303    pub placement: Placement,
304}
305
306impl<'input> Comment<'input> {
307    /// Create a captured YAML comment from a source span and raw payload.
308    ///
309    /// The placement defaults to [`Placement::Free`]. Use [`Comment::with_placement`] when the
310    /// caller already knows a more specific placement.
311    #[must_use]
312    pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
313        Self {
314            span,
315            text: text.into(),
316            placement: Placement::Free,
317        }
318    }
319
320    /// Return this comment with the given placement.
321    #[must_use]
322    pub fn with_placement(mut self, placement: Placement) -> Self {
323        self.placement = placement;
324        self
325    }
326
327    /// Return the comment payload with surrounding whitespace removed.
328    ///
329    /// This helper is ergonomic only. The raw [`Self::text`] payload remains unchanged.
330    #[must_use]
331    pub fn trimmed_text(&self) -> &str {
332        self.text.trim()
333    }
334}
335
336impl AsRef<str> for Comment<'_> {
337    fn as_ref(&self) -> &str {
338        self.text.as_ref()
339    }
340}
341
342/// An error that occurred while scanning.
343#[derive(Clone, PartialEq, Debug, Eq)]
344pub struct ScanError {
345    /// The position at which the error happened in the source.
346    mark: Marker,
347    /// Human-readable details about the error.
348    info: String,
349}
350
351impl ScanError {
352    /// Create a new error from a location and an error string.
353    #[must_use]
354    #[cold]
355    pub fn new(loc: Marker, info: String) -> ScanError {
356        ScanError { mark: loc, info }
357    }
358
359    /// Convenience alias for string slices.
360    #[must_use]
361    #[cold]
362    pub fn new_str(loc: Marker, info: &str) -> ScanError {
363        ScanError {
364            mark: loc,
365            info: info.to_owned(),
366        }
367    }
368
369    #[cold]
370    pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
371        Err(self)
372    }
373
374    /// Return the marker pointing to the error in the source.
375    #[must_use]
376    pub fn marker(&self) -> &Marker {
377        &self.mark
378    }
379
380    /// Return the information string describing the error that happened.
381    #[must_use]
382    pub fn info(&self) -> &str {
383        self.info.as_ref()
384    }
385}
386
387impl fmt::Display for ScanError {
388    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389        write!(
390            f,
391            "{} at char {} line {} column {}",
392            self.info,
393            self.mark.index(),
394            self.mark.line(),
395            self.mark.col() + 1
396        )
397    }
398}
399
400impl core::error::Error for ScanError {}
401
402/// The contents of a scanner token.
403#[derive(Clone, PartialEq, Debug, Eq)]
404pub enum TokenType<'input> {
405    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
406    StreamStart(TEncoding),
407    /// The end of the stream, EOF.
408    StreamEnd,
409    /// A YAML version directive.
410    VersionDirective(
411        /// Major version number.
412        u32,
413        /// Minor version number.
414        u32,
415    ),
416    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
417    TagDirective(
418        /// Tag directive handle, such as `!` or `!app!`.
419        Cow<'input, str>,
420        /// Tag URI prefix associated with the handle.
421        Cow<'input, str>,
422    ),
423    /// The start of a YAML document (`---`).
424    DocumentStart,
425    /// The end of a YAML document (`...`).
426    DocumentEnd,
427    /// The start of a sequence block.
428    ///
429    /// Sequence blocks are arrays starting with a `-`.
430    BlockSequenceStart,
431    /// The start of a block mapping.
432    ///
433    /// Block mappings are key-value collections written with `key: value` entries.
434    BlockMappingStart,
435    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
436    BlockEnd,
437    /// Start of an inline sequence (`[ a, b ]`).
438    FlowSequenceStart,
439    /// End of an inline sequence.
440    FlowSequenceEnd,
441    /// Start of an inline mapping (`{ a: b, c: d }`).
442    FlowMappingStart,
443    /// End of an inline mapping.
444    FlowMappingEnd,
445    /// An entry in a block sequence (see [`TokenType::BlockSequenceStart`]).
446    BlockEntry,
447    /// An entry in a flow sequence (see [`TokenType::FlowSequenceStart`]).
448    FlowEntry,
449    /// A key in a mapping.
450    Key,
451    /// A value in a mapping.
452    Value,
453    /// A reference to a previously defined anchor.
454    Alias(Cow<'input, str>),
455    /// A YAML anchor definition introduced by `&`.
456    Anchor(Cow<'input, str>),
457    /// A YAML tag (starting with bangs `!`).
458    Tag(
459        /// The handle of the tag.
460        Cow<'input, str>,
461        /// The suffix of the tag.
462        Cow<'input, str>,
463    ),
464    /// A regular YAML scalar.
465    Scalar(ScalarStyle, Cow<'input, str>),
466    /// A YAML source comment.
467    ///
468    /// The token payload carries the raw text exactly after `#`, the source span, and an initial
469    /// [`Placement`] hint. The token's companion [`Span`] is the same as [`Comment::span`].
470    Comment(
471        /// Captured comment metadata.
472        Comment<'input>,
473    ),
474    /// A reserved YAML directive.
475    ReservedDirective(
476        /// Directive name.
477        String,
478        /// Directive parameters, split on YAML whitespace.
479        Vec<String>,
480    ),
481}
482
483/// A scanner token.
484#[derive(Clone, PartialEq, Debug, Eq)]
485pub struct Token<'input>(
486    /// Source span covered by this token.
487    pub Span,
488    /// Token payload emitted by the scanner.
489    pub TokenType<'input>,
490);
491
492/// Compact comment metadata used only inside the scanner queue.
493///
494/// The queued token already stores the source span, so storing a full public [`Comment`] there
495/// duplicates a large [`Span`] and inflates every queued token.
496#[derive(Clone, PartialEq, Debug, Eq)]
497pub(crate) struct QueuedComment<'input> {
498    pub(crate) text: Cow<'input, str>,
499    pub(crate) placement: Placement,
500}
501
502impl<'input> QueuedComment<'input> {
503    fn into_public(self, span: Span) -> Comment<'input> {
504        Comment::new(span, self.text).with_placement(self.placement)
505    }
506}
507
508impl<'input> From<Comment<'input>> for QueuedComment<'input> {
509    fn from(comment: Comment<'input>) -> Self {
510        Self {
511            text: comment.text,
512            placement: comment.placement,
513        }
514    }
515}
516
517/// Token payload used in the scanner's internal queue.
518///
519/// This mirrors [`TokenType`] but stores comments without their span. Public [`Token`] values are
520/// reconstructed when the scanner emits them.
521#[derive(Clone, PartialEq, Debug, Eq)]
522pub(crate) enum QueuedTokenType<'input> {
523    StreamStart(TEncoding),
524    StreamEnd,
525    VersionDirective(u32, u32),
526    TagDirective(Cow<'input, str>, Cow<'input, str>),
527    DocumentStart,
528    DocumentEnd,
529    BlockSequenceStart,
530    BlockMappingStart,
531    BlockEnd,
532    FlowSequenceStart,
533    FlowSequenceEnd,
534    FlowMappingStart,
535    FlowMappingEnd,
536    BlockEntry,
537    FlowEntry,
538    Key,
539    Value,
540    Alias(Cow<'input, str>),
541    Anchor(Cow<'input, str>),
542    Tag(Cow<'input, str>, Cow<'input, str>),
543    Scalar(ScalarStyle, Cow<'input, str>),
544    Comment(QueuedComment<'input>),
545    ReservedDirective(String, Vec<String>),
546}
547
548impl<'input> QueuedTokenType<'input> {
549    fn into_public(self, span: Span) -> TokenType<'input> {
550        match self {
551            Self::StreamStart(encoding) => TokenType::StreamStart(encoding),
552            Self::StreamEnd => TokenType::StreamEnd,
553            Self::VersionDirective(major, minor) => TokenType::VersionDirective(major, minor),
554            Self::TagDirective(handle, prefix) => TokenType::TagDirective(handle, prefix),
555            Self::DocumentStart => TokenType::DocumentStart,
556            Self::DocumentEnd => TokenType::DocumentEnd,
557            Self::BlockSequenceStart => TokenType::BlockSequenceStart,
558            Self::BlockMappingStart => TokenType::BlockMappingStart,
559            Self::BlockEnd => TokenType::BlockEnd,
560            Self::FlowSequenceStart => TokenType::FlowSequenceStart,
561            Self::FlowSequenceEnd => TokenType::FlowSequenceEnd,
562            Self::FlowMappingStart => TokenType::FlowMappingStart,
563            Self::FlowMappingEnd => TokenType::FlowMappingEnd,
564            Self::BlockEntry => TokenType::BlockEntry,
565            Self::FlowEntry => TokenType::FlowEntry,
566            Self::Key => TokenType::Key,
567            Self::Value => TokenType::Value,
568            Self::Alias(name) => TokenType::Alias(name),
569            Self::Anchor(name) => TokenType::Anchor(name),
570            Self::Tag(handle, suffix) => TokenType::Tag(handle, suffix),
571            Self::Scalar(style, value) => TokenType::Scalar(style, value),
572            Self::Comment(comment) => TokenType::Comment(comment.into_public(span)),
573            Self::ReservedDirective(name, params) => TokenType::ReservedDirective(name, params),
574        }
575    }
576}
577
578impl<'input> From<TokenType<'input>> for QueuedTokenType<'input> {
579    fn from(token: TokenType<'input>) -> Self {
580        match token {
581            TokenType::StreamStart(encoding) => Self::StreamStart(encoding),
582            TokenType::StreamEnd => Self::StreamEnd,
583            TokenType::VersionDirective(major, minor) => Self::VersionDirective(major, minor),
584            TokenType::TagDirective(handle, prefix) => Self::TagDirective(handle, prefix),
585            TokenType::DocumentStart => Self::DocumentStart,
586            TokenType::DocumentEnd => Self::DocumentEnd,
587            TokenType::BlockSequenceStart => Self::BlockSequenceStart,
588            TokenType::BlockMappingStart => Self::BlockMappingStart,
589            TokenType::BlockEnd => Self::BlockEnd,
590            TokenType::FlowSequenceStart => Self::FlowSequenceStart,
591            TokenType::FlowSequenceEnd => Self::FlowSequenceEnd,
592            TokenType::FlowMappingStart => Self::FlowMappingStart,
593            TokenType::FlowMappingEnd => Self::FlowMappingEnd,
594            TokenType::BlockEntry => Self::BlockEntry,
595            TokenType::FlowEntry => Self::FlowEntry,
596            TokenType::Key => Self::Key,
597            TokenType::Value => Self::Value,
598            TokenType::Alias(name) => Self::Alias(name),
599            TokenType::Anchor(name) => Self::Anchor(name),
600            TokenType::Tag(handle, suffix) => Self::Tag(handle, suffix),
601            TokenType::Scalar(style, value) => Self::Scalar(style, value),
602            TokenType::Comment(comment) => Self::Comment(comment.into()),
603            TokenType::ReservedDirective(name, params) => Self::ReservedDirective(name, params),
604        }
605    }
606}
607
608/// A compact token stored by the scanner before it is emitted publicly.
609#[derive(Clone, PartialEq, Debug, Eq)]
610pub(crate) struct QueuedToken<'input>(pub(crate) Span, pub(crate) QueuedTokenType<'input>);
611
612impl<'input> QueuedToken<'input> {
613    fn into_public(self) -> Token<'input> {
614        Token(self.0, self.1.into_public(self.0))
615    }
616}
617
618impl<'input> From<Token<'input>> for QueuedToken<'input> {
619    fn from(token: Token<'input>) -> Self {
620        Self(token.0, token.1.into())
621    }
622}
623
624/// A scalar that was parsed and may correspond to a simple key.
625///
626/// Upon scanning the following YAML:
627/// ```yaml
628/// a: b
629/// ```
630/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
631/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
632/// kept inside the scanner until more context is fetched and we are able to know whether it is a
633/// plain scalar or a key.
634///
635/// For example, see the following two YAML documents:
636/// ```yaml
637/// ---
638/// a: b # Here, `a` is a key.
639/// ...
640/// ---
641/// a # Here, `a` is a plain scalar.
642/// ...
643/// ```
644/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
645///
646/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
647/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
648/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
649///
650/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
651/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
652/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
653/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
654/// [`TokenType::Scalar`] token.
655///
656/// In the second document however, reaching EOF would mark the [`SimpleKey`] as no longer possible,
657/// and no [`TokenType::Key`] would be emitted by the scanner.
658#[derive(Clone, PartialEq, Debug, Eq)]
659struct SimpleKey {
660    /// Whether the token this [`SimpleKey`] refers to may still be a key.
661    ///
662    /// Sometimes, when we have more context, we notice that what we thought could be a key no
663    /// longer can be. In that case, [`Self::possible`] is set to `false`.
664    ///
665    /// For instance, let us consider the following invalid YAML:
666    /// ```yaml
667    /// key
668    ///   : value
669    /// ```
670    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is no longer
671    /// possible and [`Self::possible`] is set to `false`.
672    possible: bool,
673    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
674    ///
675    /// With more context, we may know for sure that the token must be a key. If later input makes
676    /// that impossible, the scanner must report an error instead of silently treating the token as a
677    /// plain scalar.
678    ///
679    /// This happens for simple keys at the current block indentation where the surrounding
680    /// collection requires the next token to be a mapping key.
681    required: bool,
682    /// The index of the token referred to by the [`SimpleKey`].
683    ///
684    /// This is the index in the scanner, which takes into account both the tokens that have been
685    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
686    /// [`Scanner::tokens`] for more details.
687    token_number: usize,
688    /// The position at which the token the [`SimpleKey`] refers to is.
689    mark: Marker,
690}
691
692impl SimpleKey {
693    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
694    fn new(mark: Marker) -> SimpleKey {
695        SimpleKey {
696            possible: false,
697            required: false,
698            token_number: 0,
699            mark,
700        }
701    }
702}
703
704/// An indentation level on the stack of indentations.
705#[derive(Clone, Debug, Default)]
706struct Indent {
707    /// The former indentation level.
708    indent: isize,
709    /// Whether, upon closing, this indents generates a `BlockEnd` token.
710    ///
711    /// There are levels of indentation which do not start a block. Examples of this would be:
712    /// ```yaml
713    /// -
714    ///   foo # ok
715    /// -
716    /// bar # ko, bar needs to be indented further than the `-`.
717    /// - [
718    ///  baz, # ok
719    /// quux # ko, quux needs to be indented further than the '-'.
720    /// ] # ko, the closing bracket needs to be indented further than the `-`.
721    /// ```
722    ///
723    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
724    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
725    /// sequence, although we must have exactly one to end the sequence.
726    needs_block_end: bool,
727}
728
729/// The knowledge we have about an implicit mapping.
730///
731/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
732/// sequence is omitted:
733/// ```yaml
734/// [ a: b, c: d ]
735/// # Equivalent to
736/// [ { a: b }, { c: d } ]
737/// # Equivalent to
738/// - a: b
739/// - c: d
740/// ```
741///
742/// The state must be carefully tracked for each nested flow sequence since we must emit a
743/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
744/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
745/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
746/// out-of-order.
747///
748/// [`FlowMappingStart`]: TokenType::FlowMappingStart
749/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
750#[derive(Debug, PartialEq)]
751enum ImplicitMappingState {
752    /// It is possible there is an implicit mapping.
753    ///
754    /// This state is the one when we have just encountered the opening `[`. We need more context
755    /// to know whether an implicit mapping follows.
756    Possible,
757    /// We are inside the implicit mapping.
758    ///
759    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
760    Inside(u8),
761}
762
763/// The YAML scanner.
764///
765/// This corresponds to the low-level interface when reading YAML. The scanner emits tokens as they
766/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
767/// some of the constructs. It has understanding of indentation and whitespace and is able to
768/// generate error messages for some invalid YAML constructs.
769///
770/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
771/// YAML documents.
772#[derive(Debug)]
773#[allow(clippy::struct_excessive_bools)]
774pub struct Scanner<'input, T> {
775    /// The input source.
776    ///
777    /// This must implement [`Input`].
778    input: T,
779    /// The position of the cursor within the reader.
780    mark: Marker,
781    /// Buffer for tokens to be returned.
782    ///
783    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
784    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
785    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
786    /// [`Self::next`] until we have more context.
787    tokens: VecDeque<QueuedToken<'input>>,
788    /// The last error that happened.
789    error: Option<ScanError>,
790    /// Error found after one or more already-scanned comment tokens.
791    deferred_error: Option<ScanError>,
792    /// Whether the input may contain `#` comment indicators.
793    comments_possible: bool,
794
795    /// Whether we have already emitted the `StreamStart` token.
796    stream_start_produced: bool,
797    /// Whether we have already emitted the `StreamEnd` token.
798    stream_end_produced: bool,
799    /// Whether the scanner is still in the prefix of the next document.
800    ///
801    /// A BOM may appear in a document prefix, before directives/comments/content. Once a document
802    /// start marker or any content token is scanned, another BOM is document content and must be
803    /// rejected unless it appears inside a quoted scalar.
804    document_prefix_allowed: bool,
805    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
806    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
807    adjacent_value_allowed_at: usize,
808    /// Whether a simple key could potentially start at the current position.
809    ///
810    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
811    simple_key_allowed: bool,
812    /// A stack of potential simple keys.
813    ///
814    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
815    /// are.
816    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
817    /// The current indentation level.
818    indent: isize,
819    /// List of all block indentation levels we are in (except the current one).
820    indents: smallvec::SmallVec<[Indent; 8]>,
821    /// Level of nesting of flow sequences.
822    flow_level: u8,
823    /// The number of tokens that have been returned from the scanner.
824    ///
825    /// This excludes the tokens from [`Self::tokens`].
826    tokens_parsed: usize,
827    /// Whether a token is ready to be taken from [`Self::tokens`].
828    token_available: bool,
829    /// Whether all characters encountered since the last newline were whitespace.
830    leading_whitespace: bool,
831    /// Whether we started a flow mapping at each flow nesting level.
832    ///
833    /// This is used to detect implicit flow mapping starts such as:
834    /// ```yaml
835    /// [ : foo ] # { null: "foo" }
836    /// ```
837    flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
838    /// An array of states, representing whether flow sequences have implicit mappings.
839    ///
840    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
841    /// the state is set to [`Possible`].
842    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
843    /// [`Inside`].
844    ///
845    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
846    /// The entries are created with the opening `[` and popped with the closing `]`.
847    ///
848    /// [`Possible`]: ImplicitMappingState::Possible
849    /// [`Inside`]: ImplicitMappingState::Inside
850    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
851    /// If a plain scalar was terminated by a `#` comment on its line, we set this
852    /// to detect an illegal multiline continuation on the following line.
853    interrupted_plain_by_comment: Option<Marker>,
854    /// Whether the scanner is still validating whitespace after an explicit `?` key indicator.
855    ///
856    /// This stays set across streamed comment tokens so a tab after the comment run is rejected the
857    /// same way it was when that whitespace was scanned in one pass.
858    explicit_key_tab_check_pending: bool,
859    /// A stack of markers for opening brackets `[` and `{`.
860    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
861    buf_leading_break: String,
862    buf_trailing_breaks: String,
863    buf_whitespaces: String,
864}
865
866impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
867    type Item = Token<'input>;
868
869    fn next(&mut self) -> Option<Self::Item> {
870        if self.error.is_some() {
871            return None;
872        }
873        match self.next_token() {
874            Ok(Some(tok)) => {
875                debug_print!(
876                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
877                    tok.1,
878                    tok.0
879                );
880                Some(tok)
881            }
882            Ok(tok) => tok,
883            Err(e) => self.stop_after_error(e),
884        }
885    }
886}
887
888/// A convenience alias for scanner functions that may fail without returning a value.
889pub type ScanResult = Result<(), ScanError>;
890
891#[derive(Debug)]
892enum FlowScalarBuf {
893    /// Candidate for `Cow::Borrowed`.
894    ///
895    /// `start..end` is the committed verbatim range.
896    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
897    /// committed (they must be dropped if followed by a line break).
898    Borrowed {
899        start: usize,
900        end: usize,
901        pending_ws_start: Option<usize>,
902        pending_ws_end: usize,
903    },
904    Owned(String),
905}
906
907impl FlowScalarBuf {
908    #[inline]
909    fn new_borrowed(start: usize) -> Self {
910        Self::Borrowed {
911            start,
912            end: start,
913            pending_ws_start: None,
914            pending_ws_end: start,
915        }
916    }
917
918    #[inline]
919    fn new_owned() -> Self {
920        Self::Owned(String::new())
921    }
922
923    #[inline]
924    fn as_owned_mut(&mut self) -> Option<&mut String> {
925        match self {
926            Self::Owned(s) => Some(s),
927            Self::Borrowed { .. } => None,
928        }
929    }
930
931    #[inline]
932    fn commit_pending_ws(&mut self) {
933        if let Self::Borrowed {
934            end,
935            pending_ws_start,
936            pending_ws_end,
937            ..
938        } = self
939        {
940            if pending_ws_start.is_some() {
941                *end = *pending_ws_end;
942                *pending_ws_start = None;
943            }
944        }
945    }
946
947    #[inline]
948    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
949        if let Self::Borrowed {
950            pending_ws_start,
951            pending_ws_end,
952            ..
953        } = self
954        {
955            if pending_ws_start.is_none() {
956                *pending_ws_start = Some(ws_start);
957            }
958            *pending_ws_end = ws_end;
959        }
960    }
961
962    #[inline]
963    fn discard_pending_ws(&mut self) {
964        if let Self::Borrowed {
965            pending_ws_start,
966            pending_ws_end,
967            end,
968            ..
969        } = self
970        {
971            *pending_ws_start = None;
972            *pending_ws_end = *end;
973        }
974    }
975}
976
977impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
978    #[inline]
979    fn promote_flow_scalar_buf_to_owned(
980        &self,
981        start_mark: &Marker,
982        buf: &mut FlowScalarBuf,
983    ) -> Result<(), ScanError> {
984        let FlowScalarBuf::Borrowed {
985            start,
986            end,
987            pending_ws_start: _,
988            pending_ws_end: _,
989        } = *buf
990        else {
991            return Ok(());
992        };
993
994        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
995            ScanError::new_str(
996                *start_mark,
997                "internal error: input advertised offsets but did not provide a slice",
998            )
999        })?;
1000        *buf = FlowScalarBuf::Owned(slice.to_owned());
1001        Ok(())
1002    }
1003    /// Try to borrow a slice from the underlying input.
1004    ///
1005    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
1006    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
1007    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
1008    #[inline]
1009    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
1010        self.input.slice_borrowed(start, end)
1011    }
1012
1013    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
1014    ///
1015    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
1016    /// borrowing is not possible, it falls back to allocating.
1017    fn scan_tag_handle_directive_cow(
1018        &mut self,
1019        mark: &Marker,
1020    ) -> Result<Cow<'input, str>, ScanError> {
1021        let Some(start) = self.input.byte_offset() else {
1022            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1023        };
1024
1025        if self.input.look_ch() != '!' {
1026            return Err(ScanError::new_str(
1027                *mark,
1028                "while scanning a tag, did not find expected '!'",
1029            ));
1030        }
1031
1032        // Consume the leading '!'.
1033        self.skip_non_blank();
1034
1035        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1036        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
1037        self.input.lookahead(1);
1038        while self.input.next_is_alpha() {
1039            self.skip_non_blank();
1040            self.input.lookahead(1);
1041        }
1042
1043        // Optional trailing '!'.
1044        if self.input.peek() == '!' {
1045            self.skip_non_blank();
1046        }
1047
1048        let Some(end) = self.input.byte_offset() else {
1049            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
1050            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1051        };
1052
1053        let Some(slice) = self.try_borrow_slice(start, end) else {
1054            // Fall back to allocating if zero-copy borrow is not available.
1055            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1056                ScanError::new_str(
1057                    *mark,
1058                    "internal error: input advertised slicing but did not provide a slice",
1059                )
1060            })?;
1061            if !slice.ends_with('!') && slice != "!" {
1062                return Err(ScanError::new_str(
1063                    *mark,
1064                    "while parsing a tag directive, did not find expected '!'",
1065                ));
1066            }
1067            return Ok(Cow::Owned(slice.to_owned()));
1068        };
1069
1070        if !slice.ends_with('!') && slice != "!" {
1071            return Err(ScanError::new_str(
1072                *mark,
1073                "while parsing a tag directive, did not find expected '!'",
1074            ));
1075        }
1076
1077        Ok(Cow::Borrowed(slice))
1078    }
1079
1080    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
1081    ///
1082    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
1083    /// escape is present, the prefix must be decoded and therefore allocated.
1084    fn scan_tag_prefix_directive_cow(
1085        &mut self,
1086        start_mark: &Marker,
1087    ) -> Result<Cow<'input, str>, ScanError> {
1088        let Some(start) = self.input.byte_offset() else {
1089            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1090        };
1091
1092        // The prefix must start with either '!' (local) or a valid global tag char.
1093        if self.input.look_ch() == '!' {
1094            self.skip_non_blank();
1095        } else if !is_tag_char(self.input.peek()) {
1096            return Err(ScanError::new_str(
1097                *start_mark,
1098                "invalid global tag character",
1099            ));
1100        } else if self.input.peek() == '%' {
1101            // Needs decoding. Fall back to allocating path below.
1102        } else {
1103            self.skip_non_blank();
1104        }
1105
1106        // Consume URI chars while we can stay in the borrowed path.
1107        while is_uri_char(self.input.look_ch()) {
1108            if self.input.peek() == '%' {
1109                break;
1110            }
1111            self.skip_non_blank();
1112        }
1113
1114        // If we encountered an escape sequence, we must decode, therefore allocate.
1115        if self.input.peek() == '%' {
1116            let current = self
1117                .input
1118                .byte_offset()
1119                .expect("byte_offset() must remain available once enabled");
1120            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1121                slice.to_owned()
1122            } else {
1123                String::new()
1124            };
1125
1126            while is_uri_char(self.input.look_ch()) {
1127                if self.input.peek() == '%' {
1128                    out.push(self.scan_uri_escapes(start_mark)?);
1129                } else {
1130                    out.push(self.input.peek());
1131                    self.skip_non_blank();
1132                }
1133            }
1134            return Ok(Cow::Owned(out));
1135        }
1136
1137        let Some(end) = self.input.byte_offset() else {
1138            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1139        };
1140
1141        let Some(slice) = self.try_borrow_slice(start, end) else {
1142            // Fall back to allocating if zero-copy borrow is not available.
1143            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1144                ScanError::new_str(
1145                    *start_mark,
1146                    "internal error: input advertised slicing but did not provide a slice",
1147                )
1148            })?;
1149            return Ok(Cow::Owned(slice.to_owned()));
1150        };
1151
1152        Ok(Cow::Borrowed(slice))
1153    }
1154    /// Create a scanner over the given input source.
1155    pub fn new(input: T) -> Self {
1156        let initial_byte_offset = input.byte_offset();
1157        let comments_possible = input.may_contain_comments();
1158        Scanner {
1159            input,
1160            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
1161            tokens: VecDeque::with_capacity(64),
1162            error: None,
1163            deferred_error: None,
1164            comments_possible,
1165
1166            stream_start_produced: false,
1167            stream_end_produced: false,
1168            document_prefix_allowed: true,
1169            adjacent_value_allowed_at: 0,
1170            simple_key_allowed: true,
1171            simple_keys: smallvec::SmallVec::new(),
1172            indent: -1,
1173            indents: smallvec::SmallVec::new(),
1174            flow_level: 0,
1175            tokens_parsed: 0,
1176            token_available: false,
1177            leading_whitespace: true,
1178            flow_mapping_started: smallvec::SmallVec::new(),
1179            implicit_flow_mapping_states: smallvec::SmallVec::new(),
1180            flow_markers: smallvec::SmallVec::new(),
1181            interrupted_plain_by_comment: None,
1182            explicit_key_tab_check_pending: false,
1183
1184            buf_leading_break: String::with_capacity(128),
1185            buf_trailing_breaks: String::with_capacity(128),
1186            buf_whitespaces: String::with_capacity(128),
1187        }
1188    }
1189
1190    /// Return a copy of the last error that was encountered, if any.
1191    ///
1192    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
1193    /// clone of) the same error.
1194    #[inline]
1195    pub fn get_error(&self) -> Option<ScanError> {
1196        self.error.clone().or_else(|| self.deferred_error.clone())
1197    }
1198
1199    #[cold]
1200    fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
1201        self.error = Some(error);
1202        None
1203    }
1204
1205    #[cold]
1206    fn simple_key_expected(mark: Marker) -> ScanError {
1207        ScanError::new_str(mark, "simple key expected ':'")
1208    }
1209
1210    #[cold]
1211    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
1212        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
1213    }
1214
1215    /// Consume the next character. It is assumed the next character is a blank.
1216    #[inline]
1217    fn skip_blank(&mut self) {
1218        self.input.skip();
1219
1220        self.mark.offsets.chars += 1;
1221        self.mark.col += 1;
1222        self.mark.offsets.bytes = self.input.byte_offset();
1223    }
1224
1225    /// Consume the next character. It is assumed the next character is not a blank.
1226    #[inline]
1227    fn skip_non_blank(&mut self) {
1228        self.input.skip();
1229
1230        self.mark.offsets.chars += 1;
1231        self.mark.col += 1;
1232        self.mark.offsets.bytes = self.input.byte_offset();
1233        self.leading_whitespace = false;
1234    }
1235
1236    /// Consume a byte order mark from a document prefix.
1237    ///
1238    /// The source index advances, but the logical column remains unchanged so directives and
1239    /// document markers immediately following the BOM are still recognized as line-start tokens.
1240    #[inline]
1241    fn skip_bom(&mut self) {
1242        self.input.skip();
1243
1244        self.mark.offsets.chars += 1;
1245        self.mark.offsets.bytes = self.input.byte_offset();
1246    }
1247
1248    /// Consume one character that belongs to a comment.
1249    ///
1250    /// Unlike [`Self::skip_non_blank`], this deliberately does not change
1251    /// `leading_whitespace`. Comments are presentation content, so consuming one for either
1252    /// tokenization or skipping should only advance position bookkeeping.
1253    #[inline]
1254    fn skip_comment_char(&mut self) {
1255        self.input.skip();
1256
1257        self.mark.offsets.chars += 1;
1258        self.mark.col += 1;
1259        self.mark.offsets.bytes = self.input.byte_offset();
1260    }
1261
1262    /// Consume the next characters. It is assumed none of the next characters are blanks.
1263    #[inline]
1264    fn skip_n_non_blank(&mut self, count: usize) {
1265        for _ in 0..count {
1266            self.input.skip();
1267            self.mark.offsets.chars += 1;
1268            self.mark.col += 1;
1269        }
1270        self.mark.offsets.bytes = self.input.byte_offset();
1271        self.leading_whitespace = false;
1272    }
1273
1274    /// Consume the next character. It is assumed the next character is a newline.
1275    #[inline]
1276    fn skip_nl(&mut self) {
1277        self.input.skip();
1278
1279        self.mark.offsets.chars += 1;
1280        self.mark.col = 0;
1281        self.mark.line += 1;
1282        self.mark.offsets.bytes = self.input.byte_offset();
1283        self.leading_whitespace = true;
1284    }
1285
1286    /// Consume a line break (either CR, LF, or CRLF), if any. Do nothing if there is none.
1287    #[inline]
1288    fn skip_linebreak(&mut self) {
1289        if self.input.next_2_are('\r', '\n') {
1290            // While technically not a blank, this does not matter as `self.leading_whitespace`
1291            // will be reset by `skip_nl`.
1292            self.skip_blank();
1293            self.skip_nl();
1294        } else if self.input.next_is_break() {
1295            self.skip_nl();
1296        }
1297    }
1298
1299    #[cfg(test)]
1300    fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
1301        Ok(self.scan_comment_queued_token()?.into_public())
1302    }
1303
1304    fn scan_comment_queued_token(&mut self) -> Result<QueuedToken<'input>, ScanError> {
1305        let start_mark = self.mark;
1306        debug_assert_eq!(self.input.peek(), '#');
1307        let placement = if self.leading_whitespace {
1308            Placement::Free
1309        } else {
1310            Placement::Right
1311        };
1312
1313        self.skip_comment_char();
1314
1315        let text = if let Some(start) = self.input.byte_offset() {
1316            // Stable byte offsets are available; slice the payload once at the end.
1317            let n = self.input.skip_while_non_breakz();
1318            self.mark.offsets.chars += n;
1319            self.mark.col += n;
1320            let byte_offset = self.input.byte_offset();
1321            self.mark.offsets.bytes = byte_offset;
1322            let end = byte_offset.expect("byte_offset must remain available once enabled");
1323
1324            if let Some(slice) = self.try_borrow_slice(start, end) {
1325                Cow::Borrowed(slice)
1326            } else if let Some(slice) = self.input.slice_bytes(start, end) {
1327                // Defensive fallback for third-party inputs that expose offsets but cannot borrow.
1328                Cow::Owned(slice.to_owned())
1329            } else {
1330                return Err(ScanError::new_str(
1331                    start_mark,
1332                    "internal error: input advertised offsets but did not provide a slice",
1333                ));
1334            }
1335        } else {
1336            // Streaming input without stable offsets; collect into an owned string.
1337            let mut owned = String::new();
1338            while !is_breakz(self.input.look_ch()) {
1339                owned.push(self.input.peek());
1340                self.skip_comment_char();
1341            }
1342            Cow::Owned(owned)
1343        };
1344
1345        let end_mark = self.mark;
1346        let span = Span::new(start_mark, end_mark);
1347        Ok(QueuedToken(
1348            span,
1349            QueuedTokenType::Comment(QueuedComment { text, placement }),
1350        ))
1351    }
1352
1353    fn push_comment_token(&mut self) -> ScanResult {
1354        let token = self.scan_comment_queued_token()?;
1355        self.tokens.push_back(token);
1356        Ok(())
1357    }
1358
1359    fn skip_comment(&mut self) {
1360        debug_assert_eq!(self.input.peek(), '#');
1361
1362        self.skip_comment_char();
1363        let n = self.input.skip_while_non_breakz();
1364        self.mark.offsets.chars += n;
1365        self.mark.col += n;
1366        self.mark.offsets.bytes = self.input.byte_offset();
1367    }
1368
1369    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
1370    #[inline]
1371    pub fn stream_started(&self) -> bool {
1372        self.stream_start_produced
1373    }
1374
1375    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
1376    #[inline]
1377    pub fn stream_ended(&self) -> bool {
1378        self.stream_end_produced
1379    }
1380
1381    /// Return the current position in the input stream.
1382    #[inline]
1383    pub fn mark(&self) -> Marker {
1384        self.mark
1385    }
1386
1387    /// Return whether this scanner may emit comment tokens.
1388    #[inline]
1389    pub(crate) fn comments_possible(&self) -> bool {
1390        self.comments_possible
1391    }
1392
1393    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1394    //
1395    // A `\n` is pushed into `s`.
1396    //
1397    // # Panics (in debug)
1398    // If the next characters do not correspond to a line break.
1399    #[inline]
1400    fn read_break(&mut self, s: &mut String) {
1401        self.skip_break();
1402        s.push('\n');
1403    }
1404
1405    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1406    //
1407    // # Panics (in debug)
1408    // If the next characters do not correspond to a line break.
1409    #[inline]
1410    fn skip_break(&mut self) {
1411        let c = self.input.peek();
1412        let nc = self.input.peek_nth(1);
1413        debug_assert!(is_break(c));
1414        if c == '\r' && nc == '\n' {
1415            self.skip_blank();
1416        }
1417        self.skip_nl();
1418    }
1419
1420    /// Insert a token at the given position.
1421    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1422        let old_len = self.tokens.len();
1423        assert!(pos <= old_len);
1424        self.tokens.insert(pos, tok.into());
1425    }
1426
1427    fn simple_key_token_index(&self, sk: &SimpleKey, mark: Marker) -> Result<usize, ScanError> {
1428        let Some(index) = sk.token_number.checked_sub(self.tokens_parsed) else {
1429            return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1430        };
1431        if index > self.tokens.len() {
1432            return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1433        }
1434        Ok(index)
1435    }
1436
1437    #[inline]
1438    fn allow_simple_key(&mut self) {
1439        self.simple_key_allowed = true;
1440    }
1441
1442    #[inline]
1443    fn disallow_simple_key(&mut self) {
1444        self.simple_key_allowed = false;
1445    }
1446
1447    /// Scan enough input to append one next token to the internal token queue.
1448    ///
1449    /// # Errors
1450    /// Returns `ScanError` when the scanner does not find the next expected token.
1451    pub fn fetch_next_token(&mut self) -> ScanResult {
1452        self.input.lookahead(1);
1453
1454        if !self.stream_start_produced {
1455            self.fetch_stream_start();
1456            return Ok(());
1457        }
1458        if self.skip_to_next_token(true)? {
1459            return Ok(());
1460        }
1461
1462        debug_print!(
1463            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1464            self.mark,
1465            self.input.peek()
1466        );
1467
1468        self.stale_simple_keys()?;
1469
1470        let mark = self.mark;
1471        self.unroll_indent(mark.col as isize);
1472
1473        self.input.lookahead(4);
1474
1475        if self.input.next_is_z() {
1476            self.fetch_stream_end()?;
1477            return Ok(());
1478        }
1479
1480        if self.mark.col == 0 {
1481            if self.input.next_char_is('%') {
1482                return self.fetch_directive();
1483            } else if self.input.next_is_document_start() {
1484                return self.fetch_document_indicator(TokenType::DocumentStart);
1485            } else if self.input.next_is_document_end() {
1486                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1487                self.skip_ws_to_eol(SkipTabs::Yes)?;
1488                if !self.input.next_is_breakz() {
1489                    return Err(ScanError::new_str(
1490                        self.mark,
1491                        "invalid content after document end marker",
1492                    ));
1493                }
1494                return Ok(());
1495            }
1496        }
1497
1498        if self.document_prefix_allowed {
1499            self.document_prefix_allowed = false;
1500        }
1501
1502        if (self.mark.col as isize) < self.indent {
1503            self.input.lookahead(1);
1504            let c = self.input.peek();
1505            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1506                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1507            }
1508        }
1509
1510        let c = self.input.peek();
1511        let nc = self.input.peek_nth(1);
1512        match c {
1513            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1514            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1515            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1516            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1517            ',' => self.fetch_flow_entry(),
1518            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1519            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1520            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1521            ':' if self.flow_level > 0
1522                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1523            {
1524                self.fetch_flow_value()
1525            }
1526            // Is it an alias?
1527            '*' => self.fetch_anchor(true),
1528            // Is it an anchor?
1529            '&' => self.fetch_anchor(false),
1530            '!' => self.fetch_tag(),
1531            // Is it a literal scalar?
1532            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1533            // Is it a folded scalar?
1534            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1535            '\'' => self.fetch_flow_scalar(true),
1536            '"' => self.fetch_flow_scalar(false),
1537            // plain scalar
1538            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1539            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1540                self.fetch_plain_scalar()
1541            }
1542            c if is_bom(c) => Err(ScanError::new_str(
1543                self.mark,
1544                "a BOM must not appear inside a document",
1545            )),
1546            '%' | '@' | '`' => Err(ScanError::new(
1547                self.mark,
1548                format!("unexpected character: `{c}'"),
1549            )),
1550            _ => self.fetch_plain_scalar(),
1551        }
1552    }
1553
1554    /// Return the next compact queued token, scanning more input when needed.
1555    ///
1556    /// # Errors
1557    /// Returns `ScanError` when scanning fails to find an expected next token.
1558    pub(crate) fn next_queued_token(&mut self) -> Result<Option<QueuedToken<'input>>, ScanError> {
1559        if self.deferred_error.is_some() {
1560            if !matches!(
1561                self.tokens.front().map(|token| &token.1),
1562                Some(QueuedTokenType::Comment(_))
1563            ) {
1564                if let Some(error) = self.deferred_error.take() {
1565                    return error.into_result();
1566                }
1567            }
1568            self.token_available = true;
1569        }
1570
1571        if self.stream_end_produced {
1572            return Ok(None);
1573        }
1574
1575        if !self.token_available {
1576            if let Err(error) = self.fetch_more_tokens() {
1577                if matches!(
1578                    self.tokens.front().map(|token| &token.1),
1579                    Some(QueuedTokenType::Comment(_))
1580                ) {
1581                    self.deferred_error = Some(error);
1582                } else {
1583                    return Err(error);
1584                }
1585            }
1586        }
1587        let Some(t) = self.tokens.pop_front() else {
1588            return Err(ScanError::new_str(
1589                self.mark,
1590                "did not find expected next token",
1591            ));
1592        };
1593        self.token_available = false;
1594        self.tokens_parsed += 1;
1595
1596        let is_stream_end = matches!(t.1, QueuedTokenType::StreamEnd);
1597        if is_stream_end {
1598            self.stream_end_produced = true;
1599        }
1600        Ok(Some(t))
1601    }
1602
1603    /// Return the next queued token, scanning more input when needed.
1604    ///
1605    /// # Errors
1606    /// Returns `ScanError` when scanning fails to find an expected next token.
1607    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1608        Ok(self.next_queued_token()?.map(QueuedToken::into_public))
1609    }
1610
1611    /// Scan more input until a token is ready to be returned.
1612    ///
1613    /// # Errors
1614    /// Returns `ScanError` when scanning fails.
1615    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1616        let mut need_more;
1617        loop {
1618            if self.tokens.is_empty() {
1619                need_more = true;
1620            } else {
1621                need_more = false;
1622                // Stale potential keys that we know won't be keys.
1623                self.stale_simple_keys()?;
1624                if !matches!(
1625                    self.tokens.front().map(|token| &token.1),
1626                    Some(QueuedTokenType::Comment(_))
1627                ) {
1628                    // If our next token to be emitted may be a key, fetch more context.
1629                    for sk in &self.simple_keys {
1630                        if sk.possible && sk.token_number == self.tokens_parsed {
1631                            need_more = true;
1632                            break;
1633                        }
1634                    }
1635                }
1636            }
1637
1638            // Stop fetching immediately after document end/start markers
1639            // to allow the parser to emit the event before reading more content.
1640            if let Some(token) = self.tokens.back() {
1641                if matches!(
1642                    token.1,
1643                    QueuedTokenType::DocumentEnd | QueuedTokenType::DocumentStart
1644                ) {
1645                    break;
1646                }
1647            }
1648
1649            if !need_more {
1650                break;
1651            }
1652            self.fetch_next_token()?;
1653        }
1654        self.token_available = true;
1655
1656        Ok(())
1657    }
1658
1659    /// Mark simple keys that can no longer be keys as such.
1660    ///
1661    /// This function sets `possible` to `false` to each key that, now we have more context, we
1662    /// know will not be keys.
1663    ///
1664    /// # Errors
1665    /// This function returns an error if one of the keys becoming impossible was required to be a
1666    /// key.
1667    fn stale_simple_keys(&mut self) -> ScanResult {
1668        for sk in &mut self.simple_keys {
1669            let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1670            // The length cap applies in flow contexts too; otherwise token buffering can grow
1671            // without bound while the scanner waits to see whether a later ':' resolves the key.
1672            let is_length_stale =
1673                self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1674
1675            if sk.possible && (is_line_stale || is_length_stale) {
1676                if sk.required {
1677                    return Err(Self::simple_key_expected(sk.mark));
1678                }
1679                sk.possible = false;
1680            }
1681        }
1682        Ok(())
1683    }
1684
1685    /// Skip over whitespace (`\t`, ` `, `\n`, `\r`) until the next non-comment token.
1686    ///
1687    /// Comments encountered while skipping are queued as [`TokenType::Comment`] tokens so the
1688    /// parser can emit them as presentation events. If `stop_after_comment` is true, the function
1689    /// returns after queuing one comment so callers can emit it before scanning later comments.
1690    ///
1691    /// # Errors
1692    /// This function returns an error if a tab is encountered where there should not be
1693    /// one.
1694    fn skip_to_next_token(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1695        // Hot-path helper: consume a single logical line break and apply simple-key rules.
1696        // (Kept local to ensure the compiler can inline it easily.)
1697        let consume_linebreak = |this: &mut Self| {
1698            this.input.lookahead(2);
1699            this.skip_linebreak();
1700            if this.flow_level == 0 {
1701                this.allow_simple_key();
1702            }
1703        };
1704
1705        loop {
1706            let ch = self.input.look_ch();
1707            if self.explicit_key_tab_check_pending {
1708                match ch {
1709                    '\t' => {
1710                        return Err(ScanError::new_str(
1711                            self.mark(),
1712                            "tabs disallowed in this context",
1713                        ));
1714                    }
1715                    ' ' | '\n' | '\r' | '#' => {}
1716                    _ => self.explicit_key_tab_check_pending = false,
1717                }
1718            }
1719
1720            match ch {
1721                // Tabs may not be used as indentation (block context only).
1722                '\t' => {
1723                    if self.is_within_block()
1724                        && self.leading_whitespace
1725                        && (self.mark.col as isize) < self.indent
1726                    {
1727                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1728
1729                        // If we have content on that line with a tab, return an error.
1730                        if !self.input.next_is_breakz() {
1731                            return Err(ScanError::new_str(
1732                                self.mark,
1733                                "tabs disallowed within this context (block indentation)",
1734                            ));
1735                        }
1736
1737                        // Micro-opt: if we stopped on a line break, consume it now (avoids another loop trip).
1738                        if matches!(self.input.look_ch(), '\n' | '\r') {
1739                            consume_linebreak(self);
1740                        }
1741                    } else {
1742                        // Non-indentation tab behaves like blank.
1743                        self.skip_blank();
1744                    }
1745                }
1746
1747                ' ' => self.skip_blank(),
1748
1749                '\n' | '\r' => consume_linebreak(self),
1750
1751                c if is_bom(c)
1752                    && self.document_prefix_allowed
1753                    && self.flow_level == 0
1754                    && self.mark.col == 0 =>
1755                {
1756                    self.skip_bom();
1757                }
1758
1759                '#' => {
1760                    self.push_comment_token()?;
1761
1762                    // Micro-opt: comment-only lines are common; consume the following line break here.
1763                    if matches!(self.input.look_ch(), '\n' | '\r') {
1764                        consume_linebreak(self);
1765                    }
1766                    if stop_after_comment {
1767                        return Ok(true);
1768                    }
1769                }
1770
1771                _ => break,
1772            }
1773        }
1774
1775        // If a plain scalar was interrupted by a comment, and the next line could
1776        // continue the scalar in block context, this is invalid.
1777        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1778            // BS4K should only trigger when the continuation would start on the immediate next
1779            // line (no intervening empty/comment-only lines). A blank line resets the folding
1780            // opportunity and thus should not error.
1781            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1782
1783            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1784            if self.flow_level == 0
1785                && is_immediate_next_line
1786                && (self.mark.col as isize) > self.indent
1787            {
1788                // Ensure enough lookahead for:
1789                // - the checks below (peek/peek_nth)
1790                // - document indicator detection which needs 4 chars.
1791                self.input.lookahead(4);
1792
1793                if !self.input.next_is_z()
1794                    && !self.input.next_is_document_indicator()
1795                    && self.input.next_can_be_plain_scalar(false)
1796                {
1797                    return Err(ScanError::new_str(
1798                        err_mark,
1799                        "comment intercepting the multiline text",
1800                    ));
1801                }
1802            }
1803        }
1804
1805        Ok(false)
1806    }
1807
1808    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1809    ///
1810    /// If `stop_after_comment` is true, the function returns after queuing one comment so callers
1811    /// can emit it before scanning later comments.
1812    ///
1813    /// # Errors
1814    /// This function returns an error if no whitespace was found.
1815    fn skip_yaml_whitespace(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1816        let mut need_whitespace = true;
1817        loop {
1818            match self.input.look_ch() {
1819                ' ' => {
1820                    self.skip_blank();
1821
1822                    need_whitespace = false;
1823                }
1824                '\n' | '\r' => {
1825                    self.input.lookahead(2);
1826                    self.skip_linebreak();
1827                    if self.flow_level == 0 {
1828                        self.allow_simple_key();
1829                    }
1830                    need_whitespace = false;
1831                }
1832                '#' => {
1833                    if need_whitespace {
1834                        self.skip_comment();
1835                    } else {
1836                        self.push_comment_token()?;
1837                        if stop_after_comment {
1838                            return Ok(true);
1839                        }
1840                    }
1841                }
1842                _ => break,
1843            }
1844        }
1845
1846        if need_whitespace {
1847            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1848        } else {
1849            Ok(false)
1850        }
1851    }
1852
1853    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1854        debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
1855
1856        if !self.comments_possible {
1857            let (chars_consumed, result) = self.input.skip_ws_to_eol(skip_tabs);
1858            self.mark.col += chars_consumed;
1859            self.mark.offsets.chars += chars_consumed;
1860            self.mark.offsets.bytes = self.input.byte_offset();
1861            return result.map_err(|msg| ScanError::new_str(self.mark, msg));
1862        }
1863
1864        let (chars_consumed, whitespace) = self.input.skip_ws_to_eol_blanks(skip_tabs);
1865        self.mark.col += chars_consumed;
1866        self.mark.offsets.chars += chars_consumed;
1867        self.mark.offsets.bytes = self.input.byte_offset();
1868
1869        if self.input.look_ch() != '#' {
1870            return Ok(whitespace);
1871        }
1872
1873        if !whitespace.found_tabs() && !whitespace.has_valid_yaml_ws() {
1874            return Err(ScanError::new_str(
1875                self.mark,
1876                "comments must be separated from other tokens by whitespace",
1877            ));
1878        }
1879
1880        self.push_comment_token()?;
1881        Ok(whitespace)
1882    }
1883
1884    fn fetch_stream_start(&mut self) {
1885        let mark = self.mark;
1886        self.indent = -1;
1887        self.stream_start_produced = true;
1888        self.allow_simple_key();
1889        self.tokens
1890            .push_back(Token(Span::empty(mark), TokenType::StreamStart(TEncoding::Utf8)).into());
1891        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1892    }
1893
1894    fn fetch_stream_end(&mut self) -> ScanResult {
1895        // force new line
1896        if self.mark.col != 0 {
1897            self.mark.col = 0;
1898            self.mark.line += 1;
1899        }
1900
1901        if let Some((mark, bracket)) = self.flow_markers.pop() {
1902            return Err(Self::unclosed_bracket(mark, bracket));
1903        }
1904
1905        // If the stream ended, we won't have more context. We can stall all the simple keys we
1906        // had. If one was required, however, that was an error and we must propagate it.
1907        for sk in &mut self.simple_keys {
1908            if sk.required && sk.possible {
1909                return Err(Self::simple_key_expected(sk.mark));
1910            }
1911            sk.possible = false;
1912        }
1913
1914        self.unroll_indent(-1);
1915        self.remove_simple_key()?;
1916        self.disallow_simple_key();
1917
1918        self.tokens
1919            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd).into());
1920        Ok(())
1921    }
1922
1923    fn fetch_directive(&mut self) -> ScanResult {
1924        self.unroll_indent(-1);
1925        self.remove_simple_key()?;
1926
1927        self.disallow_simple_key();
1928
1929        let token_index = self.tokens.len();
1930        let tok = self.scan_directive()?;
1931        self.insert_token(token_index, tok);
1932
1933        Ok(())
1934    }
1935
1936    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1937        let start_mark = self.mark;
1938        self.skip_non_blank();
1939
1940        let name = self.scan_directive_name()?;
1941        let tok = match name.as_ref() {
1942            "YAML" => self.scan_version_directive_value(&start_mark)?,
1943            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1944            _ => {
1945                let mut params = Vec::new();
1946                while self.input.next_is_blank() {
1947                    let n_blanks = self.input.skip_while_blank();
1948                    self.mark.offsets.chars += n_blanks;
1949                    self.mark.col += n_blanks;
1950                    self.mark.offsets.bytes = self.input.byte_offset();
1951
1952                    if !is_blank_or_breakz(self.input.peek()) {
1953                        let mut param = String::new();
1954                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1955                        self.mark.offsets.chars += n_chars;
1956                        self.mark.col += n_chars;
1957                        self.mark.offsets.bytes = self.input.byte_offset();
1958                        params.push(param);
1959                    }
1960                }
1961
1962                Token(
1963                    Span::new(start_mark, self.mark),
1964                    TokenType::ReservedDirective(name, params),
1965                )
1966            }
1967        };
1968
1969        self.skip_ws_to_eol(SkipTabs::Yes)?;
1970
1971        if self.input.next_is_breakz() {
1972            self.input.lookahead(2);
1973            self.skip_linebreak();
1974            Ok(tok)
1975        } else {
1976            Err(ScanError::new_str(
1977                start_mark,
1978                "while scanning a directive, did not find expected comment or line break",
1979            ))
1980        }
1981    }
1982
1983    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1984        let n_blanks = self.input.skip_while_blank();
1985        self.mark.offsets.chars += n_blanks;
1986        self.mark.col += n_blanks;
1987        self.mark.offsets.bytes = self.input.byte_offset();
1988
1989        let major = self.scan_version_directive_number(mark)?;
1990
1991        if self.input.peek() != '.' {
1992            return Err(ScanError::new_str(
1993                *mark,
1994                "while scanning a YAML directive, did not find expected digit or '.' character",
1995            ));
1996        }
1997        self.skip_non_blank();
1998
1999        let minor = self.scan_version_directive_number(mark)?;
2000
2001        Ok(Token(
2002            Span::new(*mark, self.mark),
2003            TokenType::VersionDirective(major, minor),
2004        ))
2005    }
2006
2007    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
2008        let start_mark = self.mark;
2009        let mut string = String::new();
2010
2011        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
2012        self.mark.offsets.chars += n_chars;
2013        self.mark.col += n_chars;
2014        self.mark.offsets.bytes = self.input.byte_offset();
2015
2016        if string.is_empty() {
2017            return Err(ScanError::new_str(
2018                start_mark,
2019                "while scanning a directive, could not find expected directive name",
2020            ));
2021        }
2022
2023        if !is_blank_or_breakz(self.input.peek()) {
2024            return Err(ScanError::new_str(
2025                start_mark,
2026                "while scanning a directive, found unexpected non-alphabetical character",
2027            ));
2028        }
2029
2030        Ok(string)
2031    }
2032
2033    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
2034        let mut val = 0u32;
2035        let mut length = 0usize;
2036        while let Some(digit) = self.input.look_ch().to_digit(10) {
2037            if length + 1 > 9 {
2038                return Err(ScanError::new_str(
2039                    *mark,
2040                    "while scanning a YAML directive, found extremely long version number",
2041                ));
2042            }
2043            length += 1;
2044            val = val * 10 + digit;
2045            self.skip_non_blank();
2046        }
2047
2048        if length == 0 {
2049            return Err(ScanError::new_str(
2050                *mark,
2051                "while scanning a YAML directive, did not find expected version number",
2052            ));
2053        }
2054
2055        Ok(val)
2056    }
2057
2058    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
2059        let n_blanks = self.input.skip_while_blank();
2060        self.mark.offsets.chars += n_blanks;
2061        self.mark.col += n_blanks;
2062        self.mark.offsets.bytes = self.input.byte_offset();
2063
2064        let handle = self.scan_tag_handle_directive_cow(mark)?;
2065
2066        let n_blanks = self.input.skip_while_blank();
2067        self.mark.offsets.chars += n_blanks;
2068        self.mark.col += n_blanks;
2069        self.mark.offsets.bytes = self.input.byte_offset();
2070
2071        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
2072
2073        self.input.lookahead(1);
2074
2075        if self.input.next_is_blank_or_breakz() {
2076            Ok(Token(
2077                Span::new(*mark, self.mark),
2078                TokenType::TagDirective(handle, prefix),
2079            ))
2080        } else {
2081            Err(ScanError::new_str(
2082                *mark,
2083                "while scanning TAG, did not find expected whitespace or line break",
2084            ))
2085        }
2086    }
2087
2088    fn fetch_tag(&mut self) -> ScanResult {
2089        self.save_simple_key();
2090        self.disallow_simple_key();
2091
2092        let tok = self.scan_tag()?;
2093        self.tokens.push_back(tok.into());
2094        Ok(())
2095    }
2096
2097    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
2098        let start_mark = self.mark;
2099
2100        // Check if the tag is in the canonical form (verbatim).
2101        self.input.lookahead(2);
2102
2103        // If byte_offset is not available, use the original owned-only path.
2104        if self.input.byte_offset().is_none() {
2105            return self.scan_tag_owned(&start_mark);
2106        }
2107
2108        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
2109            if self.input.nth_char_is(1, '<') {
2110                // Verbatim tags always need owned strings (URI escapes).
2111                let suffix = self.scan_verbatim_tag(&start_mark)?;
2112                (Cow::Owned(String::new()), Cow::Owned(suffix))
2113            } else {
2114                // The tag has either the '!suffix' or the '!handle!suffix'
2115                let handle = self.scan_tag_handle_cow(&start_mark)?;
2116                // Check if it is, indeed, handle.
2117                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2118                    // A tag handle starting with "!!" is a secondary tag handle.
2119                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
2120                    (handle, suffix)
2121                } else {
2122                    // Not a real handle, it's part of the suffix.
2123                    // E.g., "!foo" -> handle="!", suffix="foo"
2124                    // The "handle" we scanned is actually "!" + suffix_part1.
2125                    // We need to also scan any remaining suffix characters.
2126                    let remaining_suffix =
2127                        self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
2128
2129                    // Extract suffix from handle (skip leading '!') and combine with remaining.
2130                    let suffix = if handle.len() > 1 {
2131                        if remaining_suffix.is_empty() {
2132                            // The suffix is just what's in handle after '!'
2133                            match handle {
2134                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
2135                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
2136                            }
2137                        } else {
2138                            // Combine handle (minus leading '!') with remaining suffix.
2139                            let mut combined = handle[1..].to_owned();
2140                            combined.push_str(&remaining_suffix);
2141                            Cow::Owned(combined)
2142                        }
2143                    } else {
2144                        // handle is just "!", suffix is whatever we scanned after
2145                        remaining_suffix
2146                    };
2147
2148                    // A special case: the '!' tag.  Set the handle to '' and the
2149                    // suffix to '!'.
2150                    if suffix.is_empty() {
2151                        (Cow::Borrowed(""), Cow::Borrowed("!"))
2152                    } else {
2153                        (Cow::Borrowed("!"), suffix)
2154                    }
2155                }
2156            };
2157
2158        if is_blank_or_breakz(self.input.look_ch())
2159            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2160        {
2161            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2162            // delimiter follows.
2163            Ok(Token(
2164                Span::new(start_mark, self.mark),
2165                TokenType::Tag(handle, suffix),
2166            ))
2167        } else {
2168            Err(ScanError::new_str(
2169                start_mark,
2170                "while scanning a tag, did not find expected whitespace or line break",
2171            ))
2172        }
2173    }
2174
2175    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
2176    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
2177        let mut handle = String::new();
2178        let mut suffix;
2179
2180        if self.input.nth_char_is(1, '<') {
2181            suffix = self.scan_verbatim_tag(start_mark)?;
2182        } else {
2183            // The tag has either the '!suffix' or the '!handle!suffix'
2184            handle = self.scan_tag_handle(false, start_mark)?;
2185            // Check if it is, indeed, handle.
2186            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2187                // A tag handle starting with "!!" is a secondary tag handle.
2188                let is_secondary_handle = handle == "!!";
2189                suffix =
2190                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
2191            } else {
2192                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
2193                "!".clone_into(&mut handle);
2194                // A special case: the '!' tag.  Set the handle to '' and the
2195                // suffix to '!'.
2196                if suffix.is_empty() {
2197                    handle.clear();
2198                    "!".clone_into(&mut suffix);
2199                }
2200            }
2201        }
2202
2203        if is_blank_or_breakz(self.input.look_ch())
2204            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2205        {
2206            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2207            // delimiter follows.
2208            Ok(Token(
2209                Span::new(*start_mark, self.mark),
2210                TokenType::Tag(handle.into(), suffix.into()),
2211            ))
2212        } else {
2213            Err(ScanError::new_str(
2214                *start_mark,
2215                "while scanning a tag, did not find expected whitespace or line break",
2216            ))
2217        }
2218    }
2219
2220    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
2221    ///
2222    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
2223    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
2224    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
2225        let Some(start) = self.input.byte_offset() else {
2226            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2227        };
2228
2229        if self.input.look_ch() != '!' {
2230            return Err(ScanError::new_str(
2231                *mark,
2232                "while scanning a tag, did not find expected '!'",
2233            ));
2234        }
2235
2236        // Consume the leading '!'.
2237        self.skip_non_blank();
2238
2239        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
2240        self.input.lookahead(1);
2241        while self.input.next_is_alpha() {
2242            self.skip_non_blank();
2243            self.input.lookahead(1);
2244        }
2245
2246        // Optional trailing '!'.
2247        if self.input.peek() == '!' {
2248            self.skip_non_blank();
2249        }
2250
2251        let Some(end) = self.input.byte_offset() else {
2252            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2253        };
2254
2255        if let Some(slice) = self.try_borrow_slice(start, end) {
2256            Ok(Cow::Borrowed(slice))
2257        } else {
2258            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2259                ScanError::new_str(
2260                    *mark,
2261                    "internal error: input advertised slicing but did not provide a slice",
2262                )
2263            })?;
2264            Ok(Cow::Owned(slice.to_owned()))
2265        }
2266    }
2267
2268    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
2269    ///
2270    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
2271    fn scan_tag_shorthand_suffix_cow(
2272        &mut self,
2273        mark: &Marker,
2274        require_non_empty: bool,
2275    ) -> Result<Cow<'input, str>, ScanError> {
2276        let Some(start) = self.input.byte_offset() else {
2277            return Ok(Cow::Owned(
2278                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2279            ));
2280        };
2281
2282        // Scan tag characters, checking for URI escapes.
2283        while is_tag_char(self.input.look_ch()) {
2284            if self.input.peek() == '%' {
2285                // URI escape found - must decode, so fall back to owned path.
2286                let current = self
2287                    .input
2288                    .byte_offset()
2289                    .expect("byte_offset() must remain available once enabled");
2290                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
2291                    slice.to_owned()
2292                } else {
2293                    String::new()
2294                };
2295
2296                // Continue scanning with owned buffer.
2297                while is_tag_char(self.input.look_ch()) {
2298                    if self.input.peek() == '%' {
2299                        out.push(self.scan_uri_escapes(mark)?);
2300                    } else {
2301                        out.push(self.input.peek());
2302                        self.skip_non_blank();
2303                    }
2304                }
2305                return Ok(Cow::Owned(out));
2306            }
2307            self.skip_non_blank();
2308        }
2309
2310        let Some(end) = self.input.byte_offset() else {
2311            return Ok(Cow::Owned(
2312                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2313            ));
2314        };
2315
2316        if require_non_empty && start == end {
2317            return Err(ScanError::new_str(
2318                *mark,
2319                "while parsing a tag, did not find expected tag URI",
2320            ));
2321        }
2322
2323        if let Some(slice) = self.try_borrow_slice(start, end) {
2324            Ok(Cow::Borrowed(slice))
2325        } else {
2326            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2327                ScanError::new_str(
2328                    *mark,
2329                    "internal error: input advertised slicing but did not provide a slice",
2330                )
2331            })?;
2332            Ok(Cow::Owned(slice.to_owned()))
2333        }
2334    }
2335
2336    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
2337        let mut string = String::new();
2338        if self.input.look_ch() != '!' {
2339            return Err(ScanError::new_str(
2340                *mark,
2341                "while scanning a tag, did not find expected '!'",
2342            ));
2343        }
2344
2345        string.push(self.input.peek());
2346        self.skip_non_blank();
2347
2348        let n_chars = self.input.fetch_while_is_alpha(&mut string);
2349        self.mark.offsets.chars += n_chars;
2350        self.mark.col += n_chars;
2351        self.mark.offsets.bytes = self.input.byte_offset();
2352
2353        // Check if the trailing character is '!' and copy it.
2354        if self.input.peek() == '!' {
2355            string.push(self.input.peek());
2356            self.skip_non_blank();
2357        } else if directive && string != "!" {
2358            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
2359            // directive, it's an error.  If it's a tag token, it must be a part of
2360            // URI.
2361            return Err(ScanError::new_str(
2362                *mark,
2363                "while parsing a tag directive, did not find expected '!'",
2364            ));
2365        }
2366        Ok(string)
2367    }
2368
2369    /// Scan for a tag prefix (6.8.2.2).
2370    ///
2371    /// There are 2 kinds of tag prefixes:
2372    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
2373    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
2374    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2375        let mut string = String::new();
2376
2377        if self.input.look_ch() == '!' {
2378            // If we have a local tag, insert and skip `!`.
2379            string.push(self.input.peek());
2380            self.skip_non_blank();
2381        } else if !is_tag_char(self.input.peek()) {
2382            // Otherwise, check if the first global tag character is valid.
2383            return Err(ScanError::new_str(
2384                *start_mark,
2385                "invalid global tag character",
2386            ));
2387        } else if self.input.peek() == '%' {
2388            // If it is valid and an escape sequence, escape it.
2389            string.push(self.scan_uri_escapes(start_mark)?);
2390        } else {
2391            // Otherwise, push the first character.
2392            string.push(self.input.peek());
2393            self.skip_non_blank();
2394        }
2395
2396        while is_uri_char(self.input.look_ch()) {
2397            if self.input.peek() == '%' {
2398                string.push(self.scan_uri_escapes(start_mark)?);
2399            } else {
2400                string.push(self.input.peek());
2401                self.skip_non_blank();
2402            }
2403        }
2404
2405        Ok(string)
2406    }
2407
2408    /// Scan for a verbatim tag.
2409    ///
2410    /// The prefixing `!<` must _not_ have been skipped.
2411    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2412        // Eat `!<`
2413        self.skip_non_blank();
2414        self.skip_non_blank();
2415
2416        let mut string = String::new();
2417        while is_uri_char(self.input.look_ch()) {
2418            if self.input.peek() == '%' {
2419                string.push(self.scan_uri_escapes(start_mark)?);
2420            } else {
2421                string.push(self.input.peek());
2422                self.skip_non_blank();
2423            }
2424        }
2425
2426        if string.is_empty() {
2427            return Err(ScanError::new_str(
2428                *start_mark,
2429                "while parsing a tag, did not find expected tag URI",
2430            ));
2431        }
2432
2433        if self.input.peek() != '>' {
2434            return Err(ScanError::new_str(
2435                *start_mark,
2436                "while scanning a verbatim tag, did not find the expected '>'",
2437            ));
2438        }
2439        self.skip_non_blank();
2440
2441        Ok(string)
2442    }
2443
2444    fn scan_tag_shorthand_suffix(
2445        &mut self,
2446        _directive: bool,
2447        _is_secondary: bool,
2448        head: &str,
2449        mark: &Marker,
2450    ) -> Result<String, ScanError> {
2451        let mut length = head.len();
2452        let mut string = String::new();
2453
2454        // Copy the head if needed.
2455        // Note that we don't copy the leading '!' character.
2456        if length > 1 {
2457            string.extend(head.chars().skip(1));
2458        }
2459
2460        while is_tag_char(self.input.look_ch()) {
2461            // Check if it is a URI-escape sequence.
2462            if self.input.peek() == '%' {
2463                string.push(self.scan_uri_escapes(mark)?);
2464            } else {
2465                string.push(self.input.peek());
2466                self.skip_non_blank();
2467            }
2468
2469            length += 1;
2470        }
2471
2472        if length == 0 {
2473            return Err(ScanError::new_str(
2474                *mark,
2475                "while parsing a tag, did not find expected tag URI",
2476            ));
2477        }
2478
2479        Ok(string)
2480    }
2481
2482    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2483        let mut width = 0usize;
2484        let mut bytes = [0u8; 4];
2485        let mut bytes_len = 0usize;
2486        loop {
2487            self.input.lookahead(3);
2488
2489            let c = self.input.peek_nth(1);
2490            let nc = self.input.peek_nth(2);
2491
2492            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
2493                return Err(ScanError::new_str(
2494                    *mark,
2495                    "while parsing a tag, found an invalid escape sequence",
2496                ));
2497            }
2498
2499            let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
2500                .expect("two hex nibbles always fit in a byte");
2501            if width == 0 {
2502                width = match byte {
2503                    _ if byte & 0x80 == 0x00 => 1,
2504                    _ if byte & 0xE0 == 0xC0 => 2,
2505                    _ if byte & 0xF0 == 0xE0 => 3,
2506                    _ if byte & 0xF8 == 0xF0 => 4,
2507                    _ => {
2508                        return Err(ScanError::new_str(
2509                            *mark,
2510                            "while parsing a tag, found an incorrect leading UTF-8 byte",
2511                        ));
2512                    }
2513                };
2514            } else if byte & 0xc0 != 0x80 {
2515                return Err(ScanError::new_str(
2516                    *mark,
2517                    "while parsing a tag, found an incorrect trailing UTF-8 byte",
2518                ));
2519            }
2520
2521            bytes[bytes_len] = byte;
2522            bytes_len += 1;
2523
2524            self.skip_n_non_blank(3);
2525
2526            width -= 1;
2527            if width == 0 {
2528                break;
2529            }
2530        }
2531
2532        let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2533            ScanError::new_str(
2534                *mark,
2535                "while parsing a tag, found an invalid UTF-8 codepoint",
2536            )
2537        })?;
2538
2539        let mut chars = s.chars();
2540        match (chars.next(), chars.next()) {
2541            (Some(ch), None) => Ok(ch),
2542            _ => Err(ScanError::new_str(
2543                *mark,
2544                "while parsing a tag, found an invalid UTF-8 codepoint",
2545            )),
2546        }
2547    }
2548
2549    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2550        self.save_simple_key();
2551        self.disallow_simple_key();
2552
2553        let tok = self.scan_anchor(alias)?;
2554
2555        self.tokens.push_back(tok.into());
2556
2557        Ok(())
2558    }
2559
2560    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2561        let start_mark = self.mark;
2562
2563        // Skip `&` / `*`.
2564        self.skip_non_blank();
2565
2566        // Borrow from input when possible.
2567        if let Some(start) = self.input.byte_offset() {
2568            while is_anchor_char(self.input.look_ch()) {
2569                self.skip_non_blank();
2570            }
2571
2572            let end = self
2573                .input
2574                .byte_offset()
2575                .expect("byte_offset() must remain available once enabled");
2576
2577            if start == end {
2578                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2579            }
2580
2581            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2582                Cow::Borrowed(slice)
2583            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2584                Cow::Owned(slice.to_owned())
2585            } else {
2586                return Err(ScanError::new_str(
2587                    start_mark,
2588                    "internal error: input advertised slicing but did not provide a slice",
2589                ));
2590            };
2591
2592            let tok = if alias {
2593                TokenType::Alias(cow)
2594            } else {
2595                TokenType::Anchor(cow)
2596            };
2597            return Ok(Token(Span::new(start_mark, self.mark), tok));
2598        }
2599
2600        let mut string = String::new();
2601        while is_anchor_char(self.input.look_ch()) {
2602            string.push(self.input.peek());
2603            self.skip_non_blank();
2604        }
2605
2606        if string.is_empty() {
2607            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2608        }
2609
2610        let tok = if alias {
2611            TokenType::Alias(string.into())
2612        } else {
2613            TokenType::Anchor(string.into())
2614        };
2615        Ok(Token(Span::new(start_mark, self.mark), tok))
2616    }
2617
2618    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2619        // The indicators '[' and '{' may start a simple key.
2620        self.save_simple_key();
2621
2622        let start_mark = self.mark;
2623        let indicator = self.input.peek();
2624        self.flow_markers.push((start_mark, indicator));
2625
2626        self.roll_one_col_indent();
2627        self.increase_flow_level()?;
2628
2629        self.allow_simple_key();
2630
2631        self.skip_non_blank();
2632        let end_mark = self.mark;
2633
2634        if tok == TokenType::FlowMappingStart {
2635            self.flow_mapping_started.push(true);
2636        } else {
2637            self.flow_mapping_started.push(false);
2638            self.implicit_flow_mapping_states
2639                .push(ImplicitMappingState::Possible);
2640        }
2641
2642        let token_index = self.tokens.len();
2643        self.skip_ws_to_eol(SkipTabs::Yes)?;
2644
2645        self.insert_token(token_index, Token(Span::new(start_mark, end_mark), tok));
2646        Ok(())
2647    }
2648
2649    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2650        // A closing bracket without a corresponding opening is invalid YAML.
2651        if self.flow_level == 0 {
2652            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2653        }
2654
2655        let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2656            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2657        };
2658
2659        let (expected_open, actual_close) = match tok {
2660            TokenType::FlowSequenceEnd => ('[', ']'),
2661            TokenType::FlowMappingEnd => ('{', '}'),
2662            _ => unreachable!("flow collection end called with non-closing token"),
2663        };
2664
2665        if open_ch != expected_open {
2666            return Err(ScanError::new(
2667                open_mark,
2668                format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2669            ));
2670        }
2671
2672        let flow_level = self.flow_level;
2673
2674        self.remove_simple_key()?;
2675
2676        if matches!(tok, TokenType::FlowSequenceEnd) {
2677            self.end_implicit_mapping(self.mark, flow_level);
2678            // We are out exiting the flow sequence, nesting goes down 1 level.
2679            self.implicit_flow_mapping_states.pop();
2680        }
2681        self.flow_mapping_started.pop();
2682
2683        self.decrease_flow_level();
2684
2685        self.disallow_simple_key();
2686
2687        let start_mark = self.mark;
2688        self.skip_non_blank();
2689        let end_mark = self.mark;
2690        let token_index = self.tokens.len();
2691        self.skip_ws_to_eol(SkipTabs::Yes)?;
2692
2693        // A flow collection within a flow mapping can be a key. In that case, the value may be
2694        // adjacent to the `:`.
2695        // ```yaml
2696        // - [ {a: b}:value ]
2697        // ```
2698        if self.flow_level > 0 {
2699            self.adjacent_value_allowed_at = self.mark.index();
2700        }
2701
2702        self.insert_token(token_index, Token(Span::new(start_mark, end_mark), tok));
2703        Ok(())
2704    }
2705
2706    /// Push the `FlowEntry` token and skip over the `,`.
2707    fn fetch_flow_entry(&mut self) -> ScanResult {
2708        self.remove_simple_key()?;
2709        self.allow_simple_key();
2710
2711        self.end_implicit_mapping(self.mark, self.flow_level);
2712        if self.current_flow_collection_is_sequence() {
2713            self.set_current_flow_mapping_started(false);
2714        }
2715
2716        let start_mark = self.mark;
2717        self.skip_non_blank();
2718        let end_mark = self.mark;
2719        let token_index = self.tokens.len();
2720        self.skip_ws_to_eol(SkipTabs::Yes)?;
2721
2722        self.insert_token(
2723            token_index,
2724            Token(Span::new(start_mark, end_mark), TokenType::FlowEntry),
2725        );
2726        Ok(())
2727    }
2728
2729    fn increase_flow_level(&mut self) -> ScanResult {
2730        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2731        self.flow_level = self
2732            .flow_level
2733            .checked_add(1)
2734            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2735        Ok(())
2736    }
2737
2738    fn decrease_flow_level(&mut self) {
2739        if self.flow_level > 0 {
2740            self.flow_level -= 1;
2741            self.simple_keys.pop().unwrap();
2742        }
2743    }
2744
2745    /// Push the `Block*` token(s) and skip over the `-`.
2746    ///
2747    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2748    /// `BlockEntry` token.
2749    /// This function only skips over the `-` and does not fetch the entry value.
2750    fn fetch_block_entry(&mut self) -> ScanResult {
2751        if self.flow_level > 0 {
2752            // - * only allowed in block
2753            return Err(ScanError::new_str(
2754                self.mark,
2755                r#""-" is only valid inside a block"#,
2756            ));
2757        }
2758        // Check if we are allowed to start a new entry.
2759        if !self.simple_key_allowed {
2760            return Err(ScanError::new_str(
2761                self.mark,
2762                "block sequence entries are not allowed in this context",
2763            ));
2764        }
2765
2766        // Skip over the `-`.
2767        let mark = self.mark;
2768        self.skip_non_blank();
2769
2770        // generate BLOCK-SEQUENCE-START if indented
2771        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2772        let token_index = self.tokens.len();
2773        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2774        self.input.lookahead(2);
2775        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2776        {
2777            return Err(ScanError::new_str(
2778                self.mark,
2779                "'-' must be followed by a valid YAML whitespace",
2780            ));
2781        }
2782
2783        self.skip_ws_to_eol(SkipTabs::No)?;
2784        self.input.lookahead(1);
2785        if self.input.next_is_break() || self.input.next_is_flow() {
2786            self.roll_one_col_indent();
2787        }
2788
2789        self.remove_simple_key()?;
2790        self.allow_simple_key();
2791
2792        self.insert_token(
2793            token_index,
2794            Token(Span::empty(self.mark), TokenType::BlockEntry),
2795        );
2796
2797        Ok(())
2798    }
2799
2800    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2801        if let Some((mark, bracket)) = self.flow_markers.pop() {
2802            return Err(ScanError::new(
2803                mark,
2804                format!("unclosed bracket '{bracket}'"),
2805            ));
2806        }
2807
2808        self.unroll_indent(-1);
2809        self.remove_simple_key()?;
2810        self.disallow_simple_key();
2811
2812        let mark = self.mark;
2813
2814        self.skip_n_non_blank(3);
2815
2816        self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
2817        self.tokens
2818            .push_back(Token(Span::new(mark, self.mark), t).into());
2819        Ok(())
2820    }
2821
2822    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2823        self.save_simple_key();
2824        self.allow_simple_key();
2825        let tok = self.scan_block_scalar(literal)?;
2826
2827        self.tokens.push_back(tok.into());
2828        Ok(())
2829    }
2830
2831    #[allow(clippy::too_many_lines)]
2832    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2833        let start_mark = self.mark;
2834        let mut chomping = Chomping::Clip;
2835        let mut increment: usize = 0;
2836        let mut indent: usize = 0;
2837        let mut trailing_blank: bool;
2838        let mut leading_blank: bool = false;
2839        let style = if literal {
2840            ScalarStyle::Literal
2841        } else {
2842            ScalarStyle::Folded
2843        };
2844
2845        let mut string = String::new();
2846        let mut leading_break = String::new();
2847        let mut trailing_breaks = String::new();
2848        let mut chomping_break = String::new();
2849
2850        // skip '|' or '>'
2851        self.skip_non_blank();
2852        self.unroll_non_block_indents();
2853
2854        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2855            if self.input.peek() == '+' {
2856                chomping = Chomping::Keep;
2857            } else {
2858                chomping = Chomping::Strip;
2859            }
2860            self.skip_non_blank();
2861            self.input.lookahead(1);
2862            if self.input.next_is_digit() {
2863                if self.input.peek() == '0' {
2864                    return Err(ScanError::new_str(
2865                        start_mark,
2866                        "while scanning a block scalar, found an indentation indicator equal to 0",
2867                    ));
2868                }
2869                increment = (self.input.peek() as usize) - ('0' as usize);
2870                self.skip_non_blank();
2871            }
2872        } else if self.input.next_is_digit() {
2873            if self.input.peek() == '0' {
2874                return Err(ScanError::new_str(
2875                    start_mark,
2876                    "while scanning a block scalar, found an indentation indicator equal to 0",
2877                ));
2878            }
2879
2880            increment = (self.input.peek() as usize) - ('0' as usize);
2881            self.skip_non_blank();
2882            self.input.lookahead(1);
2883            if self.input.peek() == '+' || self.input.peek() == '-' {
2884                if self.input.peek() == '+' {
2885                    chomping = Chomping::Keep;
2886                } else {
2887                    chomping = Chomping::Strip;
2888                }
2889                self.skip_non_blank();
2890            }
2891        }
2892
2893        self.skip_ws_to_eol(SkipTabs::Yes)?;
2894
2895        // Check if we are at the end of the line.
2896        self.input.lookahead(1);
2897        if !self.input.next_is_breakz() {
2898            return Err(ScanError::new_str(
2899                start_mark,
2900                "while scanning a block scalar, did not find expected comment or line break",
2901            ));
2902        }
2903
2904        if self.input.next_is_break() {
2905            self.input.lookahead(2);
2906            self.read_break(&mut chomping_break);
2907        }
2908
2909        if self.input.look_ch() == '\t' {
2910            return Err(ScanError::new_str(
2911                start_mark,
2912                "a block scalar content cannot start with a tab",
2913            ));
2914        }
2915
2916        if increment > 0 {
2917            indent = if self.indent >= 0 {
2918                (self.indent + increment as isize) as usize
2919            } else {
2920                increment
2921            }
2922        }
2923
2924        // Scan the leading line breaks and determine the indentation level if needed.
2925        if indent == 0 {
2926            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2927        } else {
2928            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2929        }
2930
2931        // We have an end-of-stream with no content, e.g.:
2932        // ```yaml
2933        // - |+
2934        // ```
2935        if self.input.next_is_z() {
2936            let contents = match chomping {
2937                // We strip trailing line breaks. Nothing remains.
2938                Chomping::Strip => String::new(),
2939                // There was no newline after the chomping indicator.
2940                _ if self.mark.line == start_mark.line() => String::new(),
2941                // With no content lines, the header break is not scalar content.
2942                Chomping::Clip => String::new(),
2943                // An indented whitespace-only line at EOF is an empty content line.
2944                Chomping::Keep if trailing_breaks.is_empty() && self.mark.col > 0 => chomping_break,
2945                // Keep actual empty content lines, if any, but not the header break.
2946                Chomping::Keep => trailing_breaks,
2947            };
2948
2949            let span = if contents.trim().is_empty() {
2950                Span::new(start_mark, self.mark)
2951            } else {
2952                Span::new(start_mark, self.mark).with_indent(Some(indent))
2953            };
2954
2955            return Ok(Token(span, TokenType::Scalar(style, contents.into())));
2956        }
2957
2958        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2959            if self.indent < 0 && self.mark.col == 0 {
2960                self.input.lookahead(4);
2961                if self.input.next_is_document_start()
2962                    || self.input.next_is_document_end()
2963                    || self.input.peek() == '#'
2964                {
2965                    // At the root level, an explicit indentation indicator can still yield an
2966                    // empty scalar when the next line is a document marker or comment.
2967                    // In this case, the scalar is terminated rather than under-indented.
2968                } else {
2969                    return Err(ScanError::new_str(
2970                        self.mark,
2971                        "wrongly indented line in block scalar",
2972                    ));
2973                }
2974            } else {
2975                return Err(ScanError::new_str(
2976                    self.mark,
2977                    "wrongly indented line in block scalar",
2978                ));
2979            }
2980        }
2981
2982        let mut line_buffer = String::with_capacity(100);
2983        let start_mark = self.mark;
2984        while self.mark.col == indent && !self.input.next_is_z() {
2985            if indent == 0 {
2986                self.input.lookahead(4);
2987                if self.input.next_is_document_end() {
2988                    break;
2989                }
2990            }
2991
2992            // We are at the first content character of a content line.
2993            trailing_blank = self.input.next_is_blank();
2994            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
2995                string.push_str(&trailing_breaks);
2996                if trailing_breaks.is_empty() {
2997                    string.push(' ');
2998                }
2999            } else {
3000                string.push_str(&leading_break);
3001                string.push_str(&trailing_breaks);
3002            }
3003
3004            leading_break.clear();
3005            trailing_breaks.clear();
3006
3007            leading_blank = self.input.next_is_blank();
3008
3009            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
3010
3011            // break on EOF
3012            self.input.lookahead(2);
3013            if self.input.next_is_z() {
3014                break;
3015            }
3016
3017            self.read_break(&mut leading_break);
3018
3019            // Eat the following indentation spaces and line breaks.
3020            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
3021        }
3022
3023        // Chomp the tail.
3024        if chomping != Chomping::Strip {
3025            string.push_str(&leading_break);
3026            // If we had reached an eof but the last character wasn't an end-of-line, check if the
3027            // last line was indented at least as the rest of the scalar, then we need to consider
3028            // there is a newline.
3029            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
3030                string.push('\n');
3031            }
3032        }
3033
3034        if chomping == Chomping::Keep {
3035            string.push_str(&trailing_breaks);
3036        }
3037
3038        let span = if string.trim().is_empty() {
3039            Span::new(start_mark, self.mark)
3040        } else {
3041            Span::new(start_mark, self.mark).with_indent(Some(indent))
3042        };
3043
3044        Ok(Token(span, TokenType::Scalar(style, string.into())))
3045    }
3046
3047    /// Retrieve the contents of the line, parsing it as a block scalar.
3048    ///
3049    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
3050    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
3051    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
3052    /// `clear`ed before the end of the function.
3053    ///
3054    /// This function assumes the first character to read is the first content character in the
3055    /// line. This function does not consume the line break character(s) after the line.
3056    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
3057        // Start by evaluating characters in the buffer.
3058        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
3059            string.push(self.input.peek());
3060            // We may technically skip non-blank characters. However, the only distinction is
3061            // to determine what is leading whitespace and what is not. Here, we read the
3062            // contents of the line until either EOF or a line break. We know we will not read
3063            // `self.leading_whitespace` until the end of the line, where it will be reset.
3064            // This allows us to call a slightly less expensive function.
3065            self.skip_blank();
3066        }
3067
3068        // All characters that were in the buffer were consumed. We need to check if more
3069        // follow.
3070        if self.input.buf_is_empty() {
3071            // We will read all consecutive non-breakz characters. We push them into a
3072            // temporary buffer. The main difference with going through `self.buffer` is that
3073            // characters are appended here as their real size (1B for ASCII, or up to 4 bytes for
3074            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
3075            // (using `String::push_str`).
3076
3077            // line_buffer is empty at this point so we can compute n_chars here as well
3078            let mut n_chars = 0;
3079            debug_assert!(line_buffer.is_empty());
3080            while let Some(c) = self.input.raw_read_non_breakz_ch() {
3081                line_buffer.push(c);
3082                n_chars += 1;
3083            }
3084
3085            // We need to manually update our position; we haven't called a `skip` function.
3086            self.mark.col += n_chars;
3087            self.mark.offsets.chars += n_chars;
3088            self.mark.offsets.bytes = self.input.byte_offset();
3089
3090            // We can now append our bytes to our `string`.
3091            string.reserve(line_buffer.len());
3092            string.push_str(line_buffer);
3093            // This clears the _contents_ without touching the _capacity_.
3094            line_buffer.clear();
3095        }
3096    }
3097
3098    /// Skip the block scalar indentation and empty lines.
3099    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
3100        loop {
3101            // Consume all spaces. Tabs cannot be used as indentation.
3102            if indent < self.input.bufmaxlen().saturating_sub(2) {
3103                self.input.lookahead(self.input.bufmaxlen());
3104                while self.mark.col < indent && self.input.peek() == ' ' {
3105                    self.skip_blank();
3106                }
3107            } else {
3108                loop {
3109                    self.input.lookahead(self.input.bufmaxlen());
3110                    while !self.input.buf_is_empty()
3111                        && self.mark.col < indent
3112                        && self.input.peek() == ' '
3113                    {
3114                        self.skip_blank();
3115                    }
3116                    // If we reached our indent, we can break. We must also break if we have
3117                    // reached content or EOF; that is, the buffer is not empty and the next
3118                    // character is not a space.
3119                    if self.mark.col == indent
3120                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
3121                    {
3122                        break;
3123                    }
3124                }
3125                self.input.lookahead(2);
3126            }
3127
3128            // If our current line is empty, skip over the break and continue looping.
3129            if self.input.next_is_break() {
3130                self.read_break(breaks);
3131            } else {
3132                // Otherwise, we have a content line. Return control.
3133                break;
3134            }
3135        }
3136    }
3137
3138    /// Determine the indentation level for a block scalar from the first line of its contents.
3139    ///
3140    /// The function skips over whitespace-only lines and sets `indent` to the longest
3141    /// whitespace line that was encountered.
3142    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
3143        let mut max_indent = 0;
3144        loop {
3145            // Consume all spaces. Tabs cannot be used as indentation.
3146            while self.input.look_ch() == ' ' {
3147                self.skip_blank();
3148            }
3149
3150            if self.mark.col > max_indent {
3151                max_indent = self.mark.col;
3152            }
3153
3154            if self.input.next_is_break() {
3155                // If our current line is empty, skip over the break and continue looping.
3156                self.input.lookahead(2);
3157                self.read_break(breaks);
3158            } else {
3159                // Otherwise, we have a content line. Return control.
3160                break;
3161            }
3162        }
3163
3164        // In case a YAML document looks like:
3165        // ```yaml
3166        // |
3167        // foo
3168        // bar
3169        // ```
3170        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
3171        // least 1. When in the above example, `self.indent` will be set to -1.
3172        *indent = max_indent.max((self.indent + 1) as usize);
3173    }
3174
3175    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
3176        self.save_simple_key();
3177        self.disallow_simple_key();
3178
3179        let token_index = self.tokens.len();
3180        let tok = self.scan_flow_scalar(single)?;
3181
3182        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
3183        // YAML allows the following value to be specified adjacent to the “:”.
3184        if self.skip_to_next_token(true)? {
3185            self.adjacent_value_allowed_at = usize::MAX;
3186        } else {
3187            self.adjacent_value_allowed_at = self.mark.index();
3188        }
3189
3190        self.insert_token(token_index, tok);
3191        Ok(())
3192    }
3193
3194    #[allow(clippy::too_many_lines)]
3195    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
3196        let start_mark = self.mark;
3197
3198        // Output scalar contents.
3199        let mut buf = match self.input.byte_offset() {
3200            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
3201            None => FlowScalarBuf::new_owned(),
3202        };
3203
3204        // Scratch used to consume the *first* line break in a break run without emitting it.
3205        // (The first break folds to ' ' or to nothing depending on escaping rules.)
3206        let mut break_scratch = String::new();
3207
3208        /* Eat the left quote. */
3209        self.skip_non_blank();
3210
3211        loop {
3212            /* Check for a document indicator. */
3213            self.input.lookahead(4);
3214
3215            if self.mark.col == 0 && self.input.next_is_document_indicator() {
3216                return Err(ScanError::new_str(
3217                    start_mark,
3218                    "while scanning a quoted scalar, found unexpected document indicator",
3219                ));
3220            }
3221
3222            if self.input.next_is_z() {
3223                return Err(ScanError::new_str(start_mark, "unclosed quote"));
3224            }
3225
3226            // Do not enforce block indentation inside quoted (flow) scalars.
3227            // YAML allows line breaks within quoted scalars.
3228            let mut leading_blanks = false;
3229            self.consume_flow_scalar_non_whitespace_chars(
3230                single,
3231                &mut buf,
3232                &mut leading_blanks,
3233                &start_mark,
3234            )?;
3235
3236            match self.input.look_ch() {
3237                '\'' if single => break,
3238                '"' if !single => break,
3239                _ => {}
3240            }
3241
3242            // --- Faster whitespace / line break handling (no temporary Strings) ---
3243            //
3244            // Instead of:
3245            //   - collecting blanks into `whitespaces` and then copying them
3246            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
3247            //
3248            // We do:
3249            //   - append trailing blanks directly to `string`, remember where they started,
3250            //     and truncate them if a line break follows.
3251            //   - for line breaks: consume the first break into a scratch (discarded),
3252            //     append subsequent breaks directly to `string`.
3253            //
3254            // These flags replace temporary-string emptiness checks:
3255            //   has_leading_break  <=> !leading_break.is_empty()
3256            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
3257            let mut trailing_ws_start: Option<usize> = None;
3258            let mut has_leading_break = false;
3259            let mut has_trailing_breaks = false;
3260
3261            // For the borrowed path: track the (byte) start of a pending whitespace run.
3262            let mut pending_ws_start: Option<usize> = None;
3263
3264            // Consume blank characters.
3265            while self.input.next_is_blank() || self.input.next_is_break() {
3266                if self.input.next_is_blank() {
3267                    // Consume a space or a tab character.
3268                    if leading_blanks {
3269                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
3270                            return Err(ScanError::new_str(
3271                                self.mark,
3272                                "tab cannot be used as indentation",
3273                            ));
3274                        }
3275                        self.skip_blank();
3276                    } else {
3277                        // Append to output immediately; if a break appears next, we'll truncate.
3278                        match buf {
3279                            FlowScalarBuf::Owned(ref mut string) => {
3280                                if trailing_ws_start.is_none() {
3281                                    trailing_ws_start = Some(string.len());
3282                                }
3283                                string.push(self.input.peek());
3284                            }
3285                            FlowScalarBuf::Borrowed { .. } => {
3286                                if pending_ws_start.is_none() {
3287                                    pending_ws_start = self.input.byte_offset();
3288                                }
3289                            }
3290                        }
3291                        self.skip_blank();
3292
3293                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
3294                            (&mut buf, pending_ws_start, self.input.byte_offset())
3295                        {
3296                            buf.note_pending_ws(ws_start, ws_end);
3297                        }
3298                    }
3299                } else {
3300                    self.input.lookahead(2);
3301
3302                    // Check if it is a first line break.
3303                    if leading_blanks {
3304                        // Second+ line break in a run: preserve it.
3305                        match buf {
3306                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
3307                            FlowScalarBuf::Borrowed { .. } => {
3308                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3309                                let Some(string) = buf.as_owned_mut() else {
3310                                    unreachable!()
3311                                };
3312                                self.read_break(string);
3313                            }
3314                        }
3315                        has_trailing_breaks = true;
3316                    } else {
3317                        // First break: drop any trailing blanks we appended, then consume the break.
3318                        if let Some(pos) = trailing_ws_start.take() {
3319                            if let FlowScalarBuf::Owned(ref mut string) = buf {
3320                                string.truncate(pos);
3321                            }
3322                        }
3323
3324                        if pending_ws_start.take().is_some() {
3325                            // Trailing blanks before a break are discarded => transformation.
3326                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3327                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3328                            }
3329                            buf.discard_pending_ws();
3330                        } else {
3331                            buf.commit_pending_ws();
3332                        }
3333
3334                        break_scratch.clear();
3335                        self.read_break(&mut break_scratch);
3336                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
3337
3338                        has_leading_break = true;
3339                        leading_blanks = true;
3340                    }
3341                }
3342
3343                self.input.lookahead(1);
3344            }
3345
3346            // If we had a line break inside a quoted (flow) scalar, validate indentation
3347            // of the continuation line in block context.
3348            if leading_blanks && has_leading_break && self.flow_level == 0 {
3349                let next_ch = self.input.peek();
3350                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
3351                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
3352                    return Err(ScanError::new_str(
3353                        self.mark,
3354                        "invalid indentation in multiline quoted scalar",
3355                    ));
3356                }
3357            }
3358
3359            // Join the whitespace or fold line breaks.
3360            if leading_blanks {
3361                // Folding rule:
3362                //   if there was no leading break, preserve the pending whitespace already emitted
3363                //   if there was a leading break but no trailing breaks, fold to one space
3364                //   otherwise, preserve the trailing breaks already emitted
3365                if has_leading_break && !has_trailing_breaks {
3366                    match buf {
3367                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
3368                        FlowScalarBuf::Borrowed { .. } => {
3369                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3370                            let Some(string) = buf.as_owned_mut() else {
3371                                unreachable!()
3372                            };
3373                            string.push(' ');
3374                        }
3375                    }
3376                }
3377            }
3378            // else: trailing blanks are already appended to `string`
3379        } // loop
3380
3381        // Eat the right quote.
3382        self.skip_non_blank();
3383        let end_mark = self.mark;
3384
3385        // Ensure there is no invalid trailing content.
3386        self.skip_ws_to_eol(SkipTabs::Yes)?;
3387        match self.input.peek() {
3388            // These can be encountered in flow sequences or mappings.
3389            ',' | '}' | ']' if self.flow_level > 0 => {}
3390            // An end-of-line / end-of-stream is fine. No trailing content.
3391            c if is_breakz(c) => {}
3392            // ':' can be encountered if our scalar is a key.
3393            // Outside of flow contexts, keys cannot span multiple lines
3394            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
3395            // Inside a flow context, this is allowed.
3396            ':' if self.flow_level > 0 => {}
3397            _ => {
3398                let message = if single {
3399                    "invalid trailing content after single-quoted scalar"
3400                } else {
3401                    "invalid trailing content after double-quoted scalar"
3402                };
3403                return Err(ScanError::new_str(self.mark, message));
3404            }
3405        }
3406
3407        let style = if single {
3408            ScalarStyle::SingleQuoted
3409        } else {
3410            ScalarStyle::DoubleQuoted
3411        };
3412
3413        let contents = match buf {
3414            FlowScalarBuf::Owned(string) => Cow::Owned(string),
3415            FlowScalarBuf::Borrowed {
3416                start,
3417                mut end,
3418                pending_ws_start,
3419                pending_ws_end,
3420            } => {
3421                // If we ended after a whitespace run, it is part of the output (no break followed).
3422                if pending_ws_start.is_some() {
3423                    end = pending_ws_end;
3424                }
3425                if let Some(slice) = self.try_borrow_slice(start, end) {
3426                    Cow::Borrowed(slice)
3427                } else {
3428                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
3429                        ScanError::new_str(
3430                            start_mark,
3431                            "internal error: input advertised offsets but did not provide a slice",
3432                        )
3433                    })?;
3434                    Cow::Owned(slice.to_owned())
3435                }
3436            }
3437        };
3438
3439        Ok(Token(
3440            Span::new(start_mark, end_mark),
3441            TokenType::Scalar(style, contents),
3442        ))
3443    }
3444
3445    /// Consume successive non-whitespace characters from a flow scalar.
3446    ///
3447    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
3448    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
3449    /// for double quoted scalars).
3450    ///
3451    /// # Errors
3452    /// Return an error if an invalid escape sequence is found.
3453    fn consume_flow_scalar_non_whitespace_chars(
3454        &mut self,
3455        single: bool,
3456        buf: &mut FlowScalarBuf,
3457        leading_blanks: &mut bool,
3458        start_mark: &Marker,
3459    ) -> Result<(), ScanError> {
3460        self.input.lookahead(2);
3461        while !is_blank_or_breakz(self.input.peek()) {
3462            match self.input.peek() {
3463                // Check for an escaped single quote.
3464                '\'' if self.input.peek_nth(1) == '\'' && single => {
3465                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3466                        buf.commit_pending_ws();
3467                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3468                    }
3469                    let Some(string) = buf.as_owned_mut() else {
3470                        unreachable!()
3471                    };
3472                    string.push('\'');
3473                    self.skip_n_non_blank(2);
3474                }
3475                // Check for the right quote.
3476                '\'' if single => break,
3477                '"' if !single => break,
3478                // Check for an escaped line break.
3479                '\\' if !single && is_break(self.input.peek_nth(1)) => {
3480                    self.input.lookahead(3);
3481                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3482                        buf.commit_pending_ws();
3483                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3484                    }
3485                    self.skip_non_blank();
3486                    self.skip_linebreak();
3487                    *leading_blanks = true;
3488                    break;
3489                }
3490                // Check for an escape sequence.
3491                '\\' if !single => {
3492                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3493                        buf.commit_pending_ws();
3494                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3495                    }
3496                    let Some(string) = buf.as_owned_mut() else {
3497                        unreachable!()
3498                    };
3499                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
3500                }
3501                c => {
3502                    match buf {
3503                        FlowScalarBuf::Owned(ref mut string) => {
3504                            string.push(c);
3505                        }
3506                        FlowScalarBuf::Borrowed { .. } => {
3507                            buf.commit_pending_ws();
3508                        }
3509                    }
3510                    self.skip_non_blank();
3511
3512                    if let Some(new_end) = self.input.byte_offset() {
3513                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
3514                            *end = new_end;
3515                        }
3516                    }
3517                }
3518            }
3519            self.input.lookahead(2);
3520        }
3521        Ok(())
3522    }
3523
3524    /// Escape the sequence we encounter in a flow scalar.
3525    ///
3526    /// `self.input.peek()` must point to the `\` starting the escape sequence.
3527    ///
3528    /// # Errors
3529    /// Return an error if an invalid escape sequence is found.
3530    fn resolve_flow_scalar_escape_sequence(
3531        &mut self,
3532        start_mark: &Marker,
3533    ) -> Result<char, ScanError> {
3534        let mut code_length = 0usize;
3535        let mut ret = '\0';
3536
3537        match self.input.peek_nth(1) {
3538            '0' => ret = '\0',
3539            'a' => ret = '\x07',
3540            'b' => ret = '\x08',
3541            't' | '\t' => ret = '\t',
3542            'n' => ret = '\n',
3543            'v' => ret = '\x0b',
3544            'f' => ret = '\x0c',
3545            'r' => ret = '\x0d',
3546            'e' => ret = '\x1b',
3547            ' ' => ret = '\x20',
3548            '"' => ret = '"',
3549            '/' => ret = '/',
3550            '\\' => ret = '\\',
3551            // Unicode next line (#x85)
3552            'N' => ret = char::from_u32(0x85).unwrap(),
3553            // Unicode non-breaking space (#xA0)
3554            '_' => ret = char::from_u32(0xA0).unwrap(),
3555            // Unicode line separator (#x2028)
3556            'L' => ret = char::from_u32(0x2028).unwrap(),
3557            // Unicode paragraph separator (#x2029)
3558            'P' => ret = char::from_u32(0x2029).unwrap(),
3559            'x' => code_length = 2,
3560            'u' => code_length = 4,
3561            'U' => code_length = 8,
3562            _ => {
3563                return Err(ScanError::new_str(
3564                    *start_mark,
3565                    "while parsing a quoted scalar, found unknown escape character",
3566                ))
3567            }
3568        }
3569        self.skip_n_non_blank(2);
3570
3571        // Consume an arbitrary escape code.
3572        if code_length > 0 {
3573            self.input.lookahead(code_length);
3574            let mut value = 0u32;
3575            for i in 0..code_length {
3576                let c = self.input.peek_nth(i);
3577                if !is_hex(c) {
3578                    return Err(ScanError::new_str(
3579                        *start_mark,
3580                        "while parsing a quoted scalar, did not find expected hexadecimal number",
3581                    ));
3582                }
3583                value = (value << 4) + as_hex(c);
3584            }
3585
3586            self.skip_n_non_blank(code_length);
3587
3588            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
3589            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3590                self.input.lookahead(2);
3591                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3592                    self.skip_n_non_blank(2);
3593                    self.input.lookahead(4);
3594                    let mut low_value = 0u32;
3595                    for i in 0..4 {
3596                        let c = self.input.peek_nth(i);
3597                        if !is_hex(c) {
3598                            return Err(ScanError::new_str(
3599                                *start_mark,
3600                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3601                            ));
3602                        }
3603                        low_value = (low_value << 4) + as_hex(c);
3604                    }
3605                    if (0xDC00..=0xDFFF).contains(&low_value) {
3606                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3607                        self.skip_n_non_blank(4);
3608                    } else {
3609                        return Err(ScanError::new_str(
3610                            *start_mark,
3611                            "while parsing a quoted scalar, found invalid low surrogate",
3612                        ));
3613                    }
3614                } else {
3615                    return Err(ScanError::new_str(
3616                        *start_mark,
3617                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3618                    ));
3619                }
3620            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3621                return Err(ScanError::new_str(
3622                    *start_mark,
3623                    "while parsing a quoted scalar, found unpaired low surrogate",
3624                ));
3625            }
3626
3627            let Some(ch) = char::from_u32(value) else {
3628                return Err(ScanError::new_str(
3629                    *start_mark,
3630                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3631                ));
3632            };
3633            ret = ch;
3634        }
3635        Ok(ret)
3636    }
3637
3638    fn fetch_plain_scalar(&mut self) -> ScanResult {
3639        self.save_simple_key();
3640        self.disallow_simple_key();
3641
3642        let token_index = self.tokens.len();
3643        let tok = self.scan_plain_scalar()?;
3644
3645        self.insert_token(token_index, tok);
3646        Ok(())
3647    }
3648
3649    /// Scan for a plain scalar.
3650    ///
3651    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3652    /// some contexts.
3653    #[allow(clippy::too_many_lines)]
3654    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3655        self.unroll_non_block_indents();
3656        let indent = self.indent + 1;
3657        let start_mark = self.mark;
3658
3659        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3660            return Err(ScanError::new_str(
3661                start_mark,
3662                "invalid indentation in flow construct",
3663            ));
3664        }
3665
3666        let mut string = String::with_capacity(32);
3667        self.buf_whitespaces.clear();
3668        self.buf_leading_break.clear();
3669        self.buf_trailing_breaks.clear();
3670        let mut end_mark = self.mark;
3671
3672        loop {
3673            self.input.lookahead(4);
3674            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3675                || self.input.peek() == '#'
3676            {
3677                // BS4K: If a `#` starts a comment after some separation spaces following content
3678                // of a plain scalar in block context, and there is potential continuation on the
3679                // next line, this is invalid. We cannot decide yet if there will be continuation,
3680                // so record that a comment interrupted a plain scalar.
3681                if self.input.peek() == '#'
3682                    && !string.is_empty()
3683                    && !self.buf_whitespaces.is_empty()
3684                    && self.flow_level == 0
3685                {
3686                    self.interrupted_plain_by_comment = Some(self.mark);
3687                }
3688                break;
3689            }
3690
3691            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3692                return Err(ScanError::new_str(
3693                    self.mark,
3694                    "plain scalar cannot start with '-' followed by ,[]{}",
3695                ));
3696            }
3697
3698            if !self.input.next_is_blank_or_breakz()
3699                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3700            {
3701                if self.leading_whitespace {
3702                    if self.buf_leading_break.is_empty() {
3703                        string.push_str(&self.buf_leading_break);
3704                        string.push_str(&self.buf_trailing_breaks);
3705                        self.buf_trailing_breaks.clear();
3706                        self.buf_leading_break.clear();
3707                    } else {
3708                        if self.buf_trailing_breaks.is_empty() {
3709                            string.push(' ');
3710                        } else {
3711                            string.push_str(&self.buf_trailing_breaks);
3712                            self.buf_trailing_breaks.clear();
3713                        }
3714                        self.buf_leading_break.clear();
3715                    }
3716                    self.leading_whitespace = false;
3717                } else if !self.buf_whitespaces.is_empty() {
3718                    string.push_str(&self.buf_whitespaces);
3719                    self.buf_whitespaces.clear();
3720                }
3721
3722                // We can unroll the first iteration of the loop.
3723                string.push(self.input.peek());
3724                self.skip_non_blank();
3725                string.reserve(self.input.bufmaxlen());
3726
3727                // Add content non-blank characters to the scalar.
3728                let mut end = false;
3729                while !end {
3730                    // Fill the buffer once and process all characters in the buffer until the next
3731                    // fetch. `next_can_be_plain_scalar` needs 2 lookahead characters, so keep one
3732                    // spare slot for normal inputs while still forcing progress for very small
3733                    // custom buffer lengths.
3734                    self.input.lookahead(self.input.bufmaxlen());
3735                    let chunk_len = self.input.bufmaxlen().saturating_sub(1).max(1);
3736                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3737                        &mut string,
3738                        chunk_len,
3739                        self.flow_level > 0,
3740                    );
3741                    end = stop;
3742                    self.mark.offsets.chars += chars_consumed;
3743                    self.mark.col += chars_consumed;
3744                    self.mark.offsets.bytes = self.input.byte_offset();
3745                }
3746                end_mark = self.mark;
3747            }
3748
3749            // We may reach the end of a plain scalar if:
3750            //  - We reach eof
3751            //  - We reach ": "
3752            //  - We find a flow character in a flow context
3753            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3754                break;
3755            }
3756
3757            // Process blank characters.
3758            self.input.lookahead(2);
3759            while self.input.next_is_blank_or_break() {
3760                if self.input.next_is_blank() {
3761                    if !self.leading_whitespace {
3762                        self.buf_whitespaces.push(self.input.peek());
3763                        self.skip_blank();
3764                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3765                        // Tabs in an indentation columns are allowed if and only if the line is
3766                        // empty. Skip to the end of the line.
3767                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3768                        if !self.input.next_is_breakz() {
3769                            return Err(ScanError::new_str(
3770                                start_mark,
3771                                "while scanning a plain scalar, found a tab",
3772                            ));
3773                        }
3774                    } else {
3775                        self.skip_blank();
3776                    }
3777                } else {
3778                    // Check if it is a first line break
3779                    if self.leading_whitespace {
3780                        self.skip_break();
3781                        self.buf_trailing_breaks.push('\n');
3782                    } else {
3783                        self.buf_whitespaces.clear();
3784                        self.skip_break();
3785                        self.buf_leading_break.push('\n');
3786                        self.leading_whitespace = true;
3787                    }
3788                }
3789                self.input.lookahead(2);
3790            }
3791
3792            // check indentation level
3793            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3794                break;
3795            }
3796        }
3797
3798        if self.leading_whitespace {
3799            self.allow_simple_key();
3800        }
3801
3802        if string.is_empty() {
3803            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3804            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3805            // erroneous inputs such as "{...".
3806            Err(ScanError::new_str(
3807                start_mark,
3808                "unexpected end of plain scalar",
3809            ))
3810        } else {
3811            let contents = if let (Some(start), Some(end)) =
3812                (start_mark.byte_offset(), end_mark.byte_offset())
3813            {
3814                match self.try_borrow_slice(start, end) {
3815                    Some(slice) if slice == string => Cow::Borrowed(slice),
3816                    _ => Cow::Owned(string),
3817                }
3818            } else {
3819                Cow::Owned(string)
3820            };
3821
3822            Ok(Token(
3823                Span::new(start_mark, end_mark),
3824                TokenType::Scalar(ScalarStyle::Plain, contents),
3825            ))
3826        }
3827    }
3828
3829    fn fetch_key(&mut self) -> ScanResult {
3830        let start_mark = self.mark;
3831        if self.flow_level == 0 {
3832            // Check if we are allowed to start a new key (not necessarily simple).
3833            if !self.simple_key_allowed {
3834                return Err(ScanError::new_str(
3835                    self.mark,
3836                    "mapping keys are not allowed in this context",
3837                ));
3838            }
3839            self.roll_indent(
3840                start_mark.col,
3841                None,
3842                TokenType::BlockMappingStart,
3843                start_mark,
3844            );
3845        } else {
3846            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3847            self.set_current_flow_mapping_started(true);
3848        }
3849
3850        self.remove_simple_key()?;
3851
3852        if self.flow_level == 0 {
3853            self.allow_simple_key();
3854        } else {
3855            self.disallow_simple_key();
3856        }
3857
3858        self.skip_non_blank();
3859        let end_mark = self.mark;
3860        let token_index = self.tokens.len();
3861        self.explicit_key_tab_check_pending = false;
3862        let stopped_after_comment = self.skip_yaml_whitespace(true)?;
3863        if self.input.peek() == '\t' {
3864            return Err(ScanError::new_str(
3865                self.mark(),
3866                "tabs disallowed in this context",
3867            ));
3868        }
3869        self.explicit_key_tab_check_pending = stopped_after_comment;
3870        self.insert_token(
3871            token_index,
3872            Token(Span::new(start_mark, end_mark), TokenType::Key),
3873        );
3874        Ok(())
3875    }
3876
3877    /// Fetch a value in a mapping inside of a flow collection.
3878    ///
3879    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3880    /// values in flow collections are respected prior to calling [`fetch_value`].
3881    ///
3882    /// [`self.flow_level`]: Self::flow_level
3883    /// [`fetch_value`]: Self::fetch_value
3884    fn fetch_flow_value(&mut self) -> ScanResult {
3885        let nc = self.input.peek_nth(1);
3886
3887        // If we encounter a ':' inside a flow collection and it is not immediately
3888        // followed by a blank or breakz:
3889        //   - We must check whether an adjacent value is allowed
3890        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3891        //     is needed for JSON compatibility.
3892        //   - If not, we must ensure there is a space after the ':' and before its value.
3893        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3894        //   - But if the value is empty (null), then it's okay.
3895        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3896        // flow character), but the ']' is not the value. The value is an invisible empty
3897        // space which is represented as null ('~').
3898        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3899            return Err(ScanError::new_str(
3900                self.mark,
3901                "':' may not precede any of `[{` in flow mapping",
3902            ));
3903        }
3904
3905        self.fetch_value()
3906    }
3907
3908    /// Fetch a value from a mapping (after a `:`).
3909    fn fetch_value(&mut self) -> ScanResult {
3910        let sk = self.simple_keys.last().unwrap().clone();
3911        let start_mark = self.mark;
3912        let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3913            && !self.current_flow_mapping_started()
3914            && !self.implicit_flow_mapping_states.is_empty();
3915        if is_implicit_flow_mapping {
3916            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3917                ImplicitMappingState::Inside(self.flow_level);
3918        }
3919
3920        // Skip over ':'.
3921        self.skip_non_blank();
3922        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3923        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3924        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3925        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3926        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3927        // cases will still fail to parse correctly (just with a less specific error message).
3928        let mut trailing_tokens = VecDeque::new();
3929        if self.input.look_ch() == '\t' {
3930            let trailing_token_index = self.tokens.len();
3931            let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
3932            trailing_tokens = self.tokens.split_off(trailing_token_index);
3933
3934            if !whitespace.has_valid_yaml_ws()
3935                && (self.input.peek() == '-' || self.input.next_is_alpha())
3936            {
3937                return Err(ScanError::new_str(
3938                    self.mark,
3939                    "':' must be followed by a valid YAML whitespace",
3940                ));
3941            }
3942        }
3943
3944        if sk.possible {
3945            let token_index = self.simple_key_token_index(&sk, start_mark)?;
3946            // insert simple key
3947            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3948            self.insert_token(token_index, tok);
3949            if is_implicit_flow_mapping {
3950                if sk.mark.line < start_mark.line {
3951                    return Err(ScanError::new_str(
3952                        start_mark,
3953                        "illegal placement of ':' indicator",
3954                    ));
3955                }
3956                self.insert_token(
3957                    token_index,
3958                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3959                );
3960            }
3961
3962            // Add the BLOCK-MAPPING-START token if needed.
3963            self.roll_indent(
3964                sk.mark.col,
3965                Some(sk.token_number),
3966                TokenType::BlockMappingStart,
3967                sk.mark,
3968            );
3969            self.roll_one_col_indent();
3970
3971            self.simple_keys.last_mut().unwrap().possible = false;
3972            self.disallow_simple_key();
3973        } else {
3974            if is_implicit_flow_mapping {
3975                self.tokens
3976                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart).into());
3977            }
3978            // The ':' indicator follows a complex key.
3979            if self.flow_level == 0 {
3980                if !self.simple_key_allowed {
3981                    return Err(ScanError::new_str(
3982                        start_mark,
3983                        "mapping values are not allowed in this context",
3984                    ));
3985                }
3986
3987                self.roll_indent(
3988                    start_mark.col,
3989                    None,
3990                    TokenType::BlockMappingStart,
3991                    start_mark,
3992                );
3993            }
3994            self.roll_one_col_indent();
3995
3996            if self.flow_level == 0 {
3997                self.allow_simple_key();
3998            } else {
3999                self.disallow_simple_key();
4000            }
4001        }
4002        self.tokens
4003            .push_back(Token(Span::empty(start_mark), TokenType::Value).into());
4004        self.tokens.append(&mut trailing_tokens);
4005
4006        Ok(())
4007    }
4008
4009    /// Add an indentation level to the stack with the given block token, if needed.
4010    ///
4011    /// An indentation level is added only if:
4012    ///   - We are not in a flow-style construct (which don't have indentation per-se).
4013    ///   - The current column is further indented than the last indent we have registered.
4014    fn roll_indent(
4015        &mut self,
4016        col: usize,
4017        number: Option<usize>,
4018        tok: TokenType<'input>,
4019        mark: Marker,
4020    ) {
4021        if self.flow_level > 0 {
4022            return;
4023        }
4024
4025        // If the last indent was a non-block indent, remove it.
4026        // This means that we prepared an indent that we thought we wouldn't use, but realized just
4027        // now that it is a block indent.
4028        if self.indent <= col as isize {
4029            if let Some(indent) = self.indents.last() {
4030                if !indent.needs_block_end {
4031                    self.indent = indent.indent;
4032                    self.indents.pop();
4033                }
4034            }
4035        }
4036
4037        if self.indent < col as isize {
4038            self.indents.push(Indent {
4039                indent: self.indent,
4040                needs_block_end: true,
4041            });
4042            self.indent = col as isize;
4043            let tokens_parsed = self.tokens_parsed;
4044            match number {
4045                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
4046                None => self.tokens.push_back(Token(Span::empty(mark), tok).into()),
4047            }
4048        }
4049    }
4050
4051    /// Pop indentation levels from the stack as much as needed.
4052    ///
4053    /// Indentation levels are popped from the stack while they are further indented than `col`.
4054    /// If we are in a flow-style construct (which don't have indentation per-se), this function
4055    /// does nothing.
4056    fn unroll_indent(&mut self, col: isize) {
4057        if self.flow_level > 0 {
4058            return;
4059        }
4060        while self.indent > col {
4061            let indent = self.indents.pop().unwrap();
4062            self.indent = indent.indent;
4063            if indent.needs_block_end {
4064                self.tokens
4065                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd).into());
4066            }
4067        }
4068    }
4069
4070    /// Add an indentation level of 1 column that does not start a block.
4071    ///
4072    /// See the documentation of [`Indent::needs_block_end`] for more details.
4073    /// An indentation is not added if we are inside a flow level or if the last indent is already
4074    /// a non-block indent.
4075    fn roll_one_col_indent(&mut self) {
4076        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
4077            self.indents.push(Indent {
4078                indent: self.indent,
4079                needs_block_end: false,
4080            });
4081            self.indent += 1;
4082        }
4083    }
4084
4085    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
4086    fn unroll_non_block_indents(&mut self) {
4087        while let Some(indent) = self.indents.last() {
4088            if indent.needs_block_end {
4089                break;
4090            }
4091            self.indent = indent.indent;
4092            self.indents.pop();
4093        }
4094    }
4095
4096    /// Mark the next token to be inserted as a potential simple key.
4097    fn save_simple_key(&mut self) {
4098        if self.simple_key_allowed {
4099            let required = self.flow_level == 0
4100                && self.indent == (self.mark.col as isize)
4101                && self.indents.last().unwrap().needs_block_end;
4102
4103            if let Some(last) = self.simple_keys.last_mut() {
4104                *last = SimpleKey {
4105                    mark: self.mark,
4106                    possible: true,
4107                    required,
4108                    token_number: self.tokens_parsed + self.tokens.len(),
4109                };
4110            }
4111        }
4112    }
4113
4114    fn remove_simple_key(&mut self) -> ScanResult {
4115        let last = self.simple_keys.last_mut().unwrap();
4116        if last.possible && last.required {
4117            return Err(Self::simple_key_expected(last.mark));
4118        }
4119
4120        last.possible = false;
4121        Ok(())
4122    }
4123
4124    /// Return whether the scanner is inside a block but outside of a flow sequence.
4125    fn is_within_block(&self) -> bool {
4126        !self.indents.is_empty()
4127    }
4128
4129    /// If an implicit mapping had started, end it.
4130    ///
4131    /// This function does not pop the state in [`implicit_flow_mapping_states`].
4132    ///
4133    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
4134    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
4135        if self
4136            .implicit_flow_mapping_states
4137            .last()
4138            .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
4139        {
4140            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
4141            self.set_current_flow_mapping_started(false);
4142            self.tokens
4143                .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd).into());
4144        }
4145    }
4146
4147    fn current_flow_collection_is_sequence(&self) -> bool {
4148        self.flow_markers
4149            .last()
4150            .is_some_and(|(_, bracket)| *bracket == '[')
4151    }
4152
4153    fn current_flow_mapping_started(&self) -> bool {
4154        self.flow_mapping_started.last().copied().unwrap_or(false)
4155    }
4156
4157    fn set_current_flow_mapping_started(&mut self, started: bool) {
4158        if let Some(current) = self.flow_mapping_started.last_mut() {
4159            *current = started;
4160        }
4161    }
4162}
4163
4164/// Chomping, how final line breaks and trailing empty lines are interpreted.
4165///
4166/// See YAML spec 8.1.1.2.
4167#[derive(PartialEq, Eq)]
4168pub enum Chomping {
4169    /// The final line break and any trailing empty lines are excluded.
4170    Strip,
4171    /// The final line break is preserved, but trailing empty lines are excluded.
4172    Clip,
4173    /// The final line break and trailing empty lines are included.
4174    Keep,
4175}
4176
4177#[cfg(test)]
4178mod test {
4179    use alloc::{
4180        borrow::{Cow, ToOwned},
4181        rc::Rc,
4182        string::String,
4183        vec::Vec,
4184    };
4185    use core::cell::Cell;
4186
4187    use crate::{
4188        input::{str::StrInput, BorrowedInput, BufferedInput, Input},
4189        scanner::{
4190            Comment, Marker, Placement, QueuedToken, QueuedTokenType, ScalarStyle, ScanError,
4191            Scanner, Span, TEncoding, Token, TokenType,
4192        },
4193    };
4194
4195    struct CountingChars {
4196        chars: alloc::vec::IntoIter<char>,
4197        read: Rc<Cell<usize>>,
4198    }
4199
4200    impl Iterator for CountingChars {
4201        type Item = char;
4202
4203        fn next(&mut self) -> Option<Self::Item> {
4204            let next = self.chars.next();
4205            if next.is_some() {
4206                self.read.set(self.read.get() + 1);
4207            }
4208            next
4209        }
4210    }
4211
4212    struct SlicingOnlyInput<'input> {
4213        inner: StrInput<'input>,
4214        expose_slice: bool,
4215    }
4216
4217    impl<'input> SlicingOnlyInput<'input> {
4218        fn new(source: &'input str, expose_slice: bool) -> Self {
4219            Self {
4220                inner: StrInput::new(source),
4221                expose_slice,
4222            }
4223        }
4224    }
4225
4226    impl Input for SlicingOnlyInput<'_> {
4227        fn lookahead(&mut self, count: usize) {
4228            self.inner.lookahead(count);
4229        }
4230
4231        fn buflen(&self) -> usize {
4232            self.inner.buflen()
4233        }
4234
4235        fn bufmaxlen(&self) -> usize {
4236            self.inner.bufmaxlen()
4237        }
4238
4239        fn raw_read_ch(&mut self) -> char {
4240            self.inner.raw_read_ch()
4241        }
4242
4243        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4244            self.inner.raw_read_non_breakz_ch()
4245        }
4246
4247        fn skip(&mut self) {
4248            self.inner.skip();
4249        }
4250
4251        fn skip_n(&mut self, count: usize) {
4252            self.inner.skip_n(count);
4253        }
4254
4255        fn peek(&self) -> char {
4256            self.inner.peek()
4257        }
4258
4259        fn peek_nth(&self, n: usize) -> char {
4260            self.inner.peek_nth(n)
4261        }
4262
4263        fn byte_offset(&self) -> Option<usize> {
4264            self.inner.byte_offset()
4265        }
4266
4267        fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
4268            if self.expose_slice {
4269                self.inner.slice_bytes(start, end)
4270            } else {
4271                None
4272            }
4273        }
4274    }
4275
4276    impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
4277        fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
4278            None
4279        }
4280    }
4281
4282    struct SmallReportedBufferInput<'input> {
4283        inner: StrInput<'input>,
4284        reported_bufmaxlen: usize,
4285    }
4286
4287    impl<'input> SmallReportedBufferInput<'input> {
4288        fn new(source: &'input str, reported_bufmaxlen: usize) -> Self {
4289            Self {
4290                inner: StrInput::new(source),
4291                reported_bufmaxlen,
4292            }
4293        }
4294    }
4295
4296    impl Input for SmallReportedBufferInput<'_> {
4297        fn lookahead(&mut self, count: usize) {
4298            self.inner.lookahead(count);
4299        }
4300
4301        fn buflen(&self) -> usize {
4302            self.inner.buflen()
4303        }
4304
4305        fn bufmaxlen(&self) -> usize {
4306            self.reported_bufmaxlen
4307        }
4308
4309        fn raw_read_ch(&mut self) -> char {
4310            self.inner.raw_read_ch()
4311        }
4312
4313        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4314            self.inner.raw_read_non_breakz_ch()
4315        }
4316
4317        fn skip(&mut self) {
4318            self.inner.skip();
4319        }
4320
4321        fn skip_n(&mut self, count: usize) {
4322            self.inner.skip_n(count);
4323        }
4324
4325        fn peek(&self) -> char {
4326            self.inner.peek()
4327        }
4328
4329        fn peek_nth(&self, n: usize) -> char {
4330            self.inner.peek_nth(n)
4331        }
4332    }
4333
4334    impl<'input> BorrowedInput<'input> for SmallReportedBufferInput<'input> {
4335        fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'input str> {
4336            self.inner.slice_borrowed(start, end)
4337        }
4338    }
4339
4340    #[test]
4341    fn anchor_character_set_allows_colon_and_rejects_flow_indicators() {
4342        use super::is_anchor_char;
4343
4344        assert!(is_anchor_char('x'));
4345        assert!(is_anchor_char('-'));
4346        assert!(is_anchor_char('_'));
4347        assert!(is_anchor_char(':'));
4348        assert!(is_anchor_char('#'));
4349        assert!(is_anchor_char('/'));
4350        assert!(is_anchor_char('?'));
4351
4352        for c in [',', '[', ']', '{', '}', ' ', '\t', '\n', '\r', '\0'] {
4353            assert!(
4354                !is_anchor_char(c),
4355                "character {c:?} must not be accepted in anchor/alias names"
4356            );
4357        }
4358    }
4359
4360    #[test]
4361    fn flow_simple_key_length_limit_bounds_buffering() {
4362        let mut yaml = String::from("[\n\"start\"\n");
4363        for _ in 0..600 {
4364            yaml.push_str("\"x\"\n");
4365        }
4366        let total_chars = yaml.chars().count();
4367        let read = Rc::new(Cell::new(0));
4368        let chars = yaml.chars().collect::<Vec<_>>().into_iter();
4369        let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
4370            chars,
4371            read: Rc::clone(&read),
4372        }));
4373
4374        assert!(matches!(
4375            scanner.next_token().unwrap().unwrap().1,
4376            TokenType::StreamStart(_)
4377        ));
4378
4379        let token = scanner.next_token().unwrap().unwrap();
4380        assert!(matches!(token.1, TokenType::FlowSequenceStart));
4381
4382        let token = scanner.next_token().unwrap().unwrap();
4383        assert!(matches!(
4384            token.1,
4385            TokenType::Scalar(_, ref value) if value == "start"
4386        ));
4387        assert!(
4388            read.get() < total_chars,
4389            "scanner consumed all {total_chars} chars before yielding the first flow scalar"
4390        );
4391        assert!(
4392            read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
4393            "scanner read {} chars before yielding the first flow scalar",
4394            read.get()
4395        );
4396    }
4397
4398    #[test]
4399    fn block_scalar_indent_tolerates_small_reported_bufmaxlen() {
4400        let mut scanner = Scanner::new(SmallReportedBufferInput::new("|\n  value\n", 0));
4401
4402        let scalar = scanner
4403            .find_map(|token| match token {
4404                Token(_, TokenType::Scalar(ScalarStyle::Literal, value)) => {
4405                    Some(value.into_owned())
4406                }
4407                _ => None,
4408            })
4409            .expect("expected block scalar token");
4410
4411        assert_eq!(scalar, "value\n");
4412    }
4413
4414    #[test]
4415    fn plain_scalar_chunk_tolerates_small_reported_bufmaxlen() {
4416        let mut scanner = Scanner::new(SmallReportedBufferInput::new("plain\n", 0));
4417
4418        let scalar = scanner
4419            .find_map(|token| match token {
4420                Token(_, TokenType::Scalar(ScalarStyle::Plain, value)) => Some(value.into_owned()),
4421                _ => None,
4422            })
4423            .expect("expected plain scalar token");
4424
4425        assert_eq!(scalar, "plain");
4426    }
4427
4428    fn first_token_slice(
4429        yaml: &str,
4430        matches_token: impl Fn(&TokenType<'_>) -> bool,
4431    ) -> Option<String> {
4432        let mut scanner = Scanner::new(StrInput::new(yaml));
4433
4434        loop {
4435            let token = scanner
4436                .next_token()
4437                .expect("scanner should accept the test YAML")?;
4438            if matches_token(&token.1) {
4439                return token.0.slice(yaml).map(ToOwned::to_owned);
4440            }
4441        }
4442    }
4443
4444    #[test]
4445    fn flow_indicator_token_spans_cover_only_the_indicator() {
4446        assert_eq!(
4447            first_token_slice("[ # c\n  a]\n", |token| matches!(
4448                token,
4449                TokenType::FlowSequenceStart
4450            ))
4451            .as_deref(),
4452            Some("[")
4453        );
4454        assert_eq!(
4455            first_token_slice("{ # c\n  a: b}\n", |token| matches!(
4456                token,
4457                TokenType::FlowMappingStart
4458            ))
4459            .as_deref(),
4460            Some("{")
4461        );
4462        assert_eq!(
4463            first_token_slice("[a] # c\n", |token| matches!(
4464                token,
4465                TokenType::FlowSequenceEnd
4466            ))
4467            .as_deref(),
4468            Some("]")
4469        );
4470        assert_eq!(
4471            first_token_slice("{a: b} # c\n", |token| matches!(
4472                token,
4473                TokenType::FlowMappingEnd
4474            ))
4475            .as_deref(),
4476            Some("}")
4477        );
4478        assert_eq!(
4479            first_token_slice("[a, # c\nb]\n", |token| matches!(
4480                token,
4481                TokenType::FlowEntry
4482            ))
4483            .as_deref(),
4484            Some(",")
4485        );
4486    }
4487
4488    #[test]
4489    fn explicit_key_token_span_covers_only_the_indicator() {
4490        assert_eq!(
4491            first_token_slice("? # c\n: value\n", |token| matches!(token, TokenType::Key))
4492                .as_deref(),
4493            Some("?")
4494        );
4495    }
4496
4497    #[test]
4498    fn comment_capture_does_not_change_leading_whitespace() {
4499        let mut scanner = Scanner::new(StrInput::new("# comment\n"));
4500
4501        let token = scanner.scan_comment_token().unwrap();
4502
4503        assert!(scanner.leading_whitespace);
4504        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
4505
4506        let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
4507        scanner.input.lookahead(1);
4508
4509        let token = scanner.scan_comment_token().unwrap();
4510
4511        assert!(scanner.leading_whitespace);
4512        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
4513    }
4514
4515    #[test]
4516    fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
4517        let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
4518        scanner.input.lookahead(2);
4519        assert_eq!(scanner.input.peek_nth(1), ' ');
4520
4521        let token = scanner.scan_comment_token().unwrap();
4522
4523        assert!(matches!(token.1, TokenType::Comment(ref comment)
4524            if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
4525    }
4526
4527    #[test]
4528    fn comment_capture_errors_when_offsets_have_no_slice() {
4529        let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
4530
4531        let error = scanner.scan_comment_token().unwrap_err();
4532
4533        assert_eq!(
4534            error.info(),
4535            "internal error: input advertised offsets but did not provide a slice"
4536        );
4537    }
4538
4539    #[test]
4540    fn queued_token_roundtrips_public_token_variants() {
4541        let span = Span::new(Marker::new(0, 1, 0), Marker::new(7, 1, 7));
4542        let tokens = [
4543            Token(span, TokenType::StreamStart(TEncoding::Utf8)),
4544            Token(span, TokenType::StreamEnd),
4545            Token(span, TokenType::VersionDirective(1, 2)),
4546            Token(
4547                span,
4548                TokenType::TagDirective(Cow::Borrowed("!app!"), Cow::Borrowed("tag:app.example,")),
4549            ),
4550            Token(span, TokenType::DocumentStart),
4551            Token(span, TokenType::DocumentEnd),
4552            Token(span, TokenType::BlockSequenceStart),
4553            Token(span, TokenType::BlockMappingStart),
4554            Token(span, TokenType::BlockEnd),
4555            Token(span, TokenType::FlowSequenceStart),
4556            Token(span, TokenType::FlowSequenceEnd),
4557            Token(span, TokenType::FlowMappingStart),
4558            Token(span, TokenType::FlowMappingEnd),
4559            Token(span, TokenType::BlockEntry),
4560            Token(span, TokenType::FlowEntry),
4561            Token(span, TokenType::Key),
4562            Token(span, TokenType::Value),
4563            Token(span, TokenType::Alias(Cow::Borrowed("alias"))),
4564            Token(span, TokenType::Anchor(Cow::Borrowed("anchor"))),
4565            Token(
4566                span,
4567                TokenType::Tag(Cow::Borrowed("!"), Cow::Borrowed("tag")),
4568            ),
4569            Token(
4570                span,
4571                TokenType::Scalar(ScalarStyle::Literal, Cow::Borrowed("scalar")),
4572            ),
4573            Token(
4574                span,
4575                TokenType::Comment(
4576                    Comment::new(span, Cow::Borrowed(" comment")).with_placement(Placement::Right),
4577                ),
4578            ),
4579            Token(
4580                span,
4581                TokenType::ReservedDirective(
4582                    "reserved".to_owned(),
4583                    vec!["one".to_owned(), "two".to_owned()],
4584                ),
4585            ),
4586        ];
4587
4588        for token in tokens {
4589            let queued: QueuedToken = token.clone().into();
4590
4591            assert_eq!(queued.into_public(), token);
4592        }
4593    }
4594
4595    #[test]
4596    fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
4597        let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
4598
4599        scanner.skip_yaml_whitespace(false).unwrap();
4600
4601        assert!(scanner.tokens.is_empty());
4602        assert_eq!(scanner.mark.line(), 2);
4603        assert_eq!(scanner.mark.col(), 0);
4604    }
4605
4606    #[test]
4607    fn yaml_whitespace_can_stop_after_queued_comment() {
4608        let mut scanner = Scanner::new(StrInput::new(" # queued\n# later\n"));
4609
4610        assert!(scanner.skip_yaml_whitespace(true).unwrap());
4611
4612        assert_eq!(scanner.tokens.len(), 1);
4613        assert!(matches!(
4614            scanner.tokens.front().unwrap().1,
4615            QueuedTokenType::Comment(ref comment) if comment.text == " queued"
4616        ));
4617        assert_eq!(scanner.mark.line(), 1);
4618        assert_eq!(scanner.mark.col(), 9);
4619    }
4620
4621    #[test]
4622    fn token_skip_can_stop_after_queued_comment() {
4623        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n"));
4624
4625        assert!(scanner.skip_to_next_token(true).unwrap());
4626
4627        assert_eq!(scanner.tokens.len(), 1);
4628        assert!(matches!(
4629            scanner.tokens.front().unwrap().1,
4630            QueuedTokenType::Comment(ref comment) if comment.text == " first"
4631        ));
4632        assert_eq!(scanner.mark.line(), 2);
4633        assert_eq!(scanner.mark.col(), 0);
4634    }
4635
4636    #[test]
4637    fn scanner_emits_first_leading_comment_before_scanning_next_comment() {
4638        let mut scanner = Scanner::new(StrInput::new("# first\n# second\nkey: value\n"));
4639
4640        assert!(matches!(
4641            scanner.next_token().unwrap().unwrap().1,
4642            TokenType::StreamStart(_)
4643        ));
4644        assert!(matches!(
4645            scanner.next_token().unwrap().unwrap().1,
4646            TokenType::Comment(ref comment) if comment.text == " first"
4647        ));
4648        assert!(scanner.tokens.is_empty());
4649        assert!(matches!(
4650            scanner.next_token().unwrap().unwrap().1,
4651            TokenType::Comment(ref comment) if comment.text == " second"
4652        ));
4653    }
4654
4655    #[test]
4656    fn scanner_emits_quoted_scalar_comment_before_scanning_following_value() {
4657        let mut scanner = Scanner::new(StrInput::new("\"key\" # quoted\n: value\n"));
4658
4659        assert!(matches!(
4660            scanner.next_token().unwrap().unwrap().1,
4661            TokenType::StreamStart(_)
4662        ));
4663        assert!(matches!(
4664            scanner.next_token().unwrap().unwrap().1,
4665            TokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4666        ));
4667        assert!(matches!(
4668            scanner.next_token().unwrap().unwrap().1,
4669            TokenType::Comment(ref comment) if comment.text == " quoted"
4670        ));
4671    }
4672
4673    #[test]
4674    fn flow_scalar_comment_disables_adjacent_value_lookahead() {
4675        let mut scanner = Scanner::new(StrInput::new("\"key\"\n# quoted\n: value\n"));
4676
4677        scanner.fetch_flow_scalar(false).unwrap();
4678
4679        assert_eq!(scanner.adjacent_value_allowed_at, usize::MAX);
4680        assert!(matches!(
4681            scanner.tokens.front().unwrap().1,
4682            QueuedTokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4683        ));
4684        assert!(scanner.tokens.iter().any(|QueuedToken(_, token)| matches!(
4685            token,
4686            QueuedTokenType::Comment(comment) if comment.text == " quoted"
4687        )));
4688    }
4689
4690    #[test]
4691    fn deferred_error_waits_for_all_comment_tokens() {
4692        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
4693
4694        assert!(matches!(
4695            scanner.next_token().unwrap().unwrap().1,
4696            TokenType::StreamStart(_)
4697        ));
4698        assert!(matches!(
4699            scanner.next_token().unwrap().unwrap().1,
4700            TokenType::Comment(ref comment) if comment.text == " first"
4701        ));
4702        assert!(matches!(
4703            scanner.next_token().unwrap().unwrap().1,
4704            TokenType::Comment(ref comment) if comment.text == " second"
4705        ));
4706
4707        let error = scanner.next_token().unwrap_err();
4708
4709        assert!(error.info().contains("unexpected character"));
4710    }
4711
4712    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
4713    #[test]
4714    fn anchor_name_is_borrowed_for_str_input() {
4715        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
4716
4717        loop {
4718            let tok = scanner
4719                .next_token()
4720                .expect("valid YAML must scan without errors")
4721                .expect("scanner must eventually produce a token");
4722            if let TokenType::Anchor(name) = tok.1 {
4723                assert!(matches!(name, Cow::Borrowed("anch")));
4724                break;
4725            }
4726        }
4727    }
4728
4729    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
4730    #[test]
4731    fn anchor_name_rejects_non_printable_control_chars() {
4732        let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
4733
4734        loop {
4735            let tok = scanner
4736                .next_token()
4737                .expect("scanning should not fail")
4738                .expect("scanner must eventually produce a token");
4739            if let TokenType::Anchor(name) = tok.1 {
4740                assert!(matches!(name, Cow::Borrowed("foo")));
4741                let next = scanner.next_token().expect("scanning should not fail");
4742                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4743                    assert!(rest.starts_with('\u{0001}'));
4744                }
4745                break;
4746            }
4747        }
4748    }
4749
4750    #[test]
4751    fn alias_name_rejects_non_printable_control_chars() {
4752        let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
4753
4754        loop {
4755            let tok = scanner
4756                .next_token()
4757                .expect("scanning should not fail")
4758                .expect("scanner must eventually produce a token");
4759            if let TokenType::Alias(name) = tok.1 {
4760                assert!(matches!(name, Cow::Borrowed("foo")));
4761                let next = scanner.next_token().expect("scanning should not fail");
4762                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4763                    assert!(rest.starts_with('\u{0001}'));
4764                }
4765                break;
4766            }
4767        }
4768    }
4769
4770    #[test]
4771    fn alias_name_is_borrowed_for_str_input() {
4772        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
4773
4774        loop {
4775            let tok = scanner
4776                .next_token()
4777                .expect("valid YAML must scan without errors")
4778                .expect("scanner must eventually produce a token");
4779            if let TokenType::Alias(name) = tok.1 {
4780                assert!(matches!(name, Cow::Borrowed("anch")));
4781                break;
4782            }
4783        }
4784    }
4785
4786    #[test]
4787    fn alias_name_scans_colon_as_part_of_name() {
4788        let mut scanner = Scanner::new(StrInput::new("*foo: bar\n"));
4789
4790        loop {
4791            let tok = scanner
4792                .next_token()
4793                .expect("scanner must not fail before alias token")
4794                .expect("scanner must eventually emit an alias token");
4795
4796            if let TokenType::Alias(name) = tok.1 {
4797                assert_eq!(name.as_ref(), "foo:");
4798                break;
4799            }
4800        }
4801    }
4802
4803    #[test]
4804    fn anchor_name_scans_colon_as_part_of_name() {
4805        let mut scanner = Scanner::new(StrInput::new("&foo: bar\n"));
4806
4807        loop {
4808            let tok = scanner
4809                .next_token()
4810                .expect("scanner must not fail before anchor token")
4811                .expect("scanner must eventually emit an anchor token");
4812
4813            if let TokenType::Anchor(name) = tok.1 {
4814                assert_eq!(name.as_ref(), "foo:");
4815                break;
4816            }
4817        }
4818    }
4819
4820    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
4821    #[test]
4822    fn tag_directive_parts_are_borrowed_for_str_input() {
4823        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
4824
4825        loop {
4826            let tok = scanner
4827                .next_token()
4828                .expect("valid YAML must scan without errors")
4829                .expect("scanner must eventually produce a token");
4830            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4831                assert!(matches!(handle, Cow::Borrowed("!e!")));
4832                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
4833                break;
4834            }
4835        }
4836    }
4837
4838    #[test]
4839    fn tag_directive_parts_are_owned_for_buffered_input() {
4840        let mut scanner = Scanner::new(BufferedInput::new(
4841            "%TAG !e! tag:example.com,2000:app/\n".chars(),
4842        ));
4843
4844        loop {
4845            let tok = scanner
4846                .next_token()
4847                .expect("valid YAML must scan without errors")
4848                .expect("scanner must eventually produce a token");
4849            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4850                assert!(matches!(handle, Cow::Owned(_)));
4851                assert_eq!(&*handle, "!e!");
4852                assert!(matches!(prefix, Cow::Owned(_)));
4853                assert_eq!(&*prefix, "tag:example.com,2000:app/");
4854                break;
4855            }
4856        }
4857    }
4858
4859    #[test]
4860    fn buffered_tag_directive_decodes_prefix_escape() {
4861        let mut scanner = Scanner::new(BufferedInput::new(
4862            "%TAG !e! %74ag:example.com,2000:app/\n".chars(),
4863        ));
4864
4865        loop {
4866            let tok = scanner
4867                .next_token()
4868                .expect("valid YAML must scan without errors")
4869                .expect("scanner must eventually produce a token");
4870            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4871                assert_eq!(&*handle, "!e!");
4872                assert!(matches!(prefix, Cow::Owned(_)));
4873                assert_eq!(&*prefix, "tag:example.com,2000:app/");
4874                break;
4875            }
4876        }
4877    }
4878
4879    #[test]
4880    fn local_tag_combines_handle_text_with_escaped_suffix() {
4881        let mut scanner = Scanner::new(StrInput::new("!foo%20bar value\n"));
4882
4883        loop {
4884            let tok = scanner
4885                .next_token()
4886                .expect("valid YAML must scan without errors")
4887                .expect("scanner must eventually produce a token");
4888            if let TokenType::Tag(handle, suffix) = tok.1 {
4889                assert!(matches!(handle, Cow::Borrowed("!")));
4890                assert!(matches!(suffix, Cow::Owned(_)));
4891                assert_eq!(&*suffix, "foo bar");
4892                break;
4893            }
4894        }
4895    }
4896
4897    #[test]
4898    fn secondary_tag_requires_suffix_in_borrowed_and_buffered_paths() {
4899        let expected = "while parsing a tag, did not find expected tag URI";
4900
4901        assert_eq!(first_scanner_error_info("!! value\n"), expected);
4902        assert_eq!(first_buffered_scanner_error_info("!! value\n"), expected);
4903    }
4904
4905    #[test]
4906    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
4907        let mut scanner = Scanner::new(StrInput::new("foo\n"));
4908
4909        loop {
4910            let tok = scanner
4911                .next_token()
4912                .expect("valid YAML must scan without errors")
4913                .expect("scanner must eventually produce a token");
4914            if let TokenType::Scalar(_, value) = tok.1 {
4915                assert!(matches!(value, Cow::Borrowed("foo")));
4916                break;
4917            }
4918        }
4919    }
4920
4921    #[test]
4922    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
4923        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
4924
4925        loop {
4926            let tok = scanner
4927                .next_token()
4928                .expect("valid YAML must scan without errors")
4929                .expect("scanner must eventually produce a token");
4930            if let TokenType::Scalar(_, value) = tok.1 {
4931                assert!(matches!(value, Cow::Borrowed("foo bar")));
4932                break;
4933            }
4934        }
4935    }
4936
4937    #[test]
4938    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4939        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
4940
4941        loop {
4942            let tok = scanner
4943                .next_token()
4944                .expect("valid YAML must scan without errors")
4945                .expect("scanner must eventually produce a token");
4946            if let TokenType::Scalar(_, value) = tok.1 {
4947                assert!(matches!(value, Cow::Borrowed("foo bar")));
4948                break;
4949            }
4950        }
4951    }
4952
4953    #[test]
4954    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
4955        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
4956
4957        loop {
4958            let tok = scanner
4959                .next_token()
4960                .expect("valid YAML must scan without errors")
4961                .expect("scanner must eventually produce a token");
4962            if let TokenType::Scalar(_, value) = tok.1 {
4963                assert!(matches!(value, Cow::Owned(_)));
4964                assert_eq!(&*value, "foo'bar");
4965                break;
4966            }
4967        }
4968    }
4969
4970    #[test]
4971    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4972        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
4973
4974        loop {
4975            let tok = scanner
4976                .next_token()
4977                .expect("valid YAML must scan without errors")
4978                .expect("scanner must eventually produce a token");
4979            if let TokenType::Scalar(_, value) = tok.1 {
4980                assert!(matches!(value, Cow::Borrowed("foo bar")));
4981                break;
4982            }
4983        }
4984    }
4985
4986    #[test]
4987    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
4988        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
4989
4990        loop {
4991            let tok = scanner
4992                .next_token()
4993                .expect("valid YAML must scan without errors")
4994                .expect("scanner must eventually produce a token");
4995            if let TokenType::Scalar(_, value) = tok.1 {
4996                assert!(matches!(value, Cow::Owned(_)));
4997                assert_eq!(&*value, "foo\nbar");
4998                break;
4999            }
5000        }
5001    }
5002
5003    #[test]
5004    fn plain_key_is_borrowed_for_str_input() {
5005        // Keys are just scalars in a key position; they should also be borrowed.
5006        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
5007
5008        let mut found_key = false;
5009        let mut key_value: Option<Cow<'_, str>> = None;
5010
5011        loop {
5012            let tok = scanner
5013                .next_token()
5014                .expect("valid YAML must scan without errors");
5015            let Some(tok) = tok else { break };
5016
5017            if matches!(tok.1, TokenType::Key) {
5018                found_key = true;
5019            } else if found_key {
5020                if let TokenType::Scalar(_, value) = tok.1 {
5021                    key_value = Some(value);
5022                    break;
5023                }
5024            }
5025        }
5026
5027        assert!(found_key, "expected to find a Key token");
5028        let key_value = key_value.expect("expected to find a scalar after Key token");
5029        assert!(
5030            matches!(key_value, Cow::Borrowed("mykey")),
5031            "key should be borrowed, got: {key_value:?}"
5032        );
5033    }
5034
5035    #[test]
5036    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
5037        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
5038
5039        let mut found_key = false;
5040        let mut key_value: Option<Cow<'_, str>> = None;
5041
5042        loop {
5043            let tok = scanner
5044                .next_token()
5045                .expect("valid YAML must scan without errors");
5046            let Some(tok) = tok else { break };
5047
5048            if matches!(tok.1, TokenType::Key) {
5049                found_key = true;
5050            } else if found_key {
5051                if let TokenType::Scalar(_, value) = tok.1 {
5052                    key_value = Some(value);
5053                    break;
5054                }
5055            }
5056        }
5057
5058        assert!(found_key, "expected to find a Key token");
5059        let key_value = key_value.expect("expected to find a scalar after Key token");
5060        assert!(
5061            matches!(key_value, Cow::Borrowed("mykey")),
5062            "quoted key should be borrowed when verbatim, got: {key_value:?}"
5063        );
5064    }
5065
5066    #[test]
5067    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
5068        // Test a tag like !!str which should have handle="!!" and suffix="str"
5069        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
5070
5071        loop {
5072            let tok = scanner
5073                .next_token()
5074                .expect("valid YAML must scan without errors")
5075                .expect("scanner must eventually produce a token");
5076            if let TokenType::Tag(handle, suffix) = tok.1 {
5077                assert!(
5078                    matches!(handle, Cow::Borrowed("!!")),
5079                    "tag handle should be borrowed, got: {handle:?}"
5080                );
5081                assert!(
5082                    matches!(suffix, Cow::Borrowed("str")),
5083                    "tag suffix should be borrowed, got: {suffix:?}"
5084                );
5085                break;
5086            }
5087        }
5088    }
5089
5090    #[test]
5091    fn local_tag_suffix_is_borrowed_for_str_input() {
5092        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
5093        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
5094
5095        loop {
5096            let tok = scanner
5097                .next_token()
5098                .expect("valid YAML must scan without errors")
5099                .expect("scanner must eventually produce a token");
5100            if let TokenType::Tag(handle, suffix) = tok.1 {
5101                assert!(
5102                    matches!(handle, Cow::Borrowed("!")),
5103                    "local tag handle should be '!', got: {handle:?}"
5104                );
5105                assert!(
5106                    matches!(suffix, Cow::Borrowed("mytag")),
5107                    "local tag suffix should be borrowed, got: {suffix:?}"
5108                );
5109                break;
5110            }
5111        }
5112    }
5113
5114    #[test]
5115    fn tag_with_uri_escape_is_owned_for_str_input() {
5116        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
5117        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
5118
5119        loop {
5120            let tok = scanner
5121                .next_token()
5122                .expect("valid YAML must scan without errors")
5123                .expect("scanner must eventually produce a token");
5124            if let TokenType::Tag(handle, suffix) = tok.1 {
5125                assert!(
5126                    matches!(handle, Cow::Borrowed("!!")),
5127                    "tag handle should still be borrowed, got: {handle:?}"
5128                );
5129                assert!(
5130                    matches!(suffix, Cow::Owned(_)),
5131                    "tag suffix with URI escape should be owned, got: {suffix:?}"
5132                );
5133                assert_eq!(&*suffix, "my tag");
5134                break;
5135            }
5136        }
5137    }
5138
5139    #[test]
5140    fn flow_scalar_buffer_tracks_pending_whitespace() {
5141        let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
5142
5143        borrowed.note_pending_ws(5, 8);
5144        borrowed.commit_pending_ws();
5145        assert!(matches!(
5146            borrowed,
5147            super::FlowScalarBuf::Borrowed {
5148                end: 8,
5149                pending_ws_start: None,
5150                pending_ws_end: 8,
5151                ..
5152            }
5153        ));
5154
5155        borrowed.note_pending_ws(9, 11);
5156        borrowed.discard_pending_ws();
5157        assert!(matches!(
5158            borrowed,
5159            super::FlowScalarBuf::Borrowed {
5160                end: 8,
5161                pending_ws_start: None,
5162                pending_ws_end: 8,
5163                ..
5164            }
5165        ));
5166        assert!(borrowed.as_owned_mut().is_none());
5167
5168        let mut owned = super::FlowScalarBuf::new_owned();
5169        owned.as_owned_mut().unwrap().push_str("owned");
5170        assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
5171    }
5172
5173    fn first_scanner_error_info(input: &str) -> String {
5174        first_scanner_error(input).info().to_owned()
5175    }
5176
5177    fn first_buffered_scanner_error_info(input: &str) -> String {
5178        let mut scanner = Scanner::new(BufferedInput::new(input.chars()));
5179        loop {
5180            match scanner.next_token() {
5181                Ok(Some(_)) => {}
5182                Ok(None) => panic!("expected scanner error"),
5183                Err(error) => return error.info().to_owned(),
5184            }
5185        }
5186    }
5187
5188    fn first_scanner_error(input: &str) -> ScanError {
5189        let mut scanner = Scanner::new(StrInput::new(input));
5190        loop {
5191            match scanner.next_token() {
5192                Ok(Some(_)) => {}
5193                Ok(None) => panic!("expected scanner error"),
5194                Err(error) => return error,
5195            }
5196        }
5197    }
5198
5199    fn first_scalar_value(input: &str) -> String {
5200        let mut scanner = Scanner::new(StrInput::new(input));
5201        loop {
5202            match scanner.next_token().expect("scanner should not error") {
5203                Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
5204                Some(_) => {}
5205                None => panic!("expected scalar token"),
5206            }
5207        }
5208    }
5209
5210    fn first_buffered_scalar_value(input: &str) -> String {
5211        let mut scanner = Scanner::new(BufferedInput::new(input.chars()));
5212        loop {
5213            match scanner.next_token().expect("scanner should not error") {
5214                Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
5215                Some(_) => {}
5216                None => panic!("expected scalar token"),
5217            }
5218        }
5219    }
5220
5221    #[test]
5222    fn iterator_next_records_error_and_then_stays_empty() {
5223        let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
5224
5225        while scanner.next().is_some() {}
5226
5227        let error = scanner
5228            .get_error()
5229            .expect("scanner should retain the error");
5230        assert_eq!(error.info(), "unclosed quote");
5231        assert!(scanner.next().is_none());
5232    }
5233
5234    #[test]
5235    fn next_token_returns_none_after_stream_end() {
5236        let mut scanner = Scanner::new(StrInput::new(""));
5237
5238        while let Some(token) = scanner.next_token().unwrap() {
5239            if matches!(token.1, TokenType::StreamEnd) {
5240                break;
5241            }
5242        }
5243
5244        assert!(scanner.stream_started());
5245        assert!(scanner.stream_ended());
5246        assert!(scanner.next_token().unwrap().is_none());
5247    }
5248
5249    #[test]
5250    fn directive_name_must_be_present() {
5251        assert_eq!(
5252            first_scanner_error_info("%\n"),
5253            "while scanning a directive, could not find expected directive name"
5254        );
5255    }
5256
5257    #[test]
5258    fn yaml_directive_requires_dot_between_version_numbers() {
5259        assert_eq!(
5260            first_scanner_error_info("%YAML 1\n"),
5261            "while scanning a YAML directive, did not find expected digit or '.' character"
5262        );
5263    }
5264
5265    #[test]
5266    fn yaml_directive_requires_major_version_number() {
5267        assert_eq!(
5268            first_scanner_error_info("%YAML .2\n"),
5269            "while scanning a YAML directive, did not find expected version number"
5270        );
5271    }
5272
5273    #[test]
5274    fn yaml_directive_rejects_extremely_long_version_number() {
5275        assert_eq!(
5276            first_scanner_error_info("%YAML 1234567890.2\n"),
5277            "while scanning a YAML directive, found extremely long version number"
5278        );
5279    }
5280
5281    #[test]
5282    fn tag_directive_handle_must_end_with_bang() {
5283        assert_eq!(
5284            first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
5285            "while parsing a tag directive, did not find expected '!'"
5286        );
5287    }
5288
5289    #[test]
5290    fn tag_directive_handle_must_start_with_bang() {
5291        assert_eq!(
5292            first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
5293            "while scanning a tag, did not find expected '!'"
5294        );
5295        assert_eq!(
5296            first_buffered_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
5297            "while scanning a tag, did not find expected '!'"
5298        );
5299    }
5300
5301    #[test]
5302    fn tag_directive_prefix_must_start_with_tag_character() {
5303        assert_eq!(
5304            first_scanner_error_info("%TAG !e! `bad\n"),
5305            "invalid global tag character"
5306        );
5307    }
5308
5309    #[test]
5310    fn tag_directive_prefix_must_end_before_invalid_content() {
5311        assert_eq!(
5312            first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
5313            "while scanning TAG, did not find expected whitespace or line break"
5314        );
5315    }
5316
5317    #[test]
5318    fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
5319        let mut scanner =
5320            Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
5321
5322        loop {
5323            let token = scanner
5324                .next_token()
5325                .expect("valid directive should scan")
5326                .expect("scanner must produce a directive token");
5327            if let TokenType::TagDirective(handle, prefix) = token.1 {
5328                assert!(matches!(handle, Cow::Borrowed("!e!")));
5329                assert!(matches!(prefix, Cow::Owned(_)));
5330                assert_eq!(&*prefix, "tag:example.com,2024:some app/");
5331                break;
5332            }
5333        }
5334    }
5335
5336    #[test]
5337    fn bare_bang_tag_scans_as_non_specific_tag() {
5338        let mut scanner = Scanner::new(StrInput::new("! foo\n"));
5339
5340        loop {
5341            let token = scanner
5342                .next_token()
5343                .expect("valid tag should scan")
5344                .expect("scanner must produce a tag token");
5345            if let TokenType::Tag(handle, suffix) = token.1 {
5346                assert_eq!(&*handle, "");
5347                assert_eq!(&*suffix, "!");
5348                break;
5349            }
5350        }
5351    }
5352
5353    #[test]
5354    fn tag_requires_separation_after_suffix() {
5355        assert_eq!(
5356            first_scanner_error_info("!foo,bar\n"),
5357            "while scanning a tag, did not find expected whitespace or line break"
5358        );
5359    }
5360
5361    #[test]
5362    fn verbatim_tag_requires_uri() {
5363        assert_eq!(
5364            first_scanner_error_info("!<> foo\n"),
5365            "while parsing a tag, did not find expected tag URI"
5366        );
5367    }
5368
5369    #[test]
5370    fn verbatim_tag_requires_closing_angle_bracket() {
5371        assert_eq!(
5372            first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
5373            "while scanning a verbatim tag, did not find the expected '>'"
5374        );
5375    }
5376
5377    #[test]
5378    fn tag_uri_escape_requires_hex_digits() {
5379        assert_eq!(
5380            first_scanner_error_info("!!bad%zz foo\n"),
5381            "while parsing a tag, found an invalid escape sequence"
5382        );
5383    }
5384
5385    #[test]
5386    fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
5387        assert_eq!(
5388            first_scanner_error_info("!!bad%80 foo\n"),
5389            "while parsing a tag, found an incorrect leading UTF-8 byte"
5390        );
5391    }
5392
5393    #[test]
5394    fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
5395        assert_eq!(
5396            first_scanner_error_info("!!bad%C2%41 foo\n"),
5397            "while parsing a tag, found an incorrect trailing UTF-8 byte"
5398        );
5399    }
5400
5401    #[test]
5402    fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
5403        assert_eq!(
5404            first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
5405            "while parsing a tag, found an invalid UTF-8 codepoint"
5406        );
5407    }
5408
5409    #[test]
5410    fn anchors_and_aliases_require_names() {
5411        let expected =
5412            "while scanning an anchor or alias, did not find expected alphabetic or numeric character";
5413
5414        assert_eq!(first_scanner_error_info("& \n"), expected);
5415        assert_eq!(first_scanner_error_info("* \n"), expected);
5416    }
5417
5418    #[test]
5419    fn document_end_marker_rejects_trailing_content() {
5420        assert_eq!(
5421            first_scanner_error_info("... trailing\n"),
5422            "invalid content after document end marker"
5423        );
5424    }
5425
5426    #[test]
5427    fn reserved_indicators_are_rejected_outside_directives() {
5428        assert_eq!(
5429            first_scanner_error_info(" @\n"),
5430            "unexpected character: `@'"
5431        );
5432    }
5433
5434    #[test]
5435    fn flow_block_entry_indicator_is_rejected() {
5436        assert_eq!(
5437            first_scanner_error_info("[- ]\n"),
5438            r#""-" is only valid inside a block"#
5439        );
5440    }
5441
5442    #[test]
5443    fn block_entry_after_tabbed_separator_reports_specific_error() {
5444        assert_eq!(
5445            first_scanner_error_info("-\t- value\n"),
5446            "'-' must be followed by a valid YAML whitespace"
5447        );
5448    }
5449
5450    #[test]
5451    fn document_indicator_reports_unclosed_flow_collection() {
5452        assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
5453    }
5454
5455    #[test]
5456    fn block_scalar_header_rejects_trailing_content() {
5457        assert_eq!(
5458            first_scanner_error_info("|+ trailing\n"),
5459            "while scanning a block scalar, did not find expected comment or line break"
5460        );
5461    }
5462
5463    #[test]
5464    fn block_scalar_rejects_zero_indent_indicator() {
5465        let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
5466
5467        assert_eq!(first_scanner_error_info("|0\n"), expected);
5468        assert_eq!(first_scanner_error_info("|+0\n"), expected);
5469    }
5470
5471    #[test]
5472    fn empty_block_scalar_at_eof_honors_chomping() {
5473        assert_eq!(first_scalar_value("|\n"), "");
5474        assert_eq!(first_scalar_value("|-\n"), "");
5475        assert_eq!(first_scalar_value("|+\n"), "");
5476        assert_eq!(first_scalar_value("|+\n\n"), "\n");
5477        assert_eq!(first_scalar_value("|+\n   "), "\n");
5478    }
5479
5480    #[test]
5481    fn buffered_block_scalar_reads_content_past_lookahead_window() {
5482        assert_eq!(
5483            first_buffered_scalar_value("|\n  abcdefghijklmnopqrstuvwxyz\n"),
5484            "abcdefghijklmnopqrstuvwxyz\n"
5485        );
5486    }
5487
5488    #[test]
5489    fn explicit_indent_block_scalar_can_end_at_document_marker() {
5490        assert_eq!(first_scalar_value("|1\n...\n"), "");
5491    }
5492
5493    #[test]
5494    fn root_explicit_indent_block_scalar_rejects_underindented_content() {
5495        assert_eq!(
5496            first_scanner_error_info("|2\nx\n"),
5497            "wrongly indented line in block scalar"
5498        );
5499    }
5500
5501    #[test]
5502    fn quoted_scalar_rejects_document_indicator_at_line_start() {
5503        assert_eq!(
5504            first_scanner_error_info("\"one\n---\ntwo\"\n"),
5505            "while scanning a quoted scalar, found unexpected document indicator"
5506        );
5507    }
5508
5509    #[test]
5510    fn quoted_scalar_rejects_tab_indentation_after_line_break() {
5511        assert_eq!(
5512            first_scanner_error_info("a: \"one\n\tbad\"\n"),
5513            "tab cannot be used as indentation"
5514        );
5515    }
5516
5517    #[test]
5518    fn quoted_scalar_rejects_underindented_continuation() {
5519        assert_eq!(
5520            first_scanner_error_info("a: \"one\nbad\"\n"),
5521            "invalid indentation in multiline quoted scalar"
5522        );
5523    }
5524
5525    #[test]
5526    fn quoted_scalar_trailing_content_error_names_quote_style() {
5527        assert_eq!(
5528            first_scanner_error_info("'foo' trailing\n"),
5529            "invalid trailing content after single-quoted scalar"
5530        );
5531        assert_eq!(
5532            first_scanner_error_info("\"foo\" trailing\n"),
5533            "invalid trailing content after double-quoted scalar"
5534        );
5535    }
5536
5537    #[test]
5538    fn quoted_scalar_escape_errors_cover_hex_and_surrogate_edges() {
5539        assert_eq!(
5540            first_scanner_error_info("\"\\xG0\"\n"),
5541            "while parsing a quoted scalar, did not find expected hexadecimal number"
5542        );
5543        assert_eq!(
5544            first_scanner_error_info("\"\\uD800\\uGGGG\"\n"),
5545            "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate"
5546        );
5547        assert_eq!(
5548            first_scanner_error_info("\"\\uD800\\u0041\"\n"),
5549            "while parsing a quoted scalar, found invalid low surrogate"
5550        );
5551        assert_eq!(
5552            first_scanner_error_info("\"\\U00110000\"\n"),
5553            "while parsing a quoted scalar, found invalid Unicode character escape code"
5554        );
5555    }
5556
5557    #[test]
5558    fn indented_flow_scalar_reports_invalid_indentation() {
5559        assert_eq!(
5560            first_scanner_error_info("a:\n  [\nfoo]\n"),
5561            "invalid indentation"
5562        );
5563    }
5564
5565    #[test]
5566    fn required_simple_key_requires_value_at_stream_end() {
5567        let error = first_scanner_error("a:\n&b\n- c\n");
5568
5569        assert_eq!(error.info(), "simple key expected ':'");
5570        assert_eq!(error.marker().index(), 3);
5571        assert_eq!(error.marker().line(), 2);
5572        assert_eq!(error.marker().col(), 0);
5573        assert_eq!(
5574            alloc::format!("{error}"),
5575            "simple key expected ':' at char 3 line 2 column 1"
5576        );
5577    }
5578
5579    #[test]
5580    fn plain_scalar_rejects_dash_before_flow_indicator() {
5581        assert_eq!(
5582            first_scanner_error_info("[-]\n"),
5583            "plain scalar cannot start with '-' followed by ,[]{}"
5584        );
5585    }
5586
5587    #[test]
5588    fn explicit_key_rejects_tab_after_indicator() {
5589        assert_eq!(
5590            first_scanner_error_info("? \tfoo\n"),
5591            "tabs disallowed in this context"
5592        );
5593    }
5594
5595    #[test]
5596    fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
5597        assert_eq!(
5598            first_scanner_error_info("[a:[]]\n"),
5599            "':' may not precede any of `[{` in flow mapping"
5600        );
5601    }
5602
5603    #[test]
5604    fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
5605        assert_eq!(
5606            first_scanner_error_info("[foo\n: bar]\n"),
5607            "illegal placement of ':' indicator"
5608        );
5609    }
5610
5611    #[test]
5612    fn stale_simple_key_token_position_is_a_scan_error() {
5613        let mut scanner = Scanner::new(StrInput::new(": value\n"));
5614        scanner.fetch_stream_start();
5615        scanner.tokens.clear();
5616        scanner.tokens_parsed = 1;
5617
5618        let simple_key = scanner
5619            .simple_keys
5620            .last_mut()
5621            .expect("stream start should create a simple key slot");
5622        simple_key.possible = true;
5623        simple_key.token_number = 0;
5624
5625        let error = scanner
5626            .fetch_value()
5627            .expect_err("stale simple key should be reported as a scan error");
5628        assert_eq!(error.info(), "simple key is no longer valid");
5629    }
5630
5631    #[test]
5632    fn issue14_alias_scanner_consumes_colon_as_name_character() {
5633        let mut scanner = Scanner::new(StrInput::new("*foo: bar\n"));
5634
5635        assert!(matches!(
5636            scanner.next_token().unwrap().unwrap().1,
5637            TokenType::StreamStart(_)
5638        ));
5639
5640        let token = scanner.next_token().unwrap().unwrap();
5641
5642        assert!(
5643            matches!(token.1, TokenType::Alias(ref name) if name.as_ref() == "foo:"),
5644            "expected `*foo: bar` to start with Alias(\"foo:\"), got {token:?}"
5645        );
5646    }
5647
5648    #[test]
5649    fn issue14_anchor_scanner_consumes_colon_as_name_character() {
5650        let mut scanner = Scanner::new(StrInput::new("&foo: bar\n"));
5651
5652        assert!(matches!(
5653            scanner.next_token().unwrap().unwrap().1,
5654            TokenType::StreamStart(_)
5655        ));
5656
5657        let token = scanner.next_token().unwrap().unwrap();
5658
5659        assert!(
5660            matches!(token.1, TokenType::Anchor(ref name) if name.as_ref() == "foo:"),
5661            "expected `&foo: bar` to start with Anchor(\"foo:\"), got {token:?}"
5662        );
5663    }
5664}
granit_parser/scanner.rs

granit_parser/
scanner.rs