Skip to main content

granit_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// Maximum number of characters the scanner may look ahead while disambiguating a simple key.
29const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31/// The encoding of the input. Currently, only UTF-8 is supported.
32#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34    /// UTF-8 encoding.
35    Utf8,
36}
37
38/// The source style used for a YAML scalar.
39#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41    /// A YAML plain scalar.
42    Plain,
43    /// A YAML single quoted scalar.
44    SingleQuoted,
45    /// A YAML double quoted scalar.
46    DoubleQuoted,
47
48    /// A YAML literal block (`|` block).
49    ///
50    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
51    /// In literal blocks, any indented character is content, including white space characters.
52    /// There is no way to escape characters, nor to break a long line.
53    Literal,
54    /// A YAML folded block (`>` block).
55    ///
56    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
57    /// In folded blocks, any indented character is content, including white space characters.
58    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
59    /// long lines.
60    Folded,
61}
62
63/// Offset information for a [`Marker`].
64///
65/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
66/// character source. For stable inputs, we can track both a character index and a byte offset.
67/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
68/// meaningful underlying file/source), so they are optional.
69#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71    /// The index (in characters) in the source.
72    chars: usize,
73    /// The offset (in bytes) in the source, if available.
74    bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78    fn eq(&self, other: &Self) -> bool {
79        // Byte offsets are an optional diagnostic enhancement and may differ between input
80        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
81        // position only.
82        self.chars == other.chars
83    }
84}
85
86impl Eq for MarkerOffsets {}
87
88/// A location in a YAML document.
89#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91    /// Offsets in the source.
92    offsets: MarkerOffsets,
93    /// The line (1-indexed).
94    line: usize,
95    /// The column (0-indexed).
96    col: usize,
97}
98
99impl Marker {
100    /// Create a new [`Marker`] at the given position.
101    #[must_use]
102    pub fn new(index: usize, line: usize, col: usize) -> Marker {
103        Marker {
104            offsets: MarkerOffsets {
105                chars: index,
106                bytes: None,
107            },
108            line,
109            col,
110        }
111    }
112
113    /// Return a copy of the marker with the given optional byte offset.
114    #[must_use]
115    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116        self.offsets.bytes = byte_offset;
117        self
118    }
119
120    /// Return the index (in characters) of the marker in the source.
121    #[must_use]
122    pub fn index(&self) -> usize {
123        self.offsets.chars
124    }
125
126    /// Return the byte offset of the marker in the source, if available.
127    #[must_use]
128    pub fn byte_offset(&self) -> Option<usize> {
129        self.offsets.bytes
130    }
131
132    /// Return the line of the marker in the source.
133    #[must_use]
134    pub fn line(&self) -> usize {
135        self.line
136    }
137
138    /// Return the column of the marker in the source.
139    #[must_use]
140    pub fn col(&self) -> usize {
141        self.col
142    }
143}
144
145/// A range of locations in a YAML document.
146#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148    /// The start (inclusive) of the range.
149    pub start: Marker,
150    /// The end (exclusive) of the range.
151    pub end: Marker,
152
153    /// Optional indentation hint associated with this span.
154    ///
155    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
156    /// When indentation is not meaningful or cannot be provided, it must be `None`.
157    pub indent: Option<usize>,
158}
159
160impl Span {
161    /// Create a new [`Span`] for the given range.
162    #[must_use]
163    pub fn new(start: Marker, end: Marker) -> Span {
164        Span {
165            start,
166            end,
167            indent: None,
168        }
169    }
170
171    /// Create an empty [`Span`] at a given location.
172    ///
173    /// An empty span doesn't contain any characters, but its position may still be meaningful.
174    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
175    ///
176    /// [`SequenceEnd`]: crate::Event::SequenceEnd
177    #[must_use]
178    pub fn empty(mark: Marker) -> Span {
179        Span {
180            start: mark,
181            end: mark,
182            indent: None,
183        }
184    }
185
186    /// Return a copy of this [`Span`] with the given indentation hint.
187    #[must_use]
188    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
189        self.indent = indent;
190        self
191    }
192
193    /// Return the length of the span (in characters).
194    #[must_use]
195    pub fn len(&self) -> usize {
196        self.end.index() - self.start.index()
197    }
198
199    /// Return whether the [`Span`] has a length of zero.
200    #[must_use]
201    pub fn is_empty(&self) -> bool {
202        self.len() == 0
203    }
204
205    /// Return the byte range of the span, if available.
206    #[must_use]
207    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
208        let start = self.start.byte_offset()?;
209        let end = self.end.byte_offset()?;
210        Some(start..end)
211    }
212
213    /// Return the source text covered by this span, if byte offsets are available
214    /// and the range is valid for the provided input.
215    #[must_use]
216    pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
217        source.get(self.byte_range()?)
218    }
219}
220
221/// A positional hint for a YAML source comment.
222///
223/// The parser currently recognizes these placements:
224///
225/// ```yaml
226/// # Above
227/// key: value # Right
228///
229/// # Free
230///
231/// next: value
232///
233/// # Last
234/// ```
235#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
236pub enum Placement {
237    /// An own-line comment immediately before another YAML token.
238    ///
239    /// This usually means the comment visually describes the following node.
240    /// Consecutive own-line comments without blank lines between them are also considered
241    /// `Above`, so a comment block can attach to the next YAML element as a group.
242    Above,
243    /// A same-line comment after YAML content or syntax. Examples include `key: value # Right`
244    /// and `- # Right` for an empty sequence entry.
245    Right,
246    /// A standalone own-line comment that is separated from nearby YAML tokens.
247    ///
248    /// This is the fallback for comments that are neither same-line comments, immediately above a
249    /// following token, nor the final comment in the stream. Consumers should treat `Free` as not
250    /// having an obvious neighboring node.
251    #[default]
252    Free,
253    /// An own-line comment at the end of the input stream.
254    ///
255    /// A `Last` comment may be followed by blank lines, but no further YAML token appears before
256    /// `StreamEnd`.
257    Last,
258}
259
260/// A YAML comment captured from the source.
261///
262/// Comments are presentation metadata, not YAML data. This type carries the raw comment payload,
263/// source span, and a best-effort [`Placement`] hint for callers that want to correlate comments
264/// with nearby YAML presentation.
265#[derive(Clone, PartialEq, Debug, Eq)]
266pub struct Comment<'input> {
267    /// Span covering the whole source comment, including `#` and excluding the line break.
268    pub span: Span,
269    /// Raw comment payload exactly after `#`, excluding only the line break.
270    ///
271    /// Leading spaces are preserved, including a single space immediately after `#` when present.
272    pub text: Cow<'input, str>,
273    /// Best-effort placement of this comment relative to nearby YAML content.
274    pub placement: Placement,
275}
276
277impl<'input> Comment<'input> {
278    /// Create a captured YAML comment from a source span and raw payload.
279    ///
280    /// The placement defaults to [`Placement::Free`]. Use [`Comment::with_placement`] when the
281    /// caller already knows a more specific placement.
282    #[must_use]
283    pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
284        Self {
285            span,
286            text: text.into(),
287            placement: Placement::Free,
288        }
289    }
290
291    /// Return this comment with the given placement.
292    #[must_use]
293    pub fn with_placement(mut self, placement: Placement) -> Self {
294        self.placement = placement;
295        self
296    }
297
298    /// Return the comment payload with surrounding whitespace removed.
299    ///
300    /// This helper is ergonomic only. The raw [`Self::text`] payload remains unchanged.
301    #[must_use]
302    pub fn trimmed_text(&self) -> &str {
303        self.text.trim()
304    }
305}
306
307impl AsRef<str> for Comment<'_> {
308    fn as_ref(&self) -> &str {
309        self.text.as_ref()
310    }
311}
312
313/// An error that occurred while scanning.
314#[derive(Clone, PartialEq, Debug, Eq)]
315pub struct ScanError {
316    /// The position at which the error happened in the source.
317    mark: Marker,
318    /// Human-readable details about the error.
319    info: String,
320}
321
322impl ScanError {
323    /// Create a new error from a location and an error string.
324    #[must_use]
325    #[cold]
326    pub fn new(loc: Marker, info: String) -> ScanError {
327        ScanError { mark: loc, info }
328    }
329
330    /// Convenience alias for string slices.
331    #[must_use]
332    #[cold]
333    pub fn new_str(loc: Marker, info: &str) -> ScanError {
334        ScanError {
335            mark: loc,
336            info: info.to_owned(),
337        }
338    }
339
340    #[cold]
341    pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
342        Err(self)
343    }
344
345    /// Return the marker pointing to the error in the source.
346    #[must_use]
347    pub fn marker(&self) -> &Marker {
348        &self.mark
349    }
350
351    /// Return the information string describing the error that happened.
352    #[must_use]
353    pub fn info(&self) -> &str {
354        self.info.as_ref()
355    }
356}
357
358impl fmt::Display for ScanError {
359    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
360        write!(
361            f,
362            "{} at char {} line {} column {}",
363            self.info,
364            self.mark.index(),
365            self.mark.line(),
366            self.mark.col() + 1
367        )
368    }
369}
370
371impl core::error::Error for ScanError {}
372
373/// The contents of a scanner token.
374#[derive(Clone, PartialEq, Debug, Eq)]
375pub enum TokenType<'input> {
376    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
377    StreamStart(TEncoding),
378    /// The end of the stream, EOF.
379    StreamEnd,
380    /// A YAML version directive.
381    VersionDirective(
382        /// Major version number.
383        u32,
384        /// Minor version number.
385        u32,
386    ),
387    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
388    TagDirective(
389        /// Tag directive handle, such as `!` or `!app!`.
390        Cow<'input, str>,
391        /// Tag URI prefix associated with the handle.
392        Cow<'input, str>,
393    ),
394    /// The start of a YAML document (`---`).
395    DocumentStart,
396    /// The end of a YAML document (`...`).
397    DocumentEnd,
398    /// The start of a sequence block.
399    ///
400    /// Sequence blocks are arrays starting with a `-`.
401    BlockSequenceStart,
402    /// The start of a block mapping.
403    ///
404    /// Block mappings are key-value collections written with `key: value` entries.
405    BlockMappingStart,
406    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
407    BlockEnd,
408    /// Start of an inline sequence (`[ a, b ]`).
409    FlowSequenceStart,
410    /// End of an inline sequence.
411    FlowSequenceEnd,
412    /// Start of an inline mapping (`{ a: b, c: d }`).
413    FlowMappingStart,
414    /// End of an inline mapping.
415    FlowMappingEnd,
416    /// An entry in a block sequence (see [`TokenType::BlockSequenceStart`]).
417    BlockEntry,
418    /// An entry in a flow sequence (see [`TokenType::FlowSequenceStart`]).
419    FlowEntry,
420    /// A key in a mapping.
421    Key,
422    /// A value in a mapping.
423    Value,
424    /// A reference to a previously defined anchor.
425    Alias(Cow<'input, str>),
426    /// A YAML anchor definition introduced by `&`.
427    Anchor(Cow<'input, str>),
428    /// A YAML tag (starting with bangs `!`).
429    Tag(
430        /// The handle of the tag.
431        Cow<'input, str>,
432        /// The suffix of the tag.
433        Cow<'input, str>,
434    ),
435    /// A regular YAML scalar.
436    Scalar(ScalarStyle, Cow<'input, str>),
437    /// A YAML source comment.
438    ///
439    /// The token payload carries the raw text exactly after `#`, the source span, and an initial
440    /// [`Placement`] hint. The token's companion [`Span`] is the same as [`Comment::span`].
441    Comment(
442        /// Captured comment metadata.
443        Comment<'input>,
444    ),
445    /// A reserved YAML directive.
446    ReservedDirective(
447        /// Directive name.
448        String,
449        /// Directive parameters, split on YAML whitespace.
450        Vec<String>,
451    ),
452}
453
454/// A scanner token.
455#[derive(Clone, PartialEq, Debug, Eq)]
456pub struct Token<'input>(
457    /// Source span covered by this token.
458    pub Span,
459    /// Token payload emitted by the scanner.
460    pub TokenType<'input>,
461);
462
463/// Compact comment metadata used only inside the scanner queue.
464///
465/// The queued token already stores the source span, so storing a full public [`Comment`] there
466/// duplicates a large [`Span`] and inflates every queued token.
467#[derive(Clone, PartialEq, Debug, Eq)]
468pub(crate) struct QueuedComment<'input> {
469    pub(crate) text: Cow<'input, str>,
470    pub(crate) placement: Placement,
471}
472
473impl<'input> QueuedComment<'input> {
474    fn into_public(self, span: Span) -> Comment<'input> {
475        Comment::new(span, self.text).with_placement(self.placement)
476    }
477}
478
479impl<'input> From<Comment<'input>> for QueuedComment<'input> {
480    fn from(comment: Comment<'input>) -> Self {
481        Self {
482            text: comment.text,
483            placement: comment.placement,
484        }
485    }
486}
487
488/// Token payload used in the scanner's internal queue.
489///
490/// This mirrors [`TokenType`] but stores comments without their span. Public [`Token`] values are
491/// reconstructed when the scanner emits them.
492#[derive(Clone, PartialEq, Debug, Eq)]
493pub(crate) enum QueuedTokenType<'input> {
494    StreamStart(TEncoding),
495    StreamEnd,
496    VersionDirective(u32, u32),
497    TagDirective(Cow<'input, str>, Cow<'input, str>),
498    DocumentStart,
499    DocumentEnd,
500    BlockSequenceStart,
501    BlockMappingStart,
502    BlockEnd,
503    FlowSequenceStart,
504    FlowSequenceEnd,
505    FlowMappingStart,
506    FlowMappingEnd,
507    BlockEntry,
508    FlowEntry,
509    Key,
510    Value,
511    Alias(Cow<'input, str>),
512    Anchor(Cow<'input, str>),
513    Tag(Cow<'input, str>, Cow<'input, str>),
514    Scalar(ScalarStyle, Cow<'input, str>),
515    Comment(QueuedComment<'input>),
516    ReservedDirective(String, Vec<String>),
517}
518
519impl<'input> QueuedTokenType<'input> {
520    fn into_public(self, span: Span) -> TokenType<'input> {
521        match self {
522            Self::StreamStart(encoding) => TokenType::StreamStart(encoding),
523            Self::StreamEnd => TokenType::StreamEnd,
524            Self::VersionDirective(major, minor) => TokenType::VersionDirective(major, minor),
525            Self::TagDirective(handle, prefix) => TokenType::TagDirective(handle, prefix),
526            Self::DocumentStart => TokenType::DocumentStart,
527            Self::DocumentEnd => TokenType::DocumentEnd,
528            Self::BlockSequenceStart => TokenType::BlockSequenceStart,
529            Self::BlockMappingStart => TokenType::BlockMappingStart,
530            Self::BlockEnd => TokenType::BlockEnd,
531            Self::FlowSequenceStart => TokenType::FlowSequenceStart,
532            Self::FlowSequenceEnd => TokenType::FlowSequenceEnd,
533            Self::FlowMappingStart => TokenType::FlowMappingStart,
534            Self::FlowMappingEnd => TokenType::FlowMappingEnd,
535            Self::BlockEntry => TokenType::BlockEntry,
536            Self::FlowEntry => TokenType::FlowEntry,
537            Self::Key => TokenType::Key,
538            Self::Value => TokenType::Value,
539            Self::Alias(name) => TokenType::Alias(name),
540            Self::Anchor(name) => TokenType::Anchor(name),
541            Self::Tag(handle, suffix) => TokenType::Tag(handle, suffix),
542            Self::Scalar(style, value) => TokenType::Scalar(style, value),
543            Self::Comment(comment) => TokenType::Comment(comment.into_public(span)),
544            Self::ReservedDirective(name, params) => TokenType::ReservedDirective(name, params),
545        }
546    }
547}
548
549impl<'input> From<TokenType<'input>> for QueuedTokenType<'input> {
550    fn from(token: TokenType<'input>) -> Self {
551        match token {
552            TokenType::StreamStart(encoding) => Self::StreamStart(encoding),
553            TokenType::StreamEnd => Self::StreamEnd,
554            TokenType::VersionDirective(major, minor) => Self::VersionDirective(major, minor),
555            TokenType::TagDirective(handle, prefix) => Self::TagDirective(handle, prefix),
556            TokenType::DocumentStart => Self::DocumentStart,
557            TokenType::DocumentEnd => Self::DocumentEnd,
558            TokenType::BlockSequenceStart => Self::BlockSequenceStart,
559            TokenType::BlockMappingStart => Self::BlockMappingStart,
560            TokenType::BlockEnd => Self::BlockEnd,
561            TokenType::FlowSequenceStart => Self::FlowSequenceStart,
562            TokenType::FlowSequenceEnd => Self::FlowSequenceEnd,
563            TokenType::FlowMappingStart => Self::FlowMappingStart,
564            TokenType::FlowMappingEnd => Self::FlowMappingEnd,
565            TokenType::BlockEntry => Self::BlockEntry,
566            TokenType::FlowEntry => Self::FlowEntry,
567            TokenType::Key => Self::Key,
568            TokenType::Value => Self::Value,
569            TokenType::Alias(name) => Self::Alias(name),
570            TokenType::Anchor(name) => Self::Anchor(name),
571            TokenType::Tag(handle, suffix) => Self::Tag(handle, suffix),
572            TokenType::Scalar(style, value) => Self::Scalar(style, value),
573            TokenType::Comment(comment) => Self::Comment(comment.into()),
574            TokenType::ReservedDirective(name, params) => Self::ReservedDirective(name, params),
575        }
576    }
577}
578
579/// A compact token stored by the scanner before it is emitted publicly.
580#[derive(Clone, PartialEq, Debug, Eq)]
581pub(crate) struct QueuedToken<'input>(pub(crate) Span, pub(crate) QueuedTokenType<'input>);
582
583impl<'input> QueuedToken<'input> {
584    fn into_public(self) -> Token<'input> {
585        Token(self.0, self.1.into_public(self.0))
586    }
587}
588
589impl<'input> From<Token<'input>> for QueuedToken<'input> {
590    fn from(token: Token<'input>) -> Self {
591        Self(token.0, token.1.into())
592    }
593}
594
595/// A scalar that was parsed and may correspond to a simple key.
596///
597/// Upon scanning the following YAML:
598/// ```yaml
599/// a: b
600/// ```
601/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
602/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
603/// kept inside the scanner until more context is fetched and we are able to know whether it is a
604/// plain scalar or a key.
605///
606/// For example, see the following two YAML documents:
607/// ```yaml
608/// ---
609/// a: b # Here, `a` is a key.
610/// ...
611/// ---
612/// a # Here, `a` is a plain scalar.
613/// ...
614/// ```
615/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
616///
617/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
618/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
619/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
620///
621/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
622/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
623/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
624/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
625/// [`TokenType::Scalar`] token.
626///
627/// In the second document however, reaching EOF would mark the [`SimpleKey`] as no longer possible,
628/// and no [`TokenType::Key`] would be emitted by the scanner.
629#[derive(Clone, PartialEq, Debug, Eq)]
630struct SimpleKey {
631    /// Whether the token this [`SimpleKey`] refers to may still be a key.
632    ///
633    /// Sometimes, when we have more context, we notice that what we thought could be a key no
634    /// longer can be. In that case, [`Self::possible`] is set to `false`.
635    ///
636    /// For instance, let us consider the following invalid YAML:
637    /// ```yaml
638    /// key
639    ///   : value
640    /// ```
641    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is no longer
642    /// possible and [`Self::possible`] is set to `false`.
643    possible: bool,
644    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
645    ///
646    /// With more context, we may know for sure that the token must be a key. If later input makes
647    /// that impossible, the scanner must report an error instead of silently treating the token as a
648    /// plain scalar.
649    ///
650    /// This happens for simple keys at the current block indentation where the surrounding
651    /// collection requires the next token to be a mapping key.
652    required: bool,
653    /// The index of the token referred to by the [`SimpleKey`].
654    ///
655    /// This is the index in the scanner, which takes into account both the tokens that have been
656    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
657    /// [`Scanner::tokens`] for more details.
658    token_number: usize,
659    /// The position at which the token the [`SimpleKey`] refers to is.
660    mark: Marker,
661}
662
663impl SimpleKey {
664    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
665    fn new(mark: Marker) -> SimpleKey {
666        SimpleKey {
667            possible: false,
668            required: false,
669            token_number: 0,
670            mark,
671        }
672    }
673}
674
675/// An indentation level on the stack of indentations.
676#[derive(Clone, Debug, Default)]
677struct Indent {
678    /// The former indentation level.
679    indent: isize,
680    /// Whether, upon closing, this indents generates a `BlockEnd` token.
681    ///
682    /// There are levels of indentation which do not start a block. Examples of this would be:
683    /// ```yaml
684    /// -
685    ///   foo # ok
686    /// -
687    /// bar # ko, bar needs to be indented further than the `-`.
688    /// - [
689    ///  baz, # ok
690    /// quux # ko, quux needs to be indented further than the '-'.
691    /// ] # ko, the closing bracket needs to be indented further than the `-`.
692    /// ```
693    ///
694    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
695    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
696    /// sequence, although we must have exactly one to end the sequence.
697    needs_block_end: bool,
698}
699
700/// The knowledge we have about an implicit mapping.
701///
702/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
703/// sequence is omitted:
704/// ```yaml
705/// [ a: b, c: d ]
706/// # Equivalent to
707/// [ { a: b }, { c: d } ]
708/// # Equivalent to
709/// - a: b
710/// - c: d
711/// ```
712///
713/// The state must be carefully tracked for each nested flow sequence since we must emit a
714/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
715/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
716/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
717/// out-of-order.
718///
719/// [`FlowMappingStart`]: TokenType::FlowMappingStart
720/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
721#[derive(Debug, PartialEq)]
722enum ImplicitMappingState {
723    /// It is possible there is an implicit mapping.
724    ///
725    /// This state is the one when we have just encountered the opening `[`. We need more context
726    /// to know whether an implicit mapping follows.
727    Possible,
728    /// We are inside the implicit mapping.
729    ///
730    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
731    Inside(u8),
732}
733
734/// The YAML scanner.
735///
736/// This corresponds to the low-level interface when reading YAML. The scanner emits tokens as they
737/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
738/// some of the constructs. It has understanding of indentation and whitespace and is able to
739/// generate error messages for some invalid YAML constructs.
740///
741/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
742/// YAML documents.
743#[derive(Debug)]
744#[allow(clippy::struct_excessive_bools)]
745pub struct Scanner<'input, T> {
746    /// The input source.
747    ///
748    /// This must implement [`Input`].
749    input: T,
750    /// The position of the cursor within the reader.
751    mark: Marker,
752    /// Buffer for tokens to be returned.
753    ///
754    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
755    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
756    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
757    /// [`Self::next`] until we have more context.
758    tokens: VecDeque<QueuedToken<'input>>,
759    /// The last error that happened.
760    error: Option<ScanError>,
761    /// Error found after one or more already-scanned comment tokens.
762    deferred_error: Option<ScanError>,
763    /// Whether the input may contain `#` comment indicators.
764    comments_possible: bool,
765
766    /// Whether we have already emitted the `StreamStart` token.
767    stream_start_produced: bool,
768    /// Whether we have already emitted the `StreamEnd` token.
769    stream_end_produced: bool,
770    /// Whether the scanner is still in the prefix of the next document.
771    ///
772    /// A BOM may appear in a document prefix, before directives/comments/content. Once a document
773    /// start marker or any content token is scanned, another BOM is document content and must be
774    /// rejected unless it appears inside a quoted scalar.
775    document_prefix_allowed: bool,
776    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
777    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
778    adjacent_value_allowed_at: usize,
779    /// Whether a simple key could potentially start at the current position.
780    ///
781    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
782    simple_key_allowed: bool,
783    /// A stack of potential simple keys.
784    ///
785    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
786    /// are.
787    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
788    /// The current indentation level.
789    indent: isize,
790    /// List of all block indentation levels we are in (except the current one).
791    indents: smallvec::SmallVec<[Indent; 8]>,
792    /// Level of nesting of flow sequences.
793    flow_level: u8,
794    /// The number of tokens that have been returned from the scanner.
795    ///
796    /// This excludes the tokens from [`Self::tokens`].
797    tokens_parsed: usize,
798    /// Whether a token is ready to be taken from [`Self::tokens`].
799    token_available: bool,
800    /// Whether all characters encountered since the last newline were whitespace.
801    leading_whitespace: bool,
802    /// Whether we started a flow mapping at each flow nesting level.
803    ///
804    /// This is used to detect implicit flow mapping starts such as:
805    /// ```yaml
806    /// [ : foo ] # { null: "foo" }
807    /// ```
808    flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
809    /// An array of states, representing whether flow sequences have implicit mappings.
810    ///
811    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
812    /// the state is set to [`Possible`].
813    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
814    /// [`Inside`].
815    ///
816    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
817    /// The entries are created with the opening `[` and popped with the closing `]`.
818    ///
819    /// [`Possible`]: ImplicitMappingState::Possible
820    /// [`Inside`]: ImplicitMappingState::Inside
821    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
822    /// If a plain scalar was terminated by a `#` comment on its line, we set this
823    /// to detect an illegal multiline continuation on the following line.
824    interrupted_plain_by_comment: Option<Marker>,
825    /// Whether the scanner is still validating whitespace after an explicit `?` key indicator.
826    ///
827    /// This stays set across streamed comment tokens so a tab after the comment run is rejected the
828    /// same way it was when that whitespace was scanned in one pass.
829    explicit_key_tab_check_pending: bool,
830    /// A stack of markers for opening brackets `[` and `{`.
831    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
832    buf_leading_break: String,
833    buf_trailing_breaks: String,
834    buf_whitespaces: String,
835}
836
837impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
838    type Item = Token<'input>;
839
840    fn next(&mut self) -> Option<Self::Item> {
841        if self.error.is_some() {
842            return None;
843        }
844        match self.next_token() {
845            Ok(Some(tok)) => {
846                debug_print!(
847                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
848                    tok.1,
849                    tok.0
850                );
851                Some(tok)
852            }
853            Ok(tok) => tok,
854            Err(e) => self.stop_after_error(e),
855        }
856    }
857}
858
859/// A convenience alias for scanner functions that may fail without returning a value.
860pub type ScanResult = Result<(), ScanError>;
861
862#[derive(Debug)]
863enum FlowScalarBuf {
864    /// Candidate for `Cow::Borrowed`.
865    ///
866    /// `start..end` is the committed verbatim range.
867    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
868    /// committed (they must be dropped if followed by a line break).
869    Borrowed {
870        start: usize,
871        end: usize,
872        pending_ws_start: Option<usize>,
873        pending_ws_end: usize,
874    },
875    Owned(String),
876}
877
878impl FlowScalarBuf {
879    #[inline]
880    fn new_borrowed(start: usize) -> Self {
881        Self::Borrowed {
882            start,
883            end: start,
884            pending_ws_start: None,
885            pending_ws_end: start,
886        }
887    }
888
889    #[inline]
890    fn new_owned() -> Self {
891        Self::Owned(String::new())
892    }
893
894    #[inline]
895    fn as_owned_mut(&mut self) -> Option<&mut String> {
896        match self {
897            Self::Owned(s) => Some(s),
898            Self::Borrowed { .. } => None,
899        }
900    }
901
902    #[inline]
903    fn commit_pending_ws(&mut self) {
904        if let Self::Borrowed {
905            end,
906            pending_ws_start,
907            pending_ws_end,
908            ..
909        } = self
910        {
911            if pending_ws_start.is_some() {
912                *end = *pending_ws_end;
913                *pending_ws_start = None;
914            }
915        }
916    }
917
918    #[inline]
919    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
920        if let Self::Borrowed {
921            pending_ws_start,
922            pending_ws_end,
923            ..
924        } = self
925        {
926            if pending_ws_start.is_none() {
927                *pending_ws_start = Some(ws_start);
928            }
929            *pending_ws_end = ws_end;
930        }
931    }
932
933    #[inline]
934    fn discard_pending_ws(&mut self) {
935        if let Self::Borrowed {
936            pending_ws_start,
937            pending_ws_end,
938            end,
939            ..
940        } = self
941        {
942            *pending_ws_start = None;
943            *pending_ws_end = *end;
944        }
945    }
946}
947
948impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
949    #[inline]
950    fn promote_flow_scalar_buf_to_owned(
951        &self,
952        start_mark: &Marker,
953        buf: &mut FlowScalarBuf,
954    ) -> Result<(), ScanError> {
955        let FlowScalarBuf::Borrowed {
956            start,
957            end,
958            pending_ws_start: _,
959            pending_ws_end: _,
960        } = *buf
961        else {
962            return Ok(());
963        };
964
965        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
966            ScanError::new_str(
967                *start_mark,
968                "internal error: input advertised offsets but did not provide a slice",
969            )
970        })?;
971        *buf = FlowScalarBuf::Owned(slice.to_owned());
972        Ok(())
973    }
974    /// Try to borrow a slice from the underlying input.
975    ///
976    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
977    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
978    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
979    #[inline]
980    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
981        self.input.slice_borrowed(start, end)
982    }
983
984    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
985    ///
986    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
987    /// borrowing is not possible, it falls back to allocating.
988    fn scan_tag_handle_directive_cow(
989        &mut self,
990        mark: &Marker,
991    ) -> Result<Cow<'input, str>, ScanError> {
992        let Some(start) = self.input.byte_offset() else {
993            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
994        };
995
996        if self.input.look_ch() != '!' {
997            return Err(ScanError::new_str(
998                *mark,
999                "while scanning a tag, did not find expected '!'",
1000            ));
1001        }
1002
1003        // Consume the leading '!'.
1004        self.skip_non_blank();
1005
1006        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1007        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
1008        self.input.lookahead(1);
1009        while self.input.next_is_alpha() {
1010            self.skip_non_blank();
1011            self.input.lookahead(1);
1012        }
1013
1014        // Optional trailing '!'.
1015        if self.input.peek() == '!' {
1016            self.skip_non_blank();
1017        }
1018
1019        let Some(end) = self.input.byte_offset() else {
1020            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
1021            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1022        };
1023
1024        let Some(slice) = self.try_borrow_slice(start, end) else {
1025            // Fall back to allocating if zero-copy borrow is not available.
1026            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1027                ScanError::new_str(
1028                    *mark,
1029                    "internal error: input advertised slicing but did not provide a slice",
1030                )
1031            })?;
1032            if !slice.ends_with('!') && slice != "!" {
1033                return Err(ScanError::new_str(
1034                    *mark,
1035                    "while parsing a tag directive, did not find expected '!'",
1036                ));
1037            }
1038            return Ok(Cow::Owned(slice.to_owned()));
1039        };
1040
1041        if !slice.ends_with('!') && slice != "!" {
1042            return Err(ScanError::new_str(
1043                *mark,
1044                "while parsing a tag directive, did not find expected '!'",
1045            ));
1046        }
1047
1048        Ok(Cow::Borrowed(slice))
1049    }
1050
1051    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
1052    ///
1053    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
1054    /// escape is present, the prefix must be decoded and therefore allocated.
1055    fn scan_tag_prefix_directive_cow(
1056        &mut self,
1057        start_mark: &Marker,
1058    ) -> Result<Cow<'input, str>, ScanError> {
1059        let Some(start) = self.input.byte_offset() else {
1060            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1061        };
1062
1063        // The prefix must start with either '!' (local) or a valid global tag char.
1064        if self.input.look_ch() == '!' {
1065            self.skip_non_blank();
1066        } else if !is_tag_char(self.input.peek()) {
1067            return Err(ScanError::new_str(
1068                *start_mark,
1069                "invalid global tag character",
1070            ));
1071        } else if self.input.peek() == '%' {
1072            // Needs decoding. Fall back to allocating path below.
1073        } else {
1074            self.skip_non_blank();
1075        }
1076
1077        // Consume URI chars while we can stay in the borrowed path.
1078        while is_uri_char(self.input.look_ch()) {
1079            if self.input.peek() == '%' {
1080                break;
1081            }
1082            self.skip_non_blank();
1083        }
1084
1085        // If we encountered an escape sequence, we must decode, therefore allocate.
1086        if self.input.peek() == '%' {
1087            let current = self
1088                .input
1089                .byte_offset()
1090                .expect("byte_offset() must remain available once enabled");
1091            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1092                slice.to_owned()
1093            } else {
1094                String::new()
1095            };
1096
1097            while is_uri_char(self.input.look_ch()) {
1098                if self.input.peek() == '%' {
1099                    out.push(self.scan_uri_escapes(start_mark)?);
1100                } else {
1101                    out.push(self.input.peek());
1102                    self.skip_non_blank();
1103                }
1104            }
1105            return Ok(Cow::Owned(out));
1106        }
1107
1108        let Some(end) = self.input.byte_offset() else {
1109            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1110        };
1111
1112        let Some(slice) = self.try_borrow_slice(start, end) else {
1113            // Fall back to allocating if zero-copy borrow is not available.
1114            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1115                ScanError::new_str(
1116                    *start_mark,
1117                    "internal error: input advertised slicing but did not provide a slice",
1118                )
1119            })?;
1120            return Ok(Cow::Owned(slice.to_owned()));
1121        };
1122
1123        Ok(Cow::Borrowed(slice))
1124    }
1125    /// Create a scanner over the given input source.
1126    pub fn new(input: T) -> Self {
1127        let initial_byte_offset = input.byte_offset();
1128        let comments_possible = input.may_contain_comments();
1129        Scanner {
1130            input,
1131            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
1132            tokens: VecDeque::with_capacity(64),
1133            error: None,
1134            deferred_error: None,
1135            comments_possible,
1136
1137            stream_start_produced: false,
1138            stream_end_produced: false,
1139            document_prefix_allowed: true,
1140            adjacent_value_allowed_at: 0,
1141            simple_key_allowed: true,
1142            simple_keys: smallvec::SmallVec::new(),
1143            indent: -1,
1144            indents: smallvec::SmallVec::new(),
1145            flow_level: 0,
1146            tokens_parsed: 0,
1147            token_available: false,
1148            leading_whitespace: true,
1149            flow_mapping_started: smallvec::SmallVec::new(),
1150            implicit_flow_mapping_states: smallvec::SmallVec::new(),
1151            flow_markers: smallvec::SmallVec::new(),
1152            interrupted_plain_by_comment: None,
1153            explicit_key_tab_check_pending: false,
1154
1155            buf_leading_break: String::with_capacity(128),
1156            buf_trailing_breaks: String::with_capacity(128),
1157            buf_whitespaces: String::with_capacity(128),
1158        }
1159    }
1160
1161    /// Return a copy of the last error that was encountered, if any.
1162    ///
1163    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
1164    /// clone of) the same error.
1165    #[inline]
1166    pub fn get_error(&self) -> Option<ScanError> {
1167        self.error.clone().or_else(|| self.deferred_error.clone())
1168    }
1169
1170    #[cold]
1171    fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
1172        self.error = Some(error);
1173        None
1174    }
1175
1176    #[cold]
1177    fn simple_key_expected(&self) -> ScanError {
1178        ScanError::new_str(self.mark, "simple key expected")
1179    }
1180
1181    #[cold]
1182    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
1183        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
1184    }
1185
1186    /// Consume the next character. It is assumed the next character is a blank.
1187    #[inline]
1188    fn skip_blank(&mut self) {
1189        self.input.skip();
1190
1191        self.mark.offsets.chars += 1;
1192        self.mark.col += 1;
1193        self.mark.offsets.bytes = self.input.byte_offset();
1194    }
1195
1196    /// Consume the next character. It is assumed the next character is not a blank.
1197    #[inline]
1198    fn skip_non_blank(&mut self) {
1199        self.input.skip();
1200
1201        self.mark.offsets.chars += 1;
1202        self.mark.col += 1;
1203        self.mark.offsets.bytes = self.input.byte_offset();
1204        self.leading_whitespace = false;
1205    }
1206
1207    /// Consume a byte order mark from a document prefix.
1208    ///
1209    /// The source index advances, but the logical column remains unchanged so directives and
1210    /// document markers immediately following the BOM are still recognized as line-start tokens.
1211    #[inline]
1212    fn skip_bom(&mut self) {
1213        self.input.skip();
1214
1215        self.mark.offsets.chars += 1;
1216        self.mark.offsets.bytes = self.input.byte_offset();
1217    }
1218
1219    /// Consume one character that belongs to a comment.
1220    ///
1221    /// Unlike [`Self::skip_non_blank`], this deliberately does not change
1222    /// `leading_whitespace`. Comments are presentation content, so consuming one for either
1223    /// tokenization or skipping should only advance position bookkeeping.
1224    #[inline]
1225    fn skip_comment_char(&mut self) {
1226        self.input.skip();
1227
1228        self.mark.offsets.chars += 1;
1229        self.mark.col += 1;
1230        self.mark.offsets.bytes = self.input.byte_offset();
1231    }
1232
1233    /// Consume the next characters. It is assumed none of the next characters are blanks.
1234    #[inline]
1235    fn skip_n_non_blank(&mut self, count: usize) {
1236        for _ in 0..count {
1237            self.input.skip();
1238            self.mark.offsets.chars += 1;
1239            self.mark.col += 1;
1240        }
1241        self.mark.offsets.bytes = self.input.byte_offset();
1242        self.leading_whitespace = false;
1243    }
1244
1245    /// Consume the next character. It is assumed the next character is a newline.
1246    #[inline]
1247    fn skip_nl(&mut self) {
1248        self.input.skip();
1249
1250        self.mark.offsets.chars += 1;
1251        self.mark.col = 0;
1252        self.mark.line += 1;
1253        self.mark.offsets.bytes = self.input.byte_offset();
1254        self.leading_whitespace = true;
1255    }
1256
1257    /// Consume a line break (either CR, LF, or CRLF), if any. Do nothing if there is none.
1258    #[inline]
1259    fn skip_linebreak(&mut self) {
1260        if self.input.next_2_are('\r', '\n') {
1261            // While technically not a blank, this does not matter as `self.leading_whitespace`
1262            // will be reset by `skip_nl`.
1263            self.skip_blank();
1264            self.skip_nl();
1265        } else if self.input.next_is_break() {
1266            self.skip_nl();
1267        }
1268    }
1269
1270    #[cfg(test)]
1271    fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
1272        Ok(self.scan_comment_queued_token()?.into_public())
1273    }
1274
1275    fn scan_comment_queued_token(&mut self) -> Result<QueuedToken<'input>, ScanError> {
1276        let start_mark = self.mark;
1277        debug_assert_eq!(self.input.peek(), '#');
1278        let placement = if self.leading_whitespace {
1279            Placement::Free
1280        } else {
1281            Placement::Right
1282        };
1283
1284        self.skip_comment_char();
1285
1286        let text = if let Some(start) = self.input.byte_offset() {
1287            // Stable byte offsets are available; slice the payload once at the end.
1288            let n = self.input.skip_while_non_breakz();
1289            self.mark.offsets.chars += n;
1290            self.mark.col += n;
1291            let byte_offset = self.input.byte_offset();
1292            self.mark.offsets.bytes = byte_offset;
1293            let end = byte_offset.expect("byte_offset must remain available once enabled");
1294
1295            if let Some(slice) = self.try_borrow_slice(start, end) {
1296                Cow::Borrowed(slice)
1297            } else if let Some(slice) = self.input.slice_bytes(start, end) {
1298                // Defensive fallback for third-party inputs that expose offsets but cannot borrow.
1299                Cow::Owned(slice.to_owned())
1300            } else {
1301                return Err(ScanError::new_str(
1302                    start_mark,
1303                    "internal error: input advertised offsets but did not provide a slice",
1304                ));
1305            }
1306        } else {
1307            // Streaming input without stable offsets; collect into an owned string.
1308            let mut owned = String::new();
1309            while !is_breakz(self.input.look_ch()) {
1310                owned.push(self.input.peek());
1311                self.skip_comment_char();
1312            }
1313            Cow::Owned(owned)
1314        };
1315
1316        let end_mark = self.mark;
1317        let span = Span::new(start_mark, end_mark);
1318        Ok(QueuedToken(
1319            span,
1320            QueuedTokenType::Comment(QueuedComment { text, placement }),
1321        ))
1322    }
1323
1324    fn push_comment_token(&mut self) -> ScanResult {
1325        let token = self.scan_comment_queued_token()?;
1326        self.tokens.push_back(token);
1327        Ok(())
1328    }
1329
1330    fn skip_comment(&mut self) {
1331        debug_assert_eq!(self.input.peek(), '#');
1332
1333        self.skip_comment_char();
1334        let n = self.input.skip_while_non_breakz();
1335        self.mark.offsets.chars += n;
1336        self.mark.col += n;
1337        self.mark.offsets.bytes = self.input.byte_offset();
1338    }
1339
1340    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
1341    #[inline]
1342    pub fn stream_started(&self) -> bool {
1343        self.stream_start_produced
1344    }
1345
1346    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
1347    #[inline]
1348    pub fn stream_ended(&self) -> bool {
1349        self.stream_end_produced
1350    }
1351
1352    /// Return the current position in the input stream.
1353    #[inline]
1354    pub fn mark(&self) -> Marker {
1355        self.mark
1356    }
1357
1358    /// Return whether this scanner may emit comment tokens.
1359    #[inline]
1360    pub(crate) fn comments_possible(&self) -> bool {
1361        self.comments_possible
1362    }
1363
1364    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1365    //
1366    // A `\n` is pushed into `s`.
1367    //
1368    // # Panics (in debug)
1369    // If the next characters do not correspond to a line break.
1370    #[inline]
1371    fn read_break(&mut self, s: &mut String) {
1372        self.skip_break();
1373        s.push('\n');
1374    }
1375
1376    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1377    //
1378    // # Panics (in debug)
1379    // If the next characters do not correspond to a line break.
1380    #[inline]
1381    fn skip_break(&mut self) {
1382        let c = self.input.peek();
1383        let nc = self.input.peek_nth(1);
1384        debug_assert!(is_break(c));
1385        if c == '\r' && nc == '\n' {
1386            self.skip_blank();
1387        }
1388        self.skip_nl();
1389    }
1390
1391    /// Insert a token at the given position.
1392    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1393        let old_len = self.tokens.len();
1394        assert!(pos <= old_len);
1395        self.tokens.insert(pos, tok.into());
1396    }
1397
1398    #[inline]
1399    fn allow_simple_key(&mut self) {
1400        self.simple_key_allowed = true;
1401    }
1402
1403    #[inline]
1404    fn disallow_simple_key(&mut self) {
1405        self.simple_key_allowed = false;
1406    }
1407
1408    /// Scan enough input to append one next token to the internal token queue.
1409    ///
1410    /// # Errors
1411    /// Returns `ScanError` when the scanner does not find the next expected token.
1412    pub fn fetch_next_token(&mut self) -> ScanResult {
1413        self.input.lookahead(1);
1414
1415        if !self.stream_start_produced {
1416            self.fetch_stream_start();
1417            return Ok(());
1418        }
1419        if self.skip_to_next_token(true)? {
1420            return Ok(());
1421        }
1422
1423        debug_print!(
1424            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1425            self.mark,
1426            self.input.peek()
1427        );
1428
1429        self.stale_simple_keys()?;
1430
1431        let mark = self.mark;
1432        self.unroll_indent(mark.col as isize);
1433
1434        self.input.lookahead(4);
1435
1436        if self.input.next_is_z() {
1437            self.fetch_stream_end()?;
1438            return Ok(());
1439        }
1440
1441        if self.mark.col == 0 {
1442            if self.input.next_char_is('%') {
1443                return self.fetch_directive();
1444            } else if self.input.next_is_document_start() {
1445                return self.fetch_document_indicator(TokenType::DocumentStart);
1446            } else if self.input.next_is_document_end() {
1447                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1448                self.skip_ws_to_eol(SkipTabs::Yes)?;
1449                if !self.input.next_is_breakz() {
1450                    return Err(ScanError::new_str(
1451                        self.mark,
1452                        "invalid content after document end marker",
1453                    ));
1454                }
1455                return Ok(());
1456            }
1457        }
1458
1459        if self.document_prefix_allowed {
1460            self.document_prefix_allowed = false;
1461        }
1462
1463        if (self.mark.col as isize) < self.indent {
1464            self.input.lookahead(1);
1465            let c = self.input.peek();
1466            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1467                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1468            }
1469        }
1470
1471        let c = self.input.peek();
1472        let nc = self.input.peek_nth(1);
1473        match c {
1474            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1475            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1476            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1477            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1478            ',' => self.fetch_flow_entry(),
1479            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1480            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1481            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1482            ':' if self.flow_level > 0
1483                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1484            {
1485                self.fetch_flow_value()
1486            }
1487            // Is it an alias?
1488            '*' => self.fetch_anchor(true),
1489            // Is it an anchor?
1490            '&' => self.fetch_anchor(false),
1491            '!' => self.fetch_tag(),
1492            // Is it a literal scalar?
1493            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1494            // Is it a folded scalar?
1495            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1496            '\'' => self.fetch_flow_scalar(true),
1497            '"' => self.fetch_flow_scalar(false),
1498            // plain scalar
1499            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1500            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1501                self.fetch_plain_scalar()
1502            }
1503            c if is_bom(c) => Err(ScanError::new_str(
1504                self.mark,
1505                "a BOM must not appear inside a document",
1506            )),
1507            '%' | '@' | '`' => Err(ScanError::new(
1508                self.mark,
1509                format!("unexpected character: `{c}'"),
1510            )),
1511            _ => self.fetch_plain_scalar(),
1512        }
1513    }
1514
1515    /// Return the next compact queued token, scanning more input when needed.
1516    ///
1517    /// # Errors
1518    /// Returns `ScanError` when scanning fails to find an expected next token.
1519    pub(crate) fn next_queued_token(&mut self) -> Result<Option<QueuedToken<'input>>, ScanError> {
1520        if self.deferred_error.is_some() {
1521            if !matches!(
1522                self.tokens.front().map(|token| &token.1),
1523                Some(QueuedTokenType::Comment(_))
1524            ) {
1525                if let Some(error) = self.deferred_error.take() {
1526                    return error.into_result();
1527                }
1528            }
1529            self.token_available = true;
1530        }
1531
1532        if self.stream_end_produced {
1533            return Ok(None);
1534        }
1535
1536        if !self.token_available {
1537            if let Err(error) = self.fetch_more_tokens() {
1538                if matches!(
1539                    self.tokens.front().map(|token| &token.1),
1540                    Some(QueuedTokenType::Comment(_))
1541                ) {
1542                    self.deferred_error = Some(error);
1543                } else {
1544                    return Err(error);
1545                }
1546            }
1547        }
1548        let Some(t) = self.tokens.pop_front() else {
1549            return Err(ScanError::new_str(
1550                self.mark,
1551                "did not find expected next token",
1552            ));
1553        };
1554        self.token_available = false;
1555        self.tokens_parsed += 1;
1556
1557        let is_stream_end = matches!(t.1, QueuedTokenType::StreamEnd);
1558        if is_stream_end {
1559            self.stream_end_produced = true;
1560        }
1561        Ok(Some(t))
1562    }
1563
1564    /// Return the next queued token, scanning more input when needed.
1565    ///
1566    /// # Errors
1567    /// Returns `ScanError` when scanning fails to find an expected next token.
1568    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1569        Ok(self.next_queued_token()?.map(QueuedToken::into_public))
1570    }
1571
1572    /// Scan more input until a token is ready to be returned.
1573    ///
1574    /// # Errors
1575    /// Returns `ScanError` when scanning fails.
1576    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1577        let mut need_more;
1578        loop {
1579            if self.tokens.is_empty() {
1580                need_more = true;
1581            } else {
1582                need_more = false;
1583                // Stale potential keys that we know won't be keys.
1584                self.stale_simple_keys()?;
1585                if !matches!(
1586                    self.tokens.front().map(|token| &token.1),
1587                    Some(QueuedTokenType::Comment(_))
1588                ) {
1589                    // If our next token to be emitted may be a key, fetch more context.
1590                    for sk in &self.simple_keys {
1591                        if sk.possible && sk.token_number == self.tokens_parsed {
1592                            need_more = true;
1593                            break;
1594                        }
1595                    }
1596                }
1597            }
1598
1599            // Stop fetching immediately after document end/start markers
1600            // to allow the parser to emit the event before reading more content.
1601            if let Some(token) = self.tokens.back() {
1602                if matches!(
1603                    token.1,
1604                    QueuedTokenType::DocumentEnd | QueuedTokenType::DocumentStart
1605                ) {
1606                    break;
1607                }
1608            }
1609
1610            if !need_more {
1611                break;
1612            }
1613            self.fetch_next_token()?;
1614        }
1615        self.token_available = true;
1616
1617        Ok(())
1618    }
1619
1620    /// Mark simple keys that can no longer be keys as such.
1621    ///
1622    /// This function sets `possible` to `false` to each key that, now we have more context, we
1623    /// know will not be keys.
1624    ///
1625    /// # Errors
1626    /// This function returns an error if one of the keys becoming impossible was required to be a
1627    /// key.
1628    fn stale_simple_keys(&mut self) -> ScanResult {
1629        for sk in &mut self.simple_keys {
1630            let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1631            // The length cap applies in flow contexts too; otherwise token buffering can grow
1632            // without bound while the scanner waits to see whether a later ':' resolves the key.
1633            let is_length_stale =
1634                self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1635
1636            if sk.possible && (is_line_stale || is_length_stale) {
1637                if sk.required {
1638                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1639                }
1640                sk.possible = false;
1641            }
1642        }
1643        Ok(())
1644    }
1645
1646    /// Skip over whitespace (`\t`, ` `, `\n`, `\r`) until the next non-comment token.
1647    ///
1648    /// Comments encountered while skipping are queued as [`TokenType::Comment`] tokens so the
1649    /// parser can emit them as presentation events. If `stop_after_comment` is true, the function
1650    /// returns after queuing one comment so callers can emit it before scanning later comments.
1651    ///
1652    /// # Errors
1653    /// This function returns an error if a tab is encountered where there should not be
1654    /// one.
1655    fn skip_to_next_token(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1656        // Hot-path helper: consume a single logical line break and apply simple-key rules.
1657        // (Kept local to ensure the compiler can inline it easily.)
1658        let consume_linebreak = |this: &mut Self| {
1659            this.input.lookahead(2);
1660            this.skip_linebreak();
1661            if this.flow_level == 0 {
1662                this.allow_simple_key();
1663            }
1664        };
1665
1666        loop {
1667            let ch = self.input.look_ch();
1668            if self.explicit_key_tab_check_pending {
1669                match ch {
1670                    '\t' => {
1671                        return Err(ScanError::new_str(
1672                            self.mark(),
1673                            "tabs disallowed in this context",
1674                        ));
1675                    }
1676                    ' ' | '\n' | '\r' | '#' => {}
1677                    _ => self.explicit_key_tab_check_pending = false,
1678                }
1679            }
1680
1681            match ch {
1682                // Tabs may not be used as indentation (block context only).
1683                '\t' => {
1684                    if self.is_within_block()
1685                        && self.leading_whitespace
1686                        && (self.mark.col as isize) < self.indent
1687                    {
1688                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1689
1690                        // If we have content on that line with a tab, return an error.
1691                        if !self.input.next_is_breakz() {
1692                            return Err(ScanError::new_str(
1693                                self.mark,
1694                                "tabs disallowed within this context (block indentation)",
1695                            ));
1696                        }
1697
1698                        // Micro-opt: if we stopped on a line break, consume it now (avoids another loop trip).
1699                        if matches!(self.input.look_ch(), '\n' | '\r') {
1700                            consume_linebreak(self);
1701                        }
1702                    } else {
1703                        // Non-indentation tab behaves like blank.
1704                        self.skip_blank();
1705                    }
1706                }
1707
1708                ' ' => self.skip_blank(),
1709
1710                '\n' | '\r' => consume_linebreak(self),
1711
1712                c if is_bom(c)
1713                    && self.document_prefix_allowed
1714                    && self.flow_level == 0
1715                    && self.mark.col == 0 =>
1716                {
1717                    self.skip_bom();
1718                }
1719
1720                '#' => {
1721                    self.push_comment_token()?;
1722
1723                    // Micro-opt: comment-only lines are common; consume the following line break here.
1724                    if matches!(self.input.look_ch(), '\n' | '\r') {
1725                        consume_linebreak(self);
1726                    }
1727                    if stop_after_comment {
1728                        return Ok(true);
1729                    }
1730                }
1731
1732                _ => break,
1733            }
1734        }
1735
1736        // If a plain scalar was interrupted by a comment, and the next line could
1737        // continue the scalar in block context, this is invalid.
1738        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1739            // BS4K should only trigger when the continuation would start on the immediate next
1740            // line (no intervening empty/comment-only lines). A blank line resets the folding
1741            // opportunity and thus should not error.
1742            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1743
1744            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1745            if self.flow_level == 0
1746                && is_immediate_next_line
1747                && (self.mark.col as isize) > self.indent
1748            {
1749                // Ensure enough lookahead for:
1750                // - the checks below (peek/peek_nth)
1751                // - document indicator detection which needs 4 chars.
1752                self.input.lookahead(4);
1753
1754                if !self.input.next_is_z()
1755                    && !self.input.next_is_document_indicator()
1756                    && self.input.next_can_be_plain_scalar(false)
1757                {
1758                    return Err(ScanError::new_str(
1759                        err_mark,
1760                        "comment intercepting the multiline text",
1761                    ));
1762                }
1763            }
1764        }
1765
1766        Ok(false)
1767    }
1768
1769    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1770    ///
1771    /// If `stop_after_comment` is true, the function returns after queuing one comment so callers
1772    /// can emit it before scanning later comments.
1773    ///
1774    /// # Errors
1775    /// This function returns an error if no whitespace was found.
1776    fn skip_yaml_whitespace(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1777        let mut need_whitespace = true;
1778        loop {
1779            match self.input.look_ch() {
1780                ' ' => {
1781                    self.skip_blank();
1782
1783                    need_whitespace = false;
1784                }
1785                '\n' | '\r' => {
1786                    self.input.lookahead(2);
1787                    self.skip_linebreak();
1788                    if self.flow_level == 0 {
1789                        self.allow_simple_key();
1790                    }
1791                    need_whitespace = false;
1792                }
1793                '#' => {
1794                    if need_whitespace {
1795                        self.skip_comment();
1796                    } else {
1797                        self.push_comment_token()?;
1798                        if stop_after_comment {
1799                            return Ok(true);
1800                        }
1801                    }
1802                }
1803                _ => break,
1804            }
1805        }
1806
1807        if need_whitespace {
1808            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1809        } else {
1810            Ok(false)
1811        }
1812    }
1813
1814    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1815        debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
1816
1817        if !self.comments_possible {
1818            let (chars_consumed, result) = self.input.skip_ws_to_eol(skip_tabs);
1819            self.mark.col += chars_consumed;
1820            self.mark.offsets.chars += chars_consumed;
1821            self.mark.offsets.bytes = self.input.byte_offset();
1822            return result.map_err(|msg| ScanError::new_str(self.mark, msg));
1823        }
1824
1825        let (chars_consumed, whitespace) = self.input.skip_ws_to_eol_blanks(skip_tabs);
1826        self.mark.col += chars_consumed;
1827        self.mark.offsets.chars += chars_consumed;
1828        self.mark.offsets.bytes = self.input.byte_offset();
1829
1830        if self.input.look_ch() != '#' {
1831            return Ok(whitespace);
1832        }
1833
1834        if !whitespace.found_tabs() && !whitespace.has_valid_yaml_ws() {
1835            return Err(ScanError::new_str(
1836                self.mark,
1837                "comments must be separated from other tokens by whitespace",
1838            ));
1839        }
1840
1841        self.push_comment_token()?;
1842        Ok(whitespace)
1843    }
1844
1845    fn fetch_stream_start(&mut self) {
1846        let mark = self.mark;
1847        self.indent = -1;
1848        self.stream_start_produced = true;
1849        self.allow_simple_key();
1850        self.tokens
1851            .push_back(Token(Span::empty(mark), TokenType::StreamStart(TEncoding::Utf8)).into());
1852        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1853    }
1854
1855    fn fetch_stream_end(&mut self) -> ScanResult {
1856        // force new line
1857        if self.mark.col != 0 {
1858            self.mark.col = 0;
1859            self.mark.line += 1;
1860        }
1861
1862        if let Some((mark, bracket)) = self.flow_markers.pop() {
1863            return Err(Self::unclosed_bracket(mark, bracket));
1864        }
1865
1866        // If the stream ended, we won't have more context. We can stall all the simple keys we
1867        // had. If one was required, however, that was an error and we must propagate it.
1868        for sk in &mut self.simple_keys {
1869            if sk.required && sk.possible {
1870                return Err(self.simple_key_expected());
1871            }
1872            sk.possible = false;
1873        }
1874
1875        self.unroll_indent(-1);
1876        self.remove_simple_key()?;
1877        self.disallow_simple_key();
1878
1879        self.tokens
1880            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd).into());
1881        Ok(())
1882    }
1883
1884    fn fetch_directive(&mut self) -> ScanResult {
1885        self.unroll_indent(-1);
1886        self.remove_simple_key()?;
1887
1888        self.disallow_simple_key();
1889
1890        let token_index = self.tokens.len();
1891        let tok = self.scan_directive()?;
1892        self.insert_token(token_index, tok);
1893
1894        Ok(())
1895    }
1896
1897    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1898        let start_mark = self.mark;
1899        self.skip_non_blank();
1900
1901        let name = self.scan_directive_name()?;
1902        let tok = match name.as_ref() {
1903            "YAML" => self.scan_version_directive_value(&start_mark)?,
1904            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1905            _ => {
1906                let mut params = Vec::new();
1907                while self.input.next_is_blank() {
1908                    let n_blanks = self.input.skip_while_blank();
1909                    self.mark.offsets.chars += n_blanks;
1910                    self.mark.col += n_blanks;
1911                    self.mark.offsets.bytes = self.input.byte_offset();
1912
1913                    if !is_blank_or_breakz(self.input.peek()) {
1914                        let mut param = String::new();
1915                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1916                        self.mark.offsets.chars += n_chars;
1917                        self.mark.col += n_chars;
1918                        self.mark.offsets.bytes = self.input.byte_offset();
1919                        params.push(param);
1920                    }
1921                }
1922
1923                Token(
1924                    Span::new(start_mark, self.mark),
1925                    TokenType::ReservedDirective(name, params),
1926                )
1927            }
1928        };
1929
1930        self.skip_ws_to_eol(SkipTabs::Yes)?;
1931
1932        if self.input.next_is_breakz() {
1933            self.input.lookahead(2);
1934            self.skip_linebreak();
1935            Ok(tok)
1936        } else {
1937            Err(ScanError::new_str(
1938                start_mark,
1939                "while scanning a directive, did not find expected comment or line break",
1940            ))
1941        }
1942    }
1943
1944    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1945        let n_blanks = self.input.skip_while_blank();
1946        self.mark.offsets.chars += n_blanks;
1947        self.mark.col += n_blanks;
1948        self.mark.offsets.bytes = self.input.byte_offset();
1949
1950        let major = self.scan_version_directive_number(mark)?;
1951
1952        if self.input.peek() != '.' {
1953            return Err(ScanError::new_str(
1954                *mark,
1955                "while scanning a YAML directive, did not find expected digit or '.' character",
1956            ));
1957        }
1958        self.skip_non_blank();
1959
1960        let minor = self.scan_version_directive_number(mark)?;
1961
1962        Ok(Token(
1963            Span::new(*mark, self.mark),
1964            TokenType::VersionDirective(major, minor),
1965        ))
1966    }
1967
1968    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1969        let start_mark = self.mark;
1970        let mut string = String::new();
1971
1972        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
1973        self.mark.offsets.chars += n_chars;
1974        self.mark.col += n_chars;
1975        self.mark.offsets.bytes = self.input.byte_offset();
1976
1977        if string.is_empty() {
1978            return Err(ScanError::new_str(
1979                start_mark,
1980                "while scanning a directive, could not find expected directive name",
1981            ));
1982        }
1983
1984        if !is_blank_or_breakz(self.input.peek()) {
1985            return Err(ScanError::new_str(
1986                start_mark,
1987                "while scanning a directive, found unexpected non-alphabetical character",
1988            ));
1989        }
1990
1991        Ok(string)
1992    }
1993
1994    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1995        let mut val = 0u32;
1996        let mut length = 0usize;
1997        while let Some(digit) = self.input.look_ch().to_digit(10) {
1998            if length + 1 > 9 {
1999                return Err(ScanError::new_str(
2000                    *mark,
2001                    "while scanning a YAML directive, found extremely long version number",
2002                ));
2003            }
2004            length += 1;
2005            val = val * 10 + digit;
2006            self.skip_non_blank();
2007        }
2008
2009        if length == 0 {
2010            return Err(ScanError::new_str(
2011                *mark,
2012                "while scanning a YAML directive, did not find expected version number",
2013            ));
2014        }
2015
2016        Ok(val)
2017    }
2018
2019    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
2020        let n_blanks = self.input.skip_while_blank();
2021        self.mark.offsets.chars += n_blanks;
2022        self.mark.col += n_blanks;
2023        self.mark.offsets.bytes = self.input.byte_offset();
2024
2025        let handle = self.scan_tag_handle_directive_cow(mark)?;
2026
2027        let n_blanks = self.input.skip_while_blank();
2028        self.mark.offsets.chars += n_blanks;
2029        self.mark.col += n_blanks;
2030        self.mark.offsets.bytes = self.input.byte_offset();
2031
2032        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
2033
2034        self.input.lookahead(1);
2035
2036        if self.input.next_is_blank_or_breakz() {
2037            Ok(Token(
2038                Span::new(*mark, self.mark),
2039                TokenType::TagDirective(handle, prefix),
2040            ))
2041        } else {
2042            Err(ScanError::new_str(
2043                *mark,
2044                "while scanning TAG, did not find expected whitespace or line break",
2045            ))
2046        }
2047    }
2048
2049    fn fetch_tag(&mut self) -> ScanResult {
2050        self.save_simple_key();
2051        self.disallow_simple_key();
2052
2053        let tok = self.scan_tag()?;
2054        self.tokens.push_back(tok.into());
2055        Ok(())
2056    }
2057
2058    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
2059        let start_mark = self.mark;
2060
2061        // Check if the tag is in the canonical form (verbatim).
2062        self.input.lookahead(2);
2063
2064        // If byte_offset is not available, use the original owned-only path.
2065        if self.input.byte_offset().is_none() {
2066            return self.scan_tag_owned(&start_mark);
2067        }
2068
2069        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
2070            if self.input.nth_char_is(1, '<') {
2071                // Verbatim tags always need owned strings (URI escapes).
2072                let suffix = self.scan_verbatim_tag(&start_mark)?;
2073                (Cow::Owned(String::new()), Cow::Owned(suffix))
2074            } else {
2075                // The tag has either the '!suffix' or the '!handle!suffix'
2076                let handle = self.scan_tag_handle_cow(&start_mark)?;
2077                // Check if it is, indeed, handle.
2078                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2079                    // A tag handle starting with "!!" is a secondary tag handle.
2080                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
2081                    (handle, suffix)
2082                } else {
2083                    // Not a real handle, it's part of the suffix.
2084                    // E.g., "!foo" -> handle="!", suffix="foo"
2085                    // The "handle" we scanned is actually "!" + suffix_part1.
2086                    // We need to also scan any remaining suffix characters.
2087                    let remaining_suffix =
2088                        self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
2089
2090                    // Extract suffix from handle (skip leading '!') and combine with remaining.
2091                    let suffix = if handle.len() > 1 {
2092                        if remaining_suffix.is_empty() {
2093                            // The suffix is just what's in handle after '!'
2094                            match handle {
2095                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
2096                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
2097                            }
2098                        } else {
2099                            // Combine handle (minus leading '!') with remaining suffix.
2100                            let mut combined = handle[1..].to_owned();
2101                            combined.push_str(&remaining_suffix);
2102                            Cow::Owned(combined)
2103                        }
2104                    } else {
2105                        // handle is just "!", suffix is whatever we scanned after
2106                        remaining_suffix
2107                    };
2108
2109                    // A special case: the '!' tag.  Set the handle to '' and the
2110                    // suffix to '!'.
2111                    if suffix.is_empty() {
2112                        (Cow::Borrowed(""), Cow::Borrowed("!"))
2113                    } else {
2114                        (Cow::Borrowed("!"), suffix)
2115                    }
2116                }
2117            };
2118
2119        if is_blank_or_breakz(self.input.look_ch())
2120            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2121        {
2122            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2123            // delimiter follows.
2124            Ok(Token(
2125                Span::new(start_mark, self.mark),
2126                TokenType::Tag(handle, suffix),
2127            ))
2128        } else {
2129            Err(ScanError::new_str(
2130                start_mark,
2131                "while scanning a tag, did not find expected whitespace or line break",
2132            ))
2133        }
2134    }
2135
2136    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
2137    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
2138        let mut handle = String::new();
2139        let mut suffix;
2140
2141        if self.input.nth_char_is(1, '<') {
2142            suffix = self.scan_verbatim_tag(start_mark)?;
2143        } else {
2144            // The tag has either the '!suffix' or the '!handle!suffix'
2145            handle = self.scan_tag_handle(false, start_mark)?;
2146            // Check if it is, indeed, handle.
2147            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2148                // A tag handle starting with "!!" is a secondary tag handle.
2149                let is_secondary_handle = handle == "!!";
2150                suffix =
2151                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
2152            } else {
2153                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
2154                "!".clone_into(&mut handle);
2155                // A special case: the '!' tag.  Set the handle to '' and the
2156                // suffix to '!'.
2157                if suffix.is_empty() {
2158                    handle.clear();
2159                    "!".clone_into(&mut suffix);
2160                }
2161            }
2162        }
2163
2164        if is_blank_or_breakz(self.input.look_ch())
2165            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2166        {
2167            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2168            // delimiter follows.
2169            Ok(Token(
2170                Span::new(*start_mark, self.mark),
2171                TokenType::Tag(handle.into(), suffix.into()),
2172            ))
2173        } else {
2174            Err(ScanError::new_str(
2175                *start_mark,
2176                "while scanning a tag, did not find expected whitespace or line break",
2177            ))
2178        }
2179    }
2180
2181    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
2182    ///
2183    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
2184    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
2185    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
2186        let Some(start) = self.input.byte_offset() else {
2187            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2188        };
2189
2190        if self.input.look_ch() != '!' {
2191            return Err(ScanError::new_str(
2192                *mark,
2193                "while scanning a tag, did not find expected '!'",
2194            ));
2195        }
2196
2197        // Consume the leading '!'.
2198        self.skip_non_blank();
2199
2200        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
2201        self.input.lookahead(1);
2202        while self.input.next_is_alpha() {
2203            self.skip_non_blank();
2204            self.input.lookahead(1);
2205        }
2206
2207        // Optional trailing '!'.
2208        if self.input.peek() == '!' {
2209            self.skip_non_blank();
2210        }
2211
2212        let Some(end) = self.input.byte_offset() else {
2213            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2214        };
2215
2216        if let Some(slice) = self.try_borrow_slice(start, end) {
2217            Ok(Cow::Borrowed(slice))
2218        } else {
2219            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2220                ScanError::new_str(
2221                    *mark,
2222                    "internal error: input advertised slicing but did not provide a slice",
2223                )
2224            })?;
2225            Ok(Cow::Owned(slice.to_owned()))
2226        }
2227    }
2228
2229    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
2230    ///
2231    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
2232    fn scan_tag_shorthand_suffix_cow(
2233        &mut self,
2234        mark: &Marker,
2235        require_non_empty: bool,
2236    ) -> Result<Cow<'input, str>, ScanError> {
2237        let Some(start) = self.input.byte_offset() else {
2238            return Ok(Cow::Owned(
2239                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2240            ));
2241        };
2242
2243        // Scan tag characters, checking for URI escapes.
2244        while is_tag_char(self.input.look_ch()) {
2245            if self.input.peek() == '%' {
2246                // URI escape found - must decode, so fall back to owned path.
2247                let current = self
2248                    .input
2249                    .byte_offset()
2250                    .expect("byte_offset() must remain available once enabled");
2251                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
2252                    slice.to_owned()
2253                } else {
2254                    String::new()
2255                };
2256
2257                // Continue scanning with owned buffer.
2258                while is_tag_char(self.input.look_ch()) {
2259                    if self.input.peek() == '%' {
2260                        out.push(self.scan_uri_escapes(mark)?);
2261                    } else {
2262                        out.push(self.input.peek());
2263                        self.skip_non_blank();
2264                    }
2265                }
2266                return Ok(Cow::Owned(out));
2267            }
2268            self.skip_non_blank();
2269        }
2270
2271        let Some(end) = self.input.byte_offset() else {
2272            return Ok(Cow::Owned(
2273                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2274            ));
2275        };
2276
2277        if require_non_empty && start == end {
2278            return Err(ScanError::new_str(
2279                *mark,
2280                "while parsing a tag, did not find expected tag URI",
2281            ));
2282        }
2283
2284        if let Some(slice) = self.try_borrow_slice(start, end) {
2285            Ok(Cow::Borrowed(slice))
2286        } else {
2287            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2288                ScanError::new_str(
2289                    *mark,
2290                    "internal error: input advertised slicing but did not provide a slice",
2291                )
2292            })?;
2293            Ok(Cow::Owned(slice.to_owned()))
2294        }
2295    }
2296
2297    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
2298        let mut string = String::new();
2299        if self.input.look_ch() != '!' {
2300            return Err(ScanError::new_str(
2301                *mark,
2302                "while scanning a tag, did not find expected '!'",
2303            ));
2304        }
2305
2306        string.push(self.input.peek());
2307        self.skip_non_blank();
2308
2309        let n_chars = self.input.fetch_while_is_alpha(&mut string);
2310        self.mark.offsets.chars += n_chars;
2311        self.mark.col += n_chars;
2312        self.mark.offsets.bytes = self.input.byte_offset();
2313
2314        // Check if the trailing character is '!' and copy it.
2315        if self.input.peek() == '!' {
2316            string.push(self.input.peek());
2317            self.skip_non_blank();
2318        } else if directive && string != "!" {
2319            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
2320            // directive, it's an error.  If it's a tag token, it must be a part of
2321            // URI.
2322            return Err(ScanError::new_str(
2323                *mark,
2324                "while parsing a tag directive, did not find expected '!'",
2325            ));
2326        }
2327        Ok(string)
2328    }
2329
2330    /// Scan for a tag prefix (6.8.2.2).
2331    ///
2332    /// There are 2 kinds of tag prefixes:
2333    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
2334    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
2335    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2336        let mut string = String::new();
2337
2338        if self.input.look_ch() == '!' {
2339            // If we have a local tag, insert and skip `!`.
2340            string.push(self.input.peek());
2341            self.skip_non_blank();
2342        } else if !is_tag_char(self.input.peek()) {
2343            // Otherwise, check if the first global tag character is valid.
2344            return Err(ScanError::new_str(
2345                *start_mark,
2346                "invalid global tag character",
2347            ));
2348        } else if self.input.peek() == '%' {
2349            // If it is valid and an escape sequence, escape it.
2350            string.push(self.scan_uri_escapes(start_mark)?);
2351        } else {
2352            // Otherwise, push the first character.
2353            string.push(self.input.peek());
2354            self.skip_non_blank();
2355        }
2356
2357        while is_uri_char(self.input.look_ch()) {
2358            if self.input.peek() == '%' {
2359                string.push(self.scan_uri_escapes(start_mark)?);
2360            } else {
2361                string.push(self.input.peek());
2362                self.skip_non_blank();
2363            }
2364        }
2365
2366        Ok(string)
2367    }
2368
2369    /// Scan for a verbatim tag.
2370    ///
2371    /// The prefixing `!<` must _not_ have been skipped.
2372    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2373        // Eat `!<`
2374        self.skip_non_blank();
2375        self.skip_non_blank();
2376
2377        let mut string = String::new();
2378        while is_uri_char(self.input.look_ch()) {
2379            if self.input.peek() == '%' {
2380                string.push(self.scan_uri_escapes(start_mark)?);
2381            } else {
2382                string.push(self.input.peek());
2383                self.skip_non_blank();
2384            }
2385        }
2386
2387        if string.is_empty() {
2388            return Err(ScanError::new_str(
2389                *start_mark,
2390                "while parsing a tag, did not find expected tag URI",
2391            ));
2392        }
2393
2394        if self.input.peek() != '>' {
2395            return Err(ScanError::new_str(
2396                *start_mark,
2397                "while scanning a verbatim tag, did not find the expected '>'",
2398            ));
2399        }
2400        self.skip_non_blank();
2401
2402        Ok(string)
2403    }
2404
2405    fn scan_tag_shorthand_suffix(
2406        &mut self,
2407        _directive: bool,
2408        _is_secondary: bool,
2409        head: &str,
2410        mark: &Marker,
2411    ) -> Result<String, ScanError> {
2412        let mut length = head.len();
2413        let mut string = String::new();
2414
2415        // Copy the head if needed.
2416        // Note that we don't copy the leading '!' character.
2417        if length > 1 {
2418            string.extend(head.chars().skip(1));
2419        }
2420
2421        while is_tag_char(self.input.look_ch()) {
2422            // Check if it is a URI-escape sequence.
2423            if self.input.peek() == '%' {
2424                string.push(self.scan_uri_escapes(mark)?);
2425            } else {
2426                string.push(self.input.peek());
2427                self.skip_non_blank();
2428            }
2429
2430            length += 1;
2431        }
2432
2433        if length == 0 {
2434            return Err(ScanError::new_str(
2435                *mark,
2436                "while parsing a tag, did not find expected tag URI",
2437            ));
2438        }
2439
2440        Ok(string)
2441    }
2442
2443    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2444        let mut width = 0usize;
2445        let mut bytes = [0u8; 4];
2446        let mut bytes_len = 0usize;
2447        loop {
2448            self.input.lookahead(3);
2449
2450            let c = self.input.peek_nth(1);
2451            let nc = self.input.peek_nth(2);
2452
2453            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
2454                return Err(ScanError::new_str(
2455                    *mark,
2456                    "while parsing a tag, found an invalid escape sequence",
2457                ));
2458            }
2459
2460            let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
2461                .expect("two hex nibbles always fit in a byte");
2462            if width == 0 {
2463                width = match byte {
2464                    _ if byte & 0x80 == 0x00 => 1,
2465                    _ if byte & 0xE0 == 0xC0 => 2,
2466                    _ if byte & 0xF0 == 0xE0 => 3,
2467                    _ if byte & 0xF8 == 0xF0 => 4,
2468                    _ => {
2469                        return Err(ScanError::new_str(
2470                            *mark,
2471                            "while parsing a tag, found an incorrect leading UTF-8 byte",
2472                        ));
2473                    }
2474                };
2475            } else if byte & 0xc0 != 0x80 {
2476                return Err(ScanError::new_str(
2477                    *mark,
2478                    "while parsing a tag, found an incorrect trailing UTF-8 byte",
2479                ));
2480            }
2481
2482            bytes[bytes_len] = byte;
2483            bytes_len += 1;
2484
2485            self.skip_n_non_blank(3);
2486
2487            width -= 1;
2488            if width == 0 {
2489                break;
2490            }
2491        }
2492
2493        let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2494            ScanError::new_str(
2495                *mark,
2496                "while parsing a tag, found an invalid UTF-8 codepoint",
2497            )
2498        })?;
2499
2500        let mut chars = s.chars();
2501        match (chars.next(), chars.next()) {
2502            (Some(ch), None) => Ok(ch),
2503            _ => Err(ScanError::new_str(
2504                *mark,
2505                "while parsing a tag, found an invalid UTF-8 codepoint",
2506            )),
2507        }
2508    }
2509
2510    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2511        self.save_simple_key();
2512        self.disallow_simple_key();
2513
2514        let tok = self.scan_anchor(alias)?;
2515
2516        self.tokens.push_back(tok.into());
2517
2518        Ok(())
2519    }
2520
2521    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2522        let start_mark = self.mark;
2523
2524        // Skip `&` / `*`.
2525        self.skip_non_blank();
2526
2527        // Borrow from input when possible.
2528        if let Some(start) = self.input.byte_offset() {
2529            while is_anchor_char(self.input.look_ch()) {
2530                self.skip_non_blank();
2531            }
2532
2533            let end = self
2534                .input
2535                .byte_offset()
2536                .expect("byte_offset() must remain available once enabled");
2537
2538            if start == end {
2539                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2540            }
2541
2542            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2543                Cow::Borrowed(slice)
2544            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2545                Cow::Owned(slice.to_owned())
2546            } else {
2547                return Err(ScanError::new_str(
2548                    start_mark,
2549                    "internal error: input advertised slicing but did not provide a slice",
2550                ));
2551            };
2552
2553            let tok = if alias {
2554                TokenType::Alias(cow)
2555            } else {
2556                TokenType::Anchor(cow)
2557            };
2558            return Ok(Token(Span::new(start_mark, self.mark), tok));
2559        }
2560
2561        let mut string = String::new();
2562        while is_anchor_char(self.input.look_ch()) {
2563            string.push(self.input.peek());
2564            self.skip_non_blank();
2565        }
2566
2567        if string.is_empty() {
2568            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2569        }
2570
2571        let tok = if alias {
2572            TokenType::Alias(string.into())
2573        } else {
2574            TokenType::Anchor(string.into())
2575        };
2576        Ok(Token(Span::new(start_mark, self.mark), tok))
2577    }
2578
2579    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2580        // The indicators '[' and '{' may start a simple key.
2581        self.save_simple_key();
2582
2583        let start_mark = self.mark;
2584        let indicator = self.input.peek();
2585        self.flow_markers.push((start_mark, indicator));
2586
2587        self.roll_one_col_indent();
2588        self.increase_flow_level()?;
2589
2590        self.allow_simple_key();
2591
2592        self.skip_non_blank();
2593
2594        if tok == TokenType::FlowMappingStart {
2595            self.flow_mapping_started.push(true);
2596        } else {
2597            self.flow_mapping_started.push(false);
2598            self.implicit_flow_mapping_states
2599                .push(ImplicitMappingState::Possible);
2600        }
2601
2602        let token_index = self.tokens.len();
2603        self.skip_ws_to_eol(SkipTabs::Yes)?;
2604
2605        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2606        Ok(())
2607    }
2608
2609    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2610        // A closing bracket without a corresponding opening is invalid YAML.
2611        if self.flow_level == 0 {
2612            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2613        }
2614
2615        let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2616            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2617        };
2618
2619        let (expected_open, actual_close) = match tok {
2620            TokenType::FlowSequenceEnd => ('[', ']'),
2621            TokenType::FlowMappingEnd => ('{', '}'),
2622            _ => unreachable!("flow collection end called with non-closing token"),
2623        };
2624
2625        if open_ch != expected_open {
2626            return Err(ScanError::new(
2627                open_mark,
2628                format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2629            ));
2630        }
2631
2632        let flow_level = self.flow_level;
2633
2634        self.remove_simple_key()?;
2635
2636        if matches!(tok, TokenType::FlowSequenceEnd) {
2637            self.end_implicit_mapping(self.mark, flow_level);
2638            // We are out exiting the flow sequence, nesting goes down 1 level.
2639            self.implicit_flow_mapping_states.pop();
2640        }
2641        self.flow_mapping_started.pop();
2642
2643        self.decrease_flow_level();
2644
2645        self.disallow_simple_key();
2646
2647        let start_mark = self.mark;
2648        self.skip_non_blank();
2649        let token_index = self.tokens.len();
2650        self.skip_ws_to_eol(SkipTabs::Yes)?;
2651
2652        // A flow collection within a flow mapping can be a key. In that case, the value may be
2653        // adjacent to the `:`.
2654        // ```yaml
2655        // - [ {a: b}:value ]
2656        // ```
2657        if self.flow_level > 0 {
2658            self.adjacent_value_allowed_at = self.mark.index();
2659        }
2660
2661        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2662        Ok(())
2663    }
2664
2665    /// Push the `FlowEntry` token and skip over the `,`.
2666    fn fetch_flow_entry(&mut self) -> ScanResult {
2667        self.remove_simple_key()?;
2668        self.allow_simple_key();
2669
2670        self.end_implicit_mapping(self.mark, self.flow_level);
2671        if self.current_flow_collection_is_sequence() {
2672            self.set_current_flow_mapping_started(false);
2673        }
2674
2675        let start_mark = self.mark;
2676        self.skip_non_blank();
2677        let token_index = self.tokens.len();
2678        self.skip_ws_to_eol(SkipTabs::Yes)?;
2679
2680        self.insert_token(
2681            token_index,
2682            Token(Span::new(start_mark, self.mark), TokenType::FlowEntry),
2683        );
2684        Ok(())
2685    }
2686
2687    fn increase_flow_level(&mut self) -> ScanResult {
2688        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2689        self.flow_level = self
2690            .flow_level
2691            .checked_add(1)
2692            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2693        Ok(())
2694    }
2695
2696    fn decrease_flow_level(&mut self) {
2697        if self.flow_level > 0 {
2698            self.flow_level -= 1;
2699            self.simple_keys.pop().unwrap();
2700        }
2701    }
2702
2703    /// Push the `Block*` token(s) and skip over the `-`.
2704    ///
2705    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2706    /// `BlockEntry` token.
2707    /// This function only skips over the `-` and does not fetch the entry value.
2708    fn fetch_block_entry(&mut self) -> ScanResult {
2709        if self.flow_level > 0 {
2710            // - * only allowed in block
2711            return Err(ScanError::new_str(
2712                self.mark,
2713                r#""-" is only valid inside a block"#,
2714            ));
2715        }
2716        // Check if we are allowed to start a new entry.
2717        if !self.simple_key_allowed {
2718            return Err(ScanError::new_str(
2719                self.mark,
2720                "block sequence entries are not allowed in this context",
2721            ));
2722        }
2723
2724        // ???, fixes test G9HC.
2725        if let Some(QueuedToken(span, QueuedTokenType::Anchor(..) | QueuedTokenType::Tag(..))) =
2726            self.tokens.back()
2727        {
2728            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2729                return Err(ScanError::new_str(
2730                    span.start,
2731                    "invalid indentation for anchor",
2732                ));
2733            }
2734        }
2735
2736        // Skip over the `-`.
2737        let mark = self.mark;
2738        self.skip_non_blank();
2739
2740        // generate BLOCK-SEQUENCE-START if indented
2741        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2742        let token_index = self.tokens.len();
2743        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2744        self.input.lookahead(2);
2745        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2746        {
2747            return Err(ScanError::new_str(
2748                self.mark,
2749                "'-' must be followed by a valid YAML whitespace",
2750            ));
2751        }
2752
2753        self.skip_ws_to_eol(SkipTabs::No)?;
2754        self.input.lookahead(1);
2755        if self.input.next_is_break() || self.input.next_is_flow() {
2756            self.roll_one_col_indent();
2757        }
2758
2759        self.remove_simple_key()?;
2760        self.allow_simple_key();
2761
2762        self.insert_token(
2763            token_index,
2764            Token(Span::empty(self.mark), TokenType::BlockEntry),
2765        );
2766
2767        Ok(())
2768    }
2769
2770    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2771        if let Some((mark, bracket)) = self.flow_markers.pop() {
2772            return Err(ScanError::new(
2773                mark,
2774                format!("unclosed bracket '{bracket}'"),
2775            ));
2776        }
2777
2778        self.unroll_indent(-1);
2779        self.remove_simple_key()?;
2780        self.disallow_simple_key();
2781
2782        let mark = self.mark;
2783
2784        self.skip_n_non_blank(3);
2785
2786        self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
2787        self.tokens
2788            .push_back(Token(Span::new(mark, self.mark), t).into());
2789        Ok(())
2790    }
2791
2792    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2793        self.save_simple_key();
2794        self.allow_simple_key();
2795        let tok = self.scan_block_scalar(literal)?;
2796
2797        self.tokens.push_back(tok.into());
2798        Ok(())
2799    }
2800
2801    #[allow(clippy::too_many_lines)]
2802    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2803        let start_mark = self.mark;
2804        let mut chomping = Chomping::Clip;
2805        let mut increment: usize = 0;
2806        let mut indent: usize = 0;
2807        let mut trailing_blank: bool;
2808        let mut leading_blank: bool = false;
2809        let style = if literal {
2810            ScalarStyle::Literal
2811        } else {
2812            ScalarStyle::Folded
2813        };
2814
2815        let mut string = String::new();
2816        let mut leading_break = String::new();
2817        let mut trailing_breaks = String::new();
2818        let mut chomping_break = String::new();
2819
2820        // skip '|' or '>'
2821        self.skip_non_blank();
2822        self.unroll_non_block_indents();
2823
2824        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2825            if self.input.peek() == '+' {
2826                chomping = Chomping::Keep;
2827            } else {
2828                chomping = Chomping::Strip;
2829            }
2830            self.skip_non_blank();
2831            self.input.lookahead(1);
2832            if self.input.next_is_digit() {
2833                if self.input.peek() == '0' {
2834                    return Err(ScanError::new_str(
2835                        start_mark,
2836                        "while scanning a block scalar, found an indentation indicator equal to 0",
2837                    ));
2838                }
2839                increment = (self.input.peek() as usize) - ('0' as usize);
2840                self.skip_non_blank();
2841            }
2842        } else if self.input.next_is_digit() {
2843            if self.input.peek() == '0' {
2844                return Err(ScanError::new_str(
2845                    start_mark,
2846                    "while scanning a block scalar, found an indentation indicator equal to 0",
2847                ));
2848            }
2849
2850            increment = (self.input.peek() as usize) - ('0' as usize);
2851            self.skip_non_blank();
2852            self.input.lookahead(1);
2853            if self.input.peek() == '+' || self.input.peek() == '-' {
2854                if self.input.peek() == '+' {
2855                    chomping = Chomping::Keep;
2856                } else {
2857                    chomping = Chomping::Strip;
2858                }
2859                self.skip_non_blank();
2860            }
2861        }
2862
2863        self.skip_ws_to_eol(SkipTabs::Yes)?;
2864
2865        // Check if we are at the end of the line.
2866        self.input.lookahead(1);
2867        if !self.input.next_is_breakz() {
2868            return Err(ScanError::new_str(
2869                start_mark,
2870                "while scanning a block scalar, did not find expected comment or line break",
2871            ));
2872        }
2873
2874        if self.input.next_is_break() {
2875            self.input.lookahead(2);
2876            self.read_break(&mut chomping_break);
2877        }
2878
2879        if self.input.look_ch() == '\t' {
2880            return Err(ScanError::new_str(
2881                start_mark,
2882                "a block scalar content cannot start with a tab",
2883            ));
2884        }
2885
2886        if increment > 0 {
2887            indent = if self.indent >= 0 {
2888                (self.indent + increment as isize) as usize
2889            } else {
2890                increment
2891            }
2892        }
2893
2894        // Scan the leading line breaks and determine the indentation level if needed.
2895        if indent == 0 {
2896            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2897        } else {
2898            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2899        }
2900
2901        // We have an end-of-stream with no content, e.g.:
2902        // ```yaml
2903        // - |+
2904        // ```
2905        if self.input.next_is_z() {
2906            let contents = match chomping {
2907                // We strip trailing line breaks. Nothing remains.
2908                Chomping::Strip => String::new(),
2909                // There was no newline after the chomping indicator.
2910                _ if self.mark.line == start_mark.line() => String::new(),
2911                // We clip lines, and there was a newline after the chomping indicator.
2912                // All other breaks are ignored.
2913                Chomping::Clip => chomping_break,
2914                // We keep lines. There was a newline after the chomping indicator but nothing
2915                // else.
2916                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2917                // Otherwise, the newline after chomping is ignored.
2918                Chomping::Keep => trailing_breaks,
2919            };
2920            return Ok(Token(
2921                Span::new(start_mark, self.mark),
2922                TokenType::Scalar(style, contents.into()),
2923            ));
2924        }
2925
2926        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2927            if self.indent < 0 && self.mark.col == 0 {
2928                self.input.lookahead(4);
2929                if self.input.next_is_document_start()
2930                    || self.input.next_is_document_end()
2931                    || self.input.peek() == '#'
2932                {
2933                    // At the root level, an explicit indentation indicator can still yield an
2934                    // empty scalar when the next line is a document marker or comment.
2935                    // In this case, the scalar is terminated rather than under-indented.
2936                } else {
2937                    return Err(ScanError::new_str(
2938                        self.mark,
2939                        "wrongly indented line in block scalar",
2940                    ));
2941                }
2942            } else {
2943                return Err(ScanError::new_str(
2944                    self.mark,
2945                    "wrongly indented line in block scalar",
2946                ));
2947            }
2948        }
2949
2950        let mut line_buffer = String::with_capacity(100);
2951        let start_mark = self.mark;
2952        while self.mark.col == indent && !self.input.next_is_z() {
2953            if indent == 0 {
2954                self.input.lookahead(4);
2955                if self.input.next_is_document_end() {
2956                    break;
2957                }
2958            }
2959
2960            // We are at the first content character of a content line.
2961            trailing_blank = self.input.next_is_blank();
2962            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
2963                string.push_str(&trailing_breaks);
2964                if trailing_breaks.is_empty() {
2965                    string.push(' ');
2966                }
2967            } else {
2968                string.push_str(&leading_break);
2969                string.push_str(&trailing_breaks);
2970            }
2971
2972            leading_break.clear();
2973            trailing_breaks.clear();
2974
2975            leading_blank = self.input.next_is_blank();
2976
2977            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
2978
2979            // break on EOF
2980            self.input.lookahead(2);
2981            if self.input.next_is_z() {
2982                break;
2983            }
2984
2985            self.read_break(&mut leading_break);
2986
2987            // Eat the following indentation spaces and line breaks.
2988            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2989        }
2990
2991        // Chomp the tail.
2992        if chomping != Chomping::Strip {
2993            string.push_str(&leading_break);
2994            // If we had reached an eof but the last character wasn't an end-of-line, check if the
2995            // last line was indented at least as the rest of the scalar, then we need to consider
2996            // there is a newline.
2997            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
2998                string.push('\n');
2999            }
3000        }
3001
3002        if chomping == Chomping::Keep {
3003            string.push_str(&trailing_breaks);
3004        }
3005
3006        Ok(Token(
3007            Span::new(start_mark, self.mark),
3008            TokenType::Scalar(style, string.into()),
3009        ))
3010    }
3011
3012    /// Retrieve the contents of the line, parsing it as a block scalar.
3013    ///
3014    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
3015    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
3016    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
3017    /// `clear`ed before the end of the function.
3018    ///
3019    /// This function assumes the first character to read is the first content character in the
3020    /// line. This function does not consume the line break character(s) after the line.
3021    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
3022        // Start by evaluating characters in the buffer.
3023        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
3024            string.push(self.input.peek());
3025            // We may technically skip non-blank characters. However, the only distinction is
3026            // to determine what is leading whitespace and what is not. Here, we read the
3027            // contents of the line until either EOF or a line break. We know we will not read
3028            // `self.leading_whitespace` until the end of the line, where it will be reset.
3029            // This allows us to call a slightly less expensive function.
3030            self.skip_blank();
3031        }
3032
3033        // All characters that were in the buffer were consumed. We need to check if more
3034        // follow.
3035        if self.input.buf_is_empty() {
3036            // We will read all consecutive non-breakz characters. We push them into a
3037            // temporary buffer. The main difference with going through `self.buffer` is that
3038            // characters are appended here as their real size (1B for ASCII, or up to 4 bytes for
3039            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
3040            // (using `String::push_str`).
3041
3042            // line_buffer is empty at this point so we can compute n_chars here as well
3043            let mut n_chars = 0;
3044            debug_assert!(line_buffer.is_empty());
3045            while let Some(c) = self.input.raw_read_non_breakz_ch() {
3046                line_buffer.push(c);
3047                n_chars += 1;
3048            }
3049
3050            // We need to manually update our position; we haven't called a `skip` function.
3051            self.mark.col += n_chars;
3052            self.mark.offsets.chars += n_chars;
3053            self.mark.offsets.bytes = self.input.byte_offset();
3054
3055            // We can now append our bytes to our `string`.
3056            string.reserve(line_buffer.len());
3057            string.push_str(line_buffer);
3058            // This clears the _contents_ without touching the _capacity_.
3059            line_buffer.clear();
3060        }
3061    }
3062
3063    /// Skip the block scalar indentation and empty lines.
3064    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
3065        loop {
3066            // Consume all spaces. Tabs cannot be used as indentation.
3067            if indent < self.input.bufmaxlen() - 2 {
3068                self.input.lookahead(self.input.bufmaxlen());
3069                while self.mark.col < indent && self.input.peek() == ' ' {
3070                    self.skip_blank();
3071                }
3072            } else {
3073                loop {
3074                    self.input.lookahead(self.input.bufmaxlen());
3075                    while !self.input.buf_is_empty()
3076                        && self.mark.col < indent
3077                        && self.input.peek() == ' '
3078                    {
3079                        self.skip_blank();
3080                    }
3081                    // If we reached our indent, we can break. We must also break if we have
3082                    // reached content or EOF; that is, the buffer is not empty and the next
3083                    // character is not a space.
3084                    if self.mark.col == indent
3085                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
3086                    {
3087                        break;
3088                    }
3089                }
3090                self.input.lookahead(2);
3091            }
3092
3093            // If our current line is empty, skip over the break and continue looping.
3094            if self.input.next_is_break() {
3095                self.read_break(breaks);
3096            } else {
3097                // Otherwise, we have a content line. Return control.
3098                break;
3099            }
3100        }
3101    }
3102
3103    /// Determine the indentation level for a block scalar from the first line of its contents.
3104    ///
3105    /// The function skips over whitespace-only lines and sets `indent` to the longest
3106    /// whitespace line that was encountered.
3107    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
3108        let mut max_indent = 0;
3109        loop {
3110            // Consume all spaces. Tabs cannot be used as indentation.
3111            while self.input.look_ch() == ' ' {
3112                self.skip_blank();
3113            }
3114
3115            if self.mark.col > max_indent {
3116                max_indent = self.mark.col;
3117            }
3118
3119            if self.input.next_is_break() {
3120                // If our current line is empty, skip over the break and continue looping.
3121                self.input.lookahead(2);
3122                self.read_break(breaks);
3123            } else {
3124                // Otherwise, we have a content line. Return control.
3125                break;
3126            }
3127        }
3128
3129        // In case a YAML document looks like:
3130        // ```yaml
3131        // |
3132        // foo
3133        // bar
3134        // ```
3135        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
3136        // least 1. When in the above example, `self.indent` will be set to -1.
3137        *indent = max_indent.max((self.indent + 1) as usize);
3138        if self.indent > 0 {
3139            *indent = (*indent).max(1);
3140        }
3141    }
3142
3143    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
3144        self.save_simple_key();
3145        self.disallow_simple_key();
3146
3147        let token_index = self.tokens.len();
3148        let tok = self.scan_flow_scalar(single)?;
3149
3150        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
3151        // YAML allows the following value to be specified adjacent to the “:”.
3152        if self.skip_to_next_token(true)? {
3153            self.adjacent_value_allowed_at = usize::MAX;
3154        } else {
3155            self.adjacent_value_allowed_at = self.mark.index();
3156        }
3157
3158        self.insert_token(token_index, tok);
3159        Ok(())
3160    }
3161
3162    #[allow(clippy::too_many_lines)]
3163    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
3164        let start_mark = self.mark;
3165
3166        // Output scalar contents.
3167        let mut buf = match self.input.byte_offset() {
3168            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
3169            None => FlowScalarBuf::new_owned(),
3170        };
3171
3172        // Scratch used to consume the *first* line break in a break run without emitting it.
3173        // (The first break folds to ' ' or to nothing depending on escaping rules.)
3174        let mut break_scratch = String::new();
3175
3176        /* Eat the left quote. */
3177        self.skip_non_blank();
3178
3179        loop {
3180            /* Check for a document indicator. */
3181            self.input.lookahead(4);
3182
3183            if self.mark.col == 0 && self.input.next_is_document_indicator() {
3184                return Err(ScanError::new_str(
3185                    start_mark,
3186                    "while scanning a quoted scalar, found unexpected document indicator",
3187                ));
3188            }
3189
3190            if self.input.next_is_z() {
3191                return Err(ScanError::new_str(start_mark, "unclosed quote"));
3192            }
3193
3194            // Do not enforce block indentation inside quoted (flow) scalars.
3195            // YAML allows line breaks within quoted scalars.
3196            let mut leading_blanks = false;
3197            self.consume_flow_scalar_non_whitespace_chars(
3198                single,
3199                &mut buf,
3200                &mut leading_blanks,
3201                &start_mark,
3202            )?;
3203
3204            match self.input.look_ch() {
3205                '\'' if single => break,
3206                '"' if !single => break,
3207                _ => {}
3208            }
3209
3210            // --- Faster whitespace / line break handling (no temporary Strings) ---
3211            //
3212            // Instead of:
3213            //   - collecting blanks into `whitespaces` and then copying them
3214            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
3215            //
3216            // We do:
3217            //   - append trailing blanks directly to `string`, remember where they started,
3218            //     and truncate them if a line break follows.
3219            //   - for line breaks: consume the first break into a scratch (discarded),
3220            //     append subsequent breaks directly to `string`.
3221            //
3222            // These flags replace temporary-string emptiness checks:
3223            //   has_leading_break  <=> !leading_break.is_empty()
3224            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
3225            let mut trailing_ws_start: Option<usize> = None;
3226            let mut has_leading_break = false;
3227            let mut has_trailing_breaks = false;
3228
3229            // For the borrowed path: track the (byte) start of a pending whitespace run.
3230            let mut pending_ws_start: Option<usize> = None;
3231
3232            // Consume blank characters.
3233            while self.input.next_is_blank() || self.input.next_is_break() {
3234                if self.input.next_is_blank() {
3235                    // Consume a space or a tab character.
3236                    if leading_blanks {
3237                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
3238                            return Err(ScanError::new_str(
3239                                self.mark,
3240                                "tab cannot be used as indentation",
3241                            ));
3242                        }
3243                        self.skip_blank();
3244                    } else {
3245                        // Append to output immediately; if a break appears next, we'll truncate.
3246                        match buf {
3247                            FlowScalarBuf::Owned(ref mut string) => {
3248                                if trailing_ws_start.is_none() {
3249                                    trailing_ws_start = Some(string.len());
3250                                }
3251                                string.push(self.input.peek());
3252                            }
3253                            FlowScalarBuf::Borrowed { .. } => {
3254                                if pending_ws_start.is_none() {
3255                                    pending_ws_start = self.input.byte_offset();
3256                                }
3257                            }
3258                        }
3259                        self.skip_blank();
3260
3261                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
3262                            (&mut buf, pending_ws_start, self.input.byte_offset())
3263                        {
3264                            buf.note_pending_ws(ws_start, ws_end);
3265                        }
3266                    }
3267                } else {
3268                    self.input.lookahead(2);
3269
3270                    // Check if it is a first line break.
3271                    if leading_blanks {
3272                        // Second+ line break in a run: preserve it.
3273                        match buf {
3274                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
3275                            FlowScalarBuf::Borrowed { .. } => {
3276                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3277                                let Some(string) = buf.as_owned_mut() else {
3278                                    unreachable!()
3279                                };
3280                                self.read_break(string);
3281                            }
3282                        }
3283                        has_trailing_breaks = true;
3284                    } else {
3285                        // First break: drop any trailing blanks we appended, then consume the break.
3286                        if let Some(pos) = trailing_ws_start.take() {
3287                            if let FlowScalarBuf::Owned(ref mut string) = buf {
3288                                string.truncate(pos);
3289                            }
3290                        }
3291
3292                        if pending_ws_start.take().is_some() {
3293                            // Trailing blanks before a break are discarded => transformation.
3294                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3295                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3296                            }
3297                            buf.discard_pending_ws();
3298                        } else {
3299                            buf.commit_pending_ws();
3300                        }
3301
3302                        break_scratch.clear();
3303                        self.read_break(&mut break_scratch);
3304                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
3305
3306                        has_leading_break = true;
3307                        leading_blanks = true;
3308                    }
3309                }
3310
3311                self.input.lookahead(1);
3312            }
3313
3314            // If we had a line break inside a quoted (flow) scalar, validate indentation
3315            // of the continuation line in block context.
3316            if leading_blanks && has_leading_break && self.flow_level == 0 {
3317                let next_ch = self.input.peek();
3318                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
3319                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
3320                    return Err(ScanError::new_str(
3321                        self.mark,
3322                        "invalid indentation in multiline quoted scalar",
3323                    ));
3324                }
3325            }
3326
3327            // Join the whitespace or fold line breaks.
3328            if leading_blanks {
3329                // Folding rule:
3330                //   if there was no leading break, preserve the pending whitespace already emitted
3331                //   if there was a leading break but no trailing breaks, fold to one space
3332                //   otherwise, preserve the trailing breaks already emitted
3333                if has_leading_break && !has_trailing_breaks {
3334                    match buf {
3335                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
3336                        FlowScalarBuf::Borrowed { .. } => {
3337                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3338                            let Some(string) = buf.as_owned_mut() else {
3339                                unreachable!()
3340                            };
3341                            string.push(' ');
3342                        }
3343                    }
3344                }
3345            }
3346            // else: trailing blanks are already appended to `string`
3347        } // loop
3348
3349        // Eat the right quote.
3350        self.skip_non_blank();
3351        let end_mark = self.mark;
3352
3353        // Ensure there is no invalid trailing content.
3354        self.skip_ws_to_eol(SkipTabs::Yes)?;
3355        match self.input.peek() {
3356            // These can be encountered in flow sequences or mappings.
3357            ',' | '}' | ']' if self.flow_level > 0 => {}
3358            // An end-of-line / end-of-stream is fine. No trailing content.
3359            c if is_breakz(c) => {}
3360            // ':' can be encountered if our scalar is a key.
3361            // Outside of flow contexts, keys cannot span multiple lines
3362            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
3363            // Inside a flow context, this is allowed.
3364            ':' if self.flow_level > 0 => {}
3365            _ => {
3366                return Err(ScanError::new_str(
3367                    self.mark,
3368                    "invalid trailing content after double-quoted scalar",
3369                ));
3370            }
3371        }
3372
3373        let style = if single {
3374            ScalarStyle::SingleQuoted
3375        } else {
3376            ScalarStyle::DoubleQuoted
3377        };
3378
3379        let contents = match buf {
3380            FlowScalarBuf::Owned(string) => Cow::Owned(string),
3381            FlowScalarBuf::Borrowed {
3382                start,
3383                mut end,
3384                pending_ws_start,
3385                pending_ws_end,
3386            } => {
3387                // If we ended after a whitespace run, it is part of the output (no break followed).
3388                if pending_ws_start.is_some() {
3389                    end = pending_ws_end;
3390                }
3391                if let Some(slice) = self.try_borrow_slice(start, end) {
3392                    Cow::Borrowed(slice)
3393                } else {
3394                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
3395                        ScanError::new_str(
3396                            start_mark,
3397                            "internal error: input advertised offsets but did not provide a slice",
3398                        )
3399                    })?;
3400                    Cow::Owned(slice.to_owned())
3401                }
3402            }
3403        };
3404
3405        Ok(Token(
3406            Span::new(start_mark, end_mark),
3407            TokenType::Scalar(style, contents),
3408        ))
3409    }
3410
3411    /// Consume successive non-whitespace characters from a flow scalar.
3412    ///
3413    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
3414    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
3415    /// for double quoted scalars).
3416    ///
3417    /// # Errors
3418    /// Return an error if an invalid escape sequence is found.
3419    fn consume_flow_scalar_non_whitespace_chars(
3420        &mut self,
3421        single: bool,
3422        buf: &mut FlowScalarBuf,
3423        leading_blanks: &mut bool,
3424        start_mark: &Marker,
3425    ) -> Result<(), ScanError> {
3426        self.input.lookahead(2);
3427        while !is_blank_or_breakz(self.input.peek()) {
3428            match self.input.peek() {
3429                // Check for an escaped single quote.
3430                '\'' if self.input.peek_nth(1) == '\'' && single => {
3431                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3432                        buf.commit_pending_ws();
3433                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3434                    }
3435                    let Some(string) = buf.as_owned_mut() else {
3436                        unreachable!()
3437                    };
3438                    string.push('\'');
3439                    self.skip_n_non_blank(2);
3440                }
3441                // Check for the right quote.
3442                '\'' if single => break,
3443                '"' if !single => break,
3444                // Check for an escaped line break.
3445                '\\' if !single && is_break(self.input.peek_nth(1)) => {
3446                    self.input.lookahead(3);
3447                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3448                        buf.commit_pending_ws();
3449                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3450                    }
3451                    self.skip_non_blank();
3452                    self.skip_linebreak();
3453                    *leading_blanks = true;
3454                    break;
3455                }
3456                // Check for an escape sequence.
3457                '\\' if !single => {
3458                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3459                        buf.commit_pending_ws();
3460                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3461                    }
3462                    let Some(string) = buf.as_owned_mut() else {
3463                        unreachable!()
3464                    };
3465                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
3466                }
3467                c => {
3468                    match buf {
3469                        FlowScalarBuf::Owned(ref mut string) => {
3470                            string.push(c);
3471                        }
3472                        FlowScalarBuf::Borrowed { .. } => {
3473                            buf.commit_pending_ws();
3474                        }
3475                    }
3476                    self.skip_non_blank();
3477
3478                    if let Some(new_end) = self.input.byte_offset() {
3479                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
3480                            *end = new_end;
3481                        }
3482                    }
3483                }
3484            }
3485            self.input.lookahead(2);
3486        }
3487        Ok(())
3488    }
3489
3490    /// Escape the sequence we encounter in a flow scalar.
3491    ///
3492    /// `self.input.peek()` must point to the `\` starting the escape sequence.
3493    ///
3494    /// # Errors
3495    /// Return an error if an invalid escape sequence is found.
3496    fn resolve_flow_scalar_escape_sequence(
3497        &mut self,
3498        start_mark: &Marker,
3499    ) -> Result<char, ScanError> {
3500        let mut code_length = 0usize;
3501        let mut ret = '\0';
3502
3503        match self.input.peek_nth(1) {
3504            '0' => ret = '\0',
3505            'a' => ret = '\x07',
3506            'b' => ret = '\x08',
3507            't' | '\t' => ret = '\t',
3508            'n' => ret = '\n',
3509            'v' => ret = '\x0b',
3510            'f' => ret = '\x0c',
3511            'r' => ret = '\x0d',
3512            'e' => ret = '\x1b',
3513            ' ' => ret = '\x20',
3514            '"' => ret = '"',
3515            '/' => ret = '/',
3516            '\\' => ret = '\\',
3517            // Unicode next line (#x85)
3518            'N' => ret = char::from_u32(0x85).unwrap(),
3519            // Unicode non-breaking space (#xA0)
3520            '_' => ret = char::from_u32(0xA0).unwrap(),
3521            // Unicode line separator (#x2028)
3522            'L' => ret = char::from_u32(0x2028).unwrap(),
3523            // Unicode paragraph separator (#x2029)
3524            'P' => ret = char::from_u32(0x2029).unwrap(),
3525            'x' => code_length = 2,
3526            'u' => code_length = 4,
3527            'U' => code_length = 8,
3528            _ => {
3529                return Err(ScanError::new_str(
3530                    *start_mark,
3531                    "while parsing a quoted scalar, found unknown escape character",
3532                ))
3533            }
3534        }
3535        self.skip_n_non_blank(2);
3536
3537        // Consume an arbitrary escape code.
3538        if code_length > 0 {
3539            self.input.lookahead(code_length);
3540            let mut value = 0u32;
3541            for i in 0..code_length {
3542                let c = self.input.peek_nth(i);
3543                if !is_hex(c) {
3544                    return Err(ScanError::new_str(
3545                        *start_mark,
3546                        "while parsing a quoted scalar, did not find expected hexadecimal number",
3547                    ));
3548                }
3549                value = (value << 4) + as_hex(c);
3550            }
3551
3552            self.skip_n_non_blank(code_length);
3553
3554            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
3555            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3556                self.input.lookahead(2);
3557                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3558                    self.skip_n_non_blank(2);
3559                    self.input.lookahead(4);
3560                    let mut low_value = 0u32;
3561                    for i in 0..4 {
3562                        let c = self.input.peek_nth(i);
3563                        if !is_hex(c) {
3564                            return Err(ScanError::new_str(
3565                                *start_mark,
3566                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3567                            ));
3568                        }
3569                        low_value = (low_value << 4) + as_hex(c);
3570                    }
3571                    if (0xDC00..=0xDFFF).contains(&low_value) {
3572                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3573                        self.skip_n_non_blank(4);
3574                    } else {
3575                        return Err(ScanError::new_str(
3576                            *start_mark,
3577                            "while parsing a quoted scalar, found invalid low surrogate",
3578                        ));
3579                    }
3580                } else {
3581                    return Err(ScanError::new_str(
3582                        *start_mark,
3583                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3584                    ));
3585                }
3586            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3587                return Err(ScanError::new_str(
3588                    *start_mark,
3589                    "while parsing a quoted scalar, found unpaired low surrogate",
3590                ));
3591            }
3592
3593            let Some(ch) = char::from_u32(value) else {
3594                return Err(ScanError::new_str(
3595                    *start_mark,
3596                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3597                ));
3598            };
3599            ret = ch;
3600        }
3601        Ok(ret)
3602    }
3603
3604    fn fetch_plain_scalar(&mut self) -> ScanResult {
3605        self.save_simple_key();
3606        self.disallow_simple_key();
3607
3608        let token_index = self.tokens.len();
3609        let tok = self.scan_plain_scalar()?;
3610
3611        self.insert_token(token_index, tok);
3612        Ok(())
3613    }
3614
3615    /// Scan for a plain scalar.
3616    ///
3617    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3618    /// some contexts.
3619    #[allow(clippy::too_many_lines)]
3620    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3621        self.unroll_non_block_indents();
3622        let indent = self.indent + 1;
3623        let start_mark = self.mark;
3624
3625        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3626            return Err(ScanError::new_str(
3627                start_mark,
3628                "invalid indentation in flow construct",
3629            ));
3630        }
3631
3632        let mut string = String::with_capacity(32);
3633        self.buf_whitespaces.clear();
3634        self.buf_leading_break.clear();
3635        self.buf_trailing_breaks.clear();
3636        let mut end_mark = self.mark;
3637
3638        loop {
3639            self.input.lookahead(4);
3640            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3641                || self.input.peek() == '#'
3642            {
3643                // BS4K: If a `#` starts a comment after some separation spaces following content
3644                // of a plain scalar in block context, and there is potential continuation on the
3645                // next line, this is invalid. We cannot decide yet if there will be continuation,
3646                // so record that a comment interrupted a plain scalar.
3647                if self.input.peek() == '#'
3648                    && !string.is_empty()
3649                    && !self.buf_whitespaces.is_empty()
3650                    && self.flow_level == 0
3651                {
3652                    self.interrupted_plain_by_comment = Some(self.mark);
3653                }
3654                break;
3655            }
3656
3657            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3658                return Err(ScanError::new_str(
3659                    self.mark,
3660                    "plain scalar cannot start with '-' followed by ,[]{}",
3661                ));
3662            }
3663
3664            if !self.input.next_is_blank_or_breakz()
3665                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3666            {
3667                if self.leading_whitespace {
3668                    if self.buf_leading_break.is_empty() {
3669                        string.push_str(&self.buf_leading_break);
3670                        string.push_str(&self.buf_trailing_breaks);
3671                        self.buf_trailing_breaks.clear();
3672                        self.buf_leading_break.clear();
3673                    } else {
3674                        if self.buf_trailing_breaks.is_empty() {
3675                            string.push(' ');
3676                        } else {
3677                            string.push_str(&self.buf_trailing_breaks);
3678                            self.buf_trailing_breaks.clear();
3679                        }
3680                        self.buf_leading_break.clear();
3681                    }
3682                    self.leading_whitespace = false;
3683                } else if !self.buf_whitespaces.is_empty() {
3684                    string.push_str(&self.buf_whitespaces);
3685                    self.buf_whitespaces.clear();
3686                }
3687
3688                // We can unroll the first iteration of the loop.
3689                string.push(self.input.peek());
3690                self.skip_non_blank();
3691                string.reserve(self.input.bufmaxlen());
3692
3693                // Add content non-blank characters to the scalar.
3694                let mut end = false;
3695                while !end {
3696                    // Fill the buffer once and process all characters in the buffer until the next
3697                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3698                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3699                    self.input.lookahead(self.input.bufmaxlen());
3700                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3701                        &mut string,
3702                        self.input.bufmaxlen() - 1,
3703                        self.flow_level > 0,
3704                    );
3705                    end = stop;
3706                    self.mark.offsets.chars += chars_consumed;
3707                    self.mark.col += chars_consumed;
3708                    self.mark.offsets.bytes = self.input.byte_offset();
3709                }
3710                end_mark = self.mark;
3711            }
3712
3713            // We may reach the end of a plain scalar if:
3714            //  - We reach eof
3715            //  - We reach ": "
3716            //  - We find a flow character in a flow context
3717            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3718                break;
3719            }
3720
3721            // Process blank characters.
3722            self.input.lookahead(2);
3723            while self.input.next_is_blank_or_break() {
3724                if self.input.next_is_blank() {
3725                    if !self.leading_whitespace {
3726                        self.buf_whitespaces.push(self.input.peek());
3727                        self.skip_blank();
3728                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3729                        // Tabs in an indentation columns are allowed if and only if the line is
3730                        // empty. Skip to the end of the line.
3731                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3732                        if !self.input.next_is_breakz() {
3733                            return Err(ScanError::new_str(
3734                                start_mark,
3735                                "while scanning a plain scalar, found a tab",
3736                            ));
3737                        }
3738                    } else {
3739                        self.skip_blank();
3740                    }
3741                } else {
3742                    // Check if it is a first line break
3743                    if self.leading_whitespace {
3744                        self.skip_break();
3745                        self.buf_trailing_breaks.push('\n');
3746                    } else {
3747                        self.buf_whitespaces.clear();
3748                        self.skip_break();
3749                        self.buf_leading_break.push('\n');
3750                        self.leading_whitespace = true;
3751                    }
3752                }
3753                self.input.lookahead(2);
3754            }
3755
3756            // check indentation level
3757            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3758                break;
3759            }
3760        }
3761
3762        if self.leading_whitespace {
3763            self.allow_simple_key();
3764        }
3765
3766        if string.is_empty() {
3767            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3768            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3769            // erroneous inputs such as "{...".
3770            Err(ScanError::new_str(
3771                start_mark,
3772                "unexpected end of plain scalar",
3773            ))
3774        } else {
3775            let contents = if let (Some(start), Some(end)) =
3776                (start_mark.byte_offset(), end_mark.byte_offset())
3777            {
3778                match self.try_borrow_slice(start, end) {
3779                    Some(slice) if slice == string => Cow::Borrowed(slice),
3780                    _ => Cow::Owned(string),
3781                }
3782            } else {
3783                Cow::Owned(string)
3784            };
3785
3786            Ok(Token(
3787                Span::new(start_mark, end_mark),
3788                TokenType::Scalar(ScalarStyle::Plain, contents),
3789            ))
3790        }
3791    }
3792
3793    fn fetch_key(&mut self) -> ScanResult {
3794        let start_mark = self.mark;
3795        if self.flow_level == 0 {
3796            // Check if we are allowed to start a new key (not necessarily simple).
3797            if !self.simple_key_allowed {
3798                return Err(ScanError::new_str(
3799                    self.mark,
3800                    "mapping keys are not allowed in this context",
3801                ));
3802            }
3803            self.roll_indent(
3804                start_mark.col,
3805                None,
3806                TokenType::BlockMappingStart,
3807                start_mark,
3808            );
3809        } else {
3810            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3811            self.set_current_flow_mapping_started(true);
3812        }
3813
3814        self.remove_simple_key()?;
3815
3816        if self.flow_level == 0 {
3817            self.allow_simple_key();
3818        } else {
3819            self.disallow_simple_key();
3820        }
3821
3822        self.skip_non_blank();
3823        let token_index = self.tokens.len();
3824        self.explicit_key_tab_check_pending = false;
3825        let stopped_after_comment = self.skip_yaml_whitespace(true)?;
3826        if self.input.peek() == '\t' {
3827            return Err(ScanError::new_str(
3828                self.mark(),
3829                "tabs disallowed in this context",
3830            ));
3831        }
3832        self.explicit_key_tab_check_pending = stopped_after_comment;
3833        self.insert_token(
3834            token_index,
3835            Token(Span::new(start_mark, self.mark), TokenType::Key),
3836        );
3837        Ok(())
3838    }
3839
3840    /// Fetch a value in a mapping inside of a flow collection.
3841    ///
3842    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3843    /// values in flow collections are respected prior to calling [`fetch_value`].
3844    ///
3845    /// [`self.flow_level`]: Self::flow_level
3846    /// [`fetch_value`]: Self::fetch_value
3847    fn fetch_flow_value(&mut self) -> ScanResult {
3848        let nc = self.input.peek_nth(1);
3849
3850        // If we encounter a ':' inside a flow collection and it is not immediately
3851        // followed by a blank or breakz:
3852        //   - We must check whether an adjacent value is allowed
3853        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3854        //     is needed for JSON compatibility.
3855        //   - If not, we must ensure there is a space after the ':' and before its value.
3856        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3857        //   - But if the value is empty (null), then it's okay.
3858        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3859        // flow character), but the ']' is not the value. The value is an invisible empty
3860        // space which is represented as null ('~').
3861        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3862            return Err(ScanError::new_str(
3863                self.mark,
3864                "':' may not precede any of `[{` in flow mapping",
3865            ));
3866        }
3867
3868        self.fetch_value()
3869    }
3870
3871    /// Fetch a value from a mapping (after a `:`).
3872    fn fetch_value(&mut self) -> ScanResult {
3873        let sk = self.simple_keys.last().unwrap().clone();
3874        let start_mark = self.mark;
3875        let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3876            && !self.current_flow_mapping_started()
3877            && !self.implicit_flow_mapping_states.is_empty();
3878        if is_implicit_flow_mapping {
3879            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3880                ImplicitMappingState::Inside(self.flow_level);
3881        }
3882
3883        // Skip over ':'.
3884        self.skip_non_blank();
3885        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3886        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3887        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3888        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3889        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3890        // cases will still fail to parse correctly (just with a less specific error message).
3891        let mut trailing_tokens = VecDeque::new();
3892        if self.input.look_ch() == '\t' {
3893            let trailing_token_index = self.tokens.len();
3894            let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
3895            trailing_tokens = self.tokens.split_off(trailing_token_index);
3896
3897            if !whitespace.has_valid_yaml_ws()
3898                && (self.input.peek() == '-' || self.input.next_is_alpha())
3899            {
3900                return Err(ScanError::new_str(
3901                    self.mark,
3902                    "':' must be followed by a valid YAML whitespace",
3903                ));
3904            }
3905        }
3906
3907        if sk.possible {
3908            // insert simple key
3909            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3910            self.insert_token(sk.token_number - self.tokens_parsed, tok);
3911            if is_implicit_flow_mapping {
3912                if sk.mark.line < start_mark.line {
3913                    return Err(ScanError::new_str(
3914                        start_mark,
3915                        "illegal placement of ':' indicator",
3916                    ));
3917                }
3918                self.insert_token(
3919                    sk.token_number - self.tokens_parsed,
3920                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3921                );
3922            }
3923
3924            // Add the BLOCK-MAPPING-START token if needed.
3925            self.roll_indent(
3926                sk.mark.col,
3927                Some(sk.token_number),
3928                TokenType::BlockMappingStart,
3929                sk.mark,
3930            );
3931            self.roll_one_col_indent();
3932
3933            self.simple_keys.last_mut().unwrap().possible = false;
3934            self.disallow_simple_key();
3935        } else {
3936            if is_implicit_flow_mapping {
3937                self.tokens
3938                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart).into());
3939            }
3940            // The ':' indicator follows a complex key.
3941            if self.flow_level == 0 {
3942                if !self.simple_key_allowed {
3943                    return Err(ScanError::new_str(
3944                        start_mark,
3945                        "mapping values are not allowed in this context",
3946                    ));
3947                }
3948
3949                self.roll_indent(
3950                    start_mark.col,
3951                    None,
3952                    TokenType::BlockMappingStart,
3953                    start_mark,
3954                );
3955            }
3956            self.roll_one_col_indent();
3957
3958            if self.flow_level == 0 {
3959                self.allow_simple_key();
3960            } else {
3961                self.disallow_simple_key();
3962            }
3963        }
3964        self.tokens
3965            .push_back(Token(Span::empty(start_mark), TokenType::Value).into());
3966        self.tokens.append(&mut trailing_tokens);
3967
3968        Ok(())
3969    }
3970
3971    /// Add an indentation level to the stack with the given block token, if needed.
3972    ///
3973    /// An indentation level is added only if:
3974    ///   - We are not in a flow-style construct (which don't have indentation per-se).
3975    ///   - The current column is further indented than the last indent we have registered.
3976    fn roll_indent(
3977        &mut self,
3978        col: usize,
3979        number: Option<usize>,
3980        tok: TokenType<'input>,
3981        mark: Marker,
3982    ) {
3983        if self.flow_level > 0 {
3984            return;
3985        }
3986
3987        // If the last indent was a non-block indent, remove it.
3988        // This means that we prepared an indent that we thought we wouldn't use, but realized just
3989        // now that it is a block indent.
3990        if self.indent <= col as isize {
3991            if let Some(indent) = self.indents.last() {
3992                if !indent.needs_block_end {
3993                    self.indent = indent.indent;
3994                    self.indents.pop();
3995                }
3996            }
3997        }
3998
3999        if self.indent < col as isize {
4000            self.indents.push(Indent {
4001                indent: self.indent,
4002                needs_block_end: true,
4003            });
4004            self.indent = col as isize;
4005            let tokens_parsed = self.tokens_parsed;
4006            match number {
4007                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
4008                None => self.tokens.push_back(Token(Span::empty(mark), tok).into()),
4009            }
4010        }
4011    }
4012
4013    /// Pop indentation levels from the stack as much as needed.
4014    ///
4015    /// Indentation levels are popped from the stack while they are further indented than `col`.
4016    /// If we are in a flow-style construct (which don't have indentation per-se), this function
4017    /// does nothing.
4018    fn unroll_indent(&mut self, col: isize) {
4019        if self.flow_level > 0 {
4020            return;
4021        }
4022        while self.indent > col {
4023            let indent = self.indents.pop().unwrap();
4024            self.indent = indent.indent;
4025            if indent.needs_block_end {
4026                self.tokens
4027                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd).into());
4028            }
4029        }
4030    }
4031
4032    /// Add an indentation level of 1 column that does not start a block.
4033    ///
4034    /// See the documentation of [`Indent::needs_block_end`] for more details.
4035    /// An indentation is not added if we are inside a flow level or if the last indent is already
4036    /// a non-block indent.
4037    fn roll_one_col_indent(&mut self) {
4038        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
4039            self.indents.push(Indent {
4040                indent: self.indent,
4041                needs_block_end: false,
4042            });
4043            self.indent += 1;
4044        }
4045    }
4046
4047    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
4048    fn unroll_non_block_indents(&mut self) {
4049        while let Some(indent) = self.indents.last() {
4050            if indent.needs_block_end {
4051                break;
4052            }
4053            self.indent = indent.indent;
4054            self.indents.pop();
4055        }
4056    }
4057
4058    /// Mark the next token to be inserted as a potential simple key.
4059    fn save_simple_key(&mut self) {
4060        if self.simple_key_allowed {
4061            let required = self.flow_level == 0
4062                && self.indent == (self.mark.col as isize)
4063                && self.indents.last().unwrap().needs_block_end;
4064
4065            if let Some(last) = self.simple_keys.last_mut() {
4066                *last = SimpleKey {
4067                    mark: self.mark,
4068                    possible: true,
4069                    required,
4070                    token_number: self.tokens_parsed + self.tokens.len(),
4071                };
4072            }
4073        }
4074    }
4075
4076    fn remove_simple_key(&mut self) -> ScanResult {
4077        let last = self.simple_keys.last_mut().unwrap();
4078        if last.possible && last.required {
4079            return Err(self.simple_key_expected());
4080        }
4081
4082        last.possible = false;
4083        Ok(())
4084    }
4085
4086    /// Return whether the scanner is inside a block but outside of a flow sequence.
4087    fn is_within_block(&self) -> bool {
4088        !self.indents.is_empty()
4089    }
4090
4091    /// If an implicit mapping had started, end it.
4092    ///
4093    /// This function does not pop the state in [`implicit_flow_mapping_states`].
4094    ///
4095    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
4096    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
4097        if self
4098            .implicit_flow_mapping_states
4099            .last()
4100            .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
4101        {
4102            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
4103            self.set_current_flow_mapping_started(false);
4104            self.tokens
4105                .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd).into());
4106        }
4107    }
4108
4109    fn current_flow_collection_is_sequence(&self) -> bool {
4110        self.flow_markers
4111            .last()
4112            .is_some_and(|(_, bracket)| *bracket == '[')
4113    }
4114
4115    fn current_flow_mapping_started(&self) -> bool {
4116        self.flow_mapping_started.last().copied().unwrap_or(false)
4117    }
4118
4119    fn set_current_flow_mapping_started(&mut self, started: bool) {
4120        if let Some(current) = self.flow_mapping_started.last_mut() {
4121            *current = started;
4122        }
4123    }
4124}
4125
4126/// Chomping, how final line breaks and trailing empty lines are interpreted.
4127///
4128/// See YAML spec 8.1.1.2.
4129#[derive(PartialEq, Eq)]
4130pub enum Chomping {
4131    /// The final line break and any trailing empty lines are excluded.
4132    Strip,
4133    /// The final line break is preserved, but trailing empty lines are excluded.
4134    Clip,
4135    /// The final line break and trailing empty lines are included.
4136    Keep,
4137}
4138
4139#[cfg(test)]
4140mod test {
4141    use alloc::{
4142        borrow::{Cow, ToOwned},
4143        rc::Rc,
4144        string::String,
4145        vec::Vec,
4146    };
4147    use core::cell::Cell;
4148
4149    use crate::{
4150        input::{str::StrInput, BorrowedInput, BufferedInput, Input},
4151        scanner::{
4152            Comment, Marker, Placement, QueuedToken, QueuedTokenType, ScalarStyle, Scanner, Span,
4153            TEncoding, Token, TokenType,
4154        },
4155    };
4156
4157    struct CountingChars {
4158        chars: alloc::vec::IntoIter<char>,
4159        read: Rc<Cell<usize>>,
4160    }
4161
4162    impl Iterator for CountingChars {
4163        type Item = char;
4164
4165        fn next(&mut self) -> Option<Self::Item> {
4166            let next = self.chars.next();
4167            if next.is_some() {
4168                self.read.set(self.read.get() + 1);
4169            }
4170            next
4171        }
4172    }
4173
4174    struct SlicingOnlyInput<'input> {
4175        inner: StrInput<'input>,
4176        expose_slice: bool,
4177    }
4178
4179    impl<'input> SlicingOnlyInput<'input> {
4180        fn new(source: &'input str, expose_slice: bool) -> Self {
4181            Self {
4182                inner: StrInput::new(source),
4183                expose_slice,
4184            }
4185        }
4186    }
4187
4188    impl Input for SlicingOnlyInput<'_> {
4189        fn lookahead(&mut self, count: usize) {
4190            self.inner.lookahead(count);
4191        }
4192
4193        fn buflen(&self) -> usize {
4194            self.inner.buflen()
4195        }
4196
4197        fn bufmaxlen(&self) -> usize {
4198            self.inner.bufmaxlen()
4199        }
4200
4201        fn raw_read_ch(&mut self) -> char {
4202            self.inner.raw_read_ch()
4203        }
4204
4205        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4206            self.inner.raw_read_non_breakz_ch()
4207        }
4208
4209        fn skip(&mut self) {
4210            self.inner.skip();
4211        }
4212
4213        fn skip_n(&mut self, count: usize) {
4214            self.inner.skip_n(count);
4215        }
4216
4217        fn peek(&self) -> char {
4218            self.inner.peek()
4219        }
4220
4221        fn peek_nth(&self, n: usize) -> char {
4222            self.inner.peek_nth(n)
4223        }
4224
4225        fn byte_offset(&self) -> Option<usize> {
4226            self.inner.byte_offset()
4227        }
4228
4229        fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
4230            if self.expose_slice {
4231                self.inner.slice_bytes(start, end)
4232            } else {
4233                None
4234            }
4235        }
4236    }
4237
4238    impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
4239        fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
4240            None
4241        }
4242    }
4243
4244    #[test]
4245    fn test_is_anchor_char() {
4246        use super::is_anchor_char;
4247        assert!(is_anchor_char('x'));
4248    }
4249
4250    #[test]
4251    fn flow_simple_key_length_limit_bounds_buffering() {
4252        let mut yaml = String::from("[\n\"start\"\n");
4253        for _ in 0..600 {
4254            yaml.push_str("\"x\"\n");
4255        }
4256        let total_chars = yaml.chars().count();
4257        let read = Rc::new(Cell::new(0));
4258        let chars = yaml.chars().collect::<Vec<_>>().into_iter();
4259        let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
4260            chars,
4261            read: Rc::clone(&read),
4262        }));
4263
4264        assert!(matches!(
4265            scanner.next_token().unwrap().unwrap().1,
4266            TokenType::StreamStart(_)
4267        ));
4268
4269        let token = scanner.next_token().unwrap().unwrap();
4270        assert!(matches!(token.1, TokenType::FlowSequenceStart));
4271
4272        let token = scanner.next_token().unwrap().unwrap();
4273        assert!(matches!(
4274            token.1,
4275            TokenType::Scalar(_, ref value) if value == "start"
4276        ));
4277        assert!(
4278            read.get() < total_chars,
4279            "scanner consumed all {total_chars} chars before yielding the first flow scalar"
4280        );
4281        assert!(
4282            read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
4283            "scanner read {} chars before yielding the first flow scalar",
4284            read.get()
4285        );
4286    }
4287
4288    #[test]
4289    fn comment_capture_does_not_change_leading_whitespace() {
4290        let mut scanner = Scanner::new(StrInput::new("# comment\n"));
4291
4292        let token = scanner.scan_comment_token().unwrap();
4293
4294        assert!(scanner.leading_whitespace);
4295        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
4296
4297        let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
4298        scanner.input.lookahead(1);
4299
4300        let token = scanner.scan_comment_token().unwrap();
4301
4302        assert!(scanner.leading_whitespace);
4303        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
4304    }
4305
4306    #[test]
4307    fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
4308        let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
4309        scanner.input.lookahead(2);
4310        assert_eq!(scanner.input.peek_nth(1), ' ');
4311
4312        let token = scanner.scan_comment_token().unwrap();
4313
4314        assert!(matches!(token.1, TokenType::Comment(ref comment)
4315            if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
4316    }
4317
4318    #[test]
4319    fn comment_capture_errors_when_offsets_have_no_slice() {
4320        let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
4321
4322        let error = scanner.scan_comment_token().unwrap_err();
4323
4324        assert_eq!(
4325            error.info(),
4326            "internal error: input advertised offsets but did not provide a slice"
4327        );
4328    }
4329
4330    #[test]
4331    fn queued_token_roundtrips_public_token_variants() {
4332        let span = Span::new(Marker::new(0, 1, 0), Marker::new(7, 1, 7));
4333        let tokens = [
4334            Token(span, TokenType::StreamStart(TEncoding::Utf8)),
4335            Token(span, TokenType::StreamEnd),
4336            Token(span, TokenType::VersionDirective(1, 2)),
4337            Token(
4338                span,
4339                TokenType::TagDirective(Cow::Borrowed("!app!"), Cow::Borrowed("tag:app.example,")),
4340            ),
4341            Token(span, TokenType::DocumentStart),
4342            Token(span, TokenType::DocumentEnd),
4343            Token(span, TokenType::BlockSequenceStart),
4344            Token(span, TokenType::BlockMappingStart),
4345            Token(span, TokenType::BlockEnd),
4346            Token(span, TokenType::FlowSequenceStart),
4347            Token(span, TokenType::FlowSequenceEnd),
4348            Token(span, TokenType::FlowMappingStart),
4349            Token(span, TokenType::FlowMappingEnd),
4350            Token(span, TokenType::BlockEntry),
4351            Token(span, TokenType::FlowEntry),
4352            Token(span, TokenType::Key),
4353            Token(span, TokenType::Value),
4354            Token(span, TokenType::Alias(Cow::Borrowed("alias"))),
4355            Token(span, TokenType::Anchor(Cow::Borrowed("anchor"))),
4356            Token(
4357                span,
4358                TokenType::Tag(Cow::Borrowed("!"), Cow::Borrowed("tag")),
4359            ),
4360            Token(
4361                span,
4362                TokenType::Scalar(ScalarStyle::Literal, Cow::Borrowed("scalar")),
4363            ),
4364            Token(
4365                span,
4366                TokenType::Comment(
4367                    Comment::new(span, Cow::Borrowed(" comment")).with_placement(Placement::Right),
4368                ),
4369            ),
4370            Token(
4371                span,
4372                TokenType::ReservedDirective(
4373                    "reserved".to_owned(),
4374                    vec!["one".to_owned(), "two".to_owned()],
4375                ),
4376            ),
4377        ];
4378
4379        for token in tokens {
4380            let queued: QueuedToken = token.clone().into();
4381
4382            assert_eq!(queued.into_public(), token);
4383        }
4384    }
4385
4386    #[test]
4387    fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
4388        let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
4389
4390        scanner.skip_yaml_whitespace(false).unwrap();
4391
4392        assert!(scanner.tokens.is_empty());
4393        assert_eq!(scanner.mark.line(), 2);
4394        assert_eq!(scanner.mark.col(), 0);
4395    }
4396
4397    #[test]
4398    fn yaml_whitespace_can_stop_after_queued_comment() {
4399        let mut scanner = Scanner::new(StrInput::new(" # queued\n# later\n"));
4400
4401        assert!(scanner.skip_yaml_whitespace(true).unwrap());
4402
4403        assert_eq!(scanner.tokens.len(), 1);
4404        assert!(matches!(
4405            scanner.tokens.front().unwrap().1,
4406            QueuedTokenType::Comment(ref comment) if comment.text == " queued"
4407        ));
4408        assert_eq!(scanner.mark.line(), 1);
4409        assert_eq!(scanner.mark.col(), 9);
4410    }
4411
4412    #[test]
4413    fn token_skip_can_stop_after_queued_comment() {
4414        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n"));
4415
4416        assert!(scanner.skip_to_next_token(true).unwrap());
4417
4418        assert_eq!(scanner.tokens.len(), 1);
4419        assert!(matches!(
4420            scanner.tokens.front().unwrap().1,
4421            QueuedTokenType::Comment(ref comment) if comment.text == " first"
4422        ));
4423        assert_eq!(scanner.mark.line(), 2);
4424        assert_eq!(scanner.mark.col(), 0);
4425    }
4426
4427    #[test]
4428    fn scanner_emits_first_leading_comment_before_scanning_next_comment() {
4429        let mut scanner = Scanner::new(StrInput::new("# first\n# second\nkey: value\n"));
4430
4431        assert!(matches!(
4432            scanner.next_token().unwrap().unwrap().1,
4433            TokenType::StreamStart(_)
4434        ));
4435        assert!(matches!(
4436            scanner.next_token().unwrap().unwrap().1,
4437            TokenType::Comment(ref comment) if comment.text == " first"
4438        ));
4439        assert!(scanner.tokens.is_empty());
4440        assert!(matches!(
4441            scanner.next_token().unwrap().unwrap().1,
4442            TokenType::Comment(ref comment) if comment.text == " second"
4443        ));
4444    }
4445
4446    #[test]
4447    fn scanner_emits_quoted_scalar_comment_before_scanning_following_value() {
4448        let mut scanner = Scanner::new(StrInput::new("\"key\" # quoted\n: value\n"));
4449
4450        assert!(matches!(
4451            scanner.next_token().unwrap().unwrap().1,
4452            TokenType::StreamStart(_)
4453        ));
4454        assert!(matches!(
4455            scanner.next_token().unwrap().unwrap().1,
4456            TokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4457        ));
4458        assert!(matches!(
4459            scanner.next_token().unwrap().unwrap().1,
4460            TokenType::Comment(ref comment) if comment.text == " quoted"
4461        ));
4462    }
4463
4464    #[test]
4465    fn flow_scalar_comment_disables_adjacent_value_lookahead() {
4466        let mut scanner = Scanner::new(StrInput::new("\"key\"\n# quoted\n: value\n"));
4467
4468        scanner.fetch_flow_scalar(false).unwrap();
4469
4470        assert_eq!(scanner.adjacent_value_allowed_at, usize::MAX);
4471        assert!(matches!(
4472            scanner.tokens.front().unwrap().1,
4473            QueuedTokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4474        ));
4475        assert!(scanner.tokens.iter().any(|QueuedToken(_, token)| matches!(
4476            token,
4477            QueuedTokenType::Comment(comment) if comment.text == " quoted"
4478        )));
4479    }
4480
4481    #[test]
4482    fn deferred_error_waits_for_all_comment_tokens() {
4483        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
4484
4485        assert!(matches!(
4486            scanner.next_token().unwrap().unwrap().1,
4487            TokenType::StreamStart(_)
4488        ));
4489        assert!(matches!(
4490            scanner.next_token().unwrap().unwrap().1,
4491            TokenType::Comment(ref comment) if comment.text == " first"
4492        ));
4493        assert!(matches!(
4494            scanner.next_token().unwrap().unwrap().1,
4495            TokenType::Comment(ref comment) if comment.text == " second"
4496        ));
4497
4498        let error = scanner.next_token().unwrap_err();
4499
4500        assert!(error.info().contains("unexpected character"));
4501    }
4502
4503    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
4504    #[test]
4505    fn anchor_name_is_borrowed_for_str_input() {
4506        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
4507
4508        loop {
4509            let tok = scanner
4510                .next_token()
4511                .expect("valid YAML must scan without errors")
4512                .expect("scanner must eventually produce a token");
4513            if let TokenType::Anchor(name) = tok.1 {
4514                assert!(matches!(name, Cow::Borrowed("anch")));
4515                break;
4516            }
4517        }
4518    }
4519
4520    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
4521    #[test]
4522    fn anchor_name_rejects_non_printable_control_chars() {
4523        let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
4524
4525        loop {
4526            let tok = scanner
4527                .next_token()
4528                .expect("scanning should not fail")
4529                .expect("scanner must eventually produce a token");
4530            if let TokenType::Anchor(name) = tok.1 {
4531                assert!(matches!(name, Cow::Borrowed("foo")));
4532                let next = scanner.next_token().expect("scanning should not fail");
4533                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4534                    assert!(rest.starts_with('\u{0001}'));
4535                }
4536                break;
4537            }
4538        }
4539    }
4540
4541    #[test]
4542    fn alias_name_rejects_non_printable_control_chars() {
4543        let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
4544
4545        loop {
4546            let tok = scanner
4547                .next_token()
4548                .expect("scanning should not fail")
4549                .expect("scanner must eventually produce a token");
4550            if let TokenType::Alias(name) = tok.1 {
4551                assert!(matches!(name, Cow::Borrowed("foo")));
4552                let next = scanner.next_token().expect("scanning should not fail");
4553                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4554                    assert!(rest.starts_with('\u{0001}'));
4555                }
4556                break;
4557            }
4558        }
4559    }
4560
4561    #[test]
4562    fn alias_name_is_borrowed_for_str_input() {
4563        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
4564
4565        loop {
4566            let tok = scanner
4567                .next_token()
4568                .expect("valid YAML must scan without errors")
4569                .expect("scanner must eventually produce a token");
4570            if let TokenType::Alias(name) = tok.1 {
4571                assert!(matches!(name, Cow::Borrowed("anch")));
4572                break;
4573            }
4574        }
4575    }
4576
4577    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
4578    #[test]
4579    fn tag_directive_parts_are_borrowed_for_str_input() {
4580        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
4581
4582        loop {
4583            let tok = scanner
4584                .next_token()
4585                .expect("valid YAML must scan without errors")
4586                .expect("scanner must eventually produce a token");
4587            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4588                assert!(matches!(handle, Cow::Borrowed("!e!")));
4589                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
4590                break;
4591            }
4592        }
4593    }
4594
4595    #[test]
4596    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
4597        let mut scanner = Scanner::new(StrInput::new("foo\n"));
4598
4599        loop {
4600            let tok = scanner
4601                .next_token()
4602                .expect("valid YAML must scan without errors")
4603                .expect("scanner must eventually produce a token");
4604            if let TokenType::Scalar(_, value) = tok.1 {
4605                assert!(matches!(value, Cow::Borrowed("foo")));
4606                break;
4607            }
4608        }
4609    }
4610
4611    #[test]
4612    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
4613        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
4614
4615        loop {
4616            let tok = scanner
4617                .next_token()
4618                .expect("valid YAML must scan without errors")
4619                .expect("scanner must eventually produce a token");
4620            if let TokenType::Scalar(_, value) = tok.1 {
4621                assert!(matches!(value, Cow::Borrowed("foo bar")));
4622                break;
4623            }
4624        }
4625    }
4626
4627    #[test]
4628    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4629        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
4630
4631        loop {
4632            let tok = scanner
4633                .next_token()
4634                .expect("valid YAML must scan without errors")
4635                .expect("scanner must eventually produce a token");
4636            if let TokenType::Scalar(_, value) = tok.1 {
4637                assert!(matches!(value, Cow::Borrowed("foo bar")));
4638                break;
4639            }
4640        }
4641    }
4642
4643    #[test]
4644    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
4645        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
4646
4647        loop {
4648            let tok = scanner
4649                .next_token()
4650                .expect("valid YAML must scan without errors")
4651                .expect("scanner must eventually produce a token");
4652            if let TokenType::Scalar(_, value) = tok.1 {
4653                assert!(matches!(value, Cow::Owned(_)));
4654                assert_eq!(&*value, "foo'bar");
4655                break;
4656            }
4657        }
4658    }
4659
4660    #[test]
4661    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4662        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
4663
4664        loop {
4665            let tok = scanner
4666                .next_token()
4667                .expect("valid YAML must scan without errors")
4668                .expect("scanner must eventually produce a token");
4669            if let TokenType::Scalar(_, value) = tok.1 {
4670                assert!(matches!(value, Cow::Borrowed("foo bar")));
4671                break;
4672            }
4673        }
4674    }
4675
4676    #[test]
4677    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
4678        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
4679
4680        loop {
4681            let tok = scanner
4682                .next_token()
4683                .expect("valid YAML must scan without errors")
4684                .expect("scanner must eventually produce a token");
4685            if let TokenType::Scalar(_, value) = tok.1 {
4686                assert!(matches!(value, Cow::Owned(_)));
4687                assert_eq!(&*value, "foo\nbar");
4688                break;
4689            }
4690        }
4691    }
4692
4693    #[test]
4694    fn plain_key_is_borrowed_for_str_input() {
4695        // Keys are just scalars in a key position; they should also be borrowed.
4696        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
4697
4698        let mut found_key = false;
4699        let mut key_value: Option<Cow<'_, str>> = None;
4700
4701        loop {
4702            let tok = scanner
4703                .next_token()
4704                .expect("valid YAML must scan without errors");
4705            let Some(tok) = tok else { break };
4706
4707            if matches!(tok.1, TokenType::Key) {
4708                found_key = true;
4709            } else if found_key {
4710                if let TokenType::Scalar(_, value) = tok.1 {
4711                    key_value = Some(value);
4712                    break;
4713                }
4714            }
4715        }
4716
4717        assert!(found_key, "expected to find a Key token");
4718        let key_value = key_value.expect("expected to find a scalar after Key token");
4719        assert!(
4720            matches!(key_value, Cow::Borrowed("mykey")),
4721            "key should be borrowed, got: {key_value:?}"
4722        );
4723    }
4724
4725    #[test]
4726    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
4727        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
4728
4729        let mut found_key = false;
4730        let mut key_value: Option<Cow<'_, str>> = None;
4731
4732        loop {
4733            let tok = scanner
4734                .next_token()
4735                .expect("valid YAML must scan without errors");
4736            let Some(tok) = tok else { break };
4737
4738            if matches!(tok.1, TokenType::Key) {
4739                found_key = true;
4740            } else if found_key {
4741                if let TokenType::Scalar(_, value) = tok.1 {
4742                    key_value = Some(value);
4743                    break;
4744                }
4745            }
4746        }
4747
4748        assert!(found_key, "expected to find a Key token");
4749        let key_value = key_value.expect("expected to find a scalar after Key token");
4750        assert!(
4751            matches!(key_value, Cow::Borrowed("mykey")),
4752            "quoted key should be borrowed when verbatim, got: {key_value:?}"
4753        );
4754    }
4755
4756    #[test]
4757    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
4758        // Test a tag like !!str which should have handle="!!" and suffix="str"
4759        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
4760
4761        loop {
4762            let tok = scanner
4763                .next_token()
4764                .expect("valid YAML must scan without errors")
4765                .expect("scanner must eventually produce a token");
4766            if let TokenType::Tag(handle, suffix) = tok.1 {
4767                assert!(
4768                    matches!(handle, Cow::Borrowed("!!")),
4769                    "tag handle should be borrowed, got: {handle:?}"
4770                );
4771                assert!(
4772                    matches!(suffix, Cow::Borrowed("str")),
4773                    "tag suffix should be borrowed, got: {suffix:?}"
4774                );
4775                break;
4776            }
4777        }
4778    }
4779
4780    #[test]
4781    fn local_tag_suffix_is_borrowed_for_str_input() {
4782        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
4783        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
4784
4785        loop {
4786            let tok = scanner
4787                .next_token()
4788                .expect("valid YAML must scan without errors")
4789                .expect("scanner must eventually produce a token");
4790            if let TokenType::Tag(handle, suffix) = tok.1 {
4791                assert!(
4792                    matches!(handle, Cow::Borrowed("!")),
4793                    "local tag handle should be '!', got: {handle:?}"
4794                );
4795                assert!(
4796                    matches!(suffix, Cow::Borrowed("mytag")),
4797                    "local tag suffix should be borrowed, got: {suffix:?}"
4798                );
4799                break;
4800            }
4801        }
4802    }
4803
4804    #[test]
4805    fn tag_with_uri_escape_is_owned_for_str_input() {
4806        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
4807        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
4808
4809        loop {
4810            let tok = scanner
4811                .next_token()
4812                .expect("valid YAML must scan without errors")
4813                .expect("scanner must eventually produce a token");
4814            if let TokenType::Tag(handle, suffix) = tok.1 {
4815                assert!(
4816                    matches!(handle, Cow::Borrowed("!!")),
4817                    "tag handle should still be borrowed, got: {handle:?}"
4818                );
4819                assert!(
4820                    matches!(suffix, Cow::Owned(_)),
4821                    "tag suffix with URI escape should be owned, got: {suffix:?}"
4822                );
4823                assert_eq!(&*suffix, "my tag");
4824                break;
4825            }
4826        }
4827    }
4828
4829    #[test]
4830    fn flow_scalar_buffer_tracks_pending_whitespace() {
4831        let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
4832
4833        borrowed.note_pending_ws(5, 8);
4834        borrowed.commit_pending_ws();
4835        assert!(matches!(
4836            borrowed,
4837            super::FlowScalarBuf::Borrowed {
4838                end: 8,
4839                pending_ws_start: None,
4840                pending_ws_end: 8,
4841                ..
4842            }
4843        ));
4844
4845        borrowed.note_pending_ws(9, 11);
4846        borrowed.discard_pending_ws();
4847        assert!(matches!(
4848            borrowed,
4849            super::FlowScalarBuf::Borrowed {
4850                end: 8,
4851                pending_ws_start: None,
4852                pending_ws_end: 8,
4853                ..
4854            }
4855        ));
4856        assert!(borrowed.as_owned_mut().is_none());
4857
4858        let mut owned = super::FlowScalarBuf::new_owned();
4859        owned.as_owned_mut().unwrap().push_str("owned");
4860        assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
4861    }
4862
4863    fn first_scanner_error_info(input: &str) -> String {
4864        let mut scanner = Scanner::new(StrInput::new(input));
4865        loop {
4866            match scanner.next_token() {
4867                Ok(Some(_)) => {}
4868                Ok(None) => panic!("expected scanner error"),
4869                Err(error) => return error.info().to_owned(),
4870            }
4871        }
4872    }
4873
4874    fn first_scalar_value(input: &str) -> String {
4875        let mut scanner = Scanner::new(StrInput::new(input));
4876        loop {
4877            match scanner.next_token().expect("scanner should not error") {
4878                Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
4879                Some(_) => {}
4880                None => panic!("expected scalar token"),
4881            }
4882        }
4883    }
4884
4885    #[test]
4886    fn iterator_next_records_error_and_then_stays_empty() {
4887        let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
4888
4889        while scanner.next().is_some() {}
4890
4891        let error = scanner
4892            .get_error()
4893            .expect("scanner should retain the error");
4894        assert_eq!(error.info(), "unclosed quote");
4895        assert!(scanner.next().is_none());
4896    }
4897
4898    #[test]
4899    fn next_token_returns_none_after_stream_end() {
4900        let mut scanner = Scanner::new(StrInput::new(""));
4901
4902        while let Some(token) = scanner.next_token().unwrap() {
4903            if matches!(token.1, TokenType::StreamEnd) {
4904                break;
4905            }
4906        }
4907
4908        assert!(scanner.stream_started());
4909        assert!(scanner.stream_ended());
4910        assert!(scanner.next_token().unwrap().is_none());
4911    }
4912
4913    #[test]
4914    fn directive_name_must_be_present() {
4915        assert_eq!(
4916            first_scanner_error_info("%\n"),
4917            "while scanning a directive, could not find expected directive name"
4918        );
4919    }
4920
4921    #[test]
4922    fn yaml_directive_requires_dot_between_version_numbers() {
4923        assert_eq!(
4924            first_scanner_error_info("%YAML 1\n"),
4925            "while scanning a YAML directive, did not find expected digit or '.' character"
4926        );
4927    }
4928
4929    #[test]
4930    fn yaml_directive_requires_major_version_number() {
4931        assert_eq!(
4932            first_scanner_error_info("%YAML .2\n"),
4933            "while scanning a YAML directive, did not find expected version number"
4934        );
4935    }
4936
4937    #[test]
4938    fn yaml_directive_rejects_extremely_long_version_number() {
4939        assert_eq!(
4940            first_scanner_error_info("%YAML 1234567890.2\n"),
4941            "while scanning a YAML directive, found extremely long version number"
4942        );
4943    }
4944
4945    #[test]
4946    fn tag_directive_handle_must_end_with_bang() {
4947        assert_eq!(
4948            first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
4949            "while parsing a tag directive, did not find expected '!'"
4950        );
4951    }
4952
4953    #[test]
4954    fn tag_directive_handle_must_start_with_bang() {
4955        assert_eq!(
4956            first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
4957            "while scanning a tag, did not find expected '!'"
4958        );
4959    }
4960
4961    #[test]
4962    fn tag_directive_prefix_must_start_with_tag_character() {
4963        assert_eq!(
4964            first_scanner_error_info("%TAG !e! `bad\n"),
4965            "invalid global tag character"
4966        );
4967    }
4968
4969    #[test]
4970    fn tag_directive_prefix_must_end_before_invalid_content() {
4971        assert_eq!(
4972            first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
4973            "while scanning TAG, did not find expected whitespace or line break"
4974        );
4975    }
4976
4977    #[test]
4978    fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
4979        let mut scanner =
4980            Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
4981
4982        loop {
4983            let token = scanner
4984                .next_token()
4985                .expect("valid directive should scan")
4986                .expect("scanner must produce a directive token");
4987            if let TokenType::TagDirective(handle, prefix) = token.1 {
4988                assert!(matches!(handle, Cow::Borrowed("!e!")));
4989                assert!(matches!(prefix, Cow::Owned(_)));
4990                assert_eq!(&*prefix, "tag:example.com,2024:some app/");
4991                break;
4992            }
4993        }
4994    }
4995
4996    #[test]
4997    fn bare_bang_tag_scans_as_non_specific_tag() {
4998        let mut scanner = Scanner::new(StrInput::new("! foo\n"));
4999
5000        loop {
5001            let token = scanner
5002                .next_token()
5003                .expect("valid tag should scan")
5004                .expect("scanner must produce a tag token");
5005            if let TokenType::Tag(handle, suffix) = token.1 {
5006                assert_eq!(&*handle, "");
5007                assert_eq!(&*suffix, "!");
5008                break;
5009            }
5010        }
5011    }
5012
5013    #[test]
5014    fn tag_requires_separation_after_suffix() {
5015        assert_eq!(
5016            first_scanner_error_info("!foo,bar\n"),
5017            "while scanning a tag, did not find expected whitespace or line break"
5018        );
5019    }
5020
5021    #[test]
5022    fn verbatim_tag_requires_uri() {
5023        assert_eq!(
5024            first_scanner_error_info("!<> foo\n"),
5025            "while parsing a tag, did not find expected tag URI"
5026        );
5027    }
5028
5029    #[test]
5030    fn verbatim_tag_requires_closing_angle_bracket() {
5031        assert_eq!(
5032            first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
5033            "while scanning a verbatim tag, did not find the expected '>'"
5034        );
5035    }
5036
5037    #[test]
5038    fn tag_uri_escape_requires_hex_digits() {
5039        assert_eq!(
5040            first_scanner_error_info("!!bad%zz foo\n"),
5041            "while parsing a tag, found an invalid escape sequence"
5042        );
5043    }
5044
5045    #[test]
5046    fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
5047        assert_eq!(
5048            first_scanner_error_info("!!bad%80 foo\n"),
5049            "while parsing a tag, found an incorrect leading UTF-8 byte"
5050        );
5051    }
5052
5053    #[test]
5054    fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
5055        assert_eq!(
5056            first_scanner_error_info("!!bad%C2%41 foo\n"),
5057            "while parsing a tag, found an incorrect trailing UTF-8 byte"
5058        );
5059    }
5060
5061    #[test]
5062    fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
5063        assert_eq!(
5064            first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
5065            "while parsing a tag, found an invalid UTF-8 codepoint"
5066        );
5067    }
5068
5069    #[test]
5070    fn anchors_and_aliases_require_names() {
5071        let expected =
5072            "while scanning an anchor or alias, did not find expected alphabetic or numeric character";
5073
5074        assert_eq!(first_scanner_error_info("& \n"), expected);
5075        assert_eq!(first_scanner_error_info("* \n"), expected);
5076    }
5077
5078    #[test]
5079    fn document_end_marker_rejects_trailing_content() {
5080        assert_eq!(
5081            first_scanner_error_info("... trailing\n"),
5082            "invalid content after document end marker"
5083        );
5084    }
5085
5086    #[test]
5087    fn reserved_indicators_are_rejected_outside_directives() {
5088        assert_eq!(
5089            first_scanner_error_info(" @\n"),
5090            "unexpected character: `@'"
5091        );
5092    }
5093
5094    #[test]
5095    fn flow_block_entry_indicator_is_rejected() {
5096        assert_eq!(
5097            first_scanner_error_info("[- ]\n"),
5098            r#""-" is only valid inside a block"#
5099        );
5100    }
5101
5102    #[test]
5103    fn block_entry_after_tabbed_separator_reports_specific_error() {
5104        assert_eq!(
5105            first_scanner_error_info("-\t- value\n"),
5106            "'-' must be followed by a valid YAML whitespace"
5107        );
5108    }
5109
5110    #[test]
5111    fn document_indicator_reports_unclosed_flow_collection() {
5112        assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
5113    }
5114
5115    #[test]
5116    fn block_scalar_header_rejects_trailing_content() {
5117        assert_eq!(
5118            first_scanner_error_info("|+ trailing\n"),
5119            "while scanning a block scalar, did not find expected comment or line break"
5120        );
5121    }
5122
5123    #[test]
5124    fn block_scalar_rejects_zero_indent_indicator() {
5125        let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
5126
5127        assert_eq!(first_scanner_error_info("|0\n"), expected);
5128        assert_eq!(first_scanner_error_info("|+0\n"), expected);
5129    }
5130
5131    #[test]
5132    fn empty_block_scalar_at_eof_honors_chomping() {
5133        assert_eq!(first_scalar_value("|-\n"), "");
5134        assert_eq!(first_scalar_value("|+\n"), "\n");
5135    }
5136
5137    #[test]
5138    fn explicit_indent_block_scalar_can_end_at_document_marker() {
5139        assert_eq!(first_scalar_value("|1\n...\n"), "");
5140    }
5141
5142    #[test]
5143    fn root_explicit_indent_block_scalar_rejects_underindented_content() {
5144        assert_eq!(
5145            first_scanner_error_info("|2\nx\n"),
5146            "wrongly indented line in block scalar"
5147        );
5148    }
5149
5150    #[test]
5151    fn quoted_scalar_rejects_document_indicator_at_line_start() {
5152        assert_eq!(
5153            first_scanner_error_info("\"one\n---\ntwo\"\n"),
5154            "while scanning a quoted scalar, found unexpected document indicator"
5155        );
5156    }
5157
5158    #[test]
5159    fn quoted_scalar_rejects_tab_indentation_after_line_break() {
5160        assert_eq!(
5161            first_scanner_error_info("a: \"one\n\tbad\"\n"),
5162            "tab cannot be used as indentation"
5163        );
5164    }
5165
5166    #[test]
5167    fn quoted_scalar_rejects_underindented_continuation() {
5168        assert_eq!(
5169            first_scanner_error_info("a: \"one\nbad\"\n"),
5170            "invalid indentation in multiline quoted scalar"
5171        );
5172    }
5173
5174    #[test]
5175    fn indented_flow_scalar_reports_invalid_indentation() {
5176        assert_eq!(
5177            first_scanner_error_info("a:\n  [\nfoo]\n"),
5178            "invalid indentation"
5179        );
5180    }
5181
5182    #[test]
5183    fn required_simple_key_requires_value_at_stream_end() {
5184        assert_eq!(
5185            first_scanner_error_info("a:\n&b\n- c\n"),
5186            "simple key expect ':'"
5187        );
5188    }
5189
5190    #[test]
5191    fn plain_scalar_rejects_dash_before_flow_indicator() {
5192        assert_eq!(
5193            first_scanner_error_info("[-]\n"),
5194            "plain scalar cannot start with '-' followed by ,[]{}"
5195        );
5196    }
5197
5198    #[test]
5199    fn explicit_key_rejects_tab_after_indicator() {
5200        assert_eq!(
5201            first_scanner_error_info("? \tfoo\n"),
5202            "tabs disallowed in this context"
5203        );
5204    }
5205
5206    #[test]
5207    fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
5208        assert_eq!(
5209            first_scanner_error_info("[a:[]]\n"),
5210            "':' may not precede any of `[{` in flow mapping"
5211        );
5212    }
5213
5214    #[test]
5215    fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
5216        assert_eq!(
5217            first_scanner_error_info("[foo\n: bar]\n"),
5218            "illegal placement of ':' indicator"
5219        );
5220    }
5221}