granit_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// Maximum number of characters the scanner may look ahead while disambiguating a simple key.
29const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31/// The encoding of the input. Currently, only UTF-8 is supported.
32#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34    /// UTF-8 encoding.
35    Utf8,
36}
37
38/// The source style used for a YAML scalar.
39#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41    /// A YAML plain scalar.
42    Plain,
43    /// A YAML single quoted scalar.
44    SingleQuoted,
45    /// A YAML double quoted scalar.
46    DoubleQuoted,
47
48    /// A YAML literal block (`|` block).
49    ///
50    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
51    /// In literal blocks, any indented character is content, including white space characters.
52    /// There is no way to escape characters, nor to break a long line.
53    Literal,
54    /// A YAML folded block (`>` block).
55    ///
56    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
57    /// In folded blocks, any indented character is content, including white space characters.
58    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
59    /// long lines.
60    Folded,
61}
62
63/// Offset information for a [`Marker`].
64///
65/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
66/// character source. For stable inputs, we can track both a character index and a byte offset.
67/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
68/// meaningful underlying file/source), so they are optional.
69#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71    /// The index (in characters) in the source.
72    chars: usize,
73    /// The offset (in bytes) in the source, if available.
74    bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78    fn eq(&self, other: &Self) -> bool {
79        // Byte offsets are an optional diagnostic enhancement and may differ between input
80        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
81        // position only.
82        self.chars == other.chars
83    }
84}
85
86impl Eq for MarkerOffsets {}
87
88/// A location in a YAML document.
89#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91    /// Offsets in the source.
92    offsets: MarkerOffsets,
93    /// The line (1-indexed).
94    line: usize,
95    /// The column (0-indexed).
96    col: usize,
97}
98
99impl Marker {
100    /// Create a new [`Marker`] at the given position.
101    #[must_use]
102    pub fn new(index: usize, line: usize, col: usize) -> Marker {
103        Marker {
104            offsets: MarkerOffsets {
105                chars: index,
106                bytes: None,
107            },
108            line,
109            col,
110        }
111    }
112
113    /// Return a copy of the marker with the given optional byte offset.
114    #[must_use]
115    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116        self.offsets.bytes = byte_offset;
117        self
118    }
119
120    /// Return the index (in characters) of the marker in the source.
121    #[must_use]
122    pub fn index(&self) -> usize {
123        self.offsets.chars
124    }
125
126    /// Return the byte offset of the marker in the source, if available.
127    #[must_use]
128    pub fn byte_offset(&self) -> Option<usize> {
129        self.offsets.bytes
130    }
131
132    /// Return the line of the marker in the source.
133    #[must_use]
134    pub fn line(&self) -> usize {
135        self.line
136    }
137
138    /// Return the column of the marker in the source.
139    #[must_use]
140    pub fn col(&self) -> usize {
141        self.col
142    }
143}
144
145/// A range of locations in a YAML document.
146#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148    /// The start (inclusive) of the range.
149    pub start: Marker,
150    /// The end (exclusive) of the range.
151    pub end: Marker,
152
153    /// Optional indentation hint associated with this span.
154    ///
155    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
156    /// When indentation is not meaningful or cannot be provided, it must be `None`.
157    pub indent: Option<usize>,
158
159    /// Optional source marker for the explicit tag token attached to this node.
160    ///
161    /// This is only meaningful for parser-emitted node events that carry a resolved tag, such as
162    /// [`Event::Scalar`](crate::Event::Scalar),
163    /// [`Event::SequenceStart`](crate::Event::SequenceStart), or
164    /// [`Event::MappingStart`](crate::Event::MappingStart). The normal [`Span::start`] and
165    /// [`Span::end`] continue to cover the node value or collection; `tag_start` points to the
166    /// tag token when that token appears at a different source location.
167    pub tag_start: Option<Marker>,
168}
169
170impl Span {
171    /// Create a new [`Span`] for the given range.
172    #[must_use]
173    pub fn new(start: Marker, end: Marker) -> Span {
174        Span {
175            start,
176            end,
177            indent: None,
178            tag_start: None,
179        }
180    }
181
182    /// Create an empty [`Span`] at a given location.
183    ///
184    /// An empty span doesn't contain any characters, but its position may still be meaningful.
185    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
186    ///
187    /// [`SequenceEnd`]: crate::Event::SequenceEnd
188    #[must_use]
189    pub fn empty(mark: Marker) -> Span {
190        Span {
191            start: mark,
192            end: mark,
193            indent: None,
194            tag_start: None,
195        }
196    }
197
198    /// Return a copy of this [`Span`] with the given indentation hint.
199    #[must_use]
200    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
201        self.indent = indent;
202        self
203    }
204
205    /// Return a copy of this [`Span`] with the given explicit tag-token start marker.
206    #[must_use]
207    pub fn with_tag_start(mut self, tag_start: Option<Marker>) -> Span {
208        self.tag_start = tag_start;
209        self
210    }
211
212    /// Return the source marker of the explicit tag token attached to this node, if any.
213    ///
214    /// The regular span still covers the node value or collection. This accessor is useful for
215    /// diagnostics that should point at the tag itself, especially when a tagged block collection
216    /// begins on a later line than the tag token.
217    #[must_use]
218    pub fn tag_start(&self) -> Option<Marker> {
219        self.tag_start
220    }
221
222    /// Return the length of the span (in characters).
223    #[must_use]
224    pub fn len(&self) -> usize {
225        self.end.index() - self.start.index()
226    }
227
228    /// Return whether the [`Span`] has a length of zero.
229    #[must_use]
230    pub fn is_empty(&self) -> bool {
231        self.len() == 0
232    }
233
234    /// Return the byte range of the span, if available.
235    #[must_use]
236    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
237        let start = self.start.byte_offset()?;
238        let end = self.end.byte_offset()?;
239        Some(start..end)
240    }
241
242    /// Return the source text covered by this span, if byte offsets are available
243    /// and the range is valid for the provided input.
244    #[must_use]
245    pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
246        source.get(self.byte_range()?)
247    }
248}
249
250/// A positional hint for a YAML source comment.
251///
252/// The parser currently recognizes these placements:
253///
254/// ```yaml
255/// # Above
256/// key: value # Right
257///
258/// # Free
259///
260/// next: value
261///
262/// # Last
263/// ```
264#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
265pub enum Placement {
266    /// An own-line comment immediately before another YAML token.
267    ///
268    /// This usually means the comment visually describes the following node.
269    /// Consecutive own-line comments without blank lines between them are also considered
270    /// `Above`, so a comment block can attach to the next YAML element as a group.
271    Above,
272    /// A same-line comment after YAML content or syntax. Examples include `key: value # Right`
273    /// and `- # Right` for an empty sequence entry.
274    Right,
275    /// A standalone own-line comment that is separated from nearby YAML tokens.
276    ///
277    /// This is the fallback for comments that are neither same-line comments, immediately above a
278    /// following token, nor the final comment in the stream. Consumers should treat `Free` as not
279    /// having an obvious neighboring node.
280    #[default]
281    Free,
282    /// An own-line comment at the end of the input stream.
283    ///
284    /// A `Last` comment may be followed by blank lines, but no further YAML token appears before
285    /// `StreamEnd`.
286    Last,
287}
288
289/// A YAML comment captured from the source.
290///
291/// Comments are presentation metadata, not YAML data. This type carries the raw comment payload,
292/// source span, and a best-effort [`Placement`] hint for callers that want to correlate comments
293/// with nearby YAML presentation.
294#[derive(Clone, PartialEq, Debug, Eq)]
295pub struct Comment<'input> {
296    /// Span covering the whole source comment, including `#` and excluding the line break.
297    pub span: Span,
298    /// Raw comment payload exactly after `#`, excluding only the line break.
299    ///
300    /// Leading spaces are preserved, including a single space immediately after `#` when present.
301    pub text: Cow<'input, str>,
302    /// Best-effort placement of this comment relative to nearby YAML content.
303    pub placement: Placement,
304}
305
306impl<'input> Comment<'input> {
307    /// Create a captured YAML comment from a source span and raw payload.
308    ///
309    /// The placement defaults to [`Placement::Free`]. Use [`Comment::with_placement`] when the
310    /// caller already knows a more specific placement.
311    #[must_use]
312    pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
313        Self {
314            span,
315            text: text.into(),
316            placement: Placement::Free,
317        }
318    }
319
320    /// Return this comment with the given placement.
321    #[must_use]
322    pub fn with_placement(mut self, placement: Placement) -> Self {
323        self.placement = placement;
324        self
325    }
326
327    /// Return the comment payload with surrounding whitespace removed.
328    ///
329    /// This helper is ergonomic only. The raw [`Self::text`] payload remains unchanged.
330    #[must_use]
331    pub fn trimmed_text(&self) -> &str {
332        self.text.trim()
333    }
334}
335
336impl AsRef<str> for Comment<'_> {
337    fn as_ref(&self) -> &str {
338        self.text.as_ref()
339    }
340}
341
342/// An error that occurred while scanning.
343#[derive(Clone, PartialEq, Debug, Eq)]
344pub struct ScanError {
345    /// The position at which the error happened in the source.
346    mark: Marker,
347    /// Human-readable details about the error.
348    info: String,
349}
350
351impl ScanError {
352    /// Create a new error from a location and an error string.
353    #[must_use]
354    #[cold]
355    pub fn new(loc: Marker, info: String) -> ScanError {
356        ScanError { mark: loc, info }
357    }
358
359    /// Convenience alias for string slices.
360    #[must_use]
361    #[cold]
362    pub fn new_str(loc: Marker, info: &str) -> ScanError {
363        ScanError {
364            mark: loc,
365            info: info.to_owned(),
366        }
367    }
368
369    #[cold]
370    pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
371        Err(self)
372    }
373
374    /// Return the marker pointing to the error in the source.
375    #[must_use]
376    pub fn marker(&self) -> &Marker {
377        &self.mark
378    }
379
380    /// Return the information string describing the error that happened.
381    #[must_use]
382    pub fn info(&self) -> &str {
383        self.info.as_ref()
384    }
385}
386
387impl fmt::Display for ScanError {
388    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389        write!(
390            f,
391            "{} at char {} line {} column {}",
392            self.info,
393            self.mark.index(),
394            self.mark.line(),
395            self.mark.col() + 1
396        )
397    }
398}
399
400impl core::error::Error for ScanError {}
401
402/// The contents of a scanner token.
403#[derive(Clone, PartialEq, Debug, Eq)]
404pub enum TokenType<'input> {
405    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
406    StreamStart(TEncoding),
407    /// The end of the stream, EOF.
408    StreamEnd,
409    /// A YAML version directive.
410    VersionDirective(
411        /// Major version number.
412        u32,
413        /// Minor version number.
414        u32,
415    ),
416    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
417    TagDirective(
418        /// Tag directive handle, such as `!` or `!app!`.
419        Cow<'input, str>,
420        /// Tag URI prefix associated with the handle.
421        Cow<'input, str>,
422    ),
423    /// The start of a YAML document (`---`).
424    DocumentStart,
425    /// The end of a YAML document (`...`).
426    DocumentEnd,
427    /// The start of a sequence block.
428    ///
429    /// Sequence blocks are arrays starting with a `-`.
430    BlockSequenceStart,
431    /// The start of a block mapping.
432    ///
433    /// Block mappings are key-value collections written with `key: value` entries.
434    BlockMappingStart,
435    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
436    BlockEnd,
437    /// Start of an inline sequence (`[ a, b ]`).
438    FlowSequenceStart,
439    /// End of an inline sequence.
440    FlowSequenceEnd,
441    /// Start of an inline mapping (`{ a: b, c: d }`).
442    FlowMappingStart,
443    /// End of an inline mapping.
444    FlowMappingEnd,
445    /// An entry in a block sequence (see [`TokenType::BlockSequenceStart`]).
446    BlockEntry,
447    /// An entry in a flow sequence (see [`TokenType::FlowSequenceStart`]).
448    FlowEntry,
449    /// A key in a mapping.
450    Key,
451    /// A value in a mapping.
452    Value,
453    /// A reference to a previously defined anchor.
454    Alias(Cow<'input, str>),
455    /// A YAML anchor definition introduced by `&`.
456    Anchor(Cow<'input, str>),
457    /// A YAML tag (starting with bangs `!`).
458    Tag(
459        /// The handle of the tag.
460        Cow<'input, str>,
461        /// The suffix of the tag.
462        Cow<'input, str>,
463    ),
464    /// A regular YAML scalar.
465    Scalar(ScalarStyle, Cow<'input, str>),
466    /// A YAML source comment.
467    ///
468    /// The token payload carries the raw text exactly after `#`, the source span, and an initial
469    /// [`Placement`] hint. The token's companion [`Span`] is the same as [`Comment::span`].
470    Comment(
471        /// Captured comment metadata.
472        Comment<'input>,
473    ),
474    /// A reserved YAML directive.
475    ReservedDirective(
476        /// Directive name.
477        String,
478        /// Directive parameters, split on YAML whitespace.
479        Vec<String>,
480    ),
481}
482
483/// A scanner token.
484#[derive(Clone, PartialEq, Debug, Eq)]
485pub struct Token<'input>(
486    /// Source span covered by this token.
487    pub Span,
488    /// Token payload emitted by the scanner.
489    pub TokenType<'input>,
490);
491
492/// Compact comment metadata used only inside the scanner queue.
493///
494/// The queued token already stores the source span, so storing a full public [`Comment`] there
495/// duplicates a large [`Span`] and inflates every queued token.
496#[derive(Clone, PartialEq, Debug, Eq)]
497pub(crate) struct QueuedComment<'input> {
498    pub(crate) text: Cow<'input, str>,
499    pub(crate) placement: Placement,
500}
501
502impl<'input> QueuedComment<'input> {
503    fn into_public(self, span: Span) -> Comment<'input> {
504        Comment::new(span, self.text).with_placement(self.placement)
505    }
506}
507
508impl<'input> From<Comment<'input>> for QueuedComment<'input> {
509    fn from(comment: Comment<'input>) -> Self {
510        Self {
511            text: comment.text,
512            placement: comment.placement,
513        }
514    }
515}
516
517/// Token payload used in the scanner's internal queue.
518///
519/// This mirrors [`TokenType`] but stores comments without their span. Public [`Token`] values are
520/// reconstructed when the scanner emits them.
521#[derive(Clone, PartialEq, Debug, Eq)]
522pub(crate) enum QueuedTokenType<'input> {
523    StreamStart(TEncoding),
524    StreamEnd,
525    VersionDirective(u32, u32),
526    TagDirective(Cow<'input, str>, Cow<'input, str>),
527    DocumentStart,
528    DocumentEnd,
529    BlockSequenceStart,
530    BlockMappingStart,
531    BlockEnd,
532    FlowSequenceStart,
533    FlowSequenceEnd,
534    FlowMappingStart,
535    FlowMappingEnd,
536    BlockEntry,
537    FlowEntry,
538    Key,
539    Value,
540    Alias(Cow<'input, str>),
541    Anchor(Cow<'input, str>),
542    Tag(Cow<'input, str>, Cow<'input, str>),
543    Scalar(ScalarStyle, Cow<'input, str>),
544    Comment(QueuedComment<'input>),
545    ReservedDirective(String, Vec<String>),
546}
547
548impl<'input> QueuedTokenType<'input> {
549    fn into_public(self, span: Span) -> TokenType<'input> {
550        match self {
551            Self::StreamStart(encoding) => TokenType::StreamStart(encoding),
552            Self::StreamEnd => TokenType::StreamEnd,
553            Self::VersionDirective(major, minor) => TokenType::VersionDirective(major, minor),
554            Self::TagDirective(handle, prefix) => TokenType::TagDirective(handle, prefix),
555            Self::DocumentStart => TokenType::DocumentStart,
556            Self::DocumentEnd => TokenType::DocumentEnd,
557            Self::BlockSequenceStart => TokenType::BlockSequenceStart,
558            Self::BlockMappingStart => TokenType::BlockMappingStart,
559            Self::BlockEnd => TokenType::BlockEnd,
560            Self::FlowSequenceStart => TokenType::FlowSequenceStart,
561            Self::FlowSequenceEnd => TokenType::FlowSequenceEnd,
562            Self::FlowMappingStart => TokenType::FlowMappingStart,
563            Self::FlowMappingEnd => TokenType::FlowMappingEnd,
564            Self::BlockEntry => TokenType::BlockEntry,
565            Self::FlowEntry => TokenType::FlowEntry,
566            Self::Key => TokenType::Key,
567            Self::Value => TokenType::Value,
568            Self::Alias(name) => TokenType::Alias(name),
569            Self::Anchor(name) => TokenType::Anchor(name),
570            Self::Tag(handle, suffix) => TokenType::Tag(handle, suffix),
571            Self::Scalar(style, value) => TokenType::Scalar(style, value),
572            Self::Comment(comment) => TokenType::Comment(comment.into_public(span)),
573            Self::ReservedDirective(name, params) => TokenType::ReservedDirective(name, params),
574        }
575    }
576}
577
578impl<'input> From<TokenType<'input>> for QueuedTokenType<'input> {
579    fn from(token: TokenType<'input>) -> Self {
580        match token {
581            TokenType::StreamStart(encoding) => Self::StreamStart(encoding),
582            TokenType::StreamEnd => Self::StreamEnd,
583            TokenType::VersionDirective(major, minor) => Self::VersionDirective(major, minor),
584            TokenType::TagDirective(handle, prefix) => Self::TagDirective(handle, prefix),
585            TokenType::DocumentStart => Self::DocumentStart,
586            TokenType::DocumentEnd => Self::DocumentEnd,
587            TokenType::BlockSequenceStart => Self::BlockSequenceStart,
588            TokenType::BlockMappingStart => Self::BlockMappingStart,
589            TokenType::BlockEnd => Self::BlockEnd,
590            TokenType::FlowSequenceStart => Self::FlowSequenceStart,
591            TokenType::FlowSequenceEnd => Self::FlowSequenceEnd,
592            TokenType::FlowMappingStart => Self::FlowMappingStart,
593            TokenType::FlowMappingEnd => Self::FlowMappingEnd,
594            TokenType::BlockEntry => Self::BlockEntry,
595            TokenType::FlowEntry => Self::FlowEntry,
596            TokenType::Key => Self::Key,
597            TokenType::Value => Self::Value,
598            TokenType::Alias(name) => Self::Alias(name),
599            TokenType::Anchor(name) => Self::Anchor(name),
600            TokenType::Tag(handle, suffix) => Self::Tag(handle, suffix),
601            TokenType::Scalar(style, value) => Self::Scalar(style, value),
602            TokenType::Comment(comment) => Self::Comment(comment.into()),
603            TokenType::ReservedDirective(name, params) => Self::ReservedDirective(name, params),
604        }
605    }
606}
607
608/// A compact token stored by the scanner before it is emitted publicly.
609#[derive(Clone, PartialEq, Debug, Eq)]
610pub(crate) struct QueuedToken<'input>(pub(crate) Span, pub(crate) QueuedTokenType<'input>);
611
612impl<'input> QueuedToken<'input> {
613    fn into_public(self) -> Token<'input> {
614        Token(self.0, self.1.into_public(self.0))
615    }
616}
617
618impl<'input> From<Token<'input>> for QueuedToken<'input> {
619    fn from(token: Token<'input>) -> Self {
620        Self(token.0, token.1.into())
621    }
622}
623
624/// A scalar that was parsed and may correspond to a simple key.
625///
626/// Upon scanning the following YAML:
627/// ```yaml
628/// a: b
629/// ```
630/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
631/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
632/// kept inside the scanner until more context is fetched and we are able to know whether it is a
633/// plain scalar or a key.
634///
635/// For example, see the following two YAML documents:
636/// ```yaml
637/// ---
638/// a: b # Here, `a` is a key.
639/// ...
640/// ---
641/// a # Here, `a` is a plain scalar.
642/// ...
643/// ```
644/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
645///
646/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
647/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
648/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
649///
650/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
651/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
652/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
653/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
654/// [`TokenType::Scalar`] token.
655///
656/// In the second document however, reaching EOF would mark the [`SimpleKey`] as no longer possible,
657/// and no [`TokenType::Key`] would be emitted by the scanner.
658#[derive(Clone, PartialEq, Debug, Eq)]
659struct SimpleKey {
660    /// Whether the token this [`SimpleKey`] refers to may still be a key.
661    ///
662    /// Sometimes, when we have more context, we notice that what we thought could be a key no
663    /// longer can be. In that case, [`Self::possible`] is set to `false`.
664    ///
665    /// For instance, let us consider the following invalid YAML:
666    /// ```yaml
667    /// key
668    ///   : value
669    /// ```
670    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is no longer
671    /// possible and [`Self::possible`] is set to `false`.
672    possible: bool,
673    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
674    ///
675    /// With more context, we may know for sure that the token must be a key. If later input makes
676    /// that impossible, the scanner must report an error instead of silently treating the token as a
677    /// plain scalar.
678    ///
679    /// This happens for simple keys at the current block indentation where the surrounding
680    /// collection requires the next token to be a mapping key.
681    required: bool,
682    /// The index of the token referred to by the [`SimpleKey`].
683    ///
684    /// This is the index in the scanner, which takes into account both the tokens that have been
685    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
686    /// [`Scanner::tokens`] for more details.
687    token_number: usize,
688    /// The position at which the token the [`SimpleKey`] refers to is.
689    mark: Marker,
690}
691
692impl SimpleKey {
693    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
694    fn new(mark: Marker) -> SimpleKey {
695        SimpleKey {
696            possible: false,
697            required: false,
698            token_number: 0,
699            mark,
700        }
701    }
702}
703
704/// An indentation level on the stack of indentations.
705#[derive(Clone, Debug, Default)]
706struct Indent {
707    /// The former indentation level.
708    indent: isize,
709    /// Whether, upon closing, this indents generates a `BlockEnd` token.
710    ///
711    /// There are levels of indentation which do not start a block. Examples of this would be:
712    /// ```yaml
713    /// -
714    ///   foo # ok
715    /// -
716    /// bar # ko, bar needs to be indented further than the `-`.
717    /// - [
718    ///  baz, # ok
719    /// quux # ko, quux needs to be indented further than the '-'.
720    /// ] # ko, the closing bracket needs to be indented further than the `-`.
721    /// ```
722    ///
723    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
724    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
725    /// sequence, although we must have exactly one to end the sequence.
726    needs_block_end: bool,
727}
728
729/// The knowledge we have about an implicit mapping.
730///
731/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
732/// sequence is omitted:
733/// ```yaml
734/// [ a: b, c: d ]
735/// # Equivalent to
736/// [ { a: b }, { c: d } ]
737/// # Equivalent to
738/// - a: b
739/// - c: d
740/// ```
741///
742/// The state must be carefully tracked for each nested flow sequence since we must emit a
743/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
744/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
745/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
746/// out-of-order.
747///
748/// [`FlowMappingStart`]: TokenType::FlowMappingStart
749/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
750#[derive(Debug, PartialEq)]
751enum ImplicitMappingState {
752    /// It is possible there is an implicit mapping.
753    ///
754    /// This state is the one when we have just encountered the opening `[`. We need more context
755    /// to know whether an implicit mapping follows.
756    Possible,
757    /// We are inside the implicit mapping.
758    ///
759    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
760    Inside(u8),
761}
762
763/// The YAML scanner.
764///
765/// This corresponds to the low-level interface when reading YAML. The scanner emits tokens as they
766/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
767/// some of the constructs. It has understanding of indentation and whitespace and is able to
768/// generate error messages for some invalid YAML constructs.
769///
770/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
771/// YAML documents.
772#[derive(Debug)]
773#[allow(clippy::struct_excessive_bools)]
774pub struct Scanner<'input, T> {
775    /// The input source.
776    ///
777    /// This must implement [`Input`].
778    input: T,
779    /// The position of the cursor within the reader.
780    mark: Marker,
781    /// Buffer for tokens to be returned.
782    ///
783    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
784    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
785    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
786    /// [`Self::next`] until we have more context.
787    tokens: VecDeque<QueuedToken<'input>>,
788    /// The last error that happened.
789    error: Option<ScanError>,
790    /// Error found after one or more already-scanned comment tokens.
791    deferred_error: Option<ScanError>,
792    /// Whether the input may contain `#` comment indicators.
793    comments_possible: bool,
794
795    /// Whether we have already emitted the `StreamStart` token.
796    stream_start_produced: bool,
797    /// Whether we have already emitted the `StreamEnd` token.
798    stream_end_produced: bool,
799    /// Whether the scanner is still in the prefix of the next document.
800    ///
801    /// A BOM may appear in a document prefix, before directives/comments/content. Once a document
802    /// start marker or any content token is scanned, another BOM is document content and must be
803    /// rejected unless it appears inside a quoted scalar.
804    document_prefix_allowed: bool,
805    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
806    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
807    adjacent_value_allowed_at: usize,
808    /// Whether a simple key could potentially start at the current position.
809    ///
810    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
811    simple_key_allowed: bool,
812    /// A stack of potential simple keys.
813    ///
814    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
815    /// are.
816    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
817    /// The current indentation level.
818    indent: isize,
819    /// List of all block indentation levels we are in (except the current one).
820    indents: smallvec::SmallVec<[Indent; 8]>,
821    /// Level of nesting of flow sequences.
822    flow_level: u8,
823    /// The number of tokens that have been returned from the scanner.
824    ///
825    /// This excludes the tokens from [`Self::tokens`].
826    tokens_parsed: usize,
827    /// Whether a token is ready to be taken from [`Self::tokens`].
828    token_available: bool,
829    /// Whether all characters encountered since the last newline were whitespace.
830    leading_whitespace: bool,
831    /// Whether we started a flow mapping at each flow nesting level.
832    ///
833    /// This is used to detect implicit flow mapping starts such as:
834    /// ```yaml
835    /// [ : foo ] # { null: "foo" }
836    /// ```
837    flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
838    /// An array of states, representing whether flow sequences have implicit mappings.
839    ///
840    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
841    /// the state is set to [`Possible`].
842    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
843    /// [`Inside`].
844    ///
845    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
846    /// The entries are created with the opening `[` and popped with the closing `]`.
847    ///
848    /// [`Possible`]: ImplicitMappingState::Possible
849    /// [`Inside`]: ImplicitMappingState::Inside
850    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
851    /// If a plain scalar was terminated by a `#` comment on its line, we set this
852    /// to detect an illegal multiline continuation on the following line.
853    interrupted_plain_by_comment: Option<Marker>,
854    /// Whether the scanner is still validating whitespace after an explicit `?` key indicator.
855    ///
856    /// This stays set across streamed comment tokens so a tab after the comment run is rejected the
857    /// same way it was when that whitespace was scanned in one pass.
858    explicit_key_tab_check_pending: bool,
859    /// A stack of markers for opening brackets `[` and `{`.
860    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
861    buf_leading_break: String,
862    buf_trailing_breaks: String,
863    buf_whitespaces: String,
864}
865
866impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
867    type Item = Token<'input>;
868
869    fn next(&mut self) -> Option<Self::Item> {
870        if self.error.is_some() {
871            return None;
872        }
873        match self.next_token() {
874            Ok(Some(tok)) => {
875                debug_print!(
876                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
877                    tok.1,
878                    tok.0
879                );
880                Some(tok)
881            }
882            Ok(tok) => tok,
883            Err(e) => self.stop_after_error(e),
884        }
885    }
886}
887
888/// A convenience alias for scanner functions that may fail without returning a value.
889pub type ScanResult = Result<(), ScanError>;
890
891#[derive(Debug)]
892enum FlowScalarBuf {
893    /// Candidate for `Cow::Borrowed`.
894    ///
895    /// `start..end` is the committed verbatim range.
896    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
897    /// committed (they must be dropped if followed by a line break).
898    Borrowed {
899        start: usize,
900        end: usize,
901        pending_ws_start: Option<usize>,
902        pending_ws_end: usize,
903    },
904    Owned(String),
905}
906
907impl FlowScalarBuf {
908    #[inline]
909    fn new_borrowed(start: usize) -> Self {
910        Self::Borrowed {
911            start,
912            end: start,
913            pending_ws_start: None,
914            pending_ws_end: start,
915        }
916    }
917
918    #[inline]
919    fn new_owned() -> Self {
920        Self::Owned(String::new())
921    }
922
923    #[inline]
924    fn as_owned_mut(&mut self) -> Option<&mut String> {
925        match self {
926            Self::Owned(s) => Some(s),
927            Self::Borrowed { .. } => None,
928        }
929    }
930
931    #[inline]
932    fn commit_pending_ws(&mut self) {
933        if let Self::Borrowed {
934            end,
935            pending_ws_start,
936            pending_ws_end,
937            ..
938        } = self
939        {
940            if pending_ws_start.is_some() {
941                *end = *pending_ws_end;
942                *pending_ws_start = None;
943            }
944        }
945    }
946
947    #[inline]
948    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
949        if let Self::Borrowed {
950            pending_ws_start,
951            pending_ws_end,
952            ..
953        } = self
954        {
955            if pending_ws_start.is_none() {
956                *pending_ws_start = Some(ws_start);
957            }
958            *pending_ws_end = ws_end;
959        }
960    }
961
962    #[inline]
963    fn discard_pending_ws(&mut self) {
964        if let Self::Borrowed {
965            pending_ws_start,
966            pending_ws_end,
967            end,
968            ..
969        } = self
970        {
971            *pending_ws_start = None;
972            *pending_ws_end = *end;
973        }
974    }
975}
976
977impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
978    #[inline]
979    fn promote_flow_scalar_buf_to_owned(
980        &self,
981        start_mark: &Marker,
982        buf: &mut FlowScalarBuf,
983    ) -> Result<(), ScanError> {
984        let FlowScalarBuf::Borrowed {
985            start,
986            end,
987            pending_ws_start: _,
988            pending_ws_end: _,
989        } = *buf
990        else {
991            return Ok(());
992        };
993
994        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
995            ScanError::new_str(
996                *start_mark,
997                "internal error: input advertised offsets but did not provide a slice",
998            )
999        })?;
1000        *buf = FlowScalarBuf::Owned(slice.to_owned());
1001        Ok(())
1002    }
1003    /// Try to borrow a slice from the underlying input.
1004    ///
1005    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
1006    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
1007    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
1008    #[inline]
1009    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
1010        self.input.slice_borrowed(start, end)
1011    }
1012
1013    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
1014    ///
1015    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
1016    /// borrowing is not possible, it falls back to allocating.
1017    fn scan_tag_handle_directive_cow(
1018        &mut self,
1019        mark: &Marker,
1020    ) -> Result<Cow<'input, str>, ScanError> {
1021        let Some(start) = self.input.byte_offset() else {
1022            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1023        };
1024
1025        if self.input.look_ch() != '!' {
1026            return Err(ScanError::new_str(
1027                *mark,
1028                "while scanning a tag, did not find expected '!'",
1029            ));
1030        }
1031
1032        // Consume the leading '!'.
1033        self.skip_non_blank();
1034
1035        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1036        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
1037        self.input.lookahead(1);
1038        while self.input.next_is_alpha() {
1039            self.skip_non_blank();
1040            self.input.lookahead(1);
1041        }
1042
1043        // Optional trailing '!'.
1044        if self.input.peek() == '!' {
1045            self.skip_non_blank();
1046        }
1047
1048        let Some(end) = self.input.byte_offset() else {
1049            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
1050            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1051        };
1052
1053        let Some(slice) = self.try_borrow_slice(start, end) else {
1054            // Fall back to allocating if zero-copy borrow is not available.
1055            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1056                ScanError::new_str(
1057                    *mark,
1058                    "internal error: input advertised slicing but did not provide a slice",
1059                )
1060            })?;
1061            if !slice.ends_with('!') && slice != "!" {
1062                return Err(ScanError::new_str(
1063                    *mark,
1064                    "while parsing a tag directive, did not find expected '!'",
1065                ));
1066            }
1067            return Ok(Cow::Owned(slice.to_owned()));
1068        };
1069
1070        if !slice.ends_with('!') && slice != "!" {
1071            return Err(ScanError::new_str(
1072                *mark,
1073                "while parsing a tag directive, did not find expected '!'",
1074            ));
1075        }
1076
1077        Ok(Cow::Borrowed(slice))
1078    }
1079
1080    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
1081    ///
1082    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
1083    /// escape is present, the prefix must be decoded and therefore allocated.
1084    fn scan_tag_prefix_directive_cow(
1085        &mut self,
1086        start_mark: &Marker,
1087    ) -> Result<Cow<'input, str>, ScanError> {
1088        let Some(start) = self.input.byte_offset() else {
1089            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1090        };
1091
1092        // The prefix must start with either '!' (local) or a valid global tag char.
1093        if self.input.look_ch() == '!' {
1094            self.skip_non_blank();
1095        } else if !is_tag_char(self.input.peek()) {
1096            return Err(ScanError::new_str(
1097                *start_mark,
1098                "invalid global tag character",
1099            ));
1100        } else if self.input.peek() == '%' {
1101            // Needs decoding. Fall back to allocating path below.
1102        } else {
1103            self.skip_non_blank();
1104        }
1105
1106        // Consume URI chars while we can stay in the borrowed path.
1107        while is_uri_char(self.input.look_ch()) {
1108            if self.input.peek() == '%' {
1109                break;
1110            }
1111            self.skip_non_blank();
1112        }
1113
1114        // If we encountered an escape sequence, we must decode, therefore allocate.
1115        if self.input.peek() == '%' {
1116            let current = self
1117                .input
1118                .byte_offset()
1119                .expect("byte_offset() must remain available once enabled");
1120            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1121                slice.to_owned()
1122            } else {
1123                String::new()
1124            };
1125
1126            while is_uri_char(self.input.look_ch()) {
1127                if self.input.peek() == '%' {
1128                    out.push(self.scan_uri_escapes(start_mark)?);
1129                } else {
1130                    out.push(self.input.peek());
1131                    self.skip_non_blank();
1132                }
1133            }
1134            return Ok(Cow::Owned(out));
1135        }
1136
1137        let Some(end) = self.input.byte_offset() else {
1138            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1139        };
1140
1141        let Some(slice) = self.try_borrow_slice(start, end) else {
1142            // Fall back to allocating if zero-copy borrow is not available.
1143            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1144                ScanError::new_str(
1145                    *start_mark,
1146                    "internal error: input advertised slicing but did not provide a slice",
1147                )
1148            })?;
1149            return Ok(Cow::Owned(slice.to_owned()));
1150        };
1151
1152        Ok(Cow::Borrowed(slice))
1153    }
1154    /// Create a scanner over the given input source.
1155    pub fn new(input: T) -> Self {
1156        let initial_byte_offset = input.byte_offset();
1157        let comments_possible = input.may_contain_comments();
1158        Scanner {
1159            input,
1160            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
1161            tokens: VecDeque::with_capacity(64),
1162            error: None,
1163            deferred_error: None,
1164            comments_possible,
1165
1166            stream_start_produced: false,
1167            stream_end_produced: false,
1168            document_prefix_allowed: true,
1169            adjacent_value_allowed_at: 0,
1170            simple_key_allowed: true,
1171            simple_keys: smallvec::SmallVec::new(),
1172            indent: -1,
1173            indents: smallvec::SmallVec::new(),
1174            flow_level: 0,
1175            tokens_parsed: 0,
1176            token_available: false,
1177            leading_whitespace: true,
1178            flow_mapping_started: smallvec::SmallVec::new(),
1179            implicit_flow_mapping_states: smallvec::SmallVec::new(),
1180            flow_markers: smallvec::SmallVec::new(),
1181            interrupted_plain_by_comment: None,
1182            explicit_key_tab_check_pending: false,
1183
1184            buf_leading_break: String::with_capacity(128),
1185            buf_trailing_breaks: String::with_capacity(128),
1186            buf_whitespaces: String::with_capacity(128),
1187        }
1188    }
1189
1190    /// Return a copy of the last error that was encountered, if any.
1191    ///
1192    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
1193    /// clone of) the same error.
1194    #[inline]
1195    pub fn get_error(&self) -> Option<ScanError> {
1196        self.error.clone().or_else(|| self.deferred_error.clone())
1197    }
1198
1199    #[cold]
1200    fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
1201        self.error = Some(error);
1202        None
1203    }
1204
1205    #[cold]
1206    fn simple_key_expected(&self) -> ScanError {
1207        ScanError::new_str(self.mark, "simple key expected")
1208    }
1209
1210    #[cold]
1211    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
1212        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
1213    }
1214
1215    /// Consume the next character. It is assumed the next character is a blank.
1216    #[inline]
1217    fn skip_blank(&mut self) {
1218        self.input.skip();
1219
1220        self.mark.offsets.chars += 1;
1221        self.mark.col += 1;
1222        self.mark.offsets.bytes = self.input.byte_offset();
1223    }
1224
1225    /// Consume the next character. It is assumed the next character is not a blank.
1226    #[inline]
1227    fn skip_non_blank(&mut self) {
1228        self.input.skip();
1229
1230        self.mark.offsets.chars += 1;
1231        self.mark.col += 1;
1232        self.mark.offsets.bytes = self.input.byte_offset();
1233        self.leading_whitespace = false;
1234    }
1235
1236    /// Consume a byte order mark from a document prefix.
1237    ///
1238    /// The source index advances, but the logical column remains unchanged so directives and
1239    /// document markers immediately following the BOM are still recognized as line-start tokens.
1240    #[inline]
1241    fn skip_bom(&mut self) {
1242        self.input.skip();
1243
1244        self.mark.offsets.chars += 1;
1245        self.mark.offsets.bytes = self.input.byte_offset();
1246    }
1247
1248    /// Consume one character that belongs to a comment.
1249    ///
1250    /// Unlike [`Self::skip_non_blank`], this deliberately does not change
1251    /// `leading_whitespace`. Comments are presentation content, so consuming one for either
1252    /// tokenization or skipping should only advance position bookkeeping.
1253    #[inline]
1254    fn skip_comment_char(&mut self) {
1255        self.input.skip();
1256
1257        self.mark.offsets.chars += 1;
1258        self.mark.col += 1;
1259        self.mark.offsets.bytes = self.input.byte_offset();
1260    }
1261
1262    /// Consume the next characters. It is assumed none of the next characters are blanks.
1263    #[inline]
1264    fn skip_n_non_blank(&mut self, count: usize) {
1265        for _ in 0..count {
1266            self.input.skip();
1267            self.mark.offsets.chars += 1;
1268            self.mark.col += 1;
1269        }
1270        self.mark.offsets.bytes = self.input.byte_offset();
1271        self.leading_whitespace = false;
1272    }
1273
1274    /// Consume the next character. It is assumed the next character is a newline.
1275    #[inline]
1276    fn skip_nl(&mut self) {
1277        self.input.skip();
1278
1279        self.mark.offsets.chars += 1;
1280        self.mark.col = 0;
1281        self.mark.line += 1;
1282        self.mark.offsets.bytes = self.input.byte_offset();
1283        self.leading_whitespace = true;
1284    }
1285
1286    /// Consume a line break (either CR, LF, or CRLF), if any. Do nothing if there is none.
1287    #[inline]
1288    fn skip_linebreak(&mut self) {
1289        if self.input.next_2_are('\r', '\n') {
1290            // While technically not a blank, this does not matter as `self.leading_whitespace`
1291            // will be reset by `skip_nl`.
1292            self.skip_blank();
1293            self.skip_nl();
1294        } else if self.input.next_is_break() {
1295            self.skip_nl();
1296        }
1297    }
1298
1299    #[cfg(test)]
1300    fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
1301        Ok(self.scan_comment_queued_token()?.into_public())
1302    }
1303
1304    fn scan_comment_queued_token(&mut self) -> Result<QueuedToken<'input>, ScanError> {
1305        let start_mark = self.mark;
1306        debug_assert_eq!(self.input.peek(), '#');
1307        let placement = if self.leading_whitespace {
1308            Placement::Free
1309        } else {
1310            Placement::Right
1311        };
1312
1313        self.skip_comment_char();
1314
1315        let text = if let Some(start) = self.input.byte_offset() {
1316            // Stable byte offsets are available; slice the payload once at the end.
1317            let n = self.input.skip_while_non_breakz();
1318            self.mark.offsets.chars += n;
1319            self.mark.col += n;
1320            let byte_offset = self.input.byte_offset();
1321            self.mark.offsets.bytes = byte_offset;
1322            let end = byte_offset.expect("byte_offset must remain available once enabled");
1323
1324            if let Some(slice) = self.try_borrow_slice(start, end) {
1325                Cow::Borrowed(slice)
1326            } else if let Some(slice) = self.input.slice_bytes(start, end) {
1327                // Defensive fallback for third-party inputs that expose offsets but cannot borrow.
1328                Cow::Owned(slice.to_owned())
1329            } else {
1330                return Err(ScanError::new_str(
1331                    start_mark,
1332                    "internal error: input advertised offsets but did not provide a slice",
1333                ));
1334            }
1335        } else {
1336            // Streaming input without stable offsets; collect into an owned string.
1337            let mut owned = String::new();
1338            while !is_breakz(self.input.look_ch()) {
1339                owned.push(self.input.peek());
1340                self.skip_comment_char();
1341            }
1342            Cow::Owned(owned)
1343        };
1344
1345        let end_mark = self.mark;
1346        let span = Span::new(start_mark, end_mark);
1347        Ok(QueuedToken(
1348            span,
1349            QueuedTokenType::Comment(QueuedComment { text, placement }),
1350        ))
1351    }
1352
1353    fn push_comment_token(&mut self) -> ScanResult {
1354        let token = self.scan_comment_queued_token()?;
1355        self.tokens.push_back(token);
1356        Ok(())
1357    }
1358
1359    fn skip_comment(&mut self) {
1360        debug_assert_eq!(self.input.peek(), '#');
1361
1362        self.skip_comment_char();
1363        let n = self.input.skip_while_non_breakz();
1364        self.mark.offsets.chars += n;
1365        self.mark.col += n;
1366        self.mark.offsets.bytes = self.input.byte_offset();
1367    }
1368
1369    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
1370    #[inline]
1371    pub fn stream_started(&self) -> bool {
1372        self.stream_start_produced
1373    }
1374
1375    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
1376    #[inline]
1377    pub fn stream_ended(&self) -> bool {
1378        self.stream_end_produced
1379    }
1380
1381    /// Return the current position in the input stream.
1382    #[inline]
1383    pub fn mark(&self) -> Marker {
1384        self.mark
1385    }
1386
1387    /// Return whether this scanner may emit comment tokens.
1388    #[inline]
1389    pub(crate) fn comments_possible(&self) -> bool {
1390        self.comments_possible
1391    }
1392
1393    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1394    //
1395    // A `\n` is pushed into `s`.
1396    //
1397    // # Panics (in debug)
1398    // If the next characters do not correspond to a line break.
1399    #[inline]
1400    fn read_break(&mut self, s: &mut String) {
1401        self.skip_break();
1402        s.push('\n');
1403    }
1404
1405    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1406    //
1407    // # Panics (in debug)
1408    // If the next characters do not correspond to a line break.
1409    #[inline]
1410    fn skip_break(&mut self) {
1411        let c = self.input.peek();
1412        let nc = self.input.peek_nth(1);
1413        debug_assert!(is_break(c));
1414        if c == '\r' && nc == '\n' {
1415            self.skip_blank();
1416        }
1417        self.skip_nl();
1418    }
1419
1420    /// Insert a token at the given position.
1421    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1422        let old_len = self.tokens.len();
1423        assert!(pos <= old_len);
1424        self.tokens.insert(pos, tok.into());
1425    }
1426
1427    fn simple_key_token_index(&self, sk: &SimpleKey, mark: Marker) -> Result<usize, ScanError> {
1428        let Some(index) = sk.token_number.checked_sub(self.tokens_parsed) else {
1429            return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1430        };
1431        if index > self.tokens.len() {
1432            return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1433        }
1434        Ok(index)
1435    }
1436
1437    #[inline]
1438    fn allow_simple_key(&mut self) {
1439        self.simple_key_allowed = true;
1440    }
1441
1442    #[inline]
1443    fn disallow_simple_key(&mut self) {
1444        self.simple_key_allowed = false;
1445    }
1446
1447    /// Scan enough input to append one next token to the internal token queue.
1448    ///
1449    /// # Errors
1450    /// Returns `ScanError` when the scanner does not find the next expected token.
1451    pub fn fetch_next_token(&mut self) -> ScanResult {
1452        self.input.lookahead(1);
1453
1454        if !self.stream_start_produced {
1455            self.fetch_stream_start();
1456            return Ok(());
1457        }
1458        if self.skip_to_next_token(true)? {
1459            return Ok(());
1460        }
1461
1462        debug_print!(
1463            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1464            self.mark,
1465            self.input.peek()
1466        );
1467
1468        self.stale_simple_keys()?;
1469
1470        let mark = self.mark;
1471        self.unroll_indent(mark.col as isize);
1472
1473        self.input.lookahead(4);
1474
1475        if self.input.next_is_z() {
1476            self.fetch_stream_end()?;
1477            return Ok(());
1478        }
1479
1480        if self.mark.col == 0 {
1481            if self.input.next_char_is('%') {
1482                return self.fetch_directive();
1483            } else if self.input.next_is_document_start() {
1484                return self.fetch_document_indicator(TokenType::DocumentStart);
1485            } else if self.input.next_is_document_end() {
1486                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1487                self.skip_ws_to_eol(SkipTabs::Yes)?;
1488                if !self.input.next_is_breakz() {
1489                    return Err(ScanError::new_str(
1490                        self.mark,
1491                        "invalid content after document end marker",
1492                    ));
1493                }
1494                return Ok(());
1495            }
1496        }
1497
1498        if self.document_prefix_allowed {
1499            self.document_prefix_allowed = false;
1500        }
1501
1502        if (self.mark.col as isize) < self.indent {
1503            self.input.lookahead(1);
1504            let c = self.input.peek();
1505            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1506                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1507            }
1508        }
1509
1510        let c = self.input.peek();
1511        let nc = self.input.peek_nth(1);
1512        match c {
1513            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1514            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1515            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1516            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1517            ',' => self.fetch_flow_entry(),
1518            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1519            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1520            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1521            ':' if self.flow_level > 0
1522                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1523            {
1524                self.fetch_flow_value()
1525            }
1526            // Is it an alias?
1527            '*' => self.fetch_anchor(true),
1528            // Is it an anchor?
1529            '&' => self.fetch_anchor(false),
1530            '!' => self.fetch_tag(),
1531            // Is it a literal scalar?
1532            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1533            // Is it a folded scalar?
1534            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1535            '\'' => self.fetch_flow_scalar(true),
1536            '"' => self.fetch_flow_scalar(false),
1537            // plain scalar
1538            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1539            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1540                self.fetch_plain_scalar()
1541            }
1542            c if is_bom(c) => Err(ScanError::new_str(
1543                self.mark,
1544                "a BOM must not appear inside a document",
1545            )),
1546            '%' | '@' | '`' => Err(ScanError::new(
1547                self.mark,
1548                format!("unexpected character: `{c}'"),
1549            )),
1550            _ => self.fetch_plain_scalar(),
1551        }
1552    }
1553
1554    /// Return the next compact queued token, scanning more input when needed.
1555    ///
1556    /// # Errors
1557    /// Returns `ScanError` when scanning fails to find an expected next token.
1558    pub(crate) fn next_queued_token(&mut self) -> Result<Option<QueuedToken<'input>>, ScanError> {
1559        if self.deferred_error.is_some() {
1560            if !matches!(
1561                self.tokens.front().map(|token| &token.1),
1562                Some(QueuedTokenType::Comment(_))
1563            ) {
1564                if let Some(error) = self.deferred_error.take() {
1565                    return error.into_result();
1566                }
1567            }
1568            self.token_available = true;
1569        }
1570
1571        if self.stream_end_produced {
1572            return Ok(None);
1573        }
1574
1575        if !self.token_available {
1576            if let Err(error) = self.fetch_more_tokens() {
1577                if matches!(
1578                    self.tokens.front().map(|token| &token.1),
1579                    Some(QueuedTokenType::Comment(_))
1580                ) {
1581                    self.deferred_error = Some(error);
1582                } else {
1583                    return Err(error);
1584                }
1585            }
1586        }
1587        let Some(t) = self.tokens.pop_front() else {
1588            return Err(ScanError::new_str(
1589                self.mark,
1590                "did not find expected next token",
1591            ));
1592        };
1593        self.token_available = false;
1594        self.tokens_parsed += 1;
1595
1596        let is_stream_end = matches!(t.1, QueuedTokenType::StreamEnd);
1597        if is_stream_end {
1598            self.stream_end_produced = true;
1599        }
1600        Ok(Some(t))
1601    }
1602
1603    /// Return the next queued token, scanning more input when needed.
1604    ///
1605    /// # Errors
1606    /// Returns `ScanError` when scanning fails to find an expected next token.
1607    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1608        Ok(self.next_queued_token()?.map(QueuedToken::into_public))
1609    }
1610
1611    /// Scan more input until a token is ready to be returned.
1612    ///
1613    /// # Errors
1614    /// Returns `ScanError` when scanning fails.
1615    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1616        let mut need_more;
1617        loop {
1618            if self.tokens.is_empty() {
1619                need_more = true;
1620            } else {
1621                need_more = false;
1622                // Stale potential keys that we know won't be keys.
1623                self.stale_simple_keys()?;
1624                if !matches!(
1625                    self.tokens.front().map(|token| &token.1),
1626                    Some(QueuedTokenType::Comment(_))
1627                ) {
1628                    // If our next token to be emitted may be a key, fetch more context.
1629                    for sk in &self.simple_keys {
1630                        if sk.possible && sk.token_number == self.tokens_parsed {
1631                            need_more = true;
1632                            break;
1633                        }
1634                    }
1635                }
1636            }
1637
1638            // Stop fetching immediately after document end/start markers
1639            // to allow the parser to emit the event before reading more content.
1640            if let Some(token) = self.tokens.back() {
1641                if matches!(
1642                    token.1,
1643                    QueuedTokenType::DocumentEnd | QueuedTokenType::DocumentStart
1644                ) {
1645                    break;
1646                }
1647            }
1648
1649            if !need_more {
1650                break;
1651            }
1652            self.fetch_next_token()?;
1653        }
1654        self.token_available = true;
1655
1656        Ok(())
1657    }
1658
1659    /// Mark simple keys that can no longer be keys as such.
1660    ///
1661    /// This function sets `possible` to `false` to each key that, now we have more context, we
1662    /// know will not be keys.
1663    ///
1664    /// # Errors
1665    /// This function returns an error if one of the keys becoming impossible was required to be a
1666    /// key.
1667    fn stale_simple_keys(&mut self) -> ScanResult {
1668        for sk in &mut self.simple_keys {
1669            let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1670            // The length cap applies in flow contexts too; otherwise token buffering can grow
1671            // without bound while the scanner waits to see whether a later ':' resolves the key.
1672            let is_length_stale =
1673                self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1674
1675            if sk.possible && (is_line_stale || is_length_stale) {
1676                if sk.required {
1677                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1678                }
1679                sk.possible = false;
1680            }
1681        }
1682        Ok(())
1683    }
1684
1685    /// Skip over whitespace (`\t`, ` `, `\n`, `\r`) until the next non-comment token.
1686    ///
1687    /// Comments encountered while skipping are queued as [`TokenType::Comment`] tokens so the
1688    /// parser can emit them as presentation events. If `stop_after_comment` is true, the function
1689    /// returns after queuing one comment so callers can emit it before scanning later comments.
1690    ///
1691    /// # Errors
1692    /// This function returns an error if a tab is encountered where there should not be
1693    /// one.
1694    fn skip_to_next_token(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1695        // Hot-path helper: consume a single logical line break and apply simple-key rules.
1696        // (Kept local to ensure the compiler can inline it easily.)
1697        let consume_linebreak = |this: &mut Self| {
1698            this.input.lookahead(2);
1699            this.skip_linebreak();
1700            if this.flow_level == 0 {
1701                this.allow_simple_key();
1702            }
1703        };
1704
1705        loop {
1706            let ch = self.input.look_ch();
1707            if self.explicit_key_tab_check_pending {
1708                match ch {
1709                    '\t' => {
1710                        return Err(ScanError::new_str(
1711                            self.mark(),
1712                            "tabs disallowed in this context",
1713                        ));
1714                    }
1715                    ' ' | '\n' | '\r' | '#' => {}
1716                    _ => self.explicit_key_tab_check_pending = false,
1717                }
1718            }
1719
1720            match ch {
1721                // Tabs may not be used as indentation (block context only).
1722                '\t' => {
1723                    if self.is_within_block()
1724                        && self.leading_whitespace
1725                        && (self.mark.col as isize) < self.indent
1726                    {
1727                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1728
1729                        // If we have content on that line with a tab, return an error.
1730                        if !self.input.next_is_breakz() {
1731                            return Err(ScanError::new_str(
1732                                self.mark,
1733                                "tabs disallowed within this context (block indentation)",
1734                            ));
1735                        }
1736
1737                        // Micro-opt: if we stopped on a line break, consume it now (avoids another loop trip).
1738                        if matches!(self.input.look_ch(), '\n' | '\r') {
1739                            consume_linebreak(self);
1740                        }
1741                    } else {
1742                        // Non-indentation tab behaves like blank.
1743                        self.skip_blank();
1744                    }
1745                }
1746
1747                ' ' => self.skip_blank(),
1748
1749                '\n' | '\r' => consume_linebreak(self),
1750
1751                c if is_bom(c)
1752                    && self.document_prefix_allowed
1753                    && self.flow_level == 0
1754                    && self.mark.col == 0 =>
1755                {
1756                    self.skip_bom();
1757                }
1758
1759                '#' => {
1760                    self.push_comment_token()?;
1761
1762                    // Micro-opt: comment-only lines are common; consume the following line break here.
1763                    if matches!(self.input.look_ch(), '\n' | '\r') {
1764                        consume_linebreak(self);
1765                    }
1766                    if stop_after_comment {
1767                        return Ok(true);
1768                    }
1769                }
1770
1771                _ => break,
1772            }
1773        }
1774
1775        // If a plain scalar was interrupted by a comment, and the next line could
1776        // continue the scalar in block context, this is invalid.
1777        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1778            // BS4K should only trigger when the continuation would start on the immediate next
1779            // line (no intervening empty/comment-only lines). A blank line resets the folding
1780            // opportunity and thus should not error.
1781            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1782
1783            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1784            if self.flow_level == 0
1785                && is_immediate_next_line
1786                && (self.mark.col as isize) > self.indent
1787            {
1788                // Ensure enough lookahead for:
1789                // - the checks below (peek/peek_nth)
1790                // - document indicator detection which needs 4 chars.
1791                self.input.lookahead(4);
1792
1793                if !self.input.next_is_z()
1794                    && !self.input.next_is_document_indicator()
1795                    && self.input.next_can_be_plain_scalar(false)
1796                {
1797                    return Err(ScanError::new_str(
1798                        err_mark,
1799                        "comment intercepting the multiline text",
1800                    ));
1801                }
1802            }
1803        }
1804
1805        Ok(false)
1806    }
1807
1808    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1809    ///
1810    /// If `stop_after_comment` is true, the function returns after queuing one comment so callers
1811    /// can emit it before scanning later comments.
1812    ///
1813    /// # Errors
1814    /// This function returns an error if no whitespace was found.
1815    fn skip_yaml_whitespace(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1816        let mut need_whitespace = true;
1817        loop {
1818            match self.input.look_ch() {
1819                ' ' => {
1820                    self.skip_blank();
1821
1822                    need_whitespace = false;
1823                }
1824                '\n' | '\r' => {
1825                    self.input.lookahead(2);
1826                    self.skip_linebreak();
1827                    if self.flow_level == 0 {
1828                        self.allow_simple_key();
1829                    }
1830                    need_whitespace = false;
1831                }
1832                '#' => {
1833                    if need_whitespace {
1834                        self.skip_comment();
1835                    } else {
1836                        self.push_comment_token()?;
1837                        if stop_after_comment {
1838                            return Ok(true);
1839                        }
1840                    }
1841                }
1842                _ => break,
1843            }
1844        }
1845
1846        if need_whitespace {
1847            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1848        } else {
1849            Ok(false)
1850        }
1851    }
1852
1853    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1854        debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
1855
1856        if !self.comments_possible {
1857            let (chars_consumed, result) = self.input.skip_ws_to_eol(skip_tabs);
1858            self.mark.col += chars_consumed;
1859            self.mark.offsets.chars += chars_consumed;
1860            self.mark.offsets.bytes = self.input.byte_offset();
1861            return result.map_err(|msg| ScanError::new_str(self.mark, msg));
1862        }
1863
1864        let (chars_consumed, whitespace) = self.input.skip_ws_to_eol_blanks(skip_tabs);
1865        self.mark.col += chars_consumed;
1866        self.mark.offsets.chars += chars_consumed;
1867        self.mark.offsets.bytes = self.input.byte_offset();
1868
1869        if self.input.look_ch() != '#' {
1870            return Ok(whitespace);
1871        }
1872
1873        if !whitespace.found_tabs() && !whitespace.has_valid_yaml_ws() {
1874            return Err(ScanError::new_str(
1875                self.mark,
1876                "comments must be separated from other tokens by whitespace",
1877            ));
1878        }
1879
1880        self.push_comment_token()?;
1881        Ok(whitespace)
1882    }
1883
1884    fn fetch_stream_start(&mut self) {
1885        let mark = self.mark;
1886        self.indent = -1;
1887        self.stream_start_produced = true;
1888        self.allow_simple_key();
1889        self.tokens
1890            .push_back(Token(Span::empty(mark), TokenType::StreamStart(TEncoding::Utf8)).into());
1891        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1892    }
1893
1894    fn fetch_stream_end(&mut self) -> ScanResult {
1895        // force new line
1896        if self.mark.col != 0 {
1897            self.mark.col = 0;
1898            self.mark.line += 1;
1899        }
1900
1901        if let Some((mark, bracket)) = self.flow_markers.pop() {
1902            return Err(Self::unclosed_bracket(mark, bracket));
1903        }
1904
1905        // If the stream ended, we won't have more context. We can stall all the simple keys we
1906        // had. If one was required, however, that was an error and we must propagate it.
1907        for sk in &mut self.simple_keys {
1908            if sk.required && sk.possible {
1909                return Err(self.simple_key_expected());
1910            }
1911            sk.possible = false;
1912        }
1913
1914        self.unroll_indent(-1);
1915        self.remove_simple_key()?;
1916        self.disallow_simple_key();
1917
1918        self.tokens
1919            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd).into());
1920        Ok(())
1921    }
1922
1923    fn fetch_directive(&mut self) -> ScanResult {
1924        self.unroll_indent(-1);
1925        self.remove_simple_key()?;
1926
1927        self.disallow_simple_key();
1928
1929        let token_index = self.tokens.len();
1930        let tok = self.scan_directive()?;
1931        self.insert_token(token_index, tok);
1932
1933        Ok(())
1934    }
1935
1936    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1937        let start_mark = self.mark;
1938        self.skip_non_blank();
1939
1940        let name = self.scan_directive_name()?;
1941        let tok = match name.as_ref() {
1942            "YAML" => self.scan_version_directive_value(&start_mark)?,
1943            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1944            _ => {
1945                let mut params = Vec::new();
1946                while self.input.next_is_blank() {
1947                    let n_blanks = self.input.skip_while_blank();
1948                    self.mark.offsets.chars += n_blanks;
1949                    self.mark.col += n_blanks;
1950                    self.mark.offsets.bytes = self.input.byte_offset();
1951
1952                    if !is_blank_or_breakz(self.input.peek()) {
1953                        let mut param = String::new();
1954                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1955                        self.mark.offsets.chars += n_chars;
1956                        self.mark.col += n_chars;
1957                        self.mark.offsets.bytes = self.input.byte_offset();
1958                        params.push(param);
1959                    }
1960                }
1961
1962                Token(
1963                    Span::new(start_mark, self.mark),
1964                    TokenType::ReservedDirective(name, params),
1965                )
1966            }
1967        };
1968
1969        self.skip_ws_to_eol(SkipTabs::Yes)?;
1970
1971        if self.input.next_is_breakz() {
1972            self.input.lookahead(2);
1973            self.skip_linebreak();
1974            Ok(tok)
1975        } else {
1976            Err(ScanError::new_str(
1977                start_mark,
1978                "while scanning a directive, did not find expected comment or line break",
1979            ))
1980        }
1981    }
1982
1983    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1984        let n_blanks = self.input.skip_while_blank();
1985        self.mark.offsets.chars += n_blanks;
1986        self.mark.col += n_blanks;
1987        self.mark.offsets.bytes = self.input.byte_offset();
1988
1989        let major = self.scan_version_directive_number(mark)?;
1990
1991        if self.input.peek() != '.' {
1992            return Err(ScanError::new_str(
1993                *mark,
1994                "while scanning a YAML directive, did not find expected digit or '.' character",
1995            ));
1996        }
1997        self.skip_non_blank();
1998
1999        let minor = self.scan_version_directive_number(mark)?;
2000
2001        Ok(Token(
2002            Span::new(*mark, self.mark),
2003            TokenType::VersionDirective(major, minor),
2004        ))
2005    }
2006
2007    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
2008        let start_mark = self.mark;
2009        let mut string = String::new();
2010
2011        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
2012        self.mark.offsets.chars += n_chars;
2013        self.mark.col += n_chars;
2014        self.mark.offsets.bytes = self.input.byte_offset();
2015
2016        if string.is_empty() {
2017            return Err(ScanError::new_str(
2018                start_mark,
2019                "while scanning a directive, could not find expected directive name",
2020            ));
2021        }
2022
2023        if !is_blank_or_breakz(self.input.peek()) {
2024            return Err(ScanError::new_str(
2025                start_mark,
2026                "while scanning a directive, found unexpected non-alphabetical character",
2027            ));
2028        }
2029
2030        Ok(string)
2031    }
2032
2033    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
2034        let mut val = 0u32;
2035        let mut length = 0usize;
2036        while let Some(digit) = self.input.look_ch().to_digit(10) {
2037            if length + 1 > 9 {
2038                return Err(ScanError::new_str(
2039                    *mark,
2040                    "while scanning a YAML directive, found extremely long version number",
2041                ));
2042            }
2043            length += 1;
2044            val = val * 10 + digit;
2045            self.skip_non_blank();
2046        }
2047
2048        if length == 0 {
2049            return Err(ScanError::new_str(
2050                *mark,
2051                "while scanning a YAML directive, did not find expected version number",
2052            ));
2053        }
2054
2055        Ok(val)
2056    }
2057
2058    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
2059        let n_blanks = self.input.skip_while_blank();
2060        self.mark.offsets.chars += n_blanks;
2061        self.mark.col += n_blanks;
2062        self.mark.offsets.bytes = self.input.byte_offset();
2063
2064        let handle = self.scan_tag_handle_directive_cow(mark)?;
2065
2066        let n_blanks = self.input.skip_while_blank();
2067        self.mark.offsets.chars += n_blanks;
2068        self.mark.col += n_blanks;
2069        self.mark.offsets.bytes = self.input.byte_offset();
2070
2071        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
2072
2073        self.input.lookahead(1);
2074
2075        if self.input.next_is_blank_or_breakz() {
2076            Ok(Token(
2077                Span::new(*mark, self.mark),
2078                TokenType::TagDirective(handle, prefix),
2079            ))
2080        } else {
2081            Err(ScanError::new_str(
2082                *mark,
2083                "while scanning TAG, did not find expected whitespace or line break",
2084            ))
2085        }
2086    }
2087
2088    fn fetch_tag(&mut self) -> ScanResult {
2089        self.save_simple_key();
2090        self.disallow_simple_key();
2091
2092        let tok = self.scan_tag()?;
2093        self.tokens.push_back(tok.into());
2094        Ok(())
2095    }
2096
2097    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
2098        let start_mark = self.mark;
2099
2100        // Check if the tag is in the canonical form (verbatim).
2101        self.input.lookahead(2);
2102
2103        // If byte_offset is not available, use the original owned-only path.
2104        if self.input.byte_offset().is_none() {
2105            return self.scan_tag_owned(&start_mark);
2106        }
2107
2108        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
2109            if self.input.nth_char_is(1, '<') {
2110                // Verbatim tags always need owned strings (URI escapes).
2111                let suffix = self.scan_verbatim_tag(&start_mark)?;
2112                (Cow::Owned(String::new()), Cow::Owned(suffix))
2113            } else {
2114                // The tag has either the '!suffix' or the '!handle!suffix'
2115                let handle = self.scan_tag_handle_cow(&start_mark)?;
2116                // Check if it is, indeed, handle.
2117                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2118                    // A tag handle starting with "!!" is a secondary tag handle.
2119                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
2120                    (handle, suffix)
2121                } else {
2122                    // Not a real handle, it's part of the suffix.
2123                    // E.g., "!foo" -> handle="!", suffix="foo"
2124                    // The "handle" we scanned is actually "!" + suffix_part1.
2125                    // We need to also scan any remaining suffix characters.
2126                    let remaining_suffix =
2127                        self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
2128
2129                    // Extract suffix from handle (skip leading '!') and combine with remaining.
2130                    let suffix = if handle.len() > 1 {
2131                        if remaining_suffix.is_empty() {
2132                            // The suffix is just what's in handle after '!'
2133                            match handle {
2134                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
2135                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
2136                            }
2137                        } else {
2138                            // Combine handle (minus leading '!') with remaining suffix.
2139                            let mut combined = handle[1..].to_owned();
2140                            combined.push_str(&remaining_suffix);
2141                            Cow::Owned(combined)
2142                        }
2143                    } else {
2144                        // handle is just "!", suffix is whatever we scanned after
2145                        remaining_suffix
2146                    };
2147
2148                    // A special case: the '!' tag.  Set the handle to '' and the
2149                    // suffix to '!'.
2150                    if suffix.is_empty() {
2151                        (Cow::Borrowed(""), Cow::Borrowed("!"))
2152                    } else {
2153                        (Cow::Borrowed("!"), suffix)
2154                    }
2155                }
2156            };
2157
2158        if is_blank_or_breakz(self.input.look_ch())
2159            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2160        {
2161            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2162            // delimiter follows.
2163            Ok(Token(
2164                Span::new(start_mark, self.mark),
2165                TokenType::Tag(handle, suffix),
2166            ))
2167        } else {
2168            Err(ScanError::new_str(
2169                start_mark,
2170                "while scanning a tag, did not find expected whitespace or line break",
2171            ))
2172        }
2173    }
2174
2175    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
2176    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
2177        let mut handle = String::new();
2178        let mut suffix;
2179
2180        if self.input.nth_char_is(1, '<') {
2181            suffix = self.scan_verbatim_tag(start_mark)?;
2182        } else {
2183            // The tag has either the '!suffix' or the '!handle!suffix'
2184            handle = self.scan_tag_handle(false, start_mark)?;
2185            // Check if it is, indeed, handle.
2186            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2187                // A tag handle starting with "!!" is a secondary tag handle.
2188                let is_secondary_handle = handle == "!!";
2189                suffix =
2190                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
2191            } else {
2192                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
2193                "!".clone_into(&mut handle);
2194                // A special case: the '!' tag.  Set the handle to '' and the
2195                // suffix to '!'.
2196                if suffix.is_empty() {
2197                    handle.clear();
2198                    "!".clone_into(&mut suffix);
2199                }
2200            }
2201        }
2202
2203        if is_blank_or_breakz(self.input.look_ch())
2204            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2205        {
2206            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2207            // delimiter follows.
2208            Ok(Token(
2209                Span::new(*start_mark, self.mark),
2210                TokenType::Tag(handle.into(), suffix.into()),
2211            ))
2212        } else {
2213            Err(ScanError::new_str(
2214                *start_mark,
2215                "while scanning a tag, did not find expected whitespace or line break",
2216            ))
2217        }
2218    }
2219
2220    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
2221    ///
2222    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
2223    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
2224    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
2225        let Some(start) = self.input.byte_offset() else {
2226            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2227        };
2228
2229        if self.input.look_ch() != '!' {
2230            return Err(ScanError::new_str(
2231                *mark,
2232                "while scanning a tag, did not find expected '!'",
2233            ));
2234        }
2235
2236        // Consume the leading '!'.
2237        self.skip_non_blank();
2238
2239        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
2240        self.input.lookahead(1);
2241        while self.input.next_is_alpha() {
2242            self.skip_non_blank();
2243            self.input.lookahead(1);
2244        }
2245
2246        // Optional trailing '!'.
2247        if self.input.peek() == '!' {
2248            self.skip_non_blank();
2249        }
2250
2251        let Some(end) = self.input.byte_offset() else {
2252            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2253        };
2254
2255        if let Some(slice) = self.try_borrow_slice(start, end) {
2256            Ok(Cow::Borrowed(slice))
2257        } else {
2258            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2259                ScanError::new_str(
2260                    *mark,
2261                    "internal error: input advertised slicing but did not provide a slice",
2262                )
2263            })?;
2264            Ok(Cow::Owned(slice.to_owned()))
2265        }
2266    }
2267
2268    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
2269    ///
2270    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
2271    fn scan_tag_shorthand_suffix_cow(
2272        &mut self,
2273        mark: &Marker,
2274        require_non_empty: bool,
2275    ) -> Result<Cow<'input, str>, ScanError> {
2276        let Some(start) = self.input.byte_offset() else {
2277            return Ok(Cow::Owned(
2278                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2279            ));
2280        };
2281
2282        // Scan tag characters, checking for URI escapes.
2283        while is_tag_char(self.input.look_ch()) {
2284            if self.input.peek() == '%' {
2285                // URI escape found - must decode, so fall back to owned path.
2286                let current = self
2287                    .input
2288                    .byte_offset()
2289                    .expect("byte_offset() must remain available once enabled");
2290                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
2291                    slice.to_owned()
2292                } else {
2293                    String::new()
2294                };
2295
2296                // Continue scanning with owned buffer.
2297                while is_tag_char(self.input.look_ch()) {
2298                    if self.input.peek() == '%' {
2299                        out.push(self.scan_uri_escapes(mark)?);
2300                    } else {
2301                        out.push(self.input.peek());
2302                        self.skip_non_blank();
2303                    }
2304                }
2305                return Ok(Cow::Owned(out));
2306            }
2307            self.skip_non_blank();
2308        }
2309
2310        let Some(end) = self.input.byte_offset() else {
2311            return Ok(Cow::Owned(
2312                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2313            ));
2314        };
2315
2316        if require_non_empty && start == end {
2317            return Err(ScanError::new_str(
2318                *mark,
2319                "while parsing a tag, did not find expected tag URI",
2320            ));
2321        }
2322
2323        if let Some(slice) = self.try_borrow_slice(start, end) {
2324            Ok(Cow::Borrowed(slice))
2325        } else {
2326            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2327                ScanError::new_str(
2328                    *mark,
2329                    "internal error: input advertised slicing but did not provide a slice",
2330                )
2331            })?;
2332            Ok(Cow::Owned(slice.to_owned()))
2333        }
2334    }
2335
2336    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
2337        let mut string = String::new();
2338        if self.input.look_ch() != '!' {
2339            return Err(ScanError::new_str(
2340                *mark,
2341                "while scanning a tag, did not find expected '!'",
2342            ));
2343        }
2344
2345        string.push(self.input.peek());
2346        self.skip_non_blank();
2347
2348        let n_chars = self.input.fetch_while_is_alpha(&mut string);
2349        self.mark.offsets.chars += n_chars;
2350        self.mark.col += n_chars;
2351        self.mark.offsets.bytes = self.input.byte_offset();
2352
2353        // Check if the trailing character is '!' and copy it.
2354        if self.input.peek() == '!' {
2355            string.push(self.input.peek());
2356            self.skip_non_blank();
2357        } else if directive && string != "!" {
2358            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
2359            // directive, it's an error.  If it's a tag token, it must be a part of
2360            // URI.
2361            return Err(ScanError::new_str(
2362                *mark,
2363                "while parsing a tag directive, did not find expected '!'",
2364            ));
2365        }
2366        Ok(string)
2367    }
2368
2369    /// Scan for a tag prefix (6.8.2.2).
2370    ///
2371    /// There are 2 kinds of tag prefixes:
2372    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
2373    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
2374    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2375        let mut string = String::new();
2376
2377        if self.input.look_ch() == '!' {
2378            // If we have a local tag, insert and skip `!`.
2379            string.push(self.input.peek());
2380            self.skip_non_blank();
2381        } else if !is_tag_char(self.input.peek()) {
2382            // Otherwise, check if the first global tag character is valid.
2383            return Err(ScanError::new_str(
2384                *start_mark,
2385                "invalid global tag character",
2386            ));
2387        } else if self.input.peek() == '%' {
2388            // If it is valid and an escape sequence, escape it.
2389            string.push(self.scan_uri_escapes(start_mark)?);
2390        } else {
2391            // Otherwise, push the first character.
2392            string.push(self.input.peek());
2393            self.skip_non_blank();
2394        }
2395
2396        while is_uri_char(self.input.look_ch()) {
2397            if self.input.peek() == '%' {
2398                string.push(self.scan_uri_escapes(start_mark)?);
2399            } else {
2400                string.push(self.input.peek());
2401                self.skip_non_blank();
2402            }
2403        }
2404
2405        Ok(string)
2406    }
2407
2408    /// Scan for a verbatim tag.
2409    ///
2410    /// The prefixing `!<` must _not_ have been skipped.
2411    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2412        // Eat `!<`
2413        self.skip_non_blank();
2414        self.skip_non_blank();
2415
2416        let mut string = String::new();
2417        while is_uri_char(self.input.look_ch()) {
2418            if self.input.peek() == '%' {
2419                string.push(self.scan_uri_escapes(start_mark)?);
2420            } else {
2421                string.push(self.input.peek());
2422                self.skip_non_blank();
2423            }
2424        }
2425
2426        if string.is_empty() {
2427            return Err(ScanError::new_str(
2428                *start_mark,
2429                "while parsing a tag, did not find expected tag URI",
2430            ));
2431        }
2432
2433        if self.input.peek() != '>' {
2434            return Err(ScanError::new_str(
2435                *start_mark,
2436                "while scanning a verbatim tag, did not find the expected '>'",
2437            ));
2438        }
2439        self.skip_non_blank();
2440
2441        Ok(string)
2442    }
2443
2444    fn scan_tag_shorthand_suffix(
2445        &mut self,
2446        _directive: bool,
2447        _is_secondary: bool,
2448        head: &str,
2449        mark: &Marker,
2450    ) -> Result<String, ScanError> {
2451        let mut length = head.len();
2452        let mut string = String::new();
2453
2454        // Copy the head if needed.
2455        // Note that we don't copy the leading '!' character.
2456        if length > 1 {
2457            string.extend(head.chars().skip(1));
2458        }
2459
2460        while is_tag_char(self.input.look_ch()) {
2461            // Check if it is a URI-escape sequence.
2462            if self.input.peek() == '%' {
2463                string.push(self.scan_uri_escapes(mark)?);
2464            } else {
2465                string.push(self.input.peek());
2466                self.skip_non_blank();
2467            }
2468
2469            length += 1;
2470        }
2471
2472        if length == 0 {
2473            return Err(ScanError::new_str(
2474                *mark,
2475                "while parsing a tag, did not find expected tag URI",
2476            ));
2477        }
2478
2479        Ok(string)
2480    }
2481
2482    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2483        let mut width = 0usize;
2484        let mut bytes = [0u8; 4];
2485        let mut bytes_len = 0usize;
2486        loop {
2487            self.input.lookahead(3);
2488
2489            let c = self.input.peek_nth(1);
2490            let nc = self.input.peek_nth(2);
2491
2492            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
2493                return Err(ScanError::new_str(
2494                    *mark,
2495                    "while parsing a tag, found an invalid escape sequence",
2496                ));
2497            }
2498
2499            let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
2500                .expect("two hex nibbles always fit in a byte");
2501            if width == 0 {
2502                width = match byte {
2503                    _ if byte & 0x80 == 0x00 => 1,
2504                    _ if byte & 0xE0 == 0xC0 => 2,
2505                    _ if byte & 0xF0 == 0xE0 => 3,
2506                    _ if byte & 0xF8 == 0xF0 => 4,
2507                    _ => {
2508                        return Err(ScanError::new_str(
2509                            *mark,
2510                            "while parsing a tag, found an incorrect leading UTF-8 byte",
2511                        ));
2512                    }
2513                };
2514            } else if byte & 0xc0 != 0x80 {
2515                return Err(ScanError::new_str(
2516                    *mark,
2517                    "while parsing a tag, found an incorrect trailing UTF-8 byte",
2518                ));
2519            }
2520
2521            bytes[bytes_len] = byte;
2522            bytes_len += 1;
2523
2524            self.skip_n_non_blank(3);
2525
2526            width -= 1;
2527            if width == 0 {
2528                break;
2529            }
2530        }
2531
2532        let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2533            ScanError::new_str(
2534                *mark,
2535                "while parsing a tag, found an invalid UTF-8 codepoint",
2536            )
2537        })?;
2538
2539        let mut chars = s.chars();
2540        match (chars.next(), chars.next()) {
2541            (Some(ch), None) => Ok(ch),
2542            _ => Err(ScanError::new_str(
2543                *mark,
2544                "while parsing a tag, found an invalid UTF-8 codepoint",
2545            )),
2546        }
2547    }
2548
2549    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2550        self.save_simple_key();
2551        self.disallow_simple_key();
2552
2553        let tok = self.scan_anchor(alias)?;
2554
2555        self.tokens.push_back(tok.into());
2556
2557        Ok(())
2558    }
2559
2560    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2561        let start_mark = self.mark;
2562
2563        // Skip `&` / `*`.
2564        self.skip_non_blank();
2565
2566        // Borrow from input when possible.
2567        if let Some(start) = self.input.byte_offset() {
2568            while is_anchor_char(self.input.look_ch()) {
2569                self.skip_non_blank();
2570            }
2571
2572            let end = self
2573                .input
2574                .byte_offset()
2575                .expect("byte_offset() must remain available once enabled");
2576
2577            if start == end {
2578                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2579            }
2580
2581            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2582                Cow::Borrowed(slice)
2583            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2584                Cow::Owned(slice.to_owned())
2585            } else {
2586                return Err(ScanError::new_str(
2587                    start_mark,
2588                    "internal error: input advertised slicing but did not provide a slice",
2589                ));
2590            };
2591
2592            let tok = if alias {
2593                TokenType::Alias(cow)
2594            } else {
2595                TokenType::Anchor(cow)
2596            };
2597            return Ok(Token(Span::new(start_mark, self.mark), tok));
2598        }
2599
2600        let mut string = String::new();
2601        while is_anchor_char(self.input.look_ch()) {
2602            string.push(self.input.peek());
2603            self.skip_non_blank();
2604        }
2605
2606        if string.is_empty() {
2607            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2608        }
2609
2610        let tok = if alias {
2611            TokenType::Alias(string.into())
2612        } else {
2613            TokenType::Anchor(string.into())
2614        };
2615        Ok(Token(Span::new(start_mark, self.mark), tok))
2616    }
2617
2618    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2619        // The indicators '[' and '{' may start a simple key.
2620        self.save_simple_key();
2621
2622        let start_mark = self.mark;
2623        let indicator = self.input.peek();
2624        self.flow_markers.push((start_mark, indicator));
2625
2626        self.roll_one_col_indent();
2627        self.increase_flow_level()?;
2628
2629        self.allow_simple_key();
2630
2631        self.skip_non_blank();
2632
2633        if tok == TokenType::FlowMappingStart {
2634            self.flow_mapping_started.push(true);
2635        } else {
2636            self.flow_mapping_started.push(false);
2637            self.implicit_flow_mapping_states
2638                .push(ImplicitMappingState::Possible);
2639        }
2640
2641        let token_index = self.tokens.len();
2642        self.skip_ws_to_eol(SkipTabs::Yes)?;
2643
2644        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2645        Ok(())
2646    }
2647
2648    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2649        // A closing bracket without a corresponding opening is invalid YAML.
2650        if self.flow_level == 0 {
2651            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2652        }
2653
2654        let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2655            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2656        };
2657
2658        let (expected_open, actual_close) = match tok {
2659            TokenType::FlowSequenceEnd => ('[', ']'),
2660            TokenType::FlowMappingEnd => ('{', '}'),
2661            _ => unreachable!("flow collection end called with non-closing token"),
2662        };
2663
2664        if open_ch != expected_open {
2665            return Err(ScanError::new(
2666                open_mark,
2667                format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2668            ));
2669        }
2670
2671        let flow_level = self.flow_level;
2672
2673        self.remove_simple_key()?;
2674
2675        if matches!(tok, TokenType::FlowSequenceEnd) {
2676            self.end_implicit_mapping(self.mark, flow_level);
2677            // We are out exiting the flow sequence, nesting goes down 1 level.
2678            self.implicit_flow_mapping_states.pop();
2679        }
2680        self.flow_mapping_started.pop();
2681
2682        self.decrease_flow_level();
2683
2684        self.disallow_simple_key();
2685
2686        let start_mark = self.mark;
2687        self.skip_non_blank();
2688        let token_index = self.tokens.len();
2689        self.skip_ws_to_eol(SkipTabs::Yes)?;
2690
2691        // A flow collection within a flow mapping can be a key. In that case, the value may be
2692        // adjacent to the `:`.
2693        // ```yaml
2694        // - [ {a: b}:value ]
2695        // ```
2696        if self.flow_level > 0 {
2697            self.adjacent_value_allowed_at = self.mark.index();
2698        }
2699
2700        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2701        Ok(())
2702    }
2703
2704    /// Push the `FlowEntry` token and skip over the `,`.
2705    fn fetch_flow_entry(&mut self) -> ScanResult {
2706        self.remove_simple_key()?;
2707        self.allow_simple_key();
2708
2709        self.end_implicit_mapping(self.mark, self.flow_level);
2710        if self.current_flow_collection_is_sequence() {
2711            self.set_current_flow_mapping_started(false);
2712        }
2713
2714        let start_mark = self.mark;
2715        self.skip_non_blank();
2716        let token_index = self.tokens.len();
2717        self.skip_ws_to_eol(SkipTabs::Yes)?;
2718
2719        self.insert_token(
2720            token_index,
2721            Token(Span::new(start_mark, self.mark), TokenType::FlowEntry),
2722        );
2723        Ok(())
2724    }
2725
2726    fn increase_flow_level(&mut self) -> ScanResult {
2727        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2728        self.flow_level = self
2729            .flow_level
2730            .checked_add(1)
2731            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2732        Ok(())
2733    }
2734
2735    fn decrease_flow_level(&mut self) {
2736        if self.flow_level > 0 {
2737            self.flow_level -= 1;
2738            self.simple_keys.pop().unwrap();
2739        }
2740    }
2741
2742    /// Push the `Block*` token(s) and skip over the `-`.
2743    ///
2744    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2745    /// `BlockEntry` token.
2746    /// This function only skips over the `-` and does not fetch the entry value.
2747    fn fetch_block_entry(&mut self) -> ScanResult {
2748        if self.flow_level > 0 {
2749            // - * only allowed in block
2750            return Err(ScanError::new_str(
2751                self.mark,
2752                r#""-" is only valid inside a block"#,
2753            ));
2754        }
2755        // Check if we are allowed to start a new entry.
2756        if !self.simple_key_allowed {
2757            return Err(ScanError::new_str(
2758                self.mark,
2759                "block sequence entries are not allowed in this context",
2760            ));
2761        }
2762
2763        // ???, fixes test G9HC.
2764        if let Some(QueuedToken(span, QueuedTokenType::Anchor(..) | QueuedTokenType::Tag(..))) =
2765            self.tokens.back()
2766        {
2767            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2768                return Err(ScanError::new_str(
2769                    span.start,
2770                    "invalid indentation for anchor",
2771                ));
2772            }
2773        }
2774
2775        // Skip over the `-`.
2776        let mark = self.mark;
2777        self.skip_non_blank();
2778
2779        // generate BLOCK-SEQUENCE-START if indented
2780        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2781        let token_index = self.tokens.len();
2782        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2783        self.input.lookahead(2);
2784        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2785        {
2786            return Err(ScanError::new_str(
2787                self.mark,
2788                "'-' must be followed by a valid YAML whitespace",
2789            ));
2790        }
2791
2792        self.skip_ws_to_eol(SkipTabs::No)?;
2793        self.input.lookahead(1);
2794        if self.input.next_is_break() || self.input.next_is_flow() {
2795            self.roll_one_col_indent();
2796        }
2797
2798        self.remove_simple_key()?;
2799        self.allow_simple_key();
2800
2801        self.insert_token(
2802            token_index,
2803            Token(Span::empty(self.mark), TokenType::BlockEntry),
2804        );
2805
2806        Ok(())
2807    }
2808
2809    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2810        if let Some((mark, bracket)) = self.flow_markers.pop() {
2811            return Err(ScanError::new(
2812                mark,
2813                format!("unclosed bracket '{bracket}'"),
2814            ));
2815        }
2816
2817        self.unroll_indent(-1);
2818        self.remove_simple_key()?;
2819        self.disallow_simple_key();
2820
2821        let mark = self.mark;
2822
2823        self.skip_n_non_blank(3);
2824
2825        self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
2826        self.tokens
2827            .push_back(Token(Span::new(mark, self.mark), t).into());
2828        Ok(())
2829    }
2830
2831    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2832        self.save_simple_key();
2833        self.allow_simple_key();
2834        let tok = self.scan_block_scalar(literal)?;
2835
2836        self.tokens.push_back(tok.into());
2837        Ok(())
2838    }
2839
2840    #[allow(clippy::too_many_lines)]
2841    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2842        let start_mark = self.mark;
2843        let mut chomping = Chomping::Clip;
2844        let mut increment: usize = 0;
2845        let mut indent: usize = 0;
2846        let mut trailing_blank: bool;
2847        let mut leading_blank: bool = false;
2848        let style = if literal {
2849            ScalarStyle::Literal
2850        } else {
2851            ScalarStyle::Folded
2852        };
2853
2854        let mut string = String::new();
2855        let mut leading_break = String::new();
2856        let mut trailing_breaks = String::new();
2857        let mut chomping_break = String::new();
2858
2859        // skip '|' or '>'
2860        self.skip_non_blank();
2861        self.unroll_non_block_indents();
2862
2863        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2864            if self.input.peek() == '+' {
2865                chomping = Chomping::Keep;
2866            } else {
2867                chomping = Chomping::Strip;
2868            }
2869            self.skip_non_blank();
2870            self.input.lookahead(1);
2871            if self.input.next_is_digit() {
2872                if self.input.peek() == '0' {
2873                    return Err(ScanError::new_str(
2874                        start_mark,
2875                        "while scanning a block scalar, found an indentation indicator equal to 0",
2876                    ));
2877                }
2878                increment = (self.input.peek() as usize) - ('0' as usize);
2879                self.skip_non_blank();
2880            }
2881        } else if self.input.next_is_digit() {
2882            if self.input.peek() == '0' {
2883                return Err(ScanError::new_str(
2884                    start_mark,
2885                    "while scanning a block scalar, found an indentation indicator equal to 0",
2886                ));
2887            }
2888
2889            increment = (self.input.peek() as usize) - ('0' as usize);
2890            self.skip_non_blank();
2891            self.input.lookahead(1);
2892            if self.input.peek() == '+' || self.input.peek() == '-' {
2893                if self.input.peek() == '+' {
2894                    chomping = Chomping::Keep;
2895                } else {
2896                    chomping = Chomping::Strip;
2897                }
2898                self.skip_non_blank();
2899            }
2900        }
2901
2902        self.skip_ws_to_eol(SkipTabs::Yes)?;
2903
2904        // Check if we are at the end of the line.
2905        self.input.lookahead(1);
2906        if !self.input.next_is_breakz() {
2907            return Err(ScanError::new_str(
2908                start_mark,
2909                "while scanning a block scalar, did not find expected comment or line break",
2910            ));
2911        }
2912
2913        if self.input.next_is_break() {
2914            self.input.lookahead(2);
2915            self.read_break(&mut chomping_break);
2916        }
2917
2918        if self.input.look_ch() == '\t' {
2919            return Err(ScanError::new_str(
2920                start_mark,
2921                "a block scalar content cannot start with a tab",
2922            ));
2923        }
2924
2925        if increment > 0 {
2926            indent = if self.indent >= 0 {
2927                (self.indent + increment as isize) as usize
2928            } else {
2929                increment
2930            }
2931        }
2932
2933        // Scan the leading line breaks and determine the indentation level if needed.
2934        if indent == 0 {
2935            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2936        } else {
2937            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2938        }
2939
2940        // We have an end-of-stream with no content, e.g.:
2941        // ```yaml
2942        // - |+
2943        // ```
2944        if self.input.next_is_z() {
2945            let contents = match chomping {
2946                // We strip trailing line breaks. Nothing remains.
2947                Chomping::Strip => String::new(),
2948                // There was no newline after the chomping indicator.
2949                _ if self.mark.line == start_mark.line() => String::new(),
2950                // We clip lines, and there was a newline after the chomping indicator.
2951                // All other breaks are ignored.
2952                Chomping::Clip => chomping_break,
2953                // We keep lines. There was a newline after the chomping indicator but nothing
2954                // else.
2955                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2956                // Otherwise, the newline after chomping is ignored.
2957                Chomping::Keep => trailing_breaks,
2958            };
2959
2960            let span = if contents.trim().is_empty() {
2961                Span::new(start_mark, self.mark)
2962            } else {
2963                Span::new(start_mark, self.mark).with_indent(Some(indent))
2964            };
2965
2966            return Ok(Token(span, TokenType::Scalar(style, contents.into())));
2967        }
2968
2969        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2970            if self.indent < 0 && self.mark.col == 0 {
2971                self.input.lookahead(4);
2972                if self.input.next_is_document_start()
2973                    || self.input.next_is_document_end()
2974                    || self.input.peek() == '#'
2975                {
2976                    // At the root level, an explicit indentation indicator can still yield an
2977                    // empty scalar when the next line is a document marker or comment.
2978                    // In this case, the scalar is terminated rather than under-indented.
2979                } else {
2980                    return Err(ScanError::new_str(
2981                        self.mark,
2982                        "wrongly indented line in block scalar",
2983                    ));
2984                }
2985            } else {
2986                return Err(ScanError::new_str(
2987                    self.mark,
2988                    "wrongly indented line in block scalar",
2989                ));
2990            }
2991        }
2992
2993        let mut line_buffer = String::with_capacity(100);
2994        let start_mark = self.mark;
2995        while self.mark.col == indent && !self.input.next_is_z() {
2996            if indent == 0 {
2997                self.input.lookahead(4);
2998                if self.input.next_is_document_end() {
2999                    break;
3000                }
3001            }
3002
3003            // We are at the first content character of a content line.
3004            trailing_blank = self.input.next_is_blank();
3005            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
3006                string.push_str(&trailing_breaks);
3007                if trailing_breaks.is_empty() {
3008                    string.push(' ');
3009                }
3010            } else {
3011                string.push_str(&leading_break);
3012                string.push_str(&trailing_breaks);
3013            }
3014
3015            leading_break.clear();
3016            trailing_breaks.clear();
3017
3018            leading_blank = self.input.next_is_blank();
3019
3020            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
3021
3022            // break on EOF
3023            self.input.lookahead(2);
3024            if self.input.next_is_z() {
3025                break;
3026            }
3027
3028            self.read_break(&mut leading_break);
3029
3030            // Eat the following indentation spaces and line breaks.
3031            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
3032        }
3033
3034        // Chomp the tail.
3035        if chomping != Chomping::Strip {
3036            string.push_str(&leading_break);
3037            // If we had reached an eof but the last character wasn't an end-of-line, check if the
3038            // last line was indented at least as the rest of the scalar, then we need to consider
3039            // there is a newline.
3040            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
3041                string.push('\n');
3042            }
3043        }
3044
3045        if chomping == Chomping::Keep {
3046            string.push_str(&trailing_breaks);
3047        }
3048
3049        let span = if string.trim().is_empty() {
3050            Span::new(start_mark, self.mark)
3051        } else {
3052            Span::new(start_mark, self.mark).with_indent(Some(indent))
3053        };
3054
3055        Ok(Token(span, TokenType::Scalar(style, string.into())))
3056    }
3057
3058    /// Retrieve the contents of the line, parsing it as a block scalar.
3059    ///
3060    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
3061    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
3062    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
3063    /// `clear`ed before the end of the function.
3064    ///
3065    /// This function assumes the first character to read is the first content character in the
3066    /// line. This function does not consume the line break character(s) after the line.
3067    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
3068        // Start by evaluating characters in the buffer.
3069        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
3070            string.push(self.input.peek());
3071            // We may technically skip non-blank characters. However, the only distinction is
3072            // to determine what is leading whitespace and what is not. Here, we read the
3073            // contents of the line until either EOF or a line break. We know we will not read
3074            // `self.leading_whitespace` until the end of the line, where it will be reset.
3075            // This allows us to call a slightly less expensive function.
3076            self.skip_blank();
3077        }
3078
3079        // All characters that were in the buffer were consumed. We need to check if more
3080        // follow.
3081        if self.input.buf_is_empty() {
3082            // We will read all consecutive non-breakz characters. We push them into a
3083            // temporary buffer. The main difference with going through `self.buffer` is that
3084            // characters are appended here as their real size (1B for ASCII, or up to 4 bytes for
3085            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
3086            // (using `String::push_str`).
3087
3088            // line_buffer is empty at this point so we can compute n_chars here as well
3089            let mut n_chars = 0;
3090            debug_assert!(line_buffer.is_empty());
3091            while let Some(c) = self.input.raw_read_non_breakz_ch() {
3092                line_buffer.push(c);
3093                n_chars += 1;
3094            }
3095
3096            // We need to manually update our position; we haven't called a `skip` function.
3097            self.mark.col += n_chars;
3098            self.mark.offsets.chars += n_chars;
3099            self.mark.offsets.bytes = self.input.byte_offset();
3100
3101            // We can now append our bytes to our `string`.
3102            string.reserve(line_buffer.len());
3103            string.push_str(line_buffer);
3104            // This clears the _contents_ without touching the _capacity_.
3105            line_buffer.clear();
3106        }
3107    }
3108
3109    /// Skip the block scalar indentation and empty lines.
3110    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
3111        loop {
3112            // Consume all spaces. Tabs cannot be used as indentation.
3113            if indent < self.input.bufmaxlen() - 2 {
3114                self.input.lookahead(self.input.bufmaxlen());
3115                while self.mark.col < indent && self.input.peek() == ' ' {
3116                    self.skip_blank();
3117                }
3118            } else {
3119                loop {
3120                    self.input.lookahead(self.input.bufmaxlen());
3121                    while !self.input.buf_is_empty()
3122                        && self.mark.col < indent
3123                        && self.input.peek() == ' '
3124                    {
3125                        self.skip_blank();
3126                    }
3127                    // If we reached our indent, we can break. We must also break if we have
3128                    // reached content or EOF; that is, the buffer is not empty and the next
3129                    // character is not a space.
3130                    if self.mark.col == indent
3131                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
3132                    {
3133                        break;
3134                    }
3135                }
3136                self.input.lookahead(2);
3137            }
3138
3139            // If our current line is empty, skip over the break and continue looping.
3140            if self.input.next_is_break() {
3141                self.read_break(breaks);
3142            } else {
3143                // Otherwise, we have a content line. Return control.
3144                break;
3145            }
3146        }
3147    }
3148
3149    /// Determine the indentation level for a block scalar from the first line of its contents.
3150    ///
3151    /// The function skips over whitespace-only lines and sets `indent` to the longest
3152    /// whitespace line that was encountered.
3153    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
3154        let mut max_indent = 0;
3155        loop {
3156            // Consume all spaces. Tabs cannot be used as indentation.
3157            while self.input.look_ch() == ' ' {
3158                self.skip_blank();
3159            }
3160
3161            if self.mark.col > max_indent {
3162                max_indent = self.mark.col;
3163            }
3164
3165            if self.input.next_is_break() {
3166                // If our current line is empty, skip over the break and continue looping.
3167                self.input.lookahead(2);
3168                self.read_break(breaks);
3169            } else {
3170                // Otherwise, we have a content line. Return control.
3171                break;
3172            }
3173        }
3174
3175        // In case a YAML document looks like:
3176        // ```yaml
3177        // |
3178        // foo
3179        // bar
3180        // ```
3181        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
3182        // least 1. When in the above example, `self.indent` will be set to -1.
3183        *indent = max_indent.max((self.indent + 1) as usize);
3184        if self.indent > 0 {
3185            *indent = (*indent).max(1);
3186        }
3187    }
3188
3189    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
3190        self.save_simple_key();
3191        self.disallow_simple_key();
3192
3193        let token_index = self.tokens.len();
3194        let tok = self.scan_flow_scalar(single)?;
3195
3196        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
3197        // YAML allows the following value to be specified adjacent to the “:”.
3198        if self.skip_to_next_token(true)? {
3199            self.adjacent_value_allowed_at = usize::MAX;
3200        } else {
3201            self.adjacent_value_allowed_at = self.mark.index();
3202        }
3203
3204        self.insert_token(token_index, tok);
3205        Ok(())
3206    }
3207
3208    #[allow(clippy::too_many_lines)]
3209    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
3210        let start_mark = self.mark;
3211
3212        // Output scalar contents.
3213        let mut buf = match self.input.byte_offset() {
3214            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
3215            None => FlowScalarBuf::new_owned(),
3216        };
3217
3218        // Scratch used to consume the *first* line break in a break run without emitting it.
3219        // (The first break folds to ' ' or to nothing depending on escaping rules.)
3220        let mut break_scratch = String::new();
3221
3222        /* Eat the left quote. */
3223        self.skip_non_blank();
3224
3225        loop {
3226            /* Check for a document indicator. */
3227            self.input.lookahead(4);
3228
3229            if self.mark.col == 0 && self.input.next_is_document_indicator() {
3230                return Err(ScanError::new_str(
3231                    start_mark,
3232                    "while scanning a quoted scalar, found unexpected document indicator",
3233                ));
3234            }
3235
3236            if self.input.next_is_z() {
3237                return Err(ScanError::new_str(start_mark, "unclosed quote"));
3238            }
3239
3240            // Do not enforce block indentation inside quoted (flow) scalars.
3241            // YAML allows line breaks within quoted scalars.
3242            let mut leading_blanks = false;
3243            self.consume_flow_scalar_non_whitespace_chars(
3244                single,
3245                &mut buf,
3246                &mut leading_blanks,
3247                &start_mark,
3248            )?;
3249
3250            match self.input.look_ch() {
3251                '\'' if single => break,
3252                '"' if !single => break,
3253                _ => {}
3254            }
3255
3256            // --- Faster whitespace / line break handling (no temporary Strings) ---
3257            //
3258            // Instead of:
3259            //   - collecting blanks into `whitespaces` and then copying them
3260            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
3261            //
3262            // We do:
3263            //   - append trailing blanks directly to `string`, remember where they started,
3264            //     and truncate them if a line break follows.
3265            //   - for line breaks: consume the first break into a scratch (discarded),
3266            //     append subsequent breaks directly to `string`.
3267            //
3268            // These flags replace temporary-string emptiness checks:
3269            //   has_leading_break  <=> !leading_break.is_empty()
3270            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
3271            let mut trailing_ws_start: Option<usize> = None;
3272            let mut has_leading_break = false;
3273            let mut has_trailing_breaks = false;
3274
3275            // For the borrowed path: track the (byte) start of a pending whitespace run.
3276            let mut pending_ws_start: Option<usize> = None;
3277
3278            // Consume blank characters.
3279            while self.input.next_is_blank() || self.input.next_is_break() {
3280                if self.input.next_is_blank() {
3281                    // Consume a space or a tab character.
3282                    if leading_blanks {
3283                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
3284                            return Err(ScanError::new_str(
3285                                self.mark,
3286                                "tab cannot be used as indentation",
3287                            ));
3288                        }
3289                        self.skip_blank();
3290                    } else {
3291                        // Append to output immediately; if a break appears next, we'll truncate.
3292                        match buf {
3293                            FlowScalarBuf::Owned(ref mut string) => {
3294                                if trailing_ws_start.is_none() {
3295                                    trailing_ws_start = Some(string.len());
3296                                }
3297                                string.push(self.input.peek());
3298                            }
3299                            FlowScalarBuf::Borrowed { .. } => {
3300                                if pending_ws_start.is_none() {
3301                                    pending_ws_start = self.input.byte_offset();
3302                                }
3303                            }
3304                        }
3305                        self.skip_blank();
3306
3307                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
3308                            (&mut buf, pending_ws_start, self.input.byte_offset())
3309                        {
3310                            buf.note_pending_ws(ws_start, ws_end);
3311                        }
3312                    }
3313                } else {
3314                    self.input.lookahead(2);
3315
3316                    // Check if it is a first line break.
3317                    if leading_blanks {
3318                        // Second+ line break in a run: preserve it.
3319                        match buf {
3320                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
3321                            FlowScalarBuf::Borrowed { .. } => {
3322                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3323                                let Some(string) = buf.as_owned_mut() else {
3324                                    unreachable!()
3325                                };
3326                                self.read_break(string);
3327                            }
3328                        }
3329                        has_trailing_breaks = true;
3330                    } else {
3331                        // First break: drop any trailing blanks we appended, then consume the break.
3332                        if let Some(pos) = trailing_ws_start.take() {
3333                            if let FlowScalarBuf::Owned(ref mut string) = buf {
3334                                string.truncate(pos);
3335                            }
3336                        }
3337
3338                        if pending_ws_start.take().is_some() {
3339                            // Trailing blanks before a break are discarded => transformation.
3340                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3341                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3342                            }
3343                            buf.discard_pending_ws();
3344                        } else {
3345                            buf.commit_pending_ws();
3346                        }
3347
3348                        break_scratch.clear();
3349                        self.read_break(&mut break_scratch);
3350                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
3351
3352                        has_leading_break = true;
3353                        leading_blanks = true;
3354                    }
3355                }
3356
3357                self.input.lookahead(1);
3358            }
3359
3360            // If we had a line break inside a quoted (flow) scalar, validate indentation
3361            // of the continuation line in block context.
3362            if leading_blanks && has_leading_break && self.flow_level == 0 {
3363                let next_ch = self.input.peek();
3364                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
3365                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
3366                    return Err(ScanError::new_str(
3367                        self.mark,
3368                        "invalid indentation in multiline quoted scalar",
3369                    ));
3370                }
3371            }
3372
3373            // Join the whitespace or fold line breaks.
3374            if leading_blanks {
3375                // Folding rule:
3376                //   if there was no leading break, preserve the pending whitespace already emitted
3377                //   if there was a leading break but no trailing breaks, fold to one space
3378                //   otherwise, preserve the trailing breaks already emitted
3379                if has_leading_break && !has_trailing_breaks {
3380                    match buf {
3381                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
3382                        FlowScalarBuf::Borrowed { .. } => {
3383                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3384                            let Some(string) = buf.as_owned_mut() else {
3385                                unreachable!()
3386                            };
3387                            string.push(' ');
3388                        }
3389                    }
3390                }
3391            }
3392            // else: trailing blanks are already appended to `string`
3393        } // loop
3394
3395        // Eat the right quote.
3396        self.skip_non_blank();
3397        let end_mark = self.mark;
3398
3399        // Ensure there is no invalid trailing content.
3400        self.skip_ws_to_eol(SkipTabs::Yes)?;
3401        match self.input.peek() {
3402            // These can be encountered in flow sequences or mappings.
3403            ',' | '}' | ']' if self.flow_level > 0 => {}
3404            // An end-of-line / end-of-stream is fine. No trailing content.
3405            c if is_breakz(c) => {}
3406            // ':' can be encountered if our scalar is a key.
3407            // Outside of flow contexts, keys cannot span multiple lines
3408            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
3409            // Inside a flow context, this is allowed.
3410            ':' if self.flow_level > 0 => {}
3411            _ => {
3412                return Err(ScanError::new_str(
3413                    self.mark,
3414                    "invalid trailing content after double-quoted scalar",
3415                ));
3416            }
3417        }
3418
3419        let style = if single {
3420            ScalarStyle::SingleQuoted
3421        } else {
3422            ScalarStyle::DoubleQuoted
3423        };
3424
3425        let contents = match buf {
3426            FlowScalarBuf::Owned(string) => Cow::Owned(string),
3427            FlowScalarBuf::Borrowed {
3428                start,
3429                mut end,
3430                pending_ws_start,
3431                pending_ws_end,
3432            } => {
3433                // If we ended after a whitespace run, it is part of the output (no break followed).
3434                if pending_ws_start.is_some() {
3435                    end = pending_ws_end;
3436                }
3437                if let Some(slice) = self.try_borrow_slice(start, end) {
3438                    Cow::Borrowed(slice)
3439                } else {
3440                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
3441                        ScanError::new_str(
3442                            start_mark,
3443                            "internal error: input advertised offsets but did not provide a slice",
3444                        )
3445                    })?;
3446                    Cow::Owned(slice.to_owned())
3447                }
3448            }
3449        };
3450
3451        Ok(Token(
3452            Span::new(start_mark, end_mark),
3453            TokenType::Scalar(style, contents),
3454        ))
3455    }
3456
3457    /// Consume successive non-whitespace characters from a flow scalar.
3458    ///
3459    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
3460    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
3461    /// for double quoted scalars).
3462    ///
3463    /// # Errors
3464    /// Return an error if an invalid escape sequence is found.
3465    fn consume_flow_scalar_non_whitespace_chars(
3466        &mut self,
3467        single: bool,
3468        buf: &mut FlowScalarBuf,
3469        leading_blanks: &mut bool,
3470        start_mark: &Marker,
3471    ) -> Result<(), ScanError> {
3472        self.input.lookahead(2);
3473        while !is_blank_or_breakz(self.input.peek()) {
3474            match self.input.peek() {
3475                // Check for an escaped single quote.
3476                '\'' if self.input.peek_nth(1) == '\'' && single => {
3477                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3478                        buf.commit_pending_ws();
3479                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3480                    }
3481                    let Some(string) = buf.as_owned_mut() else {
3482                        unreachable!()
3483                    };
3484                    string.push('\'');
3485                    self.skip_n_non_blank(2);
3486                }
3487                // Check for the right quote.
3488                '\'' if single => break,
3489                '"' if !single => break,
3490                // Check for an escaped line break.
3491                '\\' if !single && is_break(self.input.peek_nth(1)) => {
3492                    self.input.lookahead(3);
3493                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3494                        buf.commit_pending_ws();
3495                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3496                    }
3497                    self.skip_non_blank();
3498                    self.skip_linebreak();
3499                    *leading_blanks = true;
3500                    break;
3501                }
3502                // Check for an escape sequence.
3503                '\\' if !single => {
3504                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3505                        buf.commit_pending_ws();
3506                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3507                    }
3508                    let Some(string) = buf.as_owned_mut() else {
3509                        unreachable!()
3510                    };
3511                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
3512                }
3513                c => {
3514                    match buf {
3515                        FlowScalarBuf::Owned(ref mut string) => {
3516                            string.push(c);
3517                        }
3518                        FlowScalarBuf::Borrowed { .. } => {
3519                            buf.commit_pending_ws();
3520                        }
3521                    }
3522                    self.skip_non_blank();
3523
3524                    if let Some(new_end) = self.input.byte_offset() {
3525                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
3526                            *end = new_end;
3527                        }
3528                    }
3529                }
3530            }
3531            self.input.lookahead(2);
3532        }
3533        Ok(())
3534    }
3535
3536    /// Escape the sequence we encounter in a flow scalar.
3537    ///
3538    /// `self.input.peek()` must point to the `\` starting the escape sequence.
3539    ///
3540    /// # Errors
3541    /// Return an error if an invalid escape sequence is found.
3542    fn resolve_flow_scalar_escape_sequence(
3543        &mut self,
3544        start_mark: &Marker,
3545    ) -> Result<char, ScanError> {
3546        let mut code_length = 0usize;
3547        let mut ret = '\0';
3548
3549        match self.input.peek_nth(1) {
3550            '0' => ret = '\0',
3551            'a' => ret = '\x07',
3552            'b' => ret = '\x08',
3553            't' | '\t' => ret = '\t',
3554            'n' => ret = '\n',
3555            'v' => ret = '\x0b',
3556            'f' => ret = '\x0c',
3557            'r' => ret = '\x0d',
3558            'e' => ret = '\x1b',
3559            ' ' => ret = '\x20',
3560            '"' => ret = '"',
3561            '/' => ret = '/',
3562            '\\' => ret = '\\',
3563            // Unicode next line (#x85)
3564            'N' => ret = char::from_u32(0x85).unwrap(),
3565            // Unicode non-breaking space (#xA0)
3566            '_' => ret = char::from_u32(0xA0).unwrap(),
3567            // Unicode line separator (#x2028)
3568            'L' => ret = char::from_u32(0x2028).unwrap(),
3569            // Unicode paragraph separator (#x2029)
3570            'P' => ret = char::from_u32(0x2029).unwrap(),
3571            'x' => code_length = 2,
3572            'u' => code_length = 4,
3573            'U' => code_length = 8,
3574            _ => {
3575                return Err(ScanError::new_str(
3576                    *start_mark,
3577                    "while parsing a quoted scalar, found unknown escape character",
3578                ))
3579            }
3580        }
3581        self.skip_n_non_blank(2);
3582
3583        // Consume an arbitrary escape code.
3584        if code_length > 0 {
3585            self.input.lookahead(code_length);
3586            let mut value = 0u32;
3587            for i in 0..code_length {
3588                let c = self.input.peek_nth(i);
3589                if !is_hex(c) {
3590                    return Err(ScanError::new_str(
3591                        *start_mark,
3592                        "while parsing a quoted scalar, did not find expected hexadecimal number",
3593                    ));
3594                }
3595                value = (value << 4) + as_hex(c);
3596            }
3597
3598            self.skip_n_non_blank(code_length);
3599
3600            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
3601            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3602                self.input.lookahead(2);
3603                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3604                    self.skip_n_non_blank(2);
3605                    self.input.lookahead(4);
3606                    let mut low_value = 0u32;
3607                    for i in 0..4 {
3608                        let c = self.input.peek_nth(i);
3609                        if !is_hex(c) {
3610                            return Err(ScanError::new_str(
3611                                *start_mark,
3612                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3613                            ));
3614                        }
3615                        low_value = (low_value << 4) + as_hex(c);
3616                    }
3617                    if (0xDC00..=0xDFFF).contains(&low_value) {
3618                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3619                        self.skip_n_non_blank(4);
3620                    } else {
3621                        return Err(ScanError::new_str(
3622                            *start_mark,
3623                            "while parsing a quoted scalar, found invalid low surrogate",
3624                        ));
3625                    }
3626                } else {
3627                    return Err(ScanError::new_str(
3628                        *start_mark,
3629                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3630                    ));
3631                }
3632            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3633                return Err(ScanError::new_str(
3634                    *start_mark,
3635                    "while parsing a quoted scalar, found unpaired low surrogate",
3636                ));
3637            }
3638
3639            let Some(ch) = char::from_u32(value) else {
3640                return Err(ScanError::new_str(
3641                    *start_mark,
3642                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3643                ));
3644            };
3645            ret = ch;
3646        }
3647        Ok(ret)
3648    }
3649
3650    fn fetch_plain_scalar(&mut self) -> ScanResult {
3651        self.save_simple_key();
3652        self.disallow_simple_key();
3653
3654        let token_index = self.tokens.len();
3655        let tok = self.scan_plain_scalar()?;
3656
3657        self.insert_token(token_index, tok);
3658        Ok(())
3659    }
3660
3661    /// Scan for a plain scalar.
3662    ///
3663    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3664    /// some contexts.
3665    #[allow(clippy::too_many_lines)]
3666    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3667        self.unroll_non_block_indents();
3668        let indent = self.indent + 1;
3669        let start_mark = self.mark;
3670
3671        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3672            return Err(ScanError::new_str(
3673                start_mark,
3674                "invalid indentation in flow construct",
3675            ));
3676        }
3677
3678        let mut string = String::with_capacity(32);
3679        self.buf_whitespaces.clear();
3680        self.buf_leading_break.clear();
3681        self.buf_trailing_breaks.clear();
3682        let mut end_mark = self.mark;
3683
3684        loop {
3685            self.input.lookahead(4);
3686            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3687                || self.input.peek() == '#'
3688            {
3689                // BS4K: If a `#` starts a comment after some separation spaces following content
3690                // of a plain scalar in block context, and there is potential continuation on the
3691                // next line, this is invalid. We cannot decide yet if there will be continuation,
3692                // so record that a comment interrupted a plain scalar.
3693                if self.input.peek() == '#'
3694                    && !string.is_empty()
3695                    && !self.buf_whitespaces.is_empty()
3696                    && self.flow_level == 0
3697                {
3698                    self.interrupted_plain_by_comment = Some(self.mark);
3699                }
3700                break;
3701            }
3702
3703            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3704                return Err(ScanError::new_str(
3705                    self.mark,
3706                    "plain scalar cannot start with '-' followed by ,[]{}",
3707                ));
3708            }
3709
3710            if !self.input.next_is_blank_or_breakz()
3711                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3712            {
3713                if self.leading_whitespace {
3714                    if self.buf_leading_break.is_empty() {
3715                        string.push_str(&self.buf_leading_break);
3716                        string.push_str(&self.buf_trailing_breaks);
3717                        self.buf_trailing_breaks.clear();
3718                        self.buf_leading_break.clear();
3719                    } else {
3720                        if self.buf_trailing_breaks.is_empty() {
3721                            string.push(' ');
3722                        } else {
3723                            string.push_str(&self.buf_trailing_breaks);
3724                            self.buf_trailing_breaks.clear();
3725                        }
3726                        self.buf_leading_break.clear();
3727                    }
3728                    self.leading_whitespace = false;
3729                } else if !self.buf_whitespaces.is_empty() {
3730                    string.push_str(&self.buf_whitespaces);
3731                    self.buf_whitespaces.clear();
3732                }
3733
3734                // We can unroll the first iteration of the loop.
3735                string.push(self.input.peek());
3736                self.skip_non_blank();
3737                string.reserve(self.input.bufmaxlen());
3738
3739                // Add content non-blank characters to the scalar.
3740                let mut end = false;
3741                while !end {
3742                    // Fill the buffer once and process all characters in the buffer until the next
3743                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3744                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3745                    self.input.lookahead(self.input.bufmaxlen());
3746                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3747                        &mut string,
3748                        self.input.bufmaxlen() - 1,
3749                        self.flow_level > 0,
3750                    );
3751                    end = stop;
3752                    self.mark.offsets.chars += chars_consumed;
3753                    self.mark.col += chars_consumed;
3754                    self.mark.offsets.bytes = self.input.byte_offset();
3755                }
3756                end_mark = self.mark;
3757            }
3758
3759            // We may reach the end of a plain scalar if:
3760            //  - We reach eof
3761            //  - We reach ": "
3762            //  - We find a flow character in a flow context
3763            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3764                break;
3765            }
3766
3767            // Process blank characters.
3768            self.input.lookahead(2);
3769            while self.input.next_is_blank_or_break() {
3770                if self.input.next_is_blank() {
3771                    if !self.leading_whitespace {
3772                        self.buf_whitespaces.push(self.input.peek());
3773                        self.skip_blank();
3774                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3775                        // Tabs in an indentation columns are allowed if and only if the line is
3776                        // empty. Skip to the end of the line.
3777                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3778                        if !self.input.next_is_breakz() {
3779                            return Err(ScanError::new_str(
3780                                start_mark,
3781                                "while scanning a plain scalar, found a tab",
3782                            ));
3783                        }
3784                    } else {
3785                        self.skip_blank();
3786                    }
3787                } else {
3788                    // Check if it is a first line break
3789                    if self.leading_whitespace {
3790                        self.skip_break();
3791                        self.buf_trailing_breaks.push('\n');
3792                    } else {
3793                        self.buf_whitespaces.clear();
3794                        self.skip_break();
3795                        self.buf_leading_break.push('\n');
3796                        self.leading_whitespace = true;
3797                    }
3798                }
3799                self.input.lookahead(2);
3800            }
3801
3802            // check indentation level
3803            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3804                break;
3805            }
3806        }
3807
3808        if self.leading_whitespace {
3809            self.allow_simple_key();
3810        }
3811
3812        if string.is_empty() {
3813            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3814            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3815            // erroneous inputs such as "{...".
3816            Err(ScanError::new_str(
3817                start_mark,
3818                "unexpected end of plain scalar",
3819            ))
3820        } else {
3821            let contents = if let (Some(start), Some(end)) =
3822                (start_mark.byte_offset(), end_mark.byte_offset())
3823            {
3824                match self.try_borrow_slice(start, end) {
3825                    Some(slice) if slice == string => Cow::Borrowed(slice),
3826                    _ => Cow::Owned(string),
3827                }
3828            } else {
3829                Cow::Owned(string)
3830            };
3831
3832            Ok(Token(
3833                Span::new(start_mark, end_mark),
3834                TokenType::Scalar(ScalarStyle::Plain, contents),
3835            ))
3836        }
3837    }
3838
3839    fn fetch_key(&mut self) -> ScanResult {
3840        let start_mark = self.mark;
3841        if self.flow_level == 0 {
3842            // Check if we are allowed to start a new key (not necessarily simple).
3843            if !self.simple_key_allowed {
3844                return Err(ScanError::new_str(
3845                    self.mark,
3846                    "mapping keys are not allowed in this context",
3847                ));
3848            }
3849            self.roll_indent(
3850                start_mark.col,
3851                None,
3852                TokenType::BlockMappingStart,
3853                start_mark,
3854            );
3855        } else {
3856            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3857            self.set_current_flow_mapping_started(true);
3858        }
3859
3860        self.remove_simple_key()?;
3861
3862        if self.flow_level == 0 {
3863            self.allow_simple_key();
3864        } else {
3865            self.disallow_simple_key();
3866        }
3867
3868        self.skip_non_blank();
3869        let token_index = self.tokens.len();
3870        self.explicit_key_tab_check_pending = false;
3871        let stopped_after_comment = self.skip_yaml_whitespace(true)?;
3872        if self.input.peek() == '\t' {
3873            return Err(ScanError::new_str(
3874                self.mark(),
3875                "tabs disallowed in this context",
3876            ));
3877        }
3878        self.explicit_key_tab_check_pending = stopped_after_comment;
3879        self.insert_token(
3880            token_index,
3881            Token(Span::new(start_mark, self.mark), TokenType::Key),
3882        );
3883        Ok(())
3884    }
3885
3886    /// Fetch a value in a mapping inside of a flow collection.
3887    ///
3888    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3889    /// values in flow collections are respected prior to calling [`fetch_value`].
3890    ///
3891    /// [`self.flow_level`]: Self::flow_level
3892    /// [`fetch_value`]: Self::fetch_value
3893    fn fetch_flow_value(&mut self) -> ScanResult {
3894        let nc = self.input.peek_nth(1);
3895
3896        // If we encounter a ':' inside a flow collection and it is not immediately
3897        // followed by a blank or breakz:
3898        //   - We must check whether an adjacent value is allowed
3899        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3900        //     is needed for JSON compatibility.
3901        //   - If not, we must ensure there is a space after the ':' and before its value.
3902        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3903        //   - But if the value is empty (null), then it's okay.
3904        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3905        // flow character), but the ']' is not the value. The value is an invisible empty
3906        // space which is represented as null ('~').
3907        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3908            return Err(ScanError::new_str(
3909                self.mark,
3910                "':' may not precede any of `[{` in flow mapping",
3911            ));
3912        }
3913
3914        self.fetch_value()
3915    }
3916
3917    /// Fetch a value from a mapping (after a `:`).
3918    fn fetch_value(&mut self) -> ScanResult {
3919        let sk = self.simple_keys.last().unwrap().clone();
3920        let start_mark = self.mark;
3921        let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3922            && !self.current_flow_mapping_started()
3923            && !self.implicit_flow_mapping_states.is_empty();
3924        if is_implicit_flow_mapping {
3925            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3926                ImplicitMappingState::Inside(self.flow_level);
3927        }
3928
3929        // Skip over ':'.
3930        self.skip_non_blank();
3931        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3932        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3933        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3934        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3935        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3936        // cases will still fail to parse correctly (just with a less specific error message).
3937        let mut trailing_tokens = VecDeque::new();
3938        if self.input.look_ch() == '\t' {
3939            let trailing_token_index = self.tokens.len();
3940            let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
3941            trailing_tokens = self.tokens.split_off(trailing_token_index);
3942
3943            if !whitespace.has_valid_yaml_ws()
3944                && (self.input.peek() == '-' || self.input.next_is_alpha())
3945            {
3946                return Err(ScanError::new_str(
3947                    self.mark,
3948                    "':' must be followed by a valid YAML whitespace",
3949                ));
3950            }
3951        }
3952
3953        if sk.possible {
3954            let token_index = self.simple_key_token_index(&sk, start_mark)?;
3955            // insert simple key
3956            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3957            self.insert_token(token_index, tok);
3958            if is_implicit_flow_mapping {
3959                if sk.mark.line < start_mark.line {
3960                    return Err(ScanError::new_str(
3961                        start_mark,
3962                        "illegal placement of ':' indicator",
3963                    ));
3964                }
3965                self.insert_token(
3966                    token_index,
3967                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3968                );
3969            }
3970
3971            // Add the BLOCK-MAPPING-START token if needed.
3972            self.roll_indent(
3973                sk.mark.col,
3974                Some(sk.token_number),
3975                TokenType::BlockMappingStart,
3976                sk.mark,
3977            );
3978            self.roll_one_col_indent();
3979
3980            self.simple_keys.last_mut().unwrap().possible = false;
3981            self.disallow_simple_key();
3982        } else {
3983            if is_implicit_flow_mapping {
3984                self.tokens
3985                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart).into());
3986            }
3987            // The ':' indicator follows a complex key.
3988            if self.flow_level == 0 {
3989                if !self.simple_key_allowed {
3990                    return Err(ScanError::new_str(
3991                        start_mark,
3992                        "mapping values are not allowed in this context",
3993                    ));
3994                }
3995
3996                self.roll_indent(
3997                    start_mark.col,
3998                    None,
3999                    TokenType::BlockMappingStart,
4000                    start_mark,
4001                );
4002            }
4003            self.roll_one_col_indent();
4004
4005            if self.flow_level == 0 {
4006                self.allow_simple_key();
4007            } else {
4008                self.disallow_simple_key();
4009            }
4010        }
4011        self.tokens
4012            .push_back(Token(Span::empty(start_mark), TokenType::Value).into());
4013        self.tokens.append(&mut trailing_tokens);
4014
4015        Ok(())
4016    }
4017
4018    /// Add an indentation level to the stack with the given block token, if needed.
4019    ///
4020    /// An indentation level is added only if:
4021    ///   - We are not in a flow-style construct (which don't have indentation per-se).
4022    ///   - The current column is further indented than the last indent we have registered.
4023    fn roll_indent(
4024        &mut self,
4025        col: usize,
4026        number: Option<usize>,
4027        tok: TokenType<'input>,
4028        mark: Marker,
4029    ) {
4030        if self.flow_level > 0 {
4031            return;
4032        }
4033
4034        // If the last indent was a non-block indent, remove it.
4035        // This means that we prepared an indent that we thought we wouldn't use, but realized just
4036        // now that it is a block indent.
4037        if self.indent <= col as isize {
4038            if let Some(indent) = self.indents.last() {
4039                if !indent.needs_block_end {
4040                    self.indent = indent.indent;
4041                    self.indents.pop();
4042                }
4043            }
4044        }
4045
4046        if self.indent < col as isize {
4047            self.indents.push(Indent {
4048                indent: self.indent,
4049                needs_block_end: true,
4050            });
4051            self.indent = col as isize;
4052            let tokens_parsed = self.tokens_parsed;
4053            match number {
4054                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
4055                None => self.tokens.push_back(Token(Span::empty(mark), tok).into()),
4056            }
4057        }
4058    }
4059
4060    /// Pop indentation levels from the stack as much as needed.
4061    ///
4062    /// Indentation levels are popped from the stack while they are further indented than `col`.
4063    /// If we are in a flow-style construct (which don't have indentation per-se), this function
4064    /// does nothing.
4065    fn unroll_indent(&mut self, col: isize) {
4066        if self.flow_level > 0 {
4067            return;
4068        }
4069        while self.indent > col {
4070            let indent = self.indents.pop().unwrap();
4071            self.indent = indent.indent;
4072            if indent.needs_block_end {
4073                self.tokens
4074                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd).into());
4075            }
4076        }
4077    }
4078
4079    /// Add an indentation level of 1 column that does not start a block.
4080    ///
4081    /// See the documentation of [`Indent::needs_block_end`] for more details.
4082    /// An indentation is not added if we are inside a flow level or if the last indent is already
4083    /// a non-block indent.
4084    fn roll_one_col_indent(&mut self) {
4085        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
4086            self.indents.push(Indent {
4087                indent: self.indent,
4088                needs_block_end: false,
4089            });
4090            self.indent += 1;
4091        }
4092    }
4093
4094    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
4095    fn unroll_non_block_indents(&mut self) {
4096        while let Some(indent) = self.indents.last() {
4097            if indent.needs_block_end {
4098                break;
4099            }
4100            self.indent = indent.indent;
4101            self.indents.pop();
4102        }
4103    }
4104
4105    /// Mark the next token to be inserted as a potential simple key.
4106    fn save_simple_key(&mut self) {
4107        if self.simple_key_allowed {
4108            let required = self.flow_level == 0
4109                && self.indent == (self.mark.col as isize)
4110                && self.indents.last().unwrap().needs_block_end;
4111
4112            if let Some(last) = self.simple_keys.last_mut() {
4113                *last = SimpleKey {
4114                    mark: self.mark,
4115                    possible: true,
4116                    required,
4117                    token_number: self.tokens_parsed + self.tokens.len(),
4118                };
4119            }
4120        }
4121    }
4122
4123    fn remove_simple_key(&mut self) -> ScanResult {
4124        let last = self.simple_keys.last_mut().unwrap();
4125        if last.possible && last.required {
4126            return Err(self.simple_key_expected());
4127        }
4128
4129        last.possible = false;
4130        Ok(())
4131    }
4132
4133    /// Return whether the scanner is inside a block but outside of a flow sequence.
4134    fn is_within_block(&self) -> bool {
4135        !self.indents.is_empty()
4136    }
4137
4138    /// If an implicit mapping had started, end it.
4139    ///
4140    /// This function does not pop the state in [`implicit_flow_mapping_states`].
4141    ///
4142    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
4143    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
4144        if self
4145            .implicit_flow_mapping_states
4146            .last()
4147            .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
4148        {
4149            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
4150            self.set_current_flow_mapping_started(false);
4151            self.tokens
4152                .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd).into());
4153        }
4154    }
4155
4156    fn current_flow_collection_is_sequence(&self) -> bool {
4157        self.flow_markers
4158            .last()
4159            .is_some_and(|(_, bracket)| *bracket == '[')
4160    }
4161
4162    fn current_flow_mapping_started(&self) -> bool {
4163        self.flow_mapping_started.last().copied().unwrap_or(false)
4164    }
4165
4166    fn set_current_flow_mapping_started(&mut self, started: bool) {
4167        if let Some(current) = self.flow_mapping_started.last_mut() {
4168            *current = started;
4169        }
4170    }
4171}
4172
4173/// Chomping, how final line breaks and trailing empty lines are interpreted.
4174///
4175/// See YAML spec 8.1.1.2.
4176#[derive(PartialEq, Eq)]
4177pub enum Chomping {
4178    /// The final line break and any trailing empty lines are excluded.
4179    Strip,
4180    /// The final line break is preserved, but trailing empty lines are excluded.
4181    Clip,
4182    /// The final line break and trailing empty lines are included.
4183    Keep,
4184}
4185
4186#[cfg(test)]
4187mod test {
4188    use alloc::{
4189        borrow::{Cow, ToOwned},
4190        rc::Rc,
4191        string::String,
4192        vec::Vec,
4193    };
4194    use core::cell::Cell;
4195
4196    use crate::{
4197        input::{str::StrInput, BorrowedInput, BufferedInput, Input},
4198        scanner::{
4199            Comment, Marker, Placement, QueuedToken, QueuedTokenType, ScalarStyle, Scanner, Span,
4200            TEncoding, Token, TokenType,
4201        },
4202    };
4203
4204    struct CountingChars {
4205        chars: alloc::vec::IntoIter<char>,
4206        read: Rc<Cell<usize>>,
4207    }
4208
4209    impl Iterator for CountingChars {
4210        type Item = char;
4211
4212        fn next(&mut self) -> Option<Self::Item> {
4213            let next = self.chars.next();
4214            if next.is_some() {
4215                self.read.set(self.read.get() + 1);
4216            }
4217            next
4218        }
4219    }
4220
4221    struct SlicingOnlyInput<'input> {
4222        inner: StrInput<'input>,
4223        expose_slice: bool,
4224    }
4225
4226    impl<'input> SlicingOnlyInput<'input> {
4227        fn new(source: &'input str, expose_slice: bool) -> Self {
4228            Self {
4229                inner: StrInput::new(source),
4230                expose_slice,
4231            }
4232        }
4233    }
4234
4235    impl Input for SlicingOnlyInput<'_> {
4236        fn lookahead(&mut self, count: usize) {
4237            self.inner.lookahead(count);
4238        }
4239
4240        fn buflen(&self) -> usize {
4241            self.inner.buflen()
4242        }
4243
4244        fn bufmaxlen(&self) -> usize {
4245            self.inner.bufmaxlen()
4246        }
4247
4248        fn raw_read_ch(&mut self) -> char {
4249            self.inner.raw_read_ch()
4250        }
4251
4252        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4253            self.inner.raw_read_non_breakz_ch()
4254        }
4255
4256        fn skip(&mut self) {
4257            self.inner.skip();
4258        }
4259
4260        fn skip_n(&mut self, count: usize) {
4261            self.inner.skip_n(count);
4262        }
4263
4264        fn peek(&self) -> char {
4265            self.inner.peek()
4266        }
4267
4268        fn peek_nth(&self, n: usize) -> char {
4269            self.inner.peek_nth(n)
4270        }
4271
4272        fn byte_offset(&self) -> Option<usize> {
4273            self.inner.byte_offset()
4274        }
4275
4276        fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
4277            if self.expose_slice {
4278                self.inner.slice_bytes(start, end)
4279            } else {
4280                None
4281            }
4282        }
4283    }
4284
4285    impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
4286        fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
4287            None
4288        }
4289    }
4290
4291    #[test]
4292    fn anchor_character_set_allows_colon_and_rejects_flow_indicators() {
4293        use super::is_anchor_char;
4294
4295        assert!(is_anchor_char('x'));
4296        assert!(is_anchor_char('-'));
4297        assert!(is_anchor_char('_'));
4298        assert!(is_anchor_char(':'));
4299        assert!(is_anchor_char('#'));
4300        assert!(is_anchor_char('/'));
4301        assert!(is_anchor_char('?'));
4302
4303        for c in [',', '[', ']', '{', '}', ' ', '\t', '\n', '\r', '\0'] {
4304            assert!(
4305                !is_anchor_char(c),
4306                "character {c:?} must not be accepted in anchor/alias names"
4307            );
4308        }
4309    }
4310
4311    #[test]
4312    fn flow_simple_key_length_limit_bounds_buffering() {
4313        let mut yaml = String::from("[\n\"start\"\n");
4314        for _ in 0..600 {
4315            yaml.push_str("\"x\"\n");
4316        }
4317        let total_chars = yaml.chars().count();
4318        let read = Rc::new(Cell::new(0));
4319        let chars = yaml.chars().collect::<Vec<_>>().into_iter();
4320        let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
4321            chars,
4322            read: Rc::clone(&read),
4323        }));
4324
4325        assert!(matches!(
4326            scanner.next_token().unwrap().unwrap().1,
4327            TokenType::StreamStart(_)
4328        ));
4329
4330        let token = scanner.next_token().unwrap().unwrap();
4331        assert!(matches!(token.1, TokenType::FlowSequenceStart));
4332
4333        let token = scanner.next_token().unwrap().unwrap();
4334        assert!(matches!(
4335            token.1,
4336            TokenType::Scalar(_, ref value) if value == "start"
4337        ));
4338        assert!(
4339            read.get() < total_chars,
4340            "scanner consumed all {total_chars} chars before yielding the first flow scalar"
4341        );
4342        assert!(
4343            read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
4344            "scanner read {} chars before yielding the first flow scalar",
4345            read.get()
4346        );
4347    }
4348
4349    #[test]
4350    fn comment_capture_does_not_change_leading_whitespace() {
4351        let mut scanner = Scanner::new(StrInput::new("# comment\n"));
4352
4353        let token = scanner.scan_comment_token().unwrap();
4354
4355        assert!(scanner.leading_whitespace);
4356        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
4357
4358        let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
4359        scanner.input.lookahead(1);
4360
4361        let token = scanner.scan_comment_token().unwrap();
4362
4363        assert!(scanner.leading_whitespace);
4364        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
4365    }
4366
4367    #[test]
4368    fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
4369        let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
4370        scanner.input.lookahead(2);
4371        assert_eq!(scanner.input.peek_nth(1), ' ');
4372
4373        let token = scanner.scan_comment_token().unwrap();
4374
4375        assert!(matches!(token.1, TokenType::Comment(ref comment)
4376            if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
4377    }
4378
4379    #[test]
4380    fn comment_capture_errors_when_offsets_have_no_slice() {
4381        let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
4382
4383        let error = scanner.scan_comment_token().unwrap_err();
4384
4385        assert_eq!(
4386            error.info(),
4387            "internal error: input advertised offsets but did not provide a slice"
4388        );
4389    }
4390
4391    #[test]
4392    fn queued_token_roundtrips_public_token_variants() {
4393        let span = Span::new(Marker::new(0, 1, 0), Marker::new(7, 1, 7));
4394        let tokens = [
4395            Token(span, TokenType::StreamStart(TEncoding::Utf8)),
4396            Token(span, TokenType::StreamEnd),
4397            Token(span, TokenType::VersionDirective(1, 2)),
4398            Token(
4399                span,
4400                TokenType::TagDirective(Cow::Borrowed("!app!"), Cow::Borrowed("tag:app.example,")),
4401            ),
4402            Token(span, TokenType::DocumentStart),
4403            Token(span, TokenType::DocumentEnd),
4404            Token(span, TokenType::BlockSequenceStart),
4405            Token(span, TokenType::BlockMappingStart),
4406            Token(span, TokenType::BlockEnd),
4407            Token(span, TokenType::FlowSequenceStart),
4408            Token(span, TokenType::FlowSequenceEnd),
4409            Token(span, TokenType::FlowMappingStart),
4410            Token(span, TokenType::FlowMappingEnd),
4411            Token(span, TokenType::BlockEntry),
4412            Token(span, TokenType::FlowEntry),
4413            Token(span, TokenType::Key),
4414            Token(span, TokenType::Value),
4415            Token(span, TokenType::Alias(Cow::Borrowed("alias"))),
4416            Token(span, TokenType::Anchor(Cow::Borrowed("anchor"))),
4417            Token(
4418                span,
4419                TokenType::Tag(Cow::Borrowed("!"), Cow::Borrowed("tag")),
4420            ),
4421            Token(
4422                span,
4423                TokenType::Scalar(ScalarStyle::Literal, Cow::Borrowed("scalar")),
4424            ),
4425            Token(
4426                span,
4427                TokenType::Comment(
4428                    Comment::new(span, Cow::Borrowed(" comment")).with_placement(Placement::Right),
4429                ),
4430            ),
4431            Token(
4432                span,
4433                TokenType::ReservedDirective(
4434                    "reserved".to_owned(),
4435                    vec!["one".to_owned(), "two".to_owned()],
4436                ),
4437            ),
4438        ];
4439
4440        for token in tokens {
4441            let queued: QueuedToken = token.clone().into();
4442
4443            assert_eq!(queued.into_public(), token);
4444        }
4445    }
4446
4447    #[test]
4448    fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
4449        let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
4450
4451        scanner.skip_yaml_whitespace(false).unwrap();
4452
4453        assert!(scanner.tokens.is_empty());
4454        assert_eq!(scanner.mark.line(), 2);
4455        assert_eq!(scanner.mark.col(), 0);
4456    }
4457
4458    #[test]
4459    fn yaml_whitespace_can_stop_after_queued_comment() {
4460        let mut scanner = Scanner::new(StrInput::new(" # queued\n# later\n"));
4461
4462        assert!(scanner.skip_yaml_whitespace(true).unwrap());
4463
4464        assert_eq!(scanner.tokens.len(), 1);
4465        assert!(matches!(
4466            scanner.tokens.front().unwrap().1,
4467            QueuedTokenType::Comment(ref comment) if comment.text == " queued"
4468        ));
4469        assert_eq!(scanner.mark.line(), 1);
4470        assert_eq!(scanner.mark.col(), 9);
4471    }
4472
4473    #[test]
4474    fn token_skip_can_stop_after_queued_comment() {
4475        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n"));
4476
4477        assert!(scanner.skip_to_next_token(true).unwrap());
4478
4479        assert_eq!(scanner.tokens.len(), 1);
4480        assert!(matches!(
4481            scanner.tokens.front().unwrap().1,
4482            QueuedTokenType::Comment(ref comment) if comment.text == " first"
4483        ));
4484        assert_eq!(scanner.mark.line(), 2);
4485        assert_eq!(scanner.mark.col(), 0);
4486    }
4487
4488    #[test]
4489    fn scanner_emits_first_leading_comment_before_scanning_next_comment() {
4490        let mut scanner = Scanner::new(StrInput::new("# first\n# second\nkey: value\n"));
4491
4492        assert!(matches!(
4493            scanner.next_token().unwrap().unwrap().1,
4494            TokenType::StreamStart(_)
4495        ));
4496        assert!(matches!(
4497            scanner.next_token().unwrap().unwrap().1,
4498            TokenType::Comment(ref comment) if comment.text == " first"
4499        ));
4500        assert!(scanner.tokens.is_empty());
4501        assert!(matches!(
4502            scanner.next_token().unwrap().unwrap().1,
4503            TokenType::Comment(ref comment) if comment.text == " second"
4504        ));
4505    }
4506
4507    #[test]
4508    fn scanner_emits_quoted_scalar_comment_before_scanning_following_value() {
4509        let mut scanner = Scanner::new(StrInput::new("\"key\" # quoted\n: value\n"));
4510
4511        assert!(matches!(
4512            scanner.next_token().unwrap().unwrap().1,
4513            TokenType::StreamStart(_)
4514        ));
4515        assert!(matches!(
4516            scanner.next_token().unwrap().unwrap().1,
4517            TokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4518        ));
4519        assert!(matches!(
4520            scanner.next_token().unwrap().unwrap().1,
4521            TokenType::Comment(ref comment) if comment.text == " quoted"
4522        ));
4523    }
4524
4525    #[test]
4526    fn flow_scalar_comment_disables_adjacent_value_lookahead() {
4527        let mut scanner = Scanner::new(StrInput::new("\"key\"\n# quoted\n: value\n"));
4528
4529        scanner.fetch_flow_scalar(false).unwrap();
4530
4531        assert_eq!(scanner.adjacent_value_allowed_at, usize::MAX);
4532        assert!(matches!(
4533            scanner.tokens.front().unwrap().1,
4534            QueuedTokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4535        ));
4536        assert!(scanner.tokens.iter().any(|QueuedToken(_, token)| matches!(
4537            token,
4538            QueuedTokenType::Comment(comment) if comment.text == " quoted"
4539        )));
4540    }
4541
4542    #[test]
4543    fn deferred_error_waits_for_all_comment_tokens() {
4544        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
4545
4546        assert!(matches!(
4547            scanner.next_token().unwrap().unwrap().1,
4548            TokenType::StreamStart(_)
4549        ));
4550        assert!(matches!(
4551            scanner.next_token().unwrap().unwrap().1,
4552            TokenType::Comment(ref comment) if comment.text == " first"
4553        ));
4554        assert!(matches!(
4555            scanner.next_token().unwrap().unwrap().1,
4556            TokenType::Comment(ref comment) if comment.text == " second"
4557        ));
4558
4559        let error = scanner.next_token().unwrap_err();
4560
4561        assert!(error.info().contains("unexpected character"));
4562    }
4563
4564    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
4565    #[test]
4566    fn anchor_name_is_borrowed_for_str_input() {
4567        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
4568
4569        loop {
4570            let tok = scanner
4571                .next_token()
4572                .expect("valid YAML must scan without errors")
4573                .expect("scanner must eventually produce a token");
4574            if let TokenType::Anchor(name) = tok.1 {
4575                assert!(matches!(name, Cow::Borrowed("anch")));
4576                break;
4577            }
4578        }
4579    }
4580
4581    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
4582    #[test]
4583    fn anchor_name_rejects_non_printable_control_chars() {
4584        let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
4585
4586        loop {
4587            let tok = scanner
4588                .next_token()
4589                .expect("scanning should not fail")
4590                .expect("scanner must eventually produce a token");
4591            if let TokenType::Anchor(name) = tok.1 {
4592                assert!(matches!(name, Cow::Borrowed("foo")));
4593                let next = scanner.next_token().expect("scanning should not fail");
4594                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4595                    assert!(rest.starts_with('\u{0001}'));
4596                }
4597                break;
4598            }
4599        }
4600    }
4601
4602    #[test]
4603    fn alias_name_rejects_non_printable_control_chars() {
4604        let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
4605
4606        loop {
4607            let tok = scanner
4608                .next_token()
4609                .expect("scanning should not fail")
4610                .expect("scanner must eventually produce a token");
4611            if let TokenType::Alias(name) = tok.1 {
4612                assert!(matches!(name, Cow::Borrowed("foo")));
4613                let next = scanner.next_token().expect("scanning should not fail");
4614                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4615                    assert!(rest.starts_with('\u{0001}'));
4616                }
4617                break;
4618            }
4619        }
4620    }
4621
4622    #[test]
4623    fn alias_name_is_borrowed_for_str_input() {
4624        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
4625
4626        loop {
4627            let tok = scanner
4628                .next_token()
4629                .expect("valid YAML must scan without errors")
4630                .expect("scanner must eventually produce a token");
4631            if let TokenType::Alias(name) = tok.1 {
4632                assert!(matches!(name, Cow::Borrowed("anch")));
4633                break;
4634            }
4635        }
4636    }
4637
4638    #[test]
4639    fn alias_name_scans_colon_as_part_of_name() {
4640        let mut scanner = Scanner::new(StrInput::new("*foo: bar\n"));
4641
4642        loop {
4643            let tok = scanner
4644                .next_token()
4645                .expect("scanner must not fail before alias token")
4646                .expect("scanner must eventually emit an alias token");
4647
4648            if let TokenType::Alias(name) = tok.1 {
4649                assert_eq!(name.as_ref(), "foo:");
4650                break;
4651            }
4652        }
4653    }
4654
4655    #[test]
4656    fn anchor_name_scans_colon_as_part_of_name() {
4657        let mut scanner = Scanner::new(StrInput::new("&foo: bar\n"));
4658
4659        loop {
4660            let tok = scanner
4661                .next_token()
4662                .expect("scanner must not fail before anchor token")
4663                .expect("scanner must eventually emit an anchor token");
4664
4665            if let TokenType::Anchor(name) = tok.1 {
4666                assert_eq!(name.as_ref(), "foo:");
4667                break;
4668            }
4669        }
4670    }
4671
4672    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
4673    #[test]
4674    fn tag_directive_parts_are_borrowed_for_str_input() {
4675        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
4676
4677        loop {
4678            let tok = scanner
4679                .next_token()
4680                .expect("valid YAML must scan without errors")
4681                .expect("scanner must eventually produce a token");
4682            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4683                assert!(matches!(handle, Cow::Borrowed("!e!")));
4684                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
4685                break;
4686            }
4687        }
4688    }
4689
4690    #[test]
4691    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
4692        let mut scanner = Scanner::new(StrInput::new("foo\n"));
4693
4694        loop {
4695            let tok = scanner
4696                .next_token()
4697                .expect("valid YAML must scan without errors")
4698                .expect("scanner must eventually produce a token");
4699            if let TokenType::Scalar(_, value) = tok.1 {
4700                assert!(matches!(value, Cow::Borrowed("foo")));
4701                break;
4702            }
4703        }
4704    }
4705
4706    #[test]
4707    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
4708        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
4709
4710        loop {
4711            let tok = scanner
4712                .next_token()
4713                .expect("valid YAML must scan without errors")
4714                .expect("scanner must eventually produce a token");
4715            if let TokenType::Scalar(_, value) = tok.1 {
4716                assert!(matches!(value, Cow::Borrowed("foo bar")));
4717                break;
4718            }
4719        }
4720    }
4721
4722    #[test]
4723    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4724        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
4725
4726        loop {
4727            let tok = scanner
4728                .next_token()
4729                .expect("valid YAML must scan without errors")
4730                .expect("scanner must eventually produce a token");
4731            if let TokenType::Scalar(_, value) = tok.1 {
4732                assert!(matches!(value, Cow::Borrowed("foo bar")));
4733                break;
4734            }
4735        }
4736    }
4737
4738    #[test]
4739    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
4740        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
4741
4742        loop {
4743            let tok = scanner
4744                .next_token()
4745                .expect("valid YAML must scan without errors")
4746                .expect("scanner must eventually produce a token");
4747            if let TokenType::Scalar(_, value) = tok.1 {
4748                assert!(matches!(value, Cow::Owned(_)));
4749                assert_eq!(&*value, "foo'bar");
4750                break;
4751            }
4752        }
4753    }
4754
4755    #[test]
4756    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4757        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
4758
4759        loop {
4760            let tok = scanner
4761                .next_token()
4762                .expect("valid YAML must scan without errors")
4763                .expect("scanner must eventually produce a token");
4764            if let TokenType::Scalar(_, value) = tok.1 {
4765                assert!(matches!(value, Cow::Borrowed("foo bar")));
4766                break;
4767            }
4768        }
4769    }
4770
4771    #[test]
4772    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
4773        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
4774
4775        loop {
4776            let tok = scanner
4777                .next_token()
4778                .expect("valid YAML must scan without errors")
4779                .expect("scanner must eventually produce a token");
4780            if let TokenType::Scalar(_, value) = tok.1 {
4781                assert!(matches!(value, Cow::Owned(_)));
4782                assert_eq!(&*value, "foo\nbar");
4783                break;
4784            }
4785        }
4786    }
4787
4788    #[test]
4789    fn plain_key_is_borrowed_for_str_input() {
4790        // Keys are just scalars in a key position; they should also be borrowed.
4791        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
4792
4793        let mut found_key = false;
4794        let mut key_value: Option<Cow<'_, str>> = None;
4795
4796        loop {
4797            let tok = scanner
4798                .next_token()
4799                .expect("valid YAML must scan without errors");
4800            let Some(tok) = tok else { break };
4801
4802            if matches!(tok.1, TokenType::Key) {
4803                found_key = true;
4804            } else if found_key {
4805                if let TokenType::Scalar(_, value) = tok.1 {
4806                    key_value = Some(value);
4807                    break;
4808                }
4809            }
4810        }
4811
4812        assert!(found_key, "expected to find a Key token");
4813        let key_value = key_value.expect("expected to find a scalar after Key token");
4814        assert!(
4815            matches!(key_value, Cow::Borrowed("mykey")),
4816            "key should be borrowed, got: {key_value:?}"
4817        );
4818    }
4819
4820    #[test]
4821    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
4822        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
4823
4824        let mut found_key = false;
4825        let mut key_value: Option<Cow<'_, str>> = None;
4826
4827        loop {
4828            let tok = scanner
4829                .next_token()
4830                .expect("valid YAML must scan without errors");
4831            let Some(tok) = tok else { break };
4832
4833            if matches!(tok.1, TokenType::Key) {
4834                found_key = true;
4835            } else if found_key {
4836                if let TokenType::Scalar(_, value) = tok.1 {
4837                    key_value = Some(value);
4838                    break;
4839                }
4840            }
4841        }
4842
4843        assert!(found_key, "expected to find a Key token");
4844        let key_value = key_value.expect("expected to find a scalar after Key token");
4845        assert!(
4846            matches!(key_value, Cow::Borrowed("mykey")),
4847            "quoted key should be borrowed when verbatim, got: {key_value:?}"
4848        );
4849    }
4850
4851    #[test]
4852    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
4853        // Test a tag like !!str which should have handle="!!" and suffix="str"
4854        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
4855
4856        loop {
4857            let tok = scanner
4858                .next_token()
4859                .expect("valid YAML must scan without errors")
4860                .expect("scanner must eventually produce a token");
4861            if let TokenType::Tag(handle, suffix) = tok.1 {
4862                assert!(
4863                    matches!(handle, Cow::Borrowed("!!")),
4864                    "tag handle should be borrowed, got: {handle:?}"
4865                );
4866                assert!(
4867                    matches!(suffix, Cow::Borrowed("str")),
4868                    "tag suffix should be borrowed, got: {suffix:?}"
4869                );
4870                break;
4871            }
4872        }
4873    }
4874
4875    #[test]
4876    fn local_tag_suffix_is_borrowed_for_str_input() {
4877        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
4878        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
4879
4880        loop {
4881            let tok = scanner
4882                .next_token()
4883                .expect("valid YAML must scan without errors")
4884                .expect("scanner must eventually produce a token");
4885            if let TokenType::Tag(handle, suffix) = tok.1 {
4886                assert!(
4887                    matches!(handle, Cow::Borrowed("!")),
4888                    "local tag handle should be '!', got: {handle:?}"
4889                );
4890                assert!(
4891                    matches!(suffix, Cow::Borrowed("mytag")),
4892                    "local tag suffix should be borrowed, got: {suffix:?}"
4893                );
4894                break;
4895            }
4896        }
4897    }
4898
4899    #[test]
4900    fn tag_with_uri_escape_is_owned_for_str_input() {
4901        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
4902        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
4903
4904        loop {
4905            let tok = scanner
4906                .next_token()
4907                .expect("valid YAML must scan without errors")
4908                .expect("scanner must eventually produce a token");
4909            if let TokenType::Tag(handle, suffix) = tok.1 {
4910                assert!(
4911                    matches!(handle, Cow::Borrowed("!!")),
4912                    "tag handle should still be borrowed, got: {handle:?}"
4913                );
4914                assert!(
4915                    matches!(suffix, Cow::Owned(_)),
4916                    "tag suffix with URI escape should be owned, got: {suffix:?}"
4917                );
4918                assert_eq!(&*suffix, "my tag");
4919                break;
4920            }
4921        }
4922    }
4923
4924    #[test]
4925    fn flow_scalar_buffer_tracks_pending_whitespace() {
4926        let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
4927
4928        borrowed.note_pending_ws(5, 8);
4929        borrowed.commit_pending_ws();
4930        assert!(matches!(
4931            borrowed,
4932            super::FlowScalarBuf::Borrowed {
4933                end: 8,
4934                pending_ws_start: None,
4935                pending_ws_end: 8,
4936                ..
4937            }
4938        ));
4939
4940        borrowed.note_pending_ws(9, 11);
4941        borrowed.discard_pending_ws();
4942        assert!(matches!(
4943            borrowed,
4944            super::FlowScalarBuf::Borrowed {
4945                end: 8,
4946                pending_ws_start: None,
4947                pending_ws_end: 8,
4948                ..
4949            }
4950        ));
4951        assert!(borrowed.as_owned_mut().is_none());
4952
4953        let mut owned = super::FlowScalarBuf::new_owned();
4954        owned.as_owned_mut().unwrap().push_str("owned");
4955        assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
4956    }
4957
4958    fn first_scanner_error_info(input: &str) -> String {
4959        let mut scanner = Scanner::new(StrInput::new(input));
4960        loop {
4961            match scanner.next_token() {
4962                Ok(Some(_)) => {}
4963                Ok(None) => panic!("expected scanner error"),
4964                Err(error) => return error.info().to_owned(),
4965            }
4966        }
4967    }
4968
4969    fn first_scalar_value(input: &str) -> String {
4970        let mut scanner = Scanner::new(StrInput::new(input));
4971        loop {
4972            match scanner.next_token().expect("scanner should not error") {
4973                Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
4974                Some(_) => {}
4975                None => panic!("expected scalar token"),
4976            }
4977        }
4978    }
4979
4980    #[test]
4981    fn iterator_next_records_error_and_then_stays_empty() {
4982        let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
4983
4984        while scanner.next().is_some() {}
4985
4986        let error = scanner
4987            .get_error()
4988            .expect("scanner should retain the error");
4989        assert_eq!(error.info(), "unclosed quote");
4990        assert!(scanner.next().is_none());
4991    }
4992
4993    #[test]
4994    fn next_token_returns_none_after_stream_end() {
4995        let mut scanner = Scanner::new(StrInput::new(""));
4996
4997        while let Some(token) = scanner.next_token().unwrap() {
4998            if matches!(token.1, TokenType::StreamEnd) {
4999                break;
5000            }
5001        }
5002
5003        assert!(scanner.stream_started());
5004        assert!(scanner.stream_ended());
5005        assert!(scanner.next_token().unwrap().is_none());
5006    }
5007
5008    #[test]
5009    fn directive_name_must_be_present() {
5010        assert_eq!(
5011            first_scanner_error_info("%\n"),
5012            "while scanning a directive, could not find expected directive name"
5013        );
5014    }
5015
5016    #[test]
5017    fn yaml_directive_requires_dot_between_version_numbers() {
5018        assert_eq!(
5019            first_scanner_error_info("%YAML 1\n"),
5020            "while scanning a YAML directive, did not find expected digit or '.' character"
5021        );
5022    }
5023
5024    #[test]
5025    fn yaml_directive_requires_major_version_number() {
5026        assert_eq!(
5027            first_scanner_error_info("%YAML .2\n"),
5028            "while scanning a YAML directive, did not find expected version number"
5029        );
5030    }
5031
5032    #[test]
5033    fn yaml_directive_rejects_extremely_long_version_number() {
5034        assert_eq!(
5035            first_scanner_error_info("%YAML 1234567890.2\n"),
5036            "while scanning a YAML directive, found extremely long version number"
5037        );
5038    }
5039
5040    #[test]
5041    fn tag_directive_handle_must_end_with_bang() {
5042        assert_eq!(
5043            first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
5044            "while parsing a tag directive, did not find expected '!'"
5045        );
5046    }
5047
5048    #[test]
5049    fn tag_directive_handle_must_start_with_bang() {
5050        assert_eq!(
5051            first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
5052            "while scanning a tag, did not find expected '!'"
5053        );
5054    }
5055
5056    #[test]
5057    fn tag_directive_prefix_must_start_with_tag_character() {
5058        assert_eq!(
5059            first_scanner_error_info("%TAG !e! `bad\n"),
5060            "invalid global tag character"
5061        );
5062    }
5063
5064    #[test]
5065    fn tag_directive_prefix_must_end_before_invalid_content() {
5066        assert_eq!(
5067            first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
5068            "while scanning TAG, did not find expected whitespace or line break"
5069        );
5070    }
5071
5072    #[test]
5073    fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
5074        let mut scanner =
5075            Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
5076
5077        loop {
5078            let token = scanner
5079                .next_token()
5080                .expect("valid directive should scan")
5081                .expect("scanner must produce a directive token");
5082            if let TokenType::TagDirective(handle, prefix) = token.1 {
5083                assert!(matches!(handle, Cow::Borrowed("!e!")));
5084                assert!(matches!(prefix, Cow::Owned(_)));
5085                assert_eq!(&*prefix, "tag:example.com,2024:some app/");
5086                break;
5087            }
5088        }
5089    }
5090
5091    #[test]
5092    fn bare_bang_tag_scans_as_non_specific_tag() {
5093        let mut scanner = Scanner::new(StrInput::new("! foo\n"));
5094
5095        loop {
5096            let token = scanner
5097                .next_token()
5098                .expect("valid tag should scan")
5099                .expect("scanner must produce a tag token");
5100            if let TokenType::Tag(handle, suffix) = token.1 {
5101                assert_eq!(&*handle, "");
5102                assert_eq!(&*suffix, "!");
5103                break;
5104            }
5105        }
5106    }
5107
5108    #[test]
5109    fn tag_requires_separation_after_suffix() {
5110        assert_eq!(
5111            first_scanner_error_info("!foo,bar\n"),
5112            "while scanning a tag, did not find expected whitespace or line break"
5113        );
5114    }
5115
5116    #[test]
5117    fn verbatim_tag_requires_uri() {
5118        assert_eq!(
5119            first_scanner_error_info("!<> foo\n"),
5120            "while parsing a tag, did not find expected tag URI"
5121        );
5122    }
5123
5124    #[test]
5125    fn verbatim_tag_requires_closing_angle_bracket() {
5126        assert_eq!(
5127            first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
5128            "while scanning a verbatim tag, did not find the expected '>'"
5129        );
5130    }
5131
5132    #[test]
5133    fn tag_uri_escape_requires_hex_digits() {
5134        assert_eq!(
5135            first_scanner_error_info("!!bad%zz foo\n"),
5136            "while parsing a tag, found an invalid escape sequence"
5137        );
5138    }
5139
5140    #[test]
5141    fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
5142        assert_eq!(
5143            first_scanner_error_info("!!bad%80 foo\n"),
5144            "while parsing a tag, found an incorrect leading UTF-8 byte"
5145        );
5146    }
5147
5148    #[test]
5149    fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
5150        assert_eq!(
5151            first_scanner_error_info("!!bad%C2%41 foo\n"),
5152            "while parsing a tag, found an incorrect trailing UTF-8 byte"
5153        );
5154    }
5155
5156    #[test]
5157    fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
5158        assert_eq!(
5159            first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
5160            "while parsing a tag, found an invalid UTF-8 codepoint"
5161        );
5162    }
5163
5164    #[test]
5165    fn anchors_and_aliases_require_names() {
5166        let expected =
5167            "while scanning an anchor or alias, did not find expected alphabetic or numeric character";
5168
5169        assert_eq!(first_scanner_error_info("& \n"), expected);
5170        assert_eq!(first_scanner_error_info("* \n"), expected);
5171    }
5172
5173    #[test]
5174    fn document_end_marker_rejects_trailing_content() {
5175        assert_eq!(
5176            first_scanner_error_info("... trailing\n"),
5177            "invalid content after document end marker"
5178        );
5179    }
5180
5181    #[test]
5182    fn reserved_indicators_are_rejected_outside_directives() {
5183        assert_eq!(
5184            first_scanner_error_info(" @\n"),
5185            "unexpected character: `@'"
5186        );
5187    }
5188
5189    #[test]
5190    fn flow_block_entry_indicator_is_rejected() {
5191        assert_eq!(
5192            first_scanner_error_info("[- ]\n"),
5193            r#""-" is only valid inside a block"#
5194        );
5195    }
5196
5197    #[test]
5198    fn block_entry_after_tabbed_separator_reports_specific_error() {
5199        assert_eq!(
5200            first_scanner_error_info("-\t- value\n"),
5201            "'-' must be followed by a valid YAML whitespace"
5202        );
5203    }
5204
5205    #[test]
5206    fn document_indicator_reports_unclosed_flow_collection() {
5207        assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
5208    }
5209
5210    #[test]
5211    fn block_scalar_header_rejects_trailing_content() {
5212        assert_eq!(
5213            first_scanner_error_info("|+ trailing\n"),
5214            "while scanning a block scalar, did not find expected comment or line break"
5215        );
5216    }
5217
5218    #[test]
5219    fn block_scalar_rejects_zero_indent_indicator() {
5220        let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
5221
5222        assert_eq!(first_scanner_error_info("|0\n"), expected);
5223        assert_eq!(first_scanner_error_info("|+0\n"), expected);
5224    }
5225
5226    #[test]
5227    fn empty_block_scalar_at_eof_honors_chomping() {
5228        assert_eq!(first_scalar_value("|-\n"), "");
5229        assert_eq!(first_scalar_value("|+\n"), "\n");
5230    }
5231
5232    #[test]
5233    fn explicit_indent_block_scalar_can_end_at_document_marker() {
5234        assert_eq!(first_scalar_value("|1\n...\n"), "");
5235    }
5236
5237    #[test]
5238    fn root_explicit_indent_block_scalar_rejects_underindented_content() {
5239        assert_eq!(
5240            first_scanner_error_info("|2\nx\n"),
5241            "wrongly indented line in block scalar"
5242        );
5243    }
5244
5245    #[test]
5246    fn quoted_scalar_rejects_document_indicator_at_line_start() {
5247        assert_eq!(
5248            first_scanner_error_info("\"one\n---\ntwo\"\n"),
5249            "while scanning a quoted scalar, found unexpected document indicator"
5250        );
5251    }
5252
5253    #[test]
5254    fn quoted_scalar_rejects_tab_indentation_after_line_break() {
5255        assert_eq!(
5256            first_scanner_error_info("a: \"one\n\tbad\"\n"),
5257            "tab cannot be used as indentation"
5258        );
5259    }
5260
5261    #[test]
5262    fn quoted_scalar_rejects_underindented_continuation() {
5263        assert_eq!(
5264            first_scanner_error_info("a: \"one\nbad\"\n"),
5265            "invalid indentation in multiline quoted scalar"
5266        );
5267    }
5268
5269    #[test]
5270    fn indented_flow_scalar_reports_invalid_indentation() {
5271        assert_eq!(
5272            first_scanner_error_info("a:\n  [\nfoo]\n"),
5273            "invalid indentation"
5274        );
5275    }
5276
5277    #[test]
5278    fn required_simple_key_requires_value_at_stream_end() {
5279        assert_eq!(
5280            first_scanner_error_info("a:\n&b\n- c\n"),
5281            "simple key expect ':'"
5282        );
5283    }
5284
5285    #[test]
5286    fn plain_scalar_rejects_dash_before_flow_indicator() {
5287        assert_eq!(
5288            first_scanner_error_info("[-]\n"),
5289            "plain scalar cannot start with '-' followed by ,[]{}"
5290        );
5291    }
5292
5293    #[test]
5294    fn explicit_key_rejects_tab_after_indicator() {
5295        assert_eq!(
5296            first_scanner_error_info("? \tfoo\n"),
5297            "tabs disallowed in this context"
5298        );
5299    }
5300
5301    #[test]
5302    fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
5303        assert_eq!(
5304            first_scanner_error_info("[a:[]]\n"),
5305            "':' may not precede any of `[{` in flow mapping"
5306        );
5307    }
5308
5309    #[test]
5310    fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
5311        assert_eq!(
5312            first_scanner_error_info("[foo\n: bar]\n"),
5313            "illegal placement of ':' indicator"
5314        );
5315    }
5316
5317    #[test]
5318    fn stale_simple_key_token_position_is_a_scan_error() {
5319        let mut scanner = Scanner::new(StrInput::new(": value\n"));
5320        scanner.fetch_stream_start();
5321        scanner.tokens.clear();
5322        scanner.tokens_parsed = 1;
5323
5324        let simple_key = scanner
5325            .simple_keys
5326            .last_mut()
5327            .expect("stream start should create a simple key slot");
5328        simple_key.possible = true;
5329        simple_key.token_number = 0;
5330
5331        let error = scanner
5332            .fetch_value()
5333            .expect_err("stale simple key should be reported as a scan error");
5334        assert_eq!(error.info(), "simple key is no longer valid");
5335    }
5336
5337    #[test]
5338    fn issue14_alias_scanner_consumes_colon_as_name_character() {
5339        let mut scanner = Scanner::new(StrInput::new("*foo: bar\n"));
5340
5341        assert!(matches!(
5342            scanner.next_token().unwrap().unwrap().1,
5343            TokenType::StreamStart(_)
5344        ));
5345
5346        let token = scanner.next_token().unwrap().unwrap();
5347
5348        assert!(
5349            matches!(token.1, TokenType::Alias(ref name) if name.as_ref() == "foo:"),
5350            "expected `*foo: bar` to start with Alias(\"foo:\"), got {token:?}"
5351        );
5352    }
5353
5354    #[test]
5355    fn issue14_anchor_scanner_consumes_colon_as_name_character() {
5356        let mut scanner = Scanner::new(StrInput::new("&foo: bar\n"));
5357
5358        assert!(matches!(
5359            scanner.next_token().unwrap().unwrap().1,
5360            TokenType::StreamStart(_)
5361        ));
5362
5363        let token = scanner.next_token().unwrap().unwrap();
5364
5365        assert!(
5366            matches!(token.1, TokenType::Anchor(ref name) if name.as_ref() == "foo:"),
5367            "expected `&foo: bar` to start with Anchor(\"foo:\"), got {token:?}"
5368        );
5369    }
5370}
granit_parser/scanner.rs

granit_parser/
scanner.rs