granit_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// Maximum number of characters the scanner may look ahead while disambiguating a simple key.
29const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31/// The encoding of the input. Currently, only UTF-8 is supported.
32#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34    /// UTF-8 encoding.
35    Utf8,
36}
37
38/// The source style used for a YAML scalar.
39#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41    /// A YAML plain scalar.
42    Plain,
43    /// A YAML single quoted scalar.
44    SingleQuoted,
45    /// A YAML double quoted scalar.
46    DoubleQuoted,
47
48    /// A YAML literal block (`|` block).
49    ///
50    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
51    /// In literal blocks, any indented character is content, including white space characters.
52    /// There is no way to escape characters, nor to break a long line.
53    Literal,
54    /// A YAML folded block (`>` block).
55    ///
56    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
57    /// In folded blocks, any indented character is content, including white space characters.
58    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
59    /// long lines.
60    Folded,
61}
62
63/// Offset information for a [`Marker`].
64///
65/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
66/// character source. For stable inputs, we can track both a character index and a byte offset.
67/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
68/// meaningful underlying file/source), so they are optional.
69#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71    /// The index (in characters) in the source.
72    chars: usize,
73    /// The offset (in bytes) in the source, if available.
74    bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78    fn eq(&self, other: &Self) -> bool {
79        // Byte offsets are an optional diagnostic enhancement and may differ between input
80        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
81        // position only.
82        self.chars == other.chars
83    }
84}
85
86impl Eq for MarkerOffsets {}
87
88/// A location in a YAML document.
89#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91    /// Offsets in the source.
92    offsets: MarkerOffsets,
93    /// The line (1-indexed).
94    line: usize,
95    /// The column (0-indexed).
96    col: usize,
97}
98
99impl Marker {
100    /// Create a new [`Marker`] at the given position.
101    #[must_use]
102    pub fn new(index: usize, line: usize, col: usize) -> Marker {
103        Marker {
104            offsets: MarkerOffsets {
105                chars: index,
106                bytes: None,
107            },
108            line,
109            col,
110        }
111    }
112
113    /// Return a copy of the marker with the given optional byte offset.
114    #[must_use]
115    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116        self.offsets.bytes = byte_offset;
117        self
118    }
119
120    /// Return the index (in characters) of the marker in the source.
121    #[must_use]
122    pub fn index(&self) -> usize {
123        self.offsets.chars
124    }
125
126    /// Return the byte offset of the marker in the source, if available.
127    #[must_use]
128    pub fn byte_offset(&self) -> Option<usize> {
129        self.offsets.bytes
130    }
131
132    /// Return the line of the marker in the source.
133    #[must_use]
134    pub fn line(&self) -> usize {
135        self.line
136    }
137
138    /// Return the column of the marker in the source.
139    #[must_use]
140    pub fn col(&self) -> usize {
141        self.col
142    }
143}
144
145/// A range of locations in a YAML document.
146#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148    /// The start (inclusive) of the range.
149    pub start: Marker,
150    /// The end (exclusive) of the range.
151    pub end: Marker,
152
153    /// Optional indentation hint associated with this span.
154    ///
155    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
156    /// When indentation is not meaningful or cannot be provided, it must be `None`.
157    pub indent: Option<usize>,
158
159    /// Optional source marker for the explicit tag token attached to this node.
160    ///
161    /// This is only meaningful for parser-emitted node events that carry a resolved tag, such as
162    /// [`Event::Scalar`](crate::Event::Scalar),
163    /// [`Event::SequenceStart`](crate::Event::SequenceStart), or
164    /// [`Event::MappingStart`](crate::Event::MappingStart). The normal [`Span::start`] and
165    /// [`Span::end`] continue to cover the node value or collection; `tag_start` points to the
166    /// tag token when that token appears at a different source location.
167    pub tag_start: Option<Marker>,
168}
169
170impl Span {
171    /// Create a new [`Span`] for the given range.
172    #[must_use]
173    pub fn new(start: Marker, end: Marker) -> Span {
174        Span {
175            start,
176            end,
177            indent: None,
178            tag_start: None,
179        }
180    }
181
182    /// Create an empty [`Span`] at a given location.
183    ///
184    /// An empty span doesn't contain any characters, but its position may still be meaningful.
185    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
186    ///
187    /// [`SequenceEnd`]: crate::Event::SequenceEnd
188    #[must_use]
189    pub fn empty(mark: Marker) -> Span {
190        Span {
191            start: mark,
192            end: mark,
193            indent: None,
194            tag_start: None,
195        }
196    }
197
198    /// Return a copy of this [`Span`] with the given indentation hint.
199    #[must_use]
200    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
201        self.indent = indent;
202        self
203    }
204
205    /// Return a copy of this [`Span`] with the given explicit tag-token start marker.
206    #[must_use]
207    pub fn with_tag_start(mut self, tag_start: Option<Marker>) -> Span {
208        self.tag_start = tag_start;
209        self
210    }
211
212    /// Return the source marker of the explicit tag token attached to this node, if any.
213    ///
214    /// The regular span still covers the node value or collection. This accessor is useful for
215    /// diagnostics that should point at the tag itself, especially when a tagged block collection
216    /// begins on a later line than the tag token.
217    #[must_use]
218    pub fn tag_start(&self) -> Option<Marker> {
219        self.tag_start
220    }
221
222    /// Return the length of the span (in characters).
223    #[must_use]
224    pub fn len(&self) -> usize {
225        self.end.index() - self.start.index()
226    }
227
228    /// Return whether the [`Span`] has a length of zero.
229    #[must_use]
230    pub fn is_empty(&self) -> bool {
231        self.len() == 0
232    }
233
234    /// Return the byte range of the span, if available.
235    #[must_use]
236    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
237        let start = self.start.byte_offset()?;
238        let end = self.end.byte_offset()?;
239        Some(start..end)
240    }
241
242    /// Return the source text covered by this span, if byte offsets are available
243    /// and the range is valid for the provided input.
244    #[must_use]
245    pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
246        source.get(self.byte_range()?)
247    }
248}
249
250/// A positional hint for a YAML source comment.
251///
252/// The parser currently recognizes these placements:
253///
254/// ```yaml
255/// # Above
256/// key: value # Right
257///
258/// # Free
259///
260/// next: value
261///
262/// # Last
263/// ```
264#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
265pub enum Placement {
266    /// An own-line comment immediately before another YAML token.
267    ///
268    /// This usually means the comment visually describes the following node.
269    /// Consecutive own-line comments without blank lines between them are also considered
270    /// `Above`, so a comment block can attach to the next YAML element as a group.
271    Above,
272    /// A same-line comment after YAML content or syntax. Examples include `key: value # Right`
273    /// and `- # Right` for an empty sequence entry.
274    Right,
275    /// A standalone own-line comment that is separated from nearby YAML tokens.
276    ///
277    /// This is the fallback for comments that are neither same-line comments, immediately above a
278    /// following token, nor the final comment in the stream. Consumers should treat `Free` as not
279    /// having an obvious neighboring node.
280    #[default]
281    Free,
282    /// An own-line comment at the end of the input stream.
283    ///
284    /// A `Last` comment may be followed by blank lines, but no further YAML token appears before
285    /// `StreamEnd`.
286    Last,
287}
288
289/// A YAML comment captured from the source.
290///
291/// Comments are presentation metadata, not YAML data. This type carries the raw comment payload,
292/// source span, and a best-effort [`Placement`] hint for callers that want to correlate comments
293/// with nearby YAML presentation.
294#[derive(Clone, PartialEq, Debug, Eq)]
295pub struct Comment<'input> {
296    /// Span covering the whole source comment, including `#` and excluding the line break.
297    pub span: Span,
298    /// Raw comment payload exactly after `#`, excluding only the line break.
299    ///
300    /// Leading spaces are preserved, including a single space immediately after `#` when present.
301    pub text: Cow<'input, str>,
302    /// Best-effort placement of this comment relative to nearby YAML content.
303    pub placement: Placement,
304}
305
306impl<'input> Comment<'input> {
307    /// Create a captured YAML comment from a source span and raw payload.
308    ///
309    /// The placement defaults to [`Placement::Free`]. Use [`Comment::with_placement`] when the
310    /// caller already knows a more specific placement.
311    #[must_use]
312    pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
313        Self {
314            span,
315            text: text.into(),
316            placement: Placement::Free,
317        }
318    }
319
320    /// Return this comment with the given placement.
321    #[must_use]
322    pub fn with_placement(mut self, placement: Placement) -> Self {
323        self.placement = placement;
324        self
325    }
326
327    /// Return the comment payload with surrounding whitespace removed.
328    ///
329    /// This helper is ergonomic only. The raw [`Self::text`] payload remains unchanged.
330    #[must_use]
331    pub fn trimmed_text(&self) -> &str {
332        self.text.trim()
333    }
334}
335
336impl AsRef<str> for Comment<'_> {
337    fn as_ref(&self) -> &str {
338        self.text.as_ref()
339    }
340}
341
342/// An error that occurred while scanning.
343#[derive(Clone, PartialEq, Debug, Eq)]
344pub struct ScanError {
345    /// The position at which the error happened in the source.
346    mark: Marker,
347    /// Human-readable details about the error.
348    info: String,
349}
350
351impl ScanError {
352    /// Create a new error from a location and an error string.
353    #[must_use]
354    #[cold]
355    pub fn new(loc: Marker, info: String) -> ScanError {
356        ScanError { mark: loc, info }
357    }
358
359    /// Convenience alias for string slices.
360    #[must_use]
361    #[cold]
362    pub fn new_str(loc: Marker, info: &str) -> ScanError {
363        ScanError {
364            mark: loc,
365            info: info.to_owned(),
366        }
367    }
368
369    #[cold]
370    pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
371        Err(self)
372    }
373
374    /// Return the marker pointing to the error in the source.
375    #[must_use]
376    pub fn marker(&self) -> &Marker {
377        &self.mark
378    }
379
380    /// Return the information string describing the error that happened.
381    #[must_use]
382    pub fn info(&self) -> &str {
383        self.info.as_ref()
384    }
385}
386
387impl fmt::Display for ScanError {
388    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389        write!(
390            f,
391            "{} at char {} line {} column {}",
392            self.info,
393            self.mark.index(),
394            self.mark.line(),
395            self.mark.col() + 1
396        )
397    }
398}
399
400impl core::error::Error for ScanError {}
401
402/// The contents of a scanner token.
403#[derive(Clone, PartialEq, Debug, Eq)]
404pub enum TokenType<'input> {
405    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
406    StreamStart(TEncoding),
407    /// The end of the stream, EOF.
408    StreamEnd,
409    /// A YAML version directive.
410    VersionDirective(
411        /// Major version number.
412        u32,
413        /// Minor version number.
414        u32,
415    ),
416    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
417    TagDirective(
418        /// Tag directive handle, such as `!` or `!app!`.
419        Cow<'input, str>,
420        /// Tag URI prefix associated with the handle.
421        Cow<'input, str>,
422    ),
423    /// The start of a YAML document (`---`).
424    DocumentStart,
425    /// The end of a YAML document (`...`).
426    DocumentEnd,
427    /// The start of a sequence block.
428    ///
429    /// Sequence blocks are arrays starting with a `-`.
430    BlockSequenceStart,
431    /// The start of a block mapping.
432    ///
433    /// Block mappings are key-value collections written with `key: value` entries.
434    BlockMappingStart,
435    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
436    BlockEnd,
437    /// Start of an inline sequence (`[ a, b ]`).
438    FlowSequenceStart,
439    /// End of an inline sequence.
440    FlowSequenceEnd,
441    /// Start of an inline mapping (`{ a: b, c: d }`).
442    FlowMappingStart,
443    /// End of an inline mapping.
444    FlowMappingEnd,
445    /// An entry in a block sequence (see [`TokenType::BlockSequenceStart`]).
446    BlockEntry,
447    /// An entry in a flow sequence (see [`TokenType::FlowSequenceStart`]).
448    FlowEntry,
449    /// A key in a mapping.
450    Key,
451    /// A value in a mapping.
452    Value,
453    /// A reference to a previously defined anchor.
454    Alias(Cow<'input, str>),
455    /// A YAML anchor definition introduced by `&`.
456    Anchor(Cow<'input, str>),
457    /// A YAML tag (starting with bangs `!`).
458    Tag(
459        /// The handle of the tag.
460        Cow<'input, str>,
461        /// The suffix of the tag.
462        Cow<'input, str>,
463    ),
464    /// A regular YAML scalar.
465    Scalar(ScalarStyle, Cow<'input, str>),
466    /// A YAML source comment.
467    ///
468    /// The token payload carries the raw text exactly after `#`, the source span, and an initial
469    /// [`Placement`] hint. The token's companion [`Span`] is the same as [`Comment::span`].
470    Comment(
471        /// Captured comment metadata.
472        Comment<'input>,
473    ),
474    /// A reserved YAML directive.
475    ReservedDirective(
476        /// Directive name.
477        String,
478        /// Directive parameters, split on YAML whitespace.
479        Vec<String>,
480    ),
481}
482
483/// A scanner token.
484#[derive(Clone, PartialEq, Debug, Eq)]
485pub struct Token<'input>(
486    /// Source span covered by this token.
487    pub Span,
488    /// Token payload emitted by the scanner.
489    pub TokenType<'input>,
490);
491
492/// Compact comment metadata used only inside the scanner queue.
493///
494/// The queued token already stores the source span, so storing a full public [`Comment`] there
495/// duplicates a large [`Span`] and inflates every queued token.
496#[derive(Clone, PartialEq, Debug, Eq)]
497pub(crate) struct QueuedComment<'input> {
498    pub(crate) text: Cow<'input, str>,
499    pub(crate) placement: Placement,
500}
501
502impl<'input> QueuedComment<'input> {
503    fn into_public(self, span: Span) -> Comment<'input> {
504        Comment::new(span, self.text).with_placement(self.placement)
505    }
506}
507
508impl<'input> From<Comment<'input>> for QueuedComment<'input> {
509    fn from(comment: Comment<'input>) -> Self {
510        Self {
511            text: comment.text,
512            placement: comment.placement,
513        }
514    }
515}
516
517/// Token payload used in the scanner's internal queue.
518///
519/// This mirrors [`TokenType`] but stores comments without their span. Public [`Token`] values are
520/// reconstructed when the scanner emits them.
521#[derive(Clone, PartialEq, Debug, Eq)]
522pub(crate) enum QueuedTokenType<'input> {
523    StreamStart(TEncoding),
524    StreamEnd,
525    VersionDirective(u32, u32),
526    TagDirective(Cow<'input, str>, Cow<'input, str>),
527    DocumentStart,
528    DocumentEnd,
529    BlockSequenceStart,
530    BlockMappingStart,
531    BlockEnd,
532    FlowSequenceStart,
533    FlowSequenceEnd,
534    FlowMappingStart,
535    FlowMappingEnd,
536    BlockEntry,
537    FlowEntry,
538    Key,
539    Value,
540    Alias(Cow<'input, str>),
541    Anchor(Cow<'input, str>),
542    Tag(Cow<'input, str>, Cow<'input, str>),
543    Scalar(ScalarStyle, Cow<'input, str>),
544    Comment(QueuedComment<'input>),
545    ReservedDirective(String, Vec<String>),
546}
547
548impl<'input> QueuedTokenType<'input> {
549    fn into_public(self, span: Span) -> TokenType<'input> {
550        match self {
551            Self::StreamStart(encoding) => TokenType::StreamStart(encoding),
552            Self::StreamEnd => TokenType::StreamEnd,
553            Self::VersionDirective(major, minor) => TokenType::VersionDirective(major, minor),
554            Self::TagDirective(handle, prefix) => TokenType::TagDirective(handle, prefix),
555            Self::DocumentStart => TokenType::DocumentStart,
556            Self::DocumentEnd => TokenType::DocumentEnd,
557            Self::BlockSequenceStart => TokenType::BlockSequenceStart,
558            Self::BlockMappingStart => TokenType::BlockMappingStart,
559            Self::BlockEnd => TokenType::BlockEnd,
560            Self::FlowSequenceStart => TokenType::FlowSequenceStart,
561            Self::FlowSequenceEnd => TokenType::FlowSequenceEnd,
562            Self::FlowMappingStart => TokenType::FlowMappingStart,
563            Self::FlowMappingEnd => TokenType::FlowMappingEnd,
564            Self::BlockEntry => TokenType::BlockEntry,
565            Self::FlowEntry => TokenType::FlowEntry,
566            Self::Key => TokenType::Key,
567            Self::Value => TokenType::Value,
568            Self::Alias(name) => TokenType::Alias(name),
569            Self::Anchor(name) => TokenType::Anchor(name),
570            Self::Tag(handle, suffix) => TokenType::Tag(handle, suffix),
571            Self::Scalar(style, value) => TokenType::Scalar(style, value),
572            Self::Comment(comment) => TokenType::Comment(comment.into_public(span)),
573            Self::ReservedDirective(name, params) => TokenType::ReservedDirective(name, params),
574        }
575    }
576}
577
578impl<'input> From<TokenType<'input>> for QueuedTokenType<'input> {
579    fn from(token: TokenType<'input>) -> Self {
580        match token {
581            TokenType::StreamStart(encoding) => Self::StreamStart(encoding),
582            TokenType::StreamEnd => Self::StreamEnd,
583            TokenType::VersionDirective(major, minor) => Self::VersionDirective(major, minor),
584            TokenType::TagDirective(handle, prefix) => Self::TagDirective(handle, prefix),
585            TokenType::DocumentStart => Self::DocumentStart,
586            TokenType::DocumentEnd => Self::DocumentEnd,
587            TokenType::BlockSequenceStart => Self::BlockSequenceStart,
588            TokenType::BlockMappingStart => Self::BlockMappingStart,
589            TokenType::BlockEnd => Self::BlockEnd,
590            TokenType::FlowSequenceStart => Self::FlowSequenceStart,
591            TokenType::FlowSequenceEnd => Self::FlowSequenceEnd,
592            TokenType::FlowMappingStart => Self::FlowMappingStart,
593            TokenType::FlowMappingEnd => Self::FlowMappingEnd,
594            TokenType::BlockEntry => Self::BlockEntry,
595            TokenType::FlowEntry => Self::FlowEntry,
596            TokenType::Key => Self::Key,
597            TokenType::Value => Self::Value,
598            TokenType::Alias(name) => Self::Alias(name),
599            TokenType::Anchor(name) => Self::Anchor(name),
600            TokenType::Tag(handle, suffix) => Self::Tag(handle, suffix),
601            TokenType::Scalar(style, value) => Self::Scalar(style, value),
602            TokenType::Comment(comment) => Self::Comment(comment.into()),
603            TokenType::ReservedDirective(name, params) => Self::ReservedDirective(name, params),
604        }
605    }
606}
607
608/// A compact token stored by the scanner before it is emitted publicly.
609#[derive(Clone, PartialEq, Debug, Eq)]
610pub(crate) struct QueuedToken<'input>(pub(crate) Span, pub(crate) QueuedTokenType<'input>);
611
612impl<'input> QueuedToken<'input> {
613    fn into_public(self) -> Token<'input> {
614        Token(self.0, self.1.into_public(self.0))
615    }
616}
617
618impl<'input> From<Token<'input>> for QueuedToken<'input> {
619    fn from(token: Token<'input>) -> Self {
620        Self(token.0, token.1.into())
621    }
622}
623
624/// A scalar that was parsed and may correspond to a simple key.
625///
626/// Upon scanning the following YAML:
627/// ```yaml
628/// a: b
629/// ```
630/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
631/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
632/// kept inside the scanner until more context is fetched and we are able to know whether it is a
633/// plain scalar or a key.
634///
635/// For example, see the following two YAML documents:
636/// ```yaml
637/// ---
638/// a: b # Here, `a` is a key.
639/// ...
640/// ---
641/// a # Here, `a` is a plain scalar.
642/// ...
643/// ```
644/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
645///
646/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
647/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
648/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
649///
650/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
651/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
652/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
653/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
654/// [`TokenType::Scalar`] token.
655///
656/// In the second document however, reaching EOF would mark the [`SimpleKey`] as no longer possible,
657/// and no [`TokenType::Key`] would be emitted by the scanner.
658#[derive(Clone, PartialEq, Debug, Eq)]
659struct SimpleKey {
660    /// Whether the token this [`SimpleKey`] refers to may still be a key.
661    ///
662    /// Sometimes, when we have more context, we notice that what we thought could be a key no
663    /// longer can be. In that case, [`Self::possible`] is set to `false`.
664    ///
665    /// For instance, let us consider the following invalid YAML:
666    /// ```yaml
667    /// key
668    ///   : value
669    /// ```
670    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is no longer
671    /// possible and [`Self::possible`] is set to `false`.
672    possible: bool,
673    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
674    ///
675    /// With more context, we may know for sure that the token must be a key. If later input makes
676    /// that impossible, the scanner must report an error instead of silently treating the token as a
677    /// plain scalar.
678    ///
679    /// This happens for simple keys at the current block indentation where the surrounding
680    /// collection requires the next token to be a mapping key.
681    required: bool,
682    /// The index of the token referred to by the [`SimpleKey`].
683    ///
684    /// This is the index in the scanner, which takes into account both the tokens that have been
685    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
686    /// [`Scanner::tokens`] for more details.
687    token_number: usize,
688    /// The position at which the token the [`SimpleKey`] refers to is.
689    mark: Marker,
690}
691
692impl SimpleKey {
693    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
694    fn new(mark: Marker) -> SimpleKey {
695        SimpleKey {
696            possible: false,
697            required: false,
698            token_number: 0,
699            mark,
700        }
701    }
702}
703
704/// An indentation level on the stack of indentations.
705#[derive(Clone, Debug, Default)]
706struct Indent {
707    /// The former indentation level.
708    indent: isize,
709    /// Whether, upon closing, this indents generates a `BlockEnd` token.
710    ///
711    /// There are levels of indentation which do not start a block. Examples of this would be:
712    /// ```yaml
713    /// -
714    ///   foo # ok
715    /// -
716    /// bar # ko, bar needs to be indented further than the `-`.
717    /// - [
718    ///  baz, # ok
719    /// quux # ko, quux needs to be indented further than the '-'.
720    /// ] # ko, the closing bracket needs to be indented further than the `-`.
721    /// ```
722    ///
723    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
724    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
725    /// sequence, although we must have exactly one to end the sequence.
726    needs_block_end: bool,
727}
728
729/// The knowledge we have about an implicit mapping.
730///
731/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
732/// sequence is omitted:
733/// ```yaml
734/// [ a: b, c: d ]
735/// # Equivalent to
736/// [ { a: b }, { c: d } ]
737/// # Equivalent to
738/// - a: b
739/// - c: d
740/// ```
741///
742/// The state must be carefully tracked for each nested flow sequence since we must emit a
743/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
744/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
745/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
746/// out-of-order.
747///
748/// [`FlowMappingStart`]: TokenType::FlowMappingStart
749/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
750#[derive(Debug, PartialEq)]
751enum ImplicitMappingState {
752    /// It is possible there is an implicit mapping.
753    ///
754    /// This state is the one when we have just encountered the opening `[`. We need more context
755    /// to know whether an implicit mapping follows.
756    Possible,
757    /// We are inside the implicit mapping.
758    ///
759    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
760    Inside(u8),
761}
762
763/// The YAML scanner.
764///
765/// This corresponds to the low-level interface when reading YAML. The scanner emits tokens as they
766/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
767/// some of the constructs. It has understanding of indentation and whitespace and is able to
768/// generate error messages for some invalid YAML constructs.
769///
770/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
771/// YAML documents.
772#[derive(Debug)]
773#[allow(clippy::struct_excessive_bools)]
774pub struct Scanner<'input, T> {
775    /// The input source.
776    ///
777    /// This must implement [`Input`].
778    input: T,
779    /// The position of the cursor within the reader.
780    mark: Marker,
781    /// Buffer for tokens to be returned.
782    ///
783    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
784    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
785    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
786    /// [`Self::next`] until we have more context.
787    tokens: VecDeque<QueuedToken<'input>>,
788    /// The last error that happened.
789    error: Option<ScanError>,
790    /// Error found after one or more already-scanned comment tokens.
791    deferred_error: Option<ScanError>,
792    /// Whether the input may contain `#` comment indicators.
793    comments_possible: bool,
794
795    /// Whether we have already emitted the `StreamStart` token.
796    stream_start_produced: bool,
797    /// Whether we have already emitted the `StreamEnd` token.
798    stream_end_produced: bool,
799    /// Whether the scanner is still in the prefix of the next document.
800    ///
801    /// A BOM may appear in a document prefix, before directives/comments/content. Once a document
802    /// start marker or any content token is scanned, another BOM is document content and must be
803    /// rejected unless it appears inside a quoted scalar.
804    document_prefix_allowed: bool,
805    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
806    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
807    adjacent_value_allowed_at: usize,
808    /// Whether a simple key could potentially start at the current position.
809    ///
810    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
811    simple_key_allowed: bool,
812    /// A stack of potential simple keys.
813    ///
814    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
815    /// are.
816    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
817    /// The current indentation level.
818    indent: isize,
819    /// List of all block indentation levels we are in (except the current one).
820    indents: smallvec::SmallVec<[Indent; 8]>,
821    /// Level of nesting of flow sequences.
822    flow_level: u8,
823    /// The number of tokens that have been returned from the scanner.
824    ///
825    /// This excludes the tokens from [`Self::tokens`].
826    tokens_parsed: usize,
827    /// Whether a token is ready to be taken from [`Self::tokens`].
828    token_available: bool,
829    /// Whether all characters encountered since the last newline were whitespace.
830    leading_whitespace: bool,
831    /// Whether we started a flow mapping at each flow nesting level.
832    ///
833    /// This is used to detect implicit flow mapping starts such as:
834    /// ```yaml
835    /// [ : foo ] # { null: "foo" }
836    /// ```
837    flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
838    /// An array of states, representing whether flow sequences have implicit mappings.
839    ///
840    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
841    /// the state is set to [`Possible`].
842    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
843    /// [`Inside`].
844    ///
845    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
846    /// The entries are created with the opening `[` and popped with the closing `]`.
847    ///
848    /// [`Possible`]: ImplicitMappingState::Possible
849    /// [`Inside`]: ImplicitMappingState::Inside
850    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
851    /// If a plain scalar was terminated by a `#` comment on its line, we set this
852    /// to detect an illegal multiline continuation on the following line.
853    interrupted_plain_by_comment: Option<Marker>,
854    /// Whether the scanner is still validating whitespace after an explicit `?` key indicator.
855    ///
856    /// This stays set across streamed comment tokens so a tab after the comment run is rejected the
857    /// same way it was when that whitespace was scanned in one pass.
858    explicit_key_tab_check_pending: bool,
859    /// A stack of markers for opening brackets `[` and `{`.
860    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
861    buf_leading_break: String,
862    buf_trailing_breaks: String,
863    buf_whitespaces: String,
864}
865
866impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
867    type Item = Token<'input>;
868
869    fn next(&mut self) -> Option<Self::Item> {
870        if self.error.is_some() {
871            return None;
872        }
873        match self.next_token() {
874            Ok(Some(tok)) => {
875                debug_print!(
876                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
877                    tok.1,
878                    tok.0
879                );
880                Some(tok)
881            }
882            Ok(tok) => tok,
883            Err(e) => self.stop_after_error(e),
884        }
885    }
886}
887
888/// A convenience alias for scanner functions that may fail without returning a value.
889pub type ScanResult = Result<(), ScanError>;
890
891#[derive(Debug)]
892enum FlowScalarBuf {
893    /// Candidate for `Cow::Borrowed`.
894    ///
895    /// `start..end` is the committed verbatim range.
896    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
897    /// committed (they must be dropped if followed by a line break).
898    Borrowed {
899        start: usize,
900        end: usize,
901        pending_ws_start: Option<usize>,
902        pending_ws_end: usize,
903    },
904    Owned(String),
905}
906
907impl FlowScalarBuf {
908    #[inline]
909    fn new_borrowed(start: usize) -> Self {
910        Self::Borrowed {
911            start,
912            end: start,
913            pending_ws_start: None,
914            pending_ws_end: start,
915        }
916    }
917
918    #[inline]
919    fn new_owned() -> Self {
920        Self::Owned(String::new())
921    }
922
923    #[inline]
924    fn as_owned_mut(&mut self) -> Option<&mut String> {
925        match self {
926            Self::Owned(s) => Some(s),
927            Self::Borrowed { .. } => None,
928        }
929    }
930
931    #[inline]
932    fn commit_pending_ws(&mut self) {
933        if let Self::Borrowed {
934            end,
935            pending_ws_start,
936            pending_ws_end,
937            ..
938        } = self
939        {
940            if pending_ws_start.is_some() {
941                *end = *pending_ws_end;
942                *pending_ws_start = None;
943            }
944        }
945    }
946
947    #[inline]
948    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
949        if let Self::Borrowed {
950            pending_ws_start,
951            pending_ws_end,
952            ..
953        } = self
954        {
955            if pending_ws_start.is_none() {
956                *pending_ws_start = Some(ws_start);
957            }
958            *pending_ws_end = ws_end;
959        }
960    }
961
962    #[inline]
963    fn discard_pending_ws(&mut self) {
964        if let Self::Borrowed {
965            pending_ws_start,
966            pending_ws_end,
967            end,
968            ..
969        } = self
970        {
971            *pending_ws_start = None;
972            *pending_ws_end = *end;
973        }
974    }
975}
976
977impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
978    #[inline]
979    fn promote_flow_scalar_buf_to_owned(
980        &self,
981        start_mark: &Marker,
982        buf: &mut FlowScalarBuf,
983    ) -> Result<(), ScanError> {
984        let FlowScalarBuf::Borrowed {
985            start,
986            end,
987            pending_ws_start: _,
988            pending_ws_end: _,
989        } = *buf
990        else {
991            return Ok(());
992        };
993
994        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
995            ScanError::new_str(
996                *start_mark,
997                "internal error: input advertised offsets but did not provide a slice",
998            )
999        })?;
1000        *buf = FlowScalarBuf::Owned(slice.to_owned());
1001        Ok(())
1002    }
1003    /// Try to borrow a slice from the underlying input.
1004    ///
1005    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
1006    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
1007    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
1008    #[inline]
1009    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
1010        self.input.slice_borrowed(start, end)
1011    }
1012
1013    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
1014    ///
1015    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
1016    /// borrowing is not possible, it falls back to allocating.
1017    fn scan_tag_handle_directive_cow(
1018        &mut self,
1019        mark: &Marker,
1020    ) -> Result<Cow<'input, str>, ScanError> {
1021        let Some(start) = self.input.byte_offset() else {
1022            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1023        };
1024
1025        if self.input.look_ch() != '!' {
1026            return Err(ScanError::new_str(
1027                *mark,
1028                "while scanning a tag, did not find expected '!'",
1029            ));
1030        }
1031
1032        // Consume the leading '!'.
1033        self.skip_non_blank();
1034
1035        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1036        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
1037        self.input.lookahead(1);
1038        while self.input.next_is_alpha() {
1039            self.skip_non_blank();
1040            self.input.lookahead(1);
1041        }
1042
1043        // Optional trailing '!'.
1044        if self.input.peek() == '!' {
1045            self.skip_non_blank();
1046        }
1047
1048        let Some(end) = self.input.byte_offset() else {
1049            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
1050            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1051        };
1052
1053        let Some(slice) = self.try_borrow_slice(start, end) else {
1054            // Fall back to allocating if zero-copy borrow is not available.
1055            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1056                ScanError::new_str(
1057                    *mark,
1058                    "internal error: input advertised slicing but did not provide a slice",
1059                )
1060            })?;
1061            if !slice.ends_with('!') && slice != "!" {
1062                return Err(ScanError::new_str(
1063                    *mark,
1064                    "while parsing a tag directive, did not find expected '!'",
1065                ));
1066            }
1067            return Ok(Cow::Owned(slice.to_owned()));
1068        };
1069
1070        if !slice.ends_with('!') && slice != "!" {
1071            return Err(ScanError::new_str(
1072                *mark,
1073                "while parsing a tag directive, did not find expected '!'",
1074            ));
1075        }
1076
1077        Ok(Cow::Borrowed(slice))
1078    }
1079
1080    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
1081    ///
1082    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
1083    /// escape is present, the prefix must be decoded and therefore allocated.
1084    fn scan_tag_prefix_directive_cow(
1085        &mut self,
1086        start_mark: &Marker,
1087    ) -> Result<Cow<'input, str>, ScanError> {
1088        let Some(start) = self.input.byte_offset() else {
1089            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1090        };
1091
1092        // The prefix must start with either '!' (local) or a valid global tag char.
1093        if self.input.look_ch() == '!' {
1094            self.skip_non_blank();
1095        } else if !is_tag_char(self.input.peek()) {
1096            return Err(ScanError::new_str(
1097                *start_mark,
1098                "invalid global tag character",
1099            ));
1100        } else if self.input.peek() == '%' {
1101            // Needs decoding. Fall back to allocating path below.
1102        } else {
1103            self.skip_non_blank();
1104        }
1105
1106        // Consume URI chars while we can stay in the borrowed path.
1107        while is_uri_char(self.input.look_ch()) {
1108            if self.input.peek() == '%' {
1109                break;
1110            }
1111            self.skip_non_blank();
1112        }
1113
1114        // If we encountered an escape sequence, we must decode, therefore allocate.
1115        if self.input.peek() == '%' {
1116            let current = self
1117                .input
1118                .byte_offset()
1119                .expect("byte_offset() must remain available once enabled");
1120            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1121                slice.to_owned()
1122            } else {
1123                String::new()
1124            };
1125
1126            while is_uri_char(self.input.look_ch()) {
1127                if self.input.peek() == '%' {
1128                    out.push(self.scan_uri_escapes(start_mark)?);
1129                } else {
1130                    out.push(self.input.peek());
1131                    self.skip_non_blank();
1132                }
1133            }
1134            return Ok(Cow::Owned(out));
1135        }
1136
1137        let Some(end) = self.input.byte_offset() else {
1138            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1139        };
1140
1141        let Some(slice) = self.try_borrow_slice(start, end) else {
1142            // Fall back to allocating if zero-copy borrow is not available.
1143            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1144                ScanError::new_str(
1145                    *start_mark,
1146                    "internal error: input advertised slicing but did not provide a slice",
1147                )
1148            })?;
1149            return Ok(Cow::Owned(slice.to_owned()));
1150        };
1151
1152        Ok(Cow::Borrowed(slice))
1153    }
1154    /// Create a scanner over the given input source.
1155    pub fn new(input: T) -> Self {
1156        let initial_byte_offset = input.byte_offset();
1157        let comments_possible = input.may_contain_comments();
1158        Scanner {
1159            input,
1160            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
1161            tokens: VecDeque::with_capacity(64),
1162            error: None,
1163            deferred_error: None,
1164            comments_possible,
1165
1166            stream_start_produced: false,
1167            stream_end_produced: false,
1168            document_prefix_allowed: true,
1169            adjacent_value_allowed_at: 0,
1170            simple_key_allowed: true,
1171            simple_keys: smallvec::SmallVec::new(),
1172            indent: -1,
1173            indents: smallvec::SmallVec::new(),
1174            flow_level: 0,
1175            tokens_parsed: 0,
1176            token_available: false,
1177            leading_whitespace: true,
1178            flow_mapping_started: smallvec::SmallVec::new(),
1179            implicit_flow_mapping_states: smallvec::SmallVec::new(),
1180            flow_markers: smallvec::SmallVec::new(),
1181            interrupted_plain_by_comment: None,
1182            explicit_key_tab_check_pending: false,
1183
1184            buf_leading_break: String::with_capacity(128),
1185            buf_trailing_breaks: String::with_capacity(128),
1186            buf_whitespaces: String::with_capacity(128),
1187        }
1188    }
1189
1190    /// Return a copy of the last error that was encountered, if any.
1191    ///
1192    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
1193    /// clone of) the same error.
1194    #[inline]
1195    pub fn get_error(&self) -> Option<ScanError> {
1196        self.error.clone().or_else(|| self.deferred_error.clone())
1197    }
1198
1199    #[cold]
1200    fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
1201        self.error = Some(error);
1202        None
1203    }
1204
1205    #[cold]
1206    fn simple_key_expected(&self) -> ScanError {
1207        ScanError::new_str(self.mark, "simple key expected")
1208    }
1209
1210    #[cold]
1211    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
1212        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
1213    }
1214
1215    /// Consume the next character. It is assumed the next character is a blank.
1216    #[inline]
1217    fn skip_blank(&mut self) {
1218        self.input.skip();
1219
1220        self.mark.offsets.chars += 1;
1221        self.mark.col += 1;
1222        self.mark.offsets.bytes = self.input.byte_offset();
1223    }
1224
1225    /// Consume the next character. It is assumed the next character is not a blank.
1226    #[inline]
1227    fn skip_non_blank(&mut self) {
1228        self.input.skip();
1229
1230        self.mark.offsets.chars += 1;
1231        self.mark.col += 1;
1232        self.mark.offsets.bytes = self.input.byte_offset();
1233        self.leading_whitespace = false;
1234    }
1235
1236    /// Consume a byte order mark from a document prefix.
1237    ///
1238    /// The source index advances, but the logical column remains unchanged so directives and
1239    /// document markers immediately following the BOM are still recognized as line-start tokens.
1240    #[inline]
1241    fn skip_bom(&mut self) {
1242        self.input.skip();
1243
1244        self.mark.offsets.chars += 1;
1245        self.mark.offsets.bytes = self.input.byte_offset();
1246    }
1247
1248    /// Consume one character that belongs to a comment.
1249    ///
1250    /// Unlike [`Self::skip_non_blank`], this deliberately does not change
1251    /// `leading_whitespace`. Comments are presentation content, so consuming one for either
1252    /// tokenization or skipping should only advance position bookkeeping.
1253    #[inline]
1254    fn skip_comment_char(&mut self) {
1255        self.input.skip();
1256
1257        self.mark.offsets.chars += 1;
1258        self.mark.col += 1;
1259        self.mark.offsets.bytes = self.input.byte_offset();
1260    }
1261
1262    /// Consume the next characters. It is assumed none of the next characters are blanks.
1263    #[inline]
1264    fn skip_n_non_blank(&mut self, count: usize) {
1265        for _ in 0..count {
1266            self.input.skip();
1267            self.mark.offsets.chars += 1;
1268            self.mark.col += 1;
1269        }
1270        self.mark.offsets.bytes = self.input.byte_offset();
1271        self.leading_whitespace = false;
1272    }
1273
1274    /// Consume the next character. It is assumed the next character is a newline.
1275    #[inline]
1276    fn skip_nl(&mut self) {
1277        self.input.skip();
1278
1279        self.mark.offsets.chars += 1;
1280        self.mark.col = 0;
1281        self.mark.line += 1;
1282        self.mark.offsets.bytes = self.input.byte_offset();
1283        self.leading_whitespace = true;
1284    }
1285
1286    /// Consume a line break (either CR, LF, or CRLF), if any. Do nothing if there is none.
1287    #[inline]
1288    fn skip_linebreak(&mut self) {
1289        if self.input.next_2_are('\r', '\n') {
1290            // While technically not a blank, this does not matter as `self.leading_whitespace`
1291            // will be reset by `skip_nl`.
1292            self.skip_blank();
1293            self.skip_nl();
1294        } else if self.input.next_is_break() {
1295            self.skip_nl();
1296        }
1297    }
1298
1299    #[cfg(test)]
1300    fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
1301        Ok(self.scan_comment_queued_token()?.into_public())
1302    }
1303
1304    fn scan_comment_queued_token(&mut self) -> Result<QueuedToken<'input>, ScanError> {
1305        let start_mark = self.mark;
1306        debug_assert_eq!(self.input.peek(), '#');
1307        let placement = if self.leading_whitespace {
1308            Placement::Free
1309        } else {
1310            Placement::Right
1311        };
1312
1313        self.skip_comment_char();
1314
1315        let text = if let Some(start) = self.input.byte_offset() {
1316            // Stable byte offsets are available; slice the payload once at the end.
1317            let n = self.input.skip_while_non_breakz();
1318            self.mark.offsets.chars += n;
1319            self.mark.col += n;
1320            let byte_offset = self.input.byte_offset();
1321            self.mark.offsets.bytes = byte_offset;
1322            let end = byte_offset.expect("byte_offset must remain available once enabled");
1323
1324            if let Some(slice) = self.try_borrow_slice(start, end) {
1325                Cow::Borrowed(slice)
1326            } else if let Some(slice) = self.input.slice_bytes(start, end) {
1327                // Defensive fallback for third-party inputs that expose offsets but cannot borrow.
1328                Cow::Owned(slice.to_owned())
1329            } else {
1330                return Err(ScanError::new_str(
1331                    start_mark,
1332                    "internal error: input advertised offsets but did not provide a slice",
1333                ));
1334            }
1335        } else {
1336            // Streaming input without stable offsets; collect into an owned string.
1337            let mut owned = String::new();
1338            while !is_breakz(self.input.look_ch()) {
1339                owned.push(self.input.peek());
1340                self.skip_comment_char();
1341            }
1342            Cow::Owned(owned)
1343        };
1344
1345        let end_mark = self.mark;
1346        let span = Span::new(start_mark, end_mark);
1347        Ok(QueuedToken(
1348            span,
1349            QueuedTokenType::Comment(QueuedComment { text, placement }),
1350        ))
1351    }
1352
1353    fn push_comment_token(&mut self) -> ScanResult {
1354        let token = self.scan_comment_queued_token()?;
1355        self.tokens.push_back(token);
1356        Ok(())
1357    }
1358
1359    fn skip_comment(&mut self) {
1360        debug_assert_eq!(self.input.peek(), '#');
1361
1362        self.skip_comment_char();
1363        let n = self.input.skip_while_non_breakz();
1364        self.mark.offsets.chars += n;
1365        self.mark.col += n;
1366        self.mark.offsets.bytes = self.input.byte_offset();
1367    }
1368
1369    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
1370    #[inline]
1371    pub fn stream_started(&self) -> bool {
1372        self.stream_start_produced
1373    }
1374
1375    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
1376    #[inline]
1377    pub fn stream_ended(&self) -> bool {
1378        self.stream_end_produced
1379    }
1380
1381    /// Return the current position in the input stream.
1382    #[inline]
1383    pub fn mark(&self) -> Marker {
1384        self.mark
1385    }
1386
1387    /// Return whether this scanner may emit comment tokens.
1388    #[inline]
1389    pub(crate) fn comments_possible(&self) -> bool {
1390        self.comments_possible
1391    }
1392
1393    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1394    //
1395    // A `\n` is pushed into `s`.
1396    //
1397    // # Panics (in debug)
1398    // If the next characters do not correspond to a line break.
1399    #[inline]
1400    fn read_break(&mut self, s: &mut String) {
1401        self.skip_break();
1402        s.push('\n');
1403    }
1404
1405    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1406    //
1407    // # Panics (in debug)
1408    // If the next characters do not correspond to a line break.
1409    #[inline]
1410    fn skip_break(&mut self) {
1411        let c = self.input.peek();
1412        let nc = self.input.peek_nth(1);
1413        debug_assert!(is_break(c));
1414        if c == '\r' && nc == '\n' {
1415            self.skip_blank();
1416        }
1417        self.skip_nl();
1418    }
1419
1420    /// Insert a token at the given position.
1421    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1422        let old_len = self.tokens.len();
1423        assert!(pos <= old_len);
1424        self.tokens.insert(pos, tok.into());
1425    }
1426
1427    fn simple_key_token_index(&self, sk: &SimpleKey, mark: Marker) -> Result<usize, ScanError> {
1428        let Some(index) = sk.token_number.checked_sub(self.tokens_parsed) else {
1429            return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1430        };
1431        if index > self.tokens.len() {
1432            return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1433        }
1434        Ok(index)
1435    }
1436
1437    #[inline]
1438    fn allow_simple_key(&mut self) {
1439        self.simple_key_allowed = true;
1440    }
1441
1442    #[inline]
1443    fn disallow_simple_key(&mut self) {
1444        self.simple_key_allowed = false;
1445    }
1446
1447    /// Scan enough input to append one next token to the internal token queue.
1448    ///
1449    /// # Errors
1450    /// Returns `ScanError` when the scanner does not find the next expected token.
1451    pub fn fetch_next_token(&mut self) -> ScanResult {
1452        self.input.lookahead(1);
1453
1454        if !self.stream_start_produced {
1455            self.fetch_stream_start();
1456            return Ok(());
1457        }
1458        if self.skip_to_next_token(true)? {
1459            return Ok(());
1460        }
1461
1462        debug_print!(
1463            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1464            self.mark,
1465            self.input.peek()
1466        );
1467
1468        self.stale_simple_keys()?;
1469
1470        let mark = self.mark;
1471        self.unroll_indent(mark.col as isize);
1472
1473        self.input.lookahead(4);
1474
1475        if self.input.next_is_z() {
1476            self.fetch_stream_end()?;
1477            return Ok(());
1478        }
1479
1480        if self.mark.col == 0 {
1481            if self.input.next_char_is('%') {
1482                return self.fetch_directive();
1483            } else if self.input.next_is_document_start() {
1484                return self.fetch_document_indicator(TokenType::DocumentStart);
1485            } else if self.input.next_is_document_end() {
1486                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1487                self.skip_ws_to_eol(SkipTabs::Yes)?;
1488                if !self.input.next_is_breakz() {
1489                    return Err(ScanError::new_str(
1490                        self.mark,
1491                        "invalid content after document end marker",
1492                    ));
1493                }
1494                return Ok(());
1495            }
1496        }
1497
1498        if self.document_prefix_allowed {
1499            self.document_prefix_allowed = false;
1500        }
1501
1502        if (self.mark.col as isize) < self.indent {
1503            self.input.lookahead(1);
1504            let c = self.input.peek();
1505            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1506                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1507            }
1508        }
1509
1510        let c = self.input.peek();
1511        let nc = self.input.peek_nth(1);
1512        match c {
1513            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1514            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1515            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1516            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1517            ',' => self.fetch_flow_entry(),
1518            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1519            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1520            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1521            ':' if self.flow_level > 0
1522                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1523            {
1524                self.fetch_flow_value()
1525            }
1526            // Is it an alias?
1527            '*' => self.fetch_anchor(true),
1528            // Is it an anchor?
1529            '&' => self.fetch_anchor(false),
1530            '!' => self.fetch_tag(),
1531            // Is it a literal scalar?
1532            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1533            // Is it a folded scalar?
1534            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1535            '\'' => self.fetch_flow_scalar(true),
1536            '"' => self.fetch_flow_scalar(false),
1537            // plain scalar
1538            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1539            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1540                self.fetch_plain_scalar()
1541            }
1542            c if is_bom(c) => Err(ScanError::new_str(
1543                self.mark,
1544                "a BOM must not appear inside a document",
1545            )),
1546            '%' | '@' | '`' => Err(ScanError::new(
1547                self.mark,
1548                format!("unexpected character: `{c}'"),
1549            )),
1550            _ => self.fetch_plain_scalar(),
1551        }
1552    }
1553
1554    /// Return the next compact queued token, scanning more input when needed.
1555    ///
1556    /// # Errors
1557    /// Returns `ScanError` when scanning fails to find an expected next token.
1558    pub(crate) fn next_queued_token(&mut self) -> Result<Option<QueuedToken<'input>>, ScanError> {
1559        if self.deferred_error.is_some() {
1560            if !matches!(
1561                self.tokens.front().map(|token| &token.1),
1562                Some(QueuedTokenType::Comment(_))
1563            ) {
1564                if let Some(error) = self.deferred_error.take() {
1565                    return error.into_result();
1566                }
1567            }
1568            self.token_available = true;
1569        }
1570
1571        if self.stream_end_produced {
1572            return Ok(None);
1573        }
1574
1575        if !self.token_available {
1576            if let Err(error) = self.fetch_more_tokens() {
1577                if matches!(
1578                    self.tokens.front().map(|token| &token.1),
1579                    Some(QueuedTokenType::Comment(_))
1580                ) {
1581                    self.deferred_error = Some(error);
1582                } else {
1583                    return Err(error);
1584                }
1585            }
1586        }
1587        let Some(t) = self.tokens.pop_front() else {
1588            return Err(ScanError::new_str(
1589                self.mark,
1590                "did not find expected next token",
1591            ));
1592        };
1593        self.token_available = false;
1594        self.tokens_parsed += 1;
1595
1596        let is_stream_end = matches!(t.1, QueuedTokenType::StreamEnd);
1597        if is_stream_end {
1598            self.stream_end_produced = true;
1599        }
1600        Ok(Some(t))
1601    }
1602
1603    /// Return the next queued token, scanning more input when needed.
1604    ///
1605    /// # Errors
1606    /// Returns `ScanError` when scanning fails to find an expected next token.
1607    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1608        Ok(self.next_queued_token()?.map(QueuedToken::into_public))
1609    }
1610
1611    /// Scan more input until a token is ready to be returned.
1612    ///
1613    /// # Errors
1614    /// Returns `ScanError` when scanning fails.
1615    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1616        let mut need_more;
1617        loop {
1618            if self.tokens.is_empty() {
1619                need_more = true;
1620            } else {
1621                need_more = false;
1622                // Stale potential keys that we know won't be keys.
1623                self.stale_simple_keys()?;
1624                if !matches!(
1625                    self.tokens.front().map(|token| &token.1),
1626                    Some(QueuedTokenType::Comment(_))
1627                ) {
1628                    // If our next token to be emitted may be a key, fetch more context.
1629                    for sk in &self.simple_keys {
1630                        if sk.possible && sk.token_number == self.tokens_parsed {
1631                            need_more = true;
1632                            break;
1633                        }
1634                    }
1635                }
1636            }
1637
1638            // Stop fetching immediately after document end/start markers
1639            // to allow the parser to emit the event before reading more content.
1640            if let Some(token) = self.tokens.back() {
1641                if matches!(
1642                    token.1,
1643                    QueuedTokenType::DocumentEnd | QueuedTokenType::DocumentStart
1644                ) {
1645                    break;
1646                }
1647            }
1648
1649            if !need_more {
1650                break;
1651            }
1652            self.fetch_next_token()?;
1653        }
1654        self.token_available = true;
1655
1656        Ok(())
1657    }
1658
1659    /// Mark simple keys that can no longer be keys as such.
1660    ///
1661    /// This function sets `possible` to `false` to each key that, now we have more context, we
1662    /// know will not be keys.
1663    ///
1664    /// # Errors
1665    /// This function returns an error if one of the keys becoming impossible was required to be a
1666    /// key.
1667    fn stale_simple_keys(&mut self) -> ScanResult {
1668        for sk in &mut self.simple_keys {
1669            let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1670            // The length cap applies in flow contexts too; otherwise token buffering can grow
1671            // without bound while the scanner waits to see whether a later ':' resolves the key.
1672            let is_length_stale =
1673                self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1674
1675            if sk.possible && (is_line_stale || is_length_stale) {
1676                if sk.required {
1677                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1678                }
1679                sk.possible = false;
1680            }
1681        }
1682        Ok(())
1683    }
1684
1685    /// Skip over whitespace (`\t`, ` `, `\n`, `\r`) until the next non-comment token.
1686    ///
1687    /// Comments encountered while skipping are queued as [`TokenType::Comment`] tokens so the
1688    /// parser can emit them as presentation events. If `stop_after_comment` is true, the function
1689    /// returns after queuing one comment so callers can emit it before scanning later comments.
1690    ///
1691    /// # Errors
1692    /// This function returns an error if a tab is encountered where there should not be
1693    /// one.
1694    fn skip_to_next_token(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1695        // Hot-path helper: consume a single logical line break and apply simple-key rules.
1696        // (Kept local to ensure the compiler can inline it easily.)
1697        let consume_linebreak = |this: &mut Self| {
1698            this.input.lookahead(2);
1699            this.skip_linebreak();
1700            if this.flow_level == 0 {
1701                this.allow_simple_key();
1702            }
1703        };
1704
1705        loop {
1706            let ch = self.input.look_ch();
1707            if self.explicit_key_tab_check_pending {
1708                match ch {
1709                    '\t' => {
1710                        return Err(ScanError::new_str(
1711                            self.mark(),
1712                            "tabs disallowed in this context",
1713                        ));
1714                    }
1715                    ' ' | '\n' | '\r' | '#' => {}
1716                    _ => self.explicit_key_tab_check_pending = false,
1717                }
1718            }
1719
1720            match ch {
1721                // Tabs may not be used as indentation (block context only).
1722                '\t' => {
1723                    if self.is_within_block()
1724                        && self.leading_whitespace
1725                        && (self.mark.col as isize) < self.indent
1726                    {
1727                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1728
1729                        // If we have content on that line with a tab, return an error.
1730                        if !self.input.next_is_breakz() {
1731                            return Err(ScanError::new_str(
1732                                self.mark,
1733                                "tabs disallowed within this context (block indentation)",
1734                            ));
1735                        }
1736
1737                        // Micro-opt: if we stopped on a line break, consume it now (avoids another loop trip).
1738                        if matches!(self.input.look_ch(), '\n' | '\r') {
1739                            consume_linebreak(self);
1740                        }
1741                    } else {
1742                        // Non-indentation tab behaves like blank.
1743                        self.skip_blank();
1744                    }
1745                }
1746
1747                ' ' => self.skip_blank(),
1748
1749                '\n' | '\r' => consume_linebreak(self),
1750
1751                c if is_bom(c)
1752                    && self.document_prefix_allowed
1753                    && self.flow_level == 0
1754                    && self.mark.col == 0 =>
1755                {
1756                    self.skip_bom();
1757                }
1758
1759                '#' => {
1760                    self.push_comment_token()?;
1761
1762                    // Micro-opt: comment-only lines are common; consume the following line break here.
1763                    if matches!(self.input.look_ch(), '\n' | '\r') {
1764                        consume_linebreak(self);
1765                    }
1766                    if stop_after_comment {
1767                        return Ok(true);
1768                    }
1769                }
1770
1771                _ => break,
1772            }
1773        }
1774
1775        // If a plain scalar was interrupted by a comment, and the next line could
1776        // continue the scalar in block context, this is invalid.
1777        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1778            // BS4K should only trigger when the continuation would start on the immediate next
1779            // line (no intervening empty/comment-only lines). A blank line resets the folding
1780            // opportunity and thus should not error.
1781            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1782
1783            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1784            if self.flow_level == 0
1785                && is_immediate_next_line
1786                && (self.mark.col as isize) > self.indent
1787            {
1788                // Ensure enough lookahead for:
1789                // - the checks below (peek/peek_nth)
1790                // - document indicator detection which needs 4 chars.
1791                self.input.lookahead(4);
1792
1793                if !self.input.next_is_z()
1794                    && !self.input.next_is_document_indicator()
1795                    && self.input.next_can_be_plain_scalar(false)
1796                {
1797                    return Err(ScanError::new_str(
1798                        err_mark,
1799                        "comment intercepting the multiline text",
1800                    ));
1801                }
1802            }
1803        }
1804
1805        Ok(false)
1806    }
1807
1808    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1809    ///
1810    /// If `stop_after_comment` is true, the function returns after queuing one comment so callers
1811    /// can emit it before scanning later comments.
1812    ///
1813    /// # Errors
1814    /// This function returns an error if no whitespace was found.
1815    fn skip_yaml_whitespace(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1816        let mut need_whitespace = true;
1817        loop {
1818            match self.input.look_ch() {
1819                ' ' => {
1820                    self.skip_blank();
1821
1822                    need_whitespace = false;
1823                }
1824                '\n' | '\r' => {
1825                    self.input.lookahead(2);
1826                    self.skip_linebreak();
1827                    if self.flow_level == 0 {
1828                        self.allow_simple_key();
1829                    }
1830                    need_whitespace = false;
1831                }
1832                '#' => {
1833                    if need_whitespace {
1834                        self.skip_comment();
1835                    } else {
1836                        self.push_comment_token()?;
1837                        if stop_after_comment {
1838                            return Ok(true);
1839                        }
1840                    }
1841                }
1842                _ => break,
1843            }
1844        }
1845
1846        if need_whitespace {
1847            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1848        } else {
1849            Ok(false)
1850        }
1851    }
1852
1853    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1854        debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
1855
1856        if !self.comments_possible {
1857            let (chars_consumed, result) = self.input.skip_ws_to_eol(skip_tabs);
1858            self.mark.col += chars_consumed;
1859            self.mark.offsets.chars += chars_consumed;
1860            self.mark.offsets.bytes = self.input.byte_offset();
1861            return result.map_err(|msg| ScanError::new_str(self.mark, msg));
1862        }
1863
1864        let (chars_consumed, whitespace) = self.input.skip_ws_to_eol_blanks(skip_tabs);
1865        self.mark.col += chars_consumed;
1866        self.mark.offsets.chars += chars_consumed;
1867        self.mark.offsets.bytes = self.input.byte_offset();
1868
1869        if self.input.look_ch() != '#' {
1870            return Ok(whitespace);
1871        }
1872
1873        if !whitespace.found_tabs() && !whitespace.has_valid_yaml_ws() {
1874            return Err(ScanError::new_str(
1875                self.mark,
1876                "comments must be separated from other tokens by whitespace",
1877            ));
1878        }
1879
1880        self.push_comment_token()?;
1881        Ok(whitespace)
1882    }
1883
1884    fn fetch_stream_start(&mut self) {
1885        let mark = self.mark;
1886        self.indent = -1;
1887        self.stream_start_produced = true;
1888        self.allow_simple_key();
1889        self.tokens
1890            .push_back(Token(Span::empty(mark), TokenType::StreamStart(TEncoding::Utf8)).into());
1891        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1892    }
1893
1894    fn fetch_stream_end(&mut self) -> ScanResult {
1895        // force new line
1896        if self.mark.col != 0 {
1897            self.mark.col = 0;
1898            self.mark.line += 1;
1899        }
1900
1901        if let Some((mark, bracket)) = self.flow_markers.pop() {
1902            return Err(Self::unclosed_bracket(mark, bracket));
1903        }
1904
1905        // If the stream ended, we won't have more context. We can stall all the simple keys we
1906        // had. If one was required, however, that was an error and we must propagate it.
1907        for sk in &mut self.simple_keys {
1908            if sk.required && sk.possible {
1909                return Err(self.simple_key_expected());
1910            }
1911            sk.possible = false;
1912        }
1913
1914        self.unroll_indent(-1);
1915        self.remove_simple_key()?;
1916        self.disallow_simple_key();
1917
1918        self.tokens
1919            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd).into());
1920        Ok(())
1921    }
1922
1923    fn fetch_directive(&mut self) -> ScanResult {
1924        self.unroll_indent(-1);
1925        self.remove_simple_key()?;
1926
1927        self.disallow_simple_key();
1928
1929        let token_index = self.tokens.len();
1930        let tok = self.scan_directive()?;
1931        self.insert_token(token_index, tok);
1932
1933        Ok(())
1934    }
1935
1936    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1937        let start_mark = self.mark;
1938        self.skip_non_blank();
1939
1940        let name = self.scan_directive_name()?;
1941        let tok = match name.as_ref() {
1942            "YAML" => self.scan_version_directive_value(&start_mark)?,
1943            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1944            _ => {
1945                let mut params = Vec::new();
1946                while self.input.next_is_blank() {
1947                    let n_blanks = self.input.skip_while_blank();
1948                    self.mark.offsets.chars += n_blanks;
1949                    self.mark.col += n_blanks;
1950                    self.mark.offsets.bytes = self.input.byte_offset();
1951
1952                    if !is_blank_or_breakz(self.input.peek()) {
1953                        let mut param = String::new();
1954                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1955                        self.mark.offsets.chars += n_chars;
1956                        self.mark.col += n_chars;
1957                        self.mark.offsets.bytes = self.input.byte_offset();
1958                        params.push(param);
1959                    }
1960                }
1961
1962                Token(
1963                    Span::new(start_mark, self.mark),
1964                    TokenType::ReservedDirective(name, params),
1965                )
1966            }
1967        };
1968
1969        self.skip_ws_to_eol(SkipTabs::Yes)?;
1970
1971        if self.input.next_is_breakz() {
1972            self.input.lookahead(2);
1973            self.skip_linebreak();
1974            Ok(tok)
1975        } else {
1976            Err(ScanError::new_str(
1977                start_mark,
1978                "while scanning a directive, did not find expected comment or line break",
1979            ))
1980        }
1981    }
1982
1983    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1984        let n_blanks = self.input.skip_while_blank();
1985        self.mark.offsets.chars += n_blanks;
1986        self.mark.col += n_blanks;
1987        self.mark.offsets.bytes = self.input.byte_offset();
1988
1989        let major = self.scan_version_directive_number(mark)?;
1990
1991        if self.input.peek() != '.' {
1992            return Err(ScanError::new_str(
1993                *mark,
1994                "while scanning a YAML directive, did not find expected digit or '.' character",
1995            ));
1996        }
1997        self.skip_non_blank();
1998
1999        let minor = self.scan_version_directive_number(mark)?;
2000
2001        Ok(Token(
2002            Span::new(*mark, self.mark),
2003            TokenType::VersionDirective(major, minor),
2004        ))
2005    }
2006
2007    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
2008        let start_mark = self.mark;
2009        let mut string = String::new();
2010
2011        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
2012        self.mark.offsets.chars += n_chars;
2013        self.mark.col += n_chars;
2014        self.mark.offsets.bytes = self.input.byte_offset();
2015
2016        if string.is_empty() {
2017            return Err(ScanError::new_str(
2018                start_mark,
2019                "while scanning a directive, could not find expected directive name",
2020            ));
2021        }
2022
2023        if !is_blank_or_breakz(self.input.peek()) {
2024            return Err(ScanError::new_str(
2025                start_mark,
2026                "while scanning a directive, found unexpected non-alphabetical character",
2027            ));
2028        }
2029
2030        Ok(string)
2031    }
2032
2033    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
2034        let mut val = 0u32;
2035        let mut length = 0usize;
2036        while let Some(digit) = self.input.look_ch().to_digit(10) {
2037            if length + 1 > 9 {
2038                return Err(ScanError::new_str(
2039                    *mark,
2040                    "while scanning a YAML directive, found extremely long version number",
2041                ));
2042            }
2043            length += 1;
2044            val = val * 10 + digit;
2045            self.skip_non_blank();
2046        }
2047
2048        if length == 0 {
2049            return Err(ScanError::new_str(
2050                *mark,
2051                "while scanning a YAML directive, did not find expected version number",
2052            ));
2053        }
2054
2055        Ok(val)
2056    }
2057
2058    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
2059        let n_blanks = self.input.skip_while_blank();
2060        self.mark.offsets.chars += n_blanks;
2061        self.mark.col += n_blanks;
2062        self.mark.offsets.bytes = self.input.byte_offset();
2063
2064        let handle = self.scan_tag_handle_directive_cow(mark)?;
2065
2066        let n_blanks = self.input.skip_while_blank();
2067        self.mark.offsets.chars += n_blanks;
2068        self.mark.col += n_blanks;
2069        self.mark.offsets.bytes = self.input.byte_offset();
2070
2071        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
2072
2073        self.input.lookahead(1);
2074
2075        if self.input.next_is_blank_or_breakz() {
2076            Ok(Token(
2077                Span::new(*mark, self.mark),
2078                TokenType::TagDirective(handle, prefix),
2079            ))
2080        } else {
2081            Err(ScanError::new_str(
2082                *mark,
2083                "while scanning TAG, did not find expected whitespace or line break",
2084            ))
2085        }
2086    }
2087
2088    fn fetch_tag(&mut self) -> ScanResult {
2089        self.save_simple_key();
2090        self.disallow_simple_key();
2091
2092        let tok = self.scan_tag()?;
2093        self.tokens.push_back(tok.into());
2094        Ok(())
2095    }
2096
2097    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
2098        let start_mark = self.mark;
2099
2100        // Check if the tag is in the canonical form (verbatim).
2101        self.input.lookahead(2);
2102
2103        // If byte_offset is not available, use the original owned-only path.
2104        if self.input.byte_offset().is_none() {
2105            return self.scan_tag_owned(&start_mark);
2106        }
2107
2108        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
2109            if self.input.nth_char_is(1, '<') {
2110                // Verbatim tags always need owned strings (URI escapes).
2111                let suffix = self.scan_verbatim_tag(&start_mark)?;
2112                (Cow::Owned(String::new()), Cow::Owned(suffix))
2113            } else {
2114                // The tag has either the '!suffix' or the '!handle!suffix'
2115                let handle = self.scan_tag_handle_cow(&start_mark)?;
2116                // Check if it is, indeed, handle.
2117                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2118                    // A tag handle starting with "!!" is a secondary tag handle.
2119                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
2120                    (handle, suffix)
2121                } else {
2122                    // Not a real handle, it's part of the suffix.
2123                    // E.g., "!foo" -> handle="!", suffix="foo"
2124                    // The "handle" we scanned is actually "!" + suffix_part1.
2125                    // We need to also scan any remaining suffix characters.
2126                    let remaining_suffix =
2127                        self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
2128
2129                    // Extract suffix from handle (skip leading '!') and combine with remaining.
2130                    let suffix = if handle.len() > 1 {
2131                        if remaining_suffix.is_empty() {
2132                            // The suffix is just what's in handle after '!'
2133                            match handle {
2134                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
2135                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
2136                            }
2137                        } else {
2138                            // Combine handle (minus leading '!') with remaining suffix.
2139                            let mut combined = handle[1..].to_owned();
2140                            combined.push_str(&remaining_suffix);
2141                            Cow::Owned(combined)
2142                        }
2143                    } else {
2144                        // handle is just "!", suffix is whatever we scanned after
2145                        remaining_suffix
2146                    };
2147
2148                    // A special case: the '!' tag.  Set the handle to '' and the
2149                    // suffix to '!'.
2150                    if suffix.is_empty() {
2151                        (Cow::Borrowed(""), Cow::Borrowed("!"))
2152                    } else {
2153                        (Cow::Borrowed("!"), suffix)
2154                    }
2155                }
2156            };
2157
2158        if is_blank_or_breakz(self.input.look_ch())
2159            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2160        {
2161            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2162            // delimiter follows.
2163            Ok(Token(
2164                Span::new(start_mark, self.mark),
2165                TokenType::Tag(handle, suffix),
2166            ))
2167        } else {
2168            Err(ScanError::new_str(
2169                start_mark,
2170                "while scanning a tag, did not find expected whitespace or line break",
2171            ))
2172        }
2173    }
2174
2175    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
2176    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
2177        let mut handle = String::new();
2178        let mut suffix;
2179
2180        if self.input.nth_char_is(1, '<') {
2181            suffix = self.scan_verbatim_tag(start_mark)?;
2182        } else {
2183            // The tag has either the '!suffix' or the '!handle!suffix'
2184            handle = self.scan_tag_handle(false, start_mark)?;
2185            // Check if it is, indeed, handle.
2186            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2187                // A tag handle starting with "!!" is a secondary tag handle.
2188                let is_secondary_handle = handle == "!!";
2189                suffix =
2190                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
2191            } else {
2192                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
2193                "!".clone_into(&mut handle);
2194                // A special case: the '!' tag.  Set the handle to '' and the
2195                // suffix to '!'.
2196                if suffix.is_empty() {
2197                    handle.clear();
2198                    "!".clone_into(&mut suffix);
2199                }
2200            }
2201        }
2202
2203        if is_blank_or_breakz(self.input.look_ch())
2204            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2205        {
2206            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
2207            // delimiter follows.
2208            Ok(Token(
2209                Span::new(*start_mark, self.mark),
2210                TokenType::Tag(handle.into(), suffix.into()),
2211            ))
2212        } else {
2213            Err(ScanError::new_str(
2214                *start_mark,
2215                "while scanning a tag, did not find expected whitespace or line break",
2216            ))
2217        }
2218    }
2219
2220    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
2221    ///
2222    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
2223    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
2224    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
2225        let Some(start) = self.input.byte_offset() else {
2226            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2227        };
2228
2229        if self.input.look_ch() != '!' {
2230            return Err(ScanError::new_str(
2231                *mark,
2232                "while scanning a tag, did not find expected '!'",
2233            ));
2234        }
2235
2236        // Consume the leading '!'.
2237        self.skip_non_blank();
2238
2239        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
2240        self.input.lookahead(1);
2241        while self.input.next_is_alpha() {
2242            self.skip_non_blank();
2243            self.input.lookahead(1);
2244        }
2245
2246        // Optional trailing '!'.
2247        if self.input.peek() == '!' {
2248            self.skip_non_blank();
2249        }
2250
2251        let Some(end) = self.input.byte_offset() else {
2252            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2253        };
2254
2255        if let Some(slice) = self.try_borrow_slice(start, end) {
2256            Ok(Cow::Borrowed(slice))
2257        } else {
2258            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2259                ScanError::new_str(
2260                    *mark,
2261                    "internal error: input advertised slicing but did not provide a slice",
2262                )
2263            })?;
2264            Ok(Cow::Owned(slice.to_owned()))
2265        }
2266    }
2267
2268    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
2269    ///
2270    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
2271    fn scan_tag_shorthand_suffix_cow(
2272        &mut self,
2273        mark: &Marker,
2274        require_non_empty: bool,
2275    ) -> Result<Cow<'input, str>, ScanError> {
2276        let Some(start) = self.input.byte_offset() else {
2277            return Ok(Cow::Owned(
2278                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2279            ));
2280        };
2281
2282        // Scan tag characters, checking for URI escapes.
2283        while is_tag_char(self.input.look_ch()) {
2284            if self.input.peek() == '%' {
2285                // URI escape found - must decode, so fall back to owned path.
2286                let current = self
2287                    .input
2288                    .byte_offset()
2289                    .expect("byte_offset() must remain available once enabled");
2290                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
2291                    slice.to_owned()
2292                } else {
2293                    String::new()
2294                };
2295
2296                // Continue scanning with owned buffer.
2297                while is_tag_char(self.input.look_ch()) {
2298                    if self.input.peek() == '%' {
2299                        out.push(self.scan_uri_escapes(mark)?);
2300                    } else {
2301                        out.push(self.input.peek());
2302                        self.skip_non_blank();
2303                    }
2304                }
2305                return Ok(Cow::Owned(out));
2306            }
2307            self.skip_non_blank();
2308        }
2309
2310        let Some(end) = self.input.byte_offset() else {
2311            return Ok(Cow::Owned(
2312                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2313            ));
2314        };
2315
2316        if require_non_empty && start == end {
2317            return Err(ScanError::new_str(
2318                *mark,
2319                "while parsing a tag, did not find expected tag URI",
2320            ));
2321        }
2322
2323        if let Some(slice) = self.try_borrow_slice(start, end) {
2324            Ok(Cow::Borrowed(slice))
2325        } else {
2326            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2327                ScanError::new_str(
2328                    *mark,
2329                    "internal error: input advertised slicing but did not provide a slice",
2330                )
2331            })?;
2332            Ok(Cow::Owned(slice.to_owned()))
2333        }
2334    }
2335
2336    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
2337        let mut string = String::new();
2338        if self.input.look_ch() != '!' {
2339            return Err(ScanError::new_str(
2340                *mark,
2341                "while scanning a tag, did not find expected '!'",
2342            ));
2343        }
2344
2345        string.push(self.input.peek());
2346        self.skip_non_blank();
2347
2348        let n_chars = self.input.fetch_while_is_alpha(&mut string);
2349        self.mark.offsets.chars += n_chars;
2350        self.mark.col += n_chars;
2351        self.mark.offsets.bytes = self.input.byte_offset();
2352
2353        // Check if the trailing character is '!' and copy it.
2354        if self.input.peek() == '!' {
2355            string.push(self.input.peek());
2356            self.skip_non_blank();
2357        } else if directive && string != "!" {
2358            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
2359            // directive, it's an error.  If it's a tag token, it must be a part of
2360            // URI.
2361            return Err(ScanError::new_str(
2362                *mark,
2363                "while parsing a tag directive, did not find expected '!'",
2364            ));
2365        }
2366        Ok(string)
2367    }
2368
2369    /// Scan for a tag prefix (6.8.2.2).
2370    ///
2371    /// There are 2 kinds of tag prefixes:
2372    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
2373    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
2374    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2375        let mut string = String::new();
2376
2377        if self.input.look_ch() == '!' {
2378            // If we have a local tag, insert and skip `!`.
2379            string.push(self.input.peek());
2380            self.skip_non_blank();
2381        } else if !is_tag_char(self.input.peek()) {
2382            // Otherwise, check if the first global tag character is valid.
2383            return Err(ScanError::new_str(
2384                *start_mark,
2385                "invalid global tag character",
2386            ));
2387        } else if self.input.peek() == '%' {
2388            // If it is valid and an escape sequence, escape it.
2389            string.push(self.scan_uri_escapes(start_mark)?);
2390        } else {
2391            // Otherwise, push the first character.
2392            string.push(self.input.peek());
2393            self.skip_non_blank();
2394        }
2395
2396        while is_uri_char(self.input.look_ch()) {
2397            if self.input.peek() == '%' {
2398                string.push(self.scan_uri_escapes(start_mark)?);
2399            } else {
2400                string.push(self.input.peek());
2401                self.skip_non_blank();
2402            }
2403        }
2404
2405        Ok(string)
2406    }
2407
2408    /// Scan for a verbatim tag.
2409    ///
2410    /// The prefixing `!<` must _not_ have been skipped.
2411    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2412        // Eat `!<`
2413        self.skip_non_blank();
2414        self.skip_non_blank();
2415
2416        let mut string = String::new();
2417        while is_uri_char(self.input.look_ch()) {
2418            if self.input.peek() == '%' {
2419                string.push(self.scan_uri_escapes(start_mark)?);
2420            } else {
2421                string.push(self.input.peek());
2422                self.skip_non_blank();
2423            }
2424        }
2425
2426        if string.is_empty() {
2427            return Err(ScanError::new_str(
2428                *start_mark,
2429                "while parsing a tag, did not find expected tag URI",
2430            ));
2431        }
2432
2433        if self.input.peek() != '>' {
2434            return Err(ScanError::new_str(
2435                *start_mark,
2436                "while scanning a verbatim tag, did not find the expected '>'",
2437            ));
2438        }
2439        self.skip_non_blank();
2440
2441        Ok(string)
2442    }
2443
2444    fn scan_tag_shorthand_suffix(
2445        &mut self,
2446        _directive: bool,
2447        _is_secondary: bool,
2448        head: &str,
2449        mark: &Marker,
2450    ) -> Result<String, ScanError> {
2451        let mut length = head.len();
2452        let mut string = String::new();
2453
2454        // Copy the head if needed.
2455        // Note that we don't copy the leading '!' character.
2456        if length > 1 {
2457            string.extend(head.chars().skip(1));
2458        }
2459
2460        while is_tag_char(self.input.look_ch()) {
2461            // Check if it is a URI-escape sequence.
2462            if self.input.peek() == '%' {
2463                string.push(self.scan_uri_escapes(mark)?);
2464            } else {
2465                string.push(self.input.peek());
2466                self.skip_non_blank();
2467            }
2468
2469            length += 1;
2470        }
2471
2472        if length == 0 {
2473            return Err(ScanError::new_str(
2474                *mark,
2475                "while parsing a tag, did not find expected tag URI",
2476            ));
2477        }
2478
2479        Ok(string)
2480    }
2481
2482    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2483        let mut width = 0usize;
2484        let mut bytes = [0u8; 4];
2485        let mut bytes_len = 0usize;
2486        loop {
2487            self.input.lookahead(3);
2488
2489            let c = self.input.peek_nth(1);
2490            let nc = self.input.peek_nth(2);
2491
2492            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
2493                return Err(ScanError::new_str(
2494                    *mark,
2495                    "while parsing a tag, found an invalid escape sequence",
2496                ));
2497            }
2498
2499            let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
2500                .expect("two hex nibbles always fit in a byte");
2501            if width == 0 {
2502                width = match byte {
2503                    _ if byte & 0x80 == 0x00 => 1,
2504                    _ if byte & 0xE0 == 0xC0 => 2,
2505                    _ if byte & 0xF0 == 0xE0 => 3,
2506                    _ if byte & 0xF8 == 0xF0 => 4,
2507                    _ => {
2508                        return Err(ScanError::new_str(
2509                            *mark,
2510                            "while parsing a tag, found an incorrect leading UTF-8 byte",
2511                        ));
2512                    }
2513                };
2514            } else if byte & 0xc0 != 0x80 {
2515                return Err(ScanError::new_str(
2516                    *mark,
2517                    "while parsing a tag, found an incorrect trailing UTF-8 byte",
2518                ));
2519            }
2520
2521            bytes[bytes_len] = byte;
2522            bytes_len += 1;
2523
2524            self.skip_n_non_blank(3);
2525
2526            width -= 1;
2527            if width == 0 {
2528                break;
2529            }
2530        }
2531
2532        let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2533            ScanError::new_str(
2534                *mark,
2535                "while parsing a tag, found an invalid UTF-8 codepoint",
2536            )
2537        })?;
2538
2539        let mut chars = s.chars();
2540        match (chars.next(), chars.next()) {
2541            (Some(ch), None) => Ok(ch),
2542            _ => Err(ScanError::new_str(
2543                *mark,
2544                "while parsing a tag, found an invalid UTF-8 codepoint",
2545            )),
2546        }
2547    }
2548
2549    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2550        self.save_simple_key();
2551        self.disallow_simple_key();
2552
2553        let tok = self.scan_anchor(alias)?;
2554
2555        self.tokens.push_back(tok.into());
2556
2557        Ok(())
2558    }
2559
2560    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2561        let start_mark = self.mark;
2562
2563        // Skip `&` / `*`.
2564        self.skip_non_blank();
2565
2566        // Borrow from input when possible.
2567        if let Some(start) = self.input.byte_offset() {
2568            while is_anchor_char(self.input.look_ch()) {
2569                self.skip_non_blank();
2570            }
2571
2572            let end = self
2573                .input
2574                .byte_offset()
2575                .expect("byte_offset() must remain available once enabled");
2576
2577            if start == end {
2578                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2579            }
2580
2581            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2582                Cow::Borrowed(slice)
2583            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2584                Cow::Owned(slice.to_owned())
2585            } else {
2586                return Err(ScanError::new_str(
2587                    start_mark,
2588                    "internal error: input advertised slicing but did not provide a slice",
2589                ));
2590            };
2591
2592            let tok = if alias {
2593                TokenType::Alias(cow)
2594            } else {
2595                TokenType::Anchor(cow)
2596            };
2597            return Ok(Token(Span::new(start_mark, self.mark), tok));
2598        }
2599
2600        let mut string = String::new();
2601        while is_anchor_char(self.input.look_ch()) {
2602            string.push(self.input.peek());
2603            self.skip_non_blank();
2604        }
2605
2606        if string.is_empty() {
2607            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2608        }
2609
2610        let tok = if alias {
2611            TokenType::Alias(string.into())
2612        } else {
2613            TokenType::Anchor(string.into())
2614        };
2615        Ok(Token(Span::new(start_mark, self.mark), tok))
2616    }
2617
2618    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2619        // The indicators '[' and '{' may start a simple key.
2620        self.save_simple_key();
2621
2622        let start_mark = self.mark;
2623        let indicator = self.input.peek();
2624        self.flow_markers.push((start_mark, indicator));
2625
2626        self.roll_one_col_indent();
2627        self.increase_flow_level()?;
2628
2629        self.allow_simple_key();
2630
2631        self.skip_non_blank();
2632
2633        if tok == TokenType::FlowMappingStart {
2634            self.flow_mapping_started.push(true);
2635        } else {
2636            self.flow_mapping_started.push(false);
2637            self.implicit_flow_mapping_states
2638                .push(ImplicitMappingState::Possible);
2639        }
2640
2641        let token_index = self.tokens.len();
2642        self.skip_ws_to_eol(SkipTabs::Yes)?;
2643
2644        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2645        Ok(())
2646    }
2647
2648    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2649        // A closing bracket without a corresponding opening is invalid YAML.
2650        if self.flow_level == 0 {
2651            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2652        }
2653
2654        let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2655            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2656        };
2657
2658        let (expected_open, actual_close) = match tok {
2659            TokenType::FlowSequenceEnd => ('[', ']'),
2660            TokenType::FlowMappingEnd => ('{', '}'),
2661            _ => unreachable!("flow collection end called with non-closing token"),
2662        };
2663
2664        if open_ch != expected_open {
2665            return Err(ScanError::new(
2666                open_mark,
2667                format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2668            ));
2669        }
2670
2671        let flow_level = self.flow_level;
2672
2673        self.remove_simple_key()?;
2674
2675        if matches!(tok, TokenType::FlowSequenceEnd) {
2676            self.end_implicit_mapping(self.mark, flow_level);
2677            // We are out exiting the flow sequence, nesting goes down 1 level.
2678            self.implicit_flow_mapping_states.pop();
2679        }
2680        self.flow_mapping_started.pop();
2681
2682        self.decrease_flow_level();
2683
2684        self.disallow_simple_key();
2685
2686        let start_mark = self.mark;
2687        self.skip_non_blank();
2688        let token_index = self.tokens.len();
2689        self.skip_ws_to_eol(SkipTabs::Yes)?;
2690
2691        // A flow collection within a flow mapping can be a key. In that case, the value may be
2692        // adjacent to the `:`.
2693        // ```yaml
2694        // - [ {a: b}:value ]
2695        // ```
2696        if self.flow_level > 0 {
2697            self.adjacent_value_allowed_at = self.mark.index();
2698        }
2699
2700        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2701        Ok(())
2702    }
2703
2704    /// Push the `FlowEntry` token and skip over the `,`.
2705    fn fetch_flow_entry(&mut self) -> ScanResult {
2706        self.remove_simple_key()?;
2707        self.allow_simple_key();
2708
2709        self.end_implicit_mapping(self.mark, self.flow_level);
2710        if self.current_flow_collection_is_sequence() {
2711            self.set_current_flow_mapping_started(false);
2712        }
2713
2714        let start_mark = self.mark;
2715        self.skip_non_blank();
2716        let token_index = self.tokens.len();
2717        self.skip_ws_to_eol(SkipTabs::Yes)?;
2718
2719        self.insert_token(
2720            token_index,
2721            Token(Span::new(start_mark, self.mark), TokenType::FlowEntry),
2722        );
2723        Ok(())
2724    }
2725
2726    fn increase_flow_level(&mut self) -> ScanResult {
2727        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2728        self.flow_level = self
2729            .flow_level
2730            .checked_add(1)
2731            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2732        Ok(())
2733    }
2734
2735    fn decrease_flow_level(&mut self) {
2736        if self.flow_level > 0 {
2737            self.flow_level -= 1;
2738            self.simple_keys.pop().unwrap();
2739        }
2740    }
2741
2742    /// Push the `Block*` token(s) and skip over the `-`.
2743    ///
2744    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2745    /// `BlockEntry` token.
2746    /// This function only skips over the `-` and does not fetch the entry value.
2747    fn fetch_block_entry(&mut self) -> ScanResult {
2748        if self.flow_level > 0 {
2749            // - * only allowed in block
2750            return Err(ScanError::new_str(
2751                self.mark,
2752                r#""-" is only valid inside a block"#,
2753            ));
2754        }
2755        // Check if we are allowed to start a new entry.
2756        if !self.simple_key_allowed {
2757            return Err(ScanError::new_str(
2758                self.mark,
2759                "block sequence entries are not allowed in this context",
2760            ));
2761        }
2762
2763        // ???, fixes test G9HC.
2764        if let Some(QueuedToken(span, QueuedTokenType::Anchor(..) | QueuedTokenType::Tag(..))) =
2765            self.tokens.back()
2766        {
2767            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2768                return Err(ScanError::new_str(
2769                    span.start,
2770                    "invalid indentation for anchor",
2771                ));
2772            }
2773        }
2774
2775        // Skip over the `-`.
2776        let mark = self.mark;
2777        self.skip_non_blank();
2778
2779        // generate BLOCK-SEQUENCE-START if indented
2780        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2781        let token_index = self.tokens.len();
2782        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2783        self.input.lookahead(2);
2784        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2785        {
2786            return Err(ScanError::new_str(
2787                self.mark,
2788                "'-' must be followed by a valid YAML whitespace",
2789            ));
2790        }
2791
2792        self.skip_ws_to_eol(SkipTabs::No)?;
2793        self.input.lookahead(1);
2794        if self.input.next_is_break() || self.input.next_is_flow() {
2795            self.roll_one_col_indent();
2796        }
2797
2798        self.remove_simple_key()?;
2799        self.allow_simple_key();
2800
2801        self.insert_token(
2802            token_index,
2803            Token(Span::empty(self.mark), TokenType::BlockEntry),
2804        );
2805
2806        Ok(())
2807    }
2808
2809    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2810        if let Some((mark, bracket)) = self.flow_markers.pop() {
2811            return Err(ScanError::new(
2812                mark,
2813                format!("unclosed bracket '{bracket}'"),
2814            ));
2815        }
2816
2817        self.unroll_indent(-1);
2818        self.remove_simple_key()?;
2819        self.disallow_simple_key();
2820
2821        let mark = self.mark;
2822
2823        self.skip_n_non_blank(3);
2824
2825        self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
2826        self.tokens
2827            .push_back(Token(Span::new(mark, self.mark), t).into());
2828        Ok(())
2829    }
2830
2831    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2832        self.save_simple_key();
2833        self.allow_simple_key();
2834        let tok = self.scan_block_scalar(literal)?;
2835
2836        self.tokens.push_back(tok.into());
2837        Ok(())
2838    }
2839
2840    #[allow(clippy::too_many_lines)]
2841    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2842        let start_mark = self.mark;
2843        let mut chomping = Chomping::Clip;
2844        let mut increment: usize = 0;
2845        let mut indent: usize = 0;
2846        let mut trailing_blank: bool;
2847        let mut leading_blank: bool = false;
2848        let style = if literal {
2849            ScalarStyle::Literal
2850        } else {
2851            ScalarStyle::Folded
2852        };
2853
2854        let mut string = String::new();
2855        let mut leading_break = String::new();
2856        let mut trailing_breaks = String::new();
2857        let mut chomping_break = String::new();
2858
2859        // skip '|' or '>'
2860        self.skip_non_blank();
2861        self.unroll_non_block_indents();
2862
2863        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2864            if self.input.peek() == '+' {
2865                chomping = Chomping::Keep;
2866            } else {
2867                chomping = Chomping::Strip;
2868            }
2869            self.skip_non_blank();
2870            self.input.lookahead(1);
2871            if self.input.next_is_digit() {
2872                if self.input.peek() == '0' {
2873                    return Err(ScanError::new_str(
2874                        start_mark,
2875                        "while scanning a block scalar, found an indentation indicator equal to 0",
2876                    ));
2877                }
2878                increment = (self.input.peek() as usize) - ('0' as usize);
2879                self.skip_non_blank();
2880            }
2881        } else if self.input.next_is_digit() {
2882            if self.input.peek() == '0' {
2883                return Err(ScanError::new_str(
2884                    start_mark,
2885                    "while scanning a block scalar, found an indentation indicator equal to 0",
2886                ));
2887            }
2888
2889            increment = (self.input.peek() as usize) - ('0' as usize);
2890            self.skip_non_blank();
2891            self.input.lookahead(1);
2892            if self.input.peek() == '+' || self.input.peek() == '-' {
2893                if self.input.peek() == '+' {
2894                    chomping = Chomping::Keep;
2895                } else {
2896                    chomping = Chomping::Strip;
2897                }
2898                self.skip_non_blank();
2899            }
2900        }
2901
2902        self.skip_ws_to_eol(SkipTabs::Yes)?;
2903
2904        // Check if we are at the end of the line.
2905        self.input.lookahead(1);
2906        if !self.input.next_is_breakz() {
2907            return Err(ScanError::new_str(
2908                start_mark,
2909                "while scanning a block scalar, did not find expected comment or line break",
2910            ));
2911        }
2912
2913        if self.input.next_is_break() {
2914            self.input.lookahead(2);
2915            self.read_break(&mut chomping_break);
2916        }
2917
2918        if self.input.look_ch() == '\t' {
2919            return Err(ScanError::new_str(
2920                start_mark,
2921                "a block scalar content cannot start with a tab",
2922            ));
2923        }
2924
2925        if increment > 0 {
2926            indent = if self.indent >= 0 {
2927                (self.indent + increment as isize) as usize
2928            } else {
2929                increment
2930            }
2931        }
2932
2933        // Scan the leading line breaks and determine the indentation level if needed.
2934        if indent == 0 {
2935            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2936        } else {
2937            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2938        }
2939
2940        // We have an end-of-stream with no content, e.g.:
2941        // ```yaml
2942        // - |+
2943        // ```
2944        if self.input.next_is_z() {
2945            let contents = match chomping {
2946                // We strip trailing line breaks. Nothing remains.
2947                Chomping::Strip => String::new(),
2948                // There was no newline after the chomping indicator.
2949                _ if self.mark.line == start_mark.line() => String::new(),
2950                // We clip lines, and there was a newline after the chomping indicator.
2951                // All other breaks are ignored.
2952                Chomping::Clip => chomping_break,
2953                // We keep lines. There was a newline after the chomping indicator but nothing
2954                // else.
2955                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2956                // Otherwise, the newline after chomping is ignored.
2957                Chomping::Keep => trailing_breaks,
2958            };
2959            return Ok(Token(
2960                Span::new(start_mark, self.mark),
2961                TokenType::Scalar(style, contents.into()),
2962            ));
2963        }
2964
2965        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2966            if self.indent < 0 && self.mark.col == 0 {
2967                self.input.lookahead(4);
2968                if self.input.next_is_document_start()
2969                    || self.input.next_is_document_end()
2970                    || self.input.peek() == '#'
2971                {
2972                    // At the root level, an explicit indentation indicator can still yield an
2973                    // empty scalar when the next line is a document marker or comment.
2974                    // In this case, the scalar is terminated rather than under-indented.
2975                } else {
2976                    return Err(ScanError::new_str(
2977                        self.mark,
2978                        "wrongly indented line in block scalar",
2979                    ));
2980                }
2981            } else {
2982                return Err(ScanError::new_str(
2983                    self.mark,
2984                    "wrongly indented line in block scalar",
2985                ));
2986            }
2987        }
2988
2989        let mut line_buffer = String::with_capacity(100);
2990        let start_mark = self.mark;
2991        while self.mark.col == indent && !self.input.next_is_z() {
2992            if indent == 0 {
2993                self.input.lookahead(4);
2994                if self.input.next_is_document_end() {
2995                    break;
2996                }
2997            }
2998
2999            // We are at the first content character of a content line.
3000            trailing_blank = self.input.next_is_blank();
3001            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
3002                string.push_str(&trailing_breaks);
3003                if trailing_breaks.is_empty() {
3004                    string.push(' ');
3005                }
3006            } else {
3007                string.push_str(&leading_break);
3008                string.push_str(&trailing_breaks);
3009            }
3010
3011            leading_break.clear();
3012            trailing_breaks.clear();
3013
3014            leading_blank = self.input.next_is_blank();
3015
3016            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
3017
3018            // break on EOF
3019            self.input.lookahead(2);
3020            if self.input.next_is_z() {
3021                break;
3022            }
3023
3024            self.read_break(&mut leading_break);
3025
3026            // Eat the following indentation spaces and line breaks.
3027            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
3028        }
3029
3030        // Chomp the tail.
3031        if chomping != Chomping::Strip {
3032            string.push_str(&leading_break);
3033            // If we had reached an eof but the last character wasn't an end-of-line, check if the
3034            // last line was indented at least as the rest of the scalar, then we need to consider
3035            // there is a newline.
3036            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
3037                string.push('\n');
3038            }
3039        }
3040
3041        if chomping == Chomping::Keep {
3042            string.push_str(&trailing_breaks);
3043        }
3044
3045        Ok(Token(
3046            Span::new(start_mark, self.mark),
3047            TokenType::Scalar(style, string.into()),
3048        ))
3049    }
3050
3051    /// Retrieve the contents of the line, parsing it as a block scalar.
3052    ///
3053    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
3054    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
3055    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
3056    /// `clear`ed before the end of the function.
3057    ///
3058    /// This function assumes the first character to read is the first content character in the
3059    /// line. This function does not consume the line break character(s) after the line.
3060    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
3061        // Start by evaluating characters in the buffer.
3062        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
3063            string.push(self.input.peek());
3064            // We may technically skip non-blank characters. However, the only distinction is
3065            // to determine what is leading whitespace and what is not. Here, we read the
3066            // contents of the line until either EOF or a line break. We know we will not read
3067            // `self.leading_whitespace` until the end of the line, where it will be reset.
3068            // This allows us to call a slightly less expensive function.
3069            self.skip_blank();
3070        }
3071
3072        // All characters that were in the buffer were consumed. We need to check if more
3073        // follow.
3074        if self.input.buf_is_empty() {
3075            // We will read all consecutive non-breakz characters. We push them into a
3076            // temporary buffer. The main difference with going through `self.buffer` is that
3077            // characters are appended here as their real size (1B for ASCII, or up to 4 bytes for
3078            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
3079            // (using `String::push_str`).
3080
3081            // line_buffer is empty at this point so we can compute n_chars here as well
3082            let mut n_chars = 0;
3083            debug_assert!(line_buffer.is_empty());
3084            while let Some(c) = self.input.raw_read_non_breakz_ch() {
3085                line_buffer.push(c);
3086                n_chars += 1;
3087            }
3088
3089            // We need to manually update our position; we haven't called a `skip` function.
3090            self.mark.col += n_chars;
3091            self.mark.offsets.chars += n_chars;
3092            self.mark.offsets.bytes = self.input.byte_offset();
3093
3094            // We can now append our bytes to our `string`.
3095            string.reserve(line_buffer.len());
3096            string.push_str(line_buffer);
3097            // This clears the _contents_ without touching the _capacity_.
3098            line_buffer.clear();
3099        }
3100    }
3101
3102    /// Skip the block scalar indentation and empty lines.
3103    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
3104        loop {
3105            // Consume all spaces. Tabs cannot be used as indentation.
3106            if indent < self.input.bufmaxlen() - 2 {
3107                self.input.lookahead(self.input.bufmaxlen());
3108                while self.mark.col < indent && self.input.peek() == ' ' {
3109                    self.skip_blank();
3110                }
3111            } else {
3112                loop {
3113                    self.input.lookahead(self.input.bufmaxlen());
3114                    while !self.input.buf_is_empty()
3115                        && self.mark.col < indent
3116                        && self.input.peek() == ' '
3117                    {
3118                        self.skip_blank();
3119                    }
3120                    // If we reached our indent, we can break. We must also break if we have
3121                    // reached content or EOF; that is, the buffer is not empty and the next
3122                    // character is not a space.
3123                    if self.mark.col == indent
3124                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
3125                    {
3126                        break;
3127                    }
3128                }
3129                self.input.lookahead(2);
3130            }
3131
3132            // If our current line is empty, skip over the break and continue looping.
3133            if self.input.next_is_break() {
3134                self.read_break(breaks);
3135            } else {
3136                // Otherwise, we have a content line. Return control.
3137                break;
3138            }
3139        }
3140    }
3141
3142    /// Determine the indentation level for a block scalar from the first line of its contents.
3143    ///
3144    /// The function skips over whitespace-only lines and sets `indent` to the longest
3145    /// whitespace line that was encountered.
3146    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
3147        let mut max_indent = 0;
3148        loop {
3149            // Consume all spaces. Tabs cannot be used as indentation.
3150            while self.input.look_ch() == ' ' {
3151                self.skip_blank();
3152            }
3153
3154            if self.mark.col > max_indent {
3155                max_indent = self.mark.col;
3156            }
3157
3158            if self.input.next_is_break() {
3159                // If our current line is empty, skip over the break and continue looping.
3160                self.input.lookahead(2);
3161                self.read_break(breaks);
3162            } else {
3163                // Otherwise, we have a content line. Return control.
3164                break;
3165            }
3166        }
3167
3168        // In case a YAML document looks like:
3169        // ```yaml
3170        // |
3171        // foo
3172        // bar
3173        // ```
3174        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
3175        // least 1. When in the above example, `self.indent` will be set to -1.
3176        *indent = max_indent.max((self.indent + 1) as usize);
3177        if self.indent > 0 {
3178            *indent = (*indent).max(1);
3179        }
3180    }
3181
3182    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
3183        self.save_simple_key();
3184        self.disallow_simple_key();
3185
3186        let token_index = self.tokens.len();
3187        let tok = self.scan_flow_scalar(single)?;
3188
3189        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
3190        // YAML allows the following value to be specified adjacent to the “:”.
3191        if self.skip_to_next_token(true)? {
3192            self.adjacent_value_allowed_at = usize::MAX;
3193        } else {
3194            self.adjacent_value_allowed_at = self.mark.index();
3195        }
3196
3197        self.insert_token(token_index, tok);
3198        Ok(())
3199    }
3200
3201    #[allow(clippy::too_many_lines)]
3202    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
3203        let start_mark = self.mark;
3204
3205        // Output scalar contents.
3206        let mut buf = match self.input.byte_offset() {
3207            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
3208            None => FlowScalarBuf::new_owned(),
3209        };
3210
3211        // Scratch used to consume the *first* line break in a break run without emitting it.
3212        // (The first break folds to ' ' or to nothing depending on escaping rules.)
3213        let mut break_scratch = String::new();
3214
3215        /* Eat the left quote. */
3216        self.skip_non_blank();
3217
3218        loop {
3219            /* Check for a document indicator. */
3220            self.input.lookahead(4);
3221
3222            if self.mark.col == 0 && self.input.next_is_document_indicator() {
3223                return Err(ScanError::new_str(
3224                    start_mark,
3225                    "while scanning a quoted scalar, found unexpected document indicator",
3226                ));
3227            }
3228
3229            if self.input.next_is_z() {
3230                return Err(ScanError::new_str(start_mark, "unclosed quote"));
3231            }
3232
3233            // Do not enforce block indentation inside quoted (flow) scalars.
3234            // YAML allows line breaks within quoted scalars.
3235            let mut leading_blanks = false;
3236            self.consume_flow_scalar_non_whitespace_chars(
3237                single,
3238                &mut buf,
3239                &mut leading_blanks,
3240                &start_mark,
3241            )?;
3242
3243            match self.input.look_ch() {
3244                '\'' if single => break,
3245                '"' if !single => break,
3246                _ => {}
3247            }
3248
3249            // --- Faster whitespace / line break handling (no temporary Strings) ---
3250            //
3251            // Instead of:
3252            //   - collecting blanks into `whitespaces` and then copying them
3253            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
3254            //
3255            // We do:
3256            //   - append trailing blanks directly to `string`, remember where they started,
3257            //     and truncate them if a line break follows.
3258            //   - for line breaks: consume the first break into a scratch (discarded),
3259            //     append subsequent breaks directly to `string`.
3260            //
3261            // These flags replace temporary-string emptiness checks:
3262            //   has_leading_break  <=> !leading_break.is_empty()
3263            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
3264            let mut trailing_ws_start: Option<usize> = None;
3265            let mut has_leading_break = false;
3266            let mut has_trailing_breaks = false;
3267
3268            // For the borrowed path: track the (byte) start of a pending whitespace run.
3269            let mut pending_ws_start: Option<usize> = None;
3270
3271            // Consume blank characters.
3272            while self.input.next_is_blank() || self.input.next_is_break() {
3273                if self.input.next_is_blank() {
3274                    // Consume a space or a tab character.
3275                    if leading_blanks {
3276                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
3277                            return Err(ScanError::new_str(
3278                                self.mark,
3279                                "tab cannot be used as indentation",
3280                            ));
3281                        }
3282                        self.skip_blank();
3283                    } else {
3284                        // Append to output immediately; if a break appears next, we'll truncate.
3285                        match buf {
3286                            FlowScalarBuf::Owned(ref mut string) => {
3287                                if trailing_ws_start.is_none() {
3288                                    trailing_ws_start = Some(string.len());
3289                                }
3290                                string.push(self.input.peek());
3291                            }
3292                            FlowScalarBuf::Borrowed { .. } => {
3293                                if pending_ws_start.is_none() {
3294                                    pending_ws_start = self.input.byte_offset();
3295                                }
3296                            }
3297                        }
3298                        self.skip_blank();
3299
3300                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
3301                            (&mut buf, pending_ws_start, self.input.byte_offset())
3302                        {
3303                            buf.note_pending_ws(ws_start, ws_end);
3304                        }
3305                    }
3306                } else {
3307                    self.input.lookahead(2);
3308
3309                    // Check if it is a first line break.
3310                    if leading_blanks {
3311                        // Second+ line break in a run: preserve it.
3312                        match buf {
3313                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
3314                            FlowScalarBuf::Borrowed { .. } => {
3315                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3316                                let Some(string) = buf.as_owned_mut() else {
3317                                    unreachable!()
3318                                };
3319                                self.read_break(string);
3320                            }
3321                        }
3322                        has_trailing_breaks = true;
3323                    } else {
3324                        // First break: drop any trailing blanks we appended, then consume the break.
3325                        if let Some(pos) = trailing_ws_start.take() {
3326                            if let FlowScalarBuf::Owned(ref mut string) = buf {
3327                                string.truncate(pos);
3328                            }
3329                        }
3330
3331                        if pending_ws_start.take().is_some() {
3332                            // Trailing blanks before a break are discarded => transformation.
3333                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3334                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3335                            }
3336                            buf.discard_pending_ws();
3337                        } else {
3338                            buf.commit_pending_ws();
3339                        }
3340
3341                        break_scratch.clear();
3342                        self.read_break(&mut break_scratch);
3343                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
3344
3345                        has_leading_break = true;
3346                        leading_blanks = true;
3347                    }
3348                }
3349
3350                self.input.lookahead(1);
3351            }
3352
3353            // If we had a line break inside a quoted (flow) scalar, validate indentation
3354            // of the continuation line in block context.
3355            if leading_blanks && has_leading_break && self.flow_level == 0 {
3356                let next_ch = self.input.peek();
3357                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
3358                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
3359                    return Err(ScanError::new_str(
3360                        self.mark,
3361                        "invalid indentation in multiline quoted scalar",
3362                    ));
3363                }
3364            }
3365
3366            // Join the whitespace or fold line breaks.
3367            if leading_blanks {
3368                // Folding rule:
3369                //   if there was no leading break, preserve the pending whitespace already emitted
3370                //   if there was a leading break but no trailing breaks, fold to one space
3371                //   otherwise, preserve the trailing breaks already emitted
3372                if has_leading_break && !has_trailing_breaks {
3373                    match buf {
3374                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
3375                        FlowScalarBuf::Borrowed { .. } => {
3376                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3377                            let Some(string) = buf.as_owned_mut() else {
3378                                unreachable!()
3379                            };
3380                            string.push(' ');
3381                        }
3382                    }
3383                }
3384            }
3385            // else: trailing blanks are already appended to `string`
3386        } // loop
3387
3388        // Eat the right quote.
3389        self.skip_non_blank();
3390        let end_mark = self.mark;
3391
3392        // Ensure there is no invalid trailing content.
3393        self.skip_ws_to_eol(SkipTabs::Yes)?;
3394        match self.input.peek() {
3395            // These can be encountered in flow sequences or mappings.
3396            ',' | '}' | ']' if self.flow_level > 0 => {}
3397            // An end-of-line / end-of-stream is fine. No trailing content.
3398            c if is_breakz(c) => {}
3399            // ':' can be encountered if our scalar is a key.
3400            // Outside of flow contexts, keys cannot span multiple lines
3401            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
3402            // Inside a flow context, this is allowed.
3403            ':' if self.flow_level > 0 => {}
3404            _ => {
3405                return Err(ScanError::new_str(
3406                    self.mark,
3407                    "invalid trailing content after double-quoted scalar",
3408                ));
3409            }
3410        }
3411
3412        let style = if single {
3413            ScalarStyle::SingleQuoted
3414        } else {
3415            ScalarStyle::DoubleQuoted
3416        };
3417
3418        let contents = match buf {
3419            FlowScalarBuf::Owned(string) => Cow::Owned(string),
3420            FlowScalarBuf::Borrowed {
3421                start,
3422                mut end,
3423                pending_ws_start,
3424                pending_ws_end,
3425            } => {
3426                // If we ended after a whitespace run, it is part of the output (no break followed).
3427                if pending_ws_start.is_some() {
3428                    end = pending_ws_end;
3429                }
3430                if let Some(slice) = self.try_borrow_slice(start, end) {
3431                    Cow::Borrowed(slice)
3432                } else {
3433                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
3434                        ScanError::new_str(
3435                            start_mark,
3436                            "internal error: input advertised offsets but did not provide a slice",
3437                        )
3438                    })?;
3439                    Cow::Owned(slice.to_owned())
3440                }
3441            }
3442        };
3443
3444        Ok(Token(
3445            Span::new(start_mark, end_mark),
3446            TokenType::Scalar(style, contents),
3447        ))
3448    }
3449
3450    /// Consume successive non-whitespace characters from a flow scalar.
3451    ///
3452    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
3453    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
3454    /// for double quoted scalars).
3455    ///
3456    /// # Errors
3457    /// Return an error if an invalid escape sequence is found.
3458    fn consume_flow_scalar_non_whitespace_chars(
3459        &mut self,
3460        single: bool,
3461        buf: &mut FlowScalarBuf,
3462        leading_blanks: &mut bool,
3463        start_mark: &Marker,
3464    ) -> Result<(), ScanError> {
3465        self.input.lookahead(2);
3466        while !is_blank_or_breakz(self.input.peek()) {
3467            match self.input.peek() {
3468                // Check for an escaped single quote.
3469                '\'' if self.input.peek_nth(1) == '\'' && single => {
3470                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3471                        buf.commit_pending_ws();
3472                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3473                    }
3474                    let Some(string) = buf.as_owned_mut() else {
3475                        unreachable!()
3476                    };
3477                    string.push('\'');
3478                    self.skip_n_non_blank(2);
3479                }
3480                // Check for the right quote.
3481                '\'' if single => break,
3482                '"' if !single => break,
3483                // Check for an escaped line break.
3484                '\\' if !single && is_break(self.input.peek_nth(1)) => {
3485                    self.input.lookahead(3);
3486                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3487                        buf.commit_pending_ws();
3488                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3489                    }
3490                    self.skip_non_blank();
3491                    self.skip_linebreak();
3492                    *leading_blanks = true;
3493                    break;
3494                }
3495                // Check for an escape sequence.
3496                '\\' if !single => {
3497                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3498                        buf.commit_pending_ws();
3499                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3500                    }
3501                    let Some(string) = buf.as_owned_mut() else {
3502                        unreachable!()
3503                    };
3504                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
3505                }
3506                c => {
3507                    match buf {
3508                        FlowScalarBuf::Owned(ref mut string) => {
3509                            string.push(c);
3510                        }
3511                        FlowScalarBuf::Borrowed { .. } => {
3512                            buf.commit_pending_ws();
3513                        }
3514                    }
3515                    self.skip_non_blank();
3516
3517                    if let Some(new_end) = self.input.byte_offset() {
3518                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
3519                            *end = new_end;
3520                        }
3521                    }
3522                }
3523            }
3524            self.input.lookahead(2);
3525        }
3526        Ok(())
3527    }
3528
3529    /// Escape the sequence we encounter in a flow scalar.
3530    ///
3531    /// `self.input.peek()` must point to the `\` starting the escape sequence.
3532    ///
3533    /// # Errors
3534    /// Return an error if an invalid escape sequence is found.
3535    fn resolve_flow_scalar_escape_sequence(
3536        &mut self,
3537        start_mark: &Marker,
3538    ) -> Result<char, ScanError> {
3539        let mut code_length = 0usize;
3540        let mut ret = '\0';
3541
3542        match self.input.peek_nth(1) {
3543            '0' => ret = '\0',
3544            'a' => ret = '\x07',
3545            'b' => ret = '\x08',
3546            't' | '\t' => ret = '\t',
3547            'n' => ret = '\n',
3548            'v' => ret = '\x0b',
3549            'f' => ret = '\x0c',
3550            'r' => ret = '\x0d',
3551            'e' => ret = '\x1b',
3552            ' ' => ret = '\x20',
3553            '"' => ret = '"',
3554            '/' => ret = '/',
3555            '\\' => ret = '\\',
3556            // Unicode next line (#x85)
3557            'N' => ret = char::from_u32(0x85).unwrap(),
3558            // Unicode non-breaking space (#xA0)
3559            '_' => ret = char::from_u32(0xA0).unwrap(),
3560            // Unicode line separator (#x2028)
3561            'L' => ret = char::from_u32(0x2028).unwrap(),
3562            // Unicode paragraph separator (#x2029)
3563            'P' => ret = char::from_u32(0x2029).unwrap(),
3564            'x' => code_length = 2,
3565            'u' => code_length = 4,
3566            'U' => code_length = 8,
3567            _ => {
3568                return Err(ScanError::new_str(
3569                    *start_mark,
3570                    "while parsing a quoted scalar, found unknown escape character",
3571                ))
3572            }
3573        }
3574        self.skip_n_non_blank(2);
3575
3576        // Consume an arbitrary escape code.
3577        if code_length > 0 {
3578            self.input.lookahead(code_length);
3579            let mut value = 0u32;
3580            for i in 0..code_length {
3581                let c = self.input.peek_nth(i);
3582                if !is_hex(c) {
3583                    return Err(ScanError::new_str(
3584                        *start_mark,
3585                        "while parsing a quoted scalar, did not find expected hexadecimal number",
3586                    ));
3587                }
3588                value = (value << 4) + as_hex(c);
3589            }
3590
3591            self.skip_n_non_blank(code_length);
3592
3593            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
3594            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3595                self.input.lookahead(2);
3596                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3597                    self.skip_n_non_blank(2);
3598                    self.input.lookahead(4);
3599                    let mut low_value = 0u32;
3600                    for i in 0..4 {
3601                        let c = self.input.peek_nth(i);
3602                        if !is_hex(c) {
3603                            return Err(ScanError::new_str(
3604                                *start_mark,
3605                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3606                            ));
3607                        }
3608                        low_value = (low_value << 4) + as_hex(c);
3609                    }
3610                    if (0xDC00..=0xDFFF).contains(&low_value) {
3611                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3612                        self.skip_n_non_blank(4);
3613                    } else {
3614                        return Err(ScanError::new_str(
3615                            *start_mark,
3616                            "while parsing a quoted scalar, found invalid low surrogate",
3617                        ));
3618                    }
3619                } else {
3620                    return Err(ScanError::new_str(
3621                        *start_mark,
3622                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3623                    ));
3624                }
3625            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3626                return Err(ScanError::new_str(
3627                    *start_mark,
3628                    "while parsing a quoted scalar, found unpaired low surrogate",
3629                ));
3630            }
3631
3632            let Some(ch) = char::from_u32(value) else {
3633                return Err(ScanError::new_str(
3634                    *start_mark,
3635                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3636                ));
3637            };
3638            ret = ch;
3639        }
3640        Ok(ret)
3641    }
3642
3643    fn fetch_plain_scalar(&mut self) -> ScanResult {
3644        self.save_simple_key();
3645        self.disallow_simple_key();
3646
3647        let token_index = self.tokens.len();
3648        let tok = self.scan_plain_scalar()?;
3649
3650        self.insert_token(token_index, tok);
3651        Ok(())
3652    }
3653
3654    /// Scan for a plain scalar.
3655    ///
3656    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3657    /// some contexts.
3658    #[allow(clippy::too_many_lines)]
3659    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3660        self.unroll_non_block_indents();
3661        let indent = self.indent + 1;
3662        let start_mark = self.mark;
3663
3664        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3665            return Err(ScanError::new_str(
3666                start_mark,
3667                "invalid indentation in flow construct",
3668            ));
3669        }
3670
3671        let mut string = String::with_capacity(32);
3672        self.buf_whitespaces.clear();
3673        self.buf_leading_break.clear();
3674        self.buf_trailing_breaks.clear();
3675        let mut end_mark = self.mark;
3676
3677        loop {
3678            self.input.lookahead(4);
3679            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3680                || self.input.peek() == '#'
3681            {
3682                // BS4K: If a `#` starts a comment after some separation spaces following content
3683                // of a plain scalar in block context, and there is potential continuation on the
3684                // next line, this is invalid. We cannot decide yet if there will be continuation,
3685                // so record that a comment interrupted a plain scalar.
3686                if self.input.peek() == '#'
3687                    && !string.is_empty()
3688                    && !self.buf_whitespaces.is_empty()
3689                    && self.flow_level == 0
3690                {
3691                    self.interrupted_plain_by_comment = Some(self.mark);
3692                }
3693                break;
3694            }
3695
3696            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3697                return Err(ScanError::new_str(
3698                    self.mark,
3699                    "plain scalar cannot start with '-' followed by ,[]{}",
3700                ));
3701            }
3702
3703            if !self.input.next_is_blank_or_breakz()
3704                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3705            {
3706                if self.leading_whitespace {
3707                    if self.buf_leading_break.is_empty() {
3708                        string.push_str(&self.buf_leading_break);
3709                        string.push_str(&self.buf_trailing_breaks);
3710                        self.buf_trailing_breaks.clear();
3711                        self.buf_leading_break.clear();
3712                    } else {
3713                        if self.buf_trailing_breaks.is_empty() {
3714                            string.push(' ');
3715                        } else {
3716                            string.push_str(&self.buf_trailing_breaks);
3717                            self.buf_trailing_breaks.clear();
3718                        }
3719                        self.buf_leading_break.clear();
3720                    }
3721                    self.leading_whitespace = false;
3722                } else if !self.buf_whitespaces.is_empty() {
3723                    string.push_str(&self.buf_whitespaces);
3724                    self.buf_whitespaces.clear();
3725                }
3726
3727                // We can unroll the first iteration of the loop.
3728                string.push(self.input.peek());
3729                self.skip_non_blank();
3730                string.reserve(self.input.bufmaxlen());
3731
3732                // Add content non-blank characters to the scalar.
3733                let mut end = false;
3734                while !end {
3735                    // Fill the buffer once and process all characters in the buffer until the next
3736                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3737                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3738                    self.input.lookahead(self.input.bufmaxlen());
3739                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3740                        &mut string,
3741                        self.input.bufmaxlen() - 1,
3742                        self.flow_level > 0,
3743                    );
3744                    end = stop;
3745                    self.mark.offsets.chars += chars_consumed;
3746                    self.mark.col += chars_consumed;
3747                    self.mark.offsets.bytes = self.input.byte_offset();
3748                }
3749                end_mark = self.mark;
3750            }
3751
3752            // We may reach the end of a plain scalar if:
3753            //  - We reach eof
3754            //  - We reach ": "
3755            //  - We find a flow character in a flow context
3756            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3757                break;
3758            }
3759
3760            // Process blank characters.
3761            self.input.lookahead(2);
3762            while self.input.next_is_blank_or_break() {
3763                if self.input.next_is_blank() {
3764                    if !self.leading_whitespace {
3765                        self.buf_whitespaces.push(self.input.peek());
3766                        self.skip_blank();
3767                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3768                        // Tabs in an indentation columns are allowed if and only if the line is
3769                        // empty. Skip to the end of the line.
3770                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3771                        if !self.input.next_is_breakz() {
3772                            return Err(ScanError::new_str(
3773                                start_mark,
3774                                "while scanning a plain scalar, found a tab",
3775                            ));
3776                        }
3777                    } else {
3778                        self.skip_blank();
3779                    }
3780                } else {
3781                    // Check if it is a first line break
3782                    if self.leading_whitespace {
3783                        self.skip_break();
3784                        self.buf_trailing_breaks.push('\n');
3785                    } else {
3786                        self.buf_whitespaces.clear();
3787                        self.skip_break();
3788                        self.buf_leading_break.push('\n');
3789                        self.leading_whitespace = true;
3790                    }
3791                }
3792                self.input.lookahead(2);
3793            }
3794
3795            // check indentation level
3796            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3797                break;
3798            }
3799        }
3800
3801        if self.leading_whitespace {
3802            self.allow_simple_key();
3803        }
3804
3805        if string.is_empty() {
3806            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3807            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3808            // erroneous inputs such as "{...".
3809            Err(ScanError::new_str(
3810                start_mark,
3811                "unexpected end of plain scalar",
3812            ))
3813        } else {
3814            let contents = if let (Some(start), Some(end)) =
3815                (start_mark.byte_offset(), end_mark.byte_offset())
3816            {
3817                match self.try_borrow_slice(start, end) {
3818                    Some(slice) if slice == string => Cow::Borrowed(slice),
3819                    _ => Cow::Owned(string),
3820                }
3821            } else {
3822                Cow::Owned(string)
3823            };
3824
3825            Ok(Token(
3826                Span::new(start_mark, end_mark),
3827                TokenType::Scalar(ScalarStyle::Plain, contents),
3828            ))
3829        }
3830    }
3831
3832    fn fetch_key(&mut self) -> ScanResult {
3833        let start_mark = self.mark;
3834        if self.flow_level == 0 {
3835            // Check if we are allowed to start a new key (not necessarily simple).
3836            if !self.simple_key_allowed {
3837                return Err(ScanError::new_str(
3838                    self.mark,
3839                    "mapping keys are not allowed in this context",
3840                ));
3841            }
3842            self.roll_indent(
3843                start_mark.col,
3844                None,
3845                TokenType::BlockMappingStart,
3846                start_mark,
3847            );
3848        } else {
3849            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3850            self.set_current_flow_mapping_started(true);
3851        }
3852
3853        self.remove_simple_key()?;
3854
3855        if self.flow_level == 0 {
3856            self.allow_simple_key();
3857        } else {
3858            self.disallow_simple_key();
3859        }
3860
3861        self.skip_non_blank();
3862        let token_index = self.tokens.len();
3863        self.explicit_key_tab_check_pending = false;
3864        let stopped_after_comment = self.skip_yaml_whitespace(true)?;
3865        if self.input.peek() == '\t' {
3866            return Err(ScanError::new_str(
3867                self.mark(),
3868                "tabs disallowed in this context",
3869            ));
3870        }
3871        self.explicit_key_tab_check_pending = stopped_after_comment;
3872        self.insert_token(
3873            token_index,
3874            Token(Span::new(start_mark, self.mark), TokenType::Key),
3875        );
3876        Ok(())
3877    }
3878
3879    /// Fetch a value in a mapping inside of a flow collection.
3880    ///
3881    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3882    /// values in flow collections are respected prior to calling [`fetch_value`].
3883    ///
3884    /// [`self.flow_level`]: Self::flow_level
3885    /// [`fetch_value`]: Self::fetch_value
3886    fn fetch_flow_value(&mut self) -> ScanResult {
3887        let nc = self.input.peek_nth(1);
3888
3889        // If we encounter a ':' inside a flow collection and it is not immediately
3890        // followed by a blank or breakz:
3891        //   - We must check whether an adjacent value is allowed
3892        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3893        //     is needed for JSON compatibility.
3894        //   - If not, we must ensure there is a space after the ':' and before its value.
3895        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3896        //   - But if the value is empty (null), then it's okay.
3897        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3898        // flow character), but the ']' is not the value. The value is an invisible empty
3899        // space which is represented as null ('~').
3900        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3901            return Err(ScanError::new_str(
3902                self.mark,
3903                "':' may not precede any of `[{` in flow mapping",
3904            ));
3905        }
3906
3907        self.fetch_value()
3908    }
3909
3910    /// Fetch a value from a mapping (after a `:`).
3911    fn fetch_value(&mut self) -> ScanResult {
3912        let sk = self.simple_keys.last().unwrap().clone();
3913        let start_mark = self.mark;
3914        let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3915            && !self.current_flow_mapping_started()
3916            && !self.implicit_flow_mapping_states.is_empty();
3917        if is_implicit_flow_mapping {
3918            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3919                ImplicitMappingState::Inside(self.flow_level);
3920        }
3921
3922        // Skip over ':'.
3923        self.skip_non_blank();
3924        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3925        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3926        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3927        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3928        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3929        // cases will still fail to parse correctly (just with a less specific error message).
3930        let mut trailing_tokens = VecDeque::new();
3931        if self.input.look_ch() == '\t' {
3932            let trailing_token_index = self.tokens.len();
3933            let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
3934            trailing_tokens = self.tokens.split_off(trailing_token_index);
3935
3936            if !whitespace.has_valid_yaml_ws()
3937                && (self.input.peek() == '-' || self.input.next_is_alpha())
3938            {
3939                return Err(ScanError::new_str(
3940                    self.mark,
3941                    "':' must be followed by a valid YAML whitespace",
3942                ));
3943            }
3944        }
3945
3946        if sk.possible {
3947            let token_index = self.simple_key_token_index(&sk, start_mark)?;
3948            // insert simple key
3949            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3950            self.insert_token(token_index, tok);
3951            if is_implicit_flow_mapping {
3952                if sk.mark.line < start_mark.line {
3953                    return Err(ScanError::new_str(
3954                        start_mark,
3955                        "illegal placement of ':' indicator",
3956                    ));
3957                }
3958                self.insert_token(
3959                    token_index,
3960                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3961                );
3962            }
3963
3964            // Add the BLOCK-MAPPING-START token if needed.
3965            self.roll_indent(
3966                sk.mark.col,
3967                Some(sk.token_number),
3968                TokenType::BlockMappingStart,
3969                sk.mark,
3970            );
3971            self.roll_one_col_indent();
3972
3973            self.simple_keys.last_mut().unwrap().possible = false;
3974            self.disallow_simple_key();
3975        } else {
3976            if is_implicit_flow_mapping {
3977                self.tokens
3978                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart).into());
3979            }
3980            // The ':' indicator follows a complex key.
3981            if self.flow_level == 0 {
3982                if !self.simple_key_allowed {
3983                    return Err(ScanError::new_str(
3984                        start_mark,
3985                        "mapping values are not allowed in this context",
3986                    ));
3987                }
3988
3989                self.roll_indent(
3990                    start_mark.col,
3991                    None,
3992                    TokenType::BlockMappingStart,
3993                    start_mark,
3994                );
3995            }
3996            self.roll_one_col_indent();
3997
3998            if self.flow_level == 0 {
3999                self.allow_simple_key();
4000            } else {
4001                self.disallow_simple_key();
4002            }
4003        }
4004        self.tokens
4005            .push_back(Token(Span::empty(start_mark), TokenType::Value).into());
4006        self.tokens.append(&mut trailing_tokens);
4007
4008        Ok(())
4009    }
4010
4011    /// Add an indentation level to the stack with the given block token, if needed.
4012    ///
4013    /// An indentation level is added only if:
4014    ///   - We are not in a flow-style construct (which don't have indentation per-se).
4015    ///   - The current column is further indented than the last indent we have registered.
4016    fn roll_indent(
4017        &mut self,
4018        col: usize,
4019        number: Option<usize>,
4020        tok: TokenType<'input>,
4021        mark: Marker,
4022    ) {
4023        if self.flow_level > 0 {
4024            return;
4025        }
4026
4027        // If the last indent was a non-block indent, remove it.
4028        // This means that we prepared an indent that we thought we wouldn't use, but realized just
4029        // now that it is a block indent.
4030        if self.indent <= col as isize {
4031            if let Some(indent) = self.indents.last() {
4032                if !indent.needs_block_end {
4033                    self.indent = indent.indent;
4034                    self.indents.pop();
4035                }
4036            }
4037        }
4038
4039        if self.indent < col as isize {
4040            self.indents.push(Indent {
4041                indent: self.indent,
4042                needs_block_end: true,
4043            });
4044            self.indent = col as isize;
4045            let tokens_parsed = self.tokens_parsed;
4046            match number {
4047                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
4048                None => self.tokens.push_back(Token(Span::empty(mark), tok).into()),
4049            }
4050        }
4051    }
4052
4053    /// Pop indentation levels from the stack as much as needed.
4054    ///
4055    /// Indentation levels are popped from the stack while they are further indented than `col`.
4056    /// If we are in a flow-style construct (which don't have indentation per-se), this function
4057    /// does nothing.
4058    fn unroll_indent(&mut self, col: isize) {
4059        if self.flow_level > 0 {
4060            return;
4061        }
4062        while self.indent > col {
4063            let indent = self.indents.pop().unwrap();
4064            self.indent = indent.indent;
4065            if indent.needs_block_end {
4066                self.tokens
4067                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd).into());
4068            }
4069        }
4070    }
4071
4072    /// Add an indentation level of 1 column that does not start a block.
4073    ///
4074    /// See the documentation of [`Indent::needs_block_end`] for more details.
4075    /// An indentation is not added if we are inside a flow level or if the last indent is already
4076    /// a non-block indent.
4077    fn roll_one_col_indent(&mut self) {
4078        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
4079            self.indents.push(Indent {
4080                indent: self.indent,
4081                needs_block_end: false,
4082            });
4083            self.indent += 1;
4084        }
4085    }
4086
4087    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
4088    fn unroll_non_block_indents(&mut self) {
4089        while let Some(indent) = self.indents.last() {
4090            if indent.needs_block_end {
4091                break;
4092            }
4093            self.indent = indent.indent;
4094            self.indents.pop();
4095        }
4096    }
4097
4098    /// Mark the next token to be inserted as a potential simple key.
4099    fn save_simple_key(&mut self) {
4100        if self.simple_key_allowed {
4101            let required = self.flow_level == 0
4102                && self.indent == (self.mark.col as isize)
4103                && self.indents.last().unwrap().needs_block_end;
4104
4105            if let Some(last) = self.simple_keys.last_mut() {
4106                *last = SimpleKey {
4107                    mark: self.mark,
4108                    possible: true,
4109                    required,
4110                    token_number: self.tokens_parsed + self.tokens.len(),
4111                };
4112            }
4113        }
4114    }
4115
4116    fn remove_simple_key(&mut self) -> ScanResult {
4117        let last = self.simple_keys.last_mut().unwrap();
4118        if last.possible && last.required {
4119            return Err(self.simple_key_expected());
4120        }
4121
4122        last.possible = false;
4123        Ok(())
4124    }
4125
4126    /// Return whether the scanner is inside a block but outside of a flow sequence.
4127    fn is_within_block(&self) -> bool {
4128        !self.indents.is_empty()
4129    }
4130
4131    /// If an implicit mapping had started, end it.
4132    ///
4133    /// This function does not pop the state in [`implicit_flow_mapping_states`].
4134    ///
4135    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
4136    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
4137        if self
4138            .implicit_flow_mapping_states
4139            .last()
4140            .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
4141        {
4142            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
4143            self.set_current_flow_mapping_started(false);
4144            self.tokens
4145                .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd).into());
4146        }
4147    }
4148
4149    fn current_flow_collection_is_sequence(&self) -> bool {
4150        self.flow_markers
4151            .last()
4152            .is_some_and(|(_, bracket)| *bracket == '[')
4153    }
4154
4155    fn current_flow_mapping_started(&self) -> bool {
4156        self.flow_mapping_started.last().copied().unwrap_or(false)
4157    }
4158
4159    fn set_current_flow_mapping_started(&mut self, started: bool) {
4160        if let Some(current) = self.flow_mapping_started.last_mut() {
4161            *current = started;
4162        }
4163    }
4164}
4165
4166/// Chomping, how final line breaks and trailing empty lines are interpreted.
4167///
4168/// See YAML spec 8.1.1.2.
4169#[derive(PartialEq, Eq)]
4170pub enum Chomping {
4171    /// The final line break and any trailing empty lines are excluded.
4172    Strip,
4173    /// The final line break is preserved, but trailing empty lines are excluded.
4174    Clip,
4175    /// The final line break and trailing empty lines are included.
4176    Keep,
4177}
4178
4179#[cfg(test)]
4180mod test {
4181    use alloc::{
4182        borrow::{Cow, ToOwned},
4183        rc::Rc,
4184        string::String,
4185        vec::Vec,
4186    };
4187    use core::cell::Cell;
4188
4189    use crate::{
4190        input::{str::StrInput, BorrowedInput, BufferedInput, Input},
4191        scanner::{
4192            Comment, Marker, Placement, QueuedToken, QueuedTokenType, ScalarStyle, Scanner, Span,
4193            TEncoding, Token, TokenType,
4194        },
4195    };
4196
4197    struct CountingChars {
4198        chars: alloc::vec::IntoIter<char>,
4199        read: Rc<Cell<usize>>,
4200    }
4201
4202    impl Iterator for CountingChars {
4203        type Item = char;
4204
4205        fn next(&mut self) -> Option<Self::Item> {
4206            let next = self.chars.next();
4207            if next.is_some() {
4208                self.read.set(self.read.get() + 1);
4209            }
4210            next
4211        }
4212    }
4213
4214    struct SlicingOnlyInput<'input> {
4215        inner: StrInput<'input>,
4216        expose_slice: bool,
4217    }
4218
4219    impl<'input> SlicingOnlyInput<'input> {
4220        fn new(source: &'input str, expose_slice: bool) -> Self {
4221            Self {
4222                inner: StrInput::new(source),
4223                expose_slice,
4224            }
4225        }
4226    }
4227
4228    impl Input for SlicingOnlyInput<'_> {
4229        fn lookahead(&mut self, count: usize) {
4230            self.inner.lookahead(count);
4231        }
4232
4233        fn buflen(&self) -> usize {
4234            self.inner.buflen()
4235        }
4236
4237        fn bufmaxlen(&self) -> usize {
4238            self.inner.bufmaxlen()
4239        }
4240
4241        fn raw_read_ch(&mut self) -> char {
4242            self.inner.raw_read_ch()
4243        }
4244
4245        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4246            self.inner.raw_read_non_breakz_ch()
4247        }
4248
4249        fn skip(&mut self) {
4250            self.inner.skip();
4251        }
4252
4253        fn skip_n(&mut self, count: usize) {
4254            self.inner.skip_n(count);
4255        }
4256
4257        fn peek(&self) -> char {
4258            self.inner.peek()
4259        }
4260
4261        fn peek_nth(&self, n: usize) -> char {
4262            self.inner.peek_nth(n)
4263        }
4264
4265        fn byte_offset(&self) -> Option<usize> {
4266            self.inner.byte_offset()
4267        }
4268
4269        fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
4270            if self.expose_slice {
4271                self.inner.slice_bytes(start, end)
4272            } else {
4273                None
4274            }
4275        }
4276    }
4277
4278    impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
4279        fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
4280            None
4281        }
4282    }
4283
4284    #[test]
4285    fn test_is_anchor_char() {
4286        use super::is_anchor_char;
4287        assert!(is_anchor_char('x'));
4288    }
4289
4290    #[test]
4291    fn flow_simple_key_length_limit_bounds_buffering() {
4292        let mut yaml = String::from("[\n\"start\"\n");
4293        for _ in 0..600 {
4294            yaml.push_str("\"x\"\n");
4295        }
4296        let total_chars = yaml.chars().count();
4297        let read = Rc::new(Cell::new(0));
4298        let chars = yaml.chars().collect::<Vec<_>>().into_iter();
4299        let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
4300            chars,
4301            read: Rc::clone(&read),
4302        }));
4303
4304        assert!(matches!(
4305            scanner.next_token().unwrap().unwrap().1,
4306            TokenType::StreamStart(_)
4307        ));
4308
4309        let token = scanner.next_token().unwrap().unwrap();
4310        assert!(matches!(token.1, TokenType::FlowSequenceStart));
4311
4312        let token = scanner.next_token().unwrap().unwrap();
4313        assert!(matches!(
4314            token.1,
4315            TokenType::Scalar(_, ref value) if value == "start"
4316        ));
4317        assert!(
4318            read.get() < total_chars,
4319            "scanner consumed all {total_chars} chars before yielding the first flow scalar"
4320        );
4321        assert!(
4322            read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
4323            "scanner read {} chars before yielding the first flow scalar",
4324            read.get()
4325        );
4326    }
4327
4328    #[test]
4329    fn comment_capture_does_not_change_leading_whitespace() {
4330        let mut scanner = Scanner::new(StrInput::new("# comment\n"));
4331
4332        let token = scanner.scan_comment_token().unwrap();
4333
4334        assert!(scanner.leading_whitespace);
4335        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
4336
4337        let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
4338        scanner.input.lookahead(1);
4339
4340        let token = scanner.scan_comment_token().unwrap();
4341
4342        assert!(scanner.leading_whitespace);
4343        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
4344    }
4345
4346    #[test]
4347    fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
4348        let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
4349        scanner.input.lookahead(2);
4350        assert_eq!(scanner.input.peek_nth(1), ' ');
4351
4352        let token = scanner.scan_comment_token().unwrap();
4353
4354        assert!(matches!(token.1, TokenType::Comment(ref comment)
4355            if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
4356    }
4357
4358    #[test]
4359    fn comment_capture_errors_when_offsets_have_no_slice() {
4360        let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
4361
4362        let error = scanner.scan_comment_token().unwrap_err();
4363
4364        assert_eq!(
4365            error.info(),
4366            "internal error: input advertised offsets but did not provide a slice"
4367        );
4368    }
4369
4370    #[test]
4371    fn queued_token_roundtrips_public_token_variants() {
4372        let span = Span::new(Marker::new(0, 1, 0), Marker::new(7, 1, 7));
4373        let tokens = [
4374            Token(span, TokenType::StreamStart(TEncoding::Utf8)),
4375            Token(span, TokenType::StreamEnd),
4376            Token(span, TokenType::VersionDirective(1, 2)),
4377            Token(
4378                span,
4379                TokenType::TagDirective(Cow::Borrowed("!app!"), Cow::Borrowed("tag:app.example,")),
4380            ),
4381            Token(span, TokenType::DocumentStart),
4382            Token(span, TokenType::DocumentEnd),
4383            Token(span, TokenType::BlockSequenceStart),
4384            Token(span, TokenType::BlockMappingStart),
4385            Token(span, TokenType::BlockEnd),
4386            Token(span, TokenType::FlowSequenceStart),
4387            Token(span, TokenType::FlowSequenceEnd),
4388            Token(span, TokenType::FlowMappingStart),
4389            Token(span, TokenType::FlowMappingEnd),
4390            Token(span, TokenType::BlockEntry),
4391            Token(span, TokenType::FlowEntry),
4392            Token(span, TokenType::Key),
4393            Token(span, TokenType::Value),
4394            Token(span, TokenType::Alias(Cow::Borrowed("alias"))),
4395            Token(span, TokenType::Anchor(Cow::Borrowed("anchor"))),
4396            Token(
4397                span,
4398                TokenType::Tag(Cow::Borrowed("!"), Cow::Borrowed("tag")),
4399            ),
4400            Token(
4401                span,
4402                TokenType::Scalar(ScalarStyle::Literal, Cow::Borrowed("scalar")),
4403            ),
4404            Token(
4405                span,
4406                TokenType::Comment(
4407                    Comment::new(span, Cow::Borrowed(" comment")).with_placement(Placement::Right),
4408                ),
4409            ),
4410            Token(
4411                span,
4412                TokenType::ReservedDirective(
4413                    "reserved".to_owned(),
4414                    vec!["one".to_owned(), "two".to_owned()],
4415                ),
4416            ),
4417        ];
4418
4419        for token in tokens {
4420            let queued: QueuedToken = token.clone().into();
4421
4422            assert_eq!(queued.into_public(), token);
4423        }
4424    }
4425
4426    #[test]
4427    fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
4428        let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
4429
4430        scanner.skip_yaml_whitespace(false).unwrap();
4431
4432        assert!(scanner.tokens.is_empty());
4433        assert_eq!(scanner.mark.line(), 2);
4434        assert_eq!(scanner.mark.col(), 0);
4435    }
4436
4437    #[test]
4438    fn yaml_whitespace_can_stop_after_queued_comment() {
4439        let mut scanner = Scanner::new(StrInput::new(" # queued\n# later\n"));
4440
4441        assert!(scanner.skip_yaml_whitespace(true).unwrap());
4442
4443        assert_eq!(scanner.tokens.len(), 1);
4444        assert!(matches!(
4445            scanner.tokens.front().unwrap().1,
4446            QueuedTokenType::Comment(ref comment) if comment.text == " queued"
4447        ));
4448        assert_eq!(scanner.mark.line(), 1);
4449        assert_eq!(scanner.mark.col(), 9);
4450    }
4451
4452    #[test]
4453    fn token_skip_can_stop_after_queued_comment() {
4454        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n"));
4455
4456        assert!(scanner.skip_to_next_token(true).unwrap());
4457
4458        assert_eq!(scanner.tokens.len(), 1);
4459        assert!(matches!(
4460            scanner.tokens.front().unwrap().1,
4461            QueuedTokenType::Comment(ref comment) if comment.text == " first"
4462        ));
4463        assert_eq!(scanner.mark.line(), 2);
4464        assert_eq!(scanner.mark.col(), 0);
4465    }
4466
4467    #[test]
4468    fn scanner_emits_first_leading_comment_before_scanning_next_comment() {
4469        let mut scanner = Scanner::new(StrInput::new("# first\n# second\nkey: value\n"));
4470
4471        assert!(matches!(
4472            scanner.next_token().unwrap().unwrap().1,
4473            TokenType::StreamStart(_)
4474        ));
4475        assert!(matches!(
4476            scanner.next_token().unwrap().unwrap().1,
4477            TokenType::Comment(ref comment) if comment.text == " first"
4478        ));
4479        assert!(scanner.tokens.is_empty());
4480        assert!(matches!(
4481            scanner.next_token().unwrap().unwrap().1,
4482            TokenType::Comment(ref comment) if comment.text == " second"
4483        ));
4484    }
4485
4486    #[test]
4487    fn scanner_emits_quoted_scalar_comment_before_scanning_following_value() {
4488        let mut scanner = Scanner::new(StrInput::new("\"key\" # quoted\n: value\n"));
4489
4490        assert!(matches!(
4491            scanner.next_token().unwrap().unwrap().1,
4492            TokenType::StreamStart(_)
4493        ));
4494        assert!(matches!(
4495            scanner.next_token().unwrap().unwrap().1,
4496            TokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4497        ));
4498        assert!(matches!(
4499            scanner.next_token().unwrap().unwrap().1,
4500            TokenType::Comment(ref comment) if comment.text == " quoted"
4501        ));
4502    }
4503
4504    #[test]
4505    fn flow_scalar_comment_disables_adjacent_value_lookahead() {
4506        let mut scanner = Scanner::new(StrInput::new("\"key\"\n# quoted\n: value\n"));
4507
4508        scanner.fetch_flow_scalar(false).unwrap();
4509
4510        assert_eq!(scanner.adjacent_value_allowed_at, usize::MAX);
4511        assert!(matches!(
4512            scanner.tokens.front().unwrap().1,
4513            QueuedTokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4514        ));
4515        assert!(scanner.tokens.iter().any(|QueuedToken(_, token)| matches!(
4516            token,
4517            QueuedTokenType::Comment(comment) if comment.text == " quoted"
4518        )));
4519    }
4520
4521    #[test]
4522    fn deferred_error_waits_for_all_comment_tokens() {
4523        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
4524
4525        assert!(matches!(
4526            scanner.next_token().unwrap().unwrap().1,
4527            TokenType::StreamStart(_)
4528        ));
4529        assert!(matches!(
4530            scanner.next_token().unwrap().unwrap().1,
4531            TokenType::Comment(ref comment) if comment.text == " first"
4532        ));
4533        assert!(matches!(
4534            scanner.next_token().unwrap().unwrap().1,
4535            TokenType::Comment(ref comment) if comment.text == " second"
4536        ));
4537
4538        let error = scanner.next_token().unwrap_err();
4539
4540        assert!(error.info().contains("unexpected character"));
4541    }
4542
4543    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
4544    #[test]
4545    fn anchor_name_is_borrowed_for_str_input() {
4546        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
4547
4548        loop {
4549            let tok = scanner
4550                .next_token()
4551                .expect("valid YAML must scan without errors")
4552                .expect("scanner must eventually produce a token");
4553            if let TokenType::Anchor(name) = tok.1 {
4554                assert!(matches!(name, Cow::Borrowed("anch")));
4555                break;
4556            }
4557        }
4558    }
4559
4560    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
4561    #[test]
4562    fn anchor_name_rejects_non_printable_control_chars() {
4563        let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
4564
4565        loop {
4566            let tok = scanner
4567                .next_token()
4568                .expect("scanning should not fail")
4569                .expect("scanner must eventually produce a token");
4570            if let TokenType::Anchor(name) = tok.1 {
4571                assert!(matches!(name, Cow::Borrowed("foo")));
4572                let next = scanner.next_token().expect("scanning should not fail");
4573                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4574                    assert!(rest.starts_with('\u{0001}'));
4575                }
4576                break;
4577            }
4578        }
4579    }
4580
4581    #[test]
4582    fn alias_name_rejects_non_printable_control_chars() {
4583        let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
4584
4585        loop {
4586            let tok = scanner
4587                .next_token()
4588                .expect("scanning should not fail")
4589                .expect("scanner must eventually produce a token");
4590            if let TokenType::Alias(name) = tok.1 {
4591                assert!(matches!(name, Cow::Borrowed("foo")));
4592                let next = scanner.next_token().expect("scanning should not fail");
4593                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4594                    assert!(rest.starts_with('\u{0001}'));
4595                }
4596                break;
4597            }
4598        }
4599    }
4600
4601    #[test]
4602    fn alias_name_is_borrowed_for_str_input() {
4603        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
4604
4605        loop {
4606            let tok = scanner
4607                .next_token()
4608                .expect("valid YAML must scan without errors")
4609                .expect("scanner must eventually produce a token");
4610            if let TokenType::Alias(name) = tok.1 {
4611                assert!(matches!(name, Cow::Borrowed("anch")));
4612                break;
4613            }
4614        }
4615    }
4616
4617    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
4618    #[test]
4619    fn tag_directive_parts_are_borrowed_for_str_input() {
4620        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
4621
4622        loop {
4623            let tok = scanner
4624                .next_token()
4625                .expect("valid YAML must scan without errors")
4626                .expect("scanner must eventually produce a token");
4627            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4628                assert!(matches!(handle, Cow::Borrowed("!e!")));
4629                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
4630                break;
4631            }
4632        }
4633    }
4634
4635    #[test]
4636    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
4637        let mut scanner = Scanner::new(StrInput::new("foo\n"));
4638
4639        loop {
4640            let tok = scanner
4641                .next_token()
4642                .expect("valid YAML must scan without errors")
4643                .expect("scanner must eventually produce a token");
4644            if let TokenType::Scalar(_, value) = tok.1 {
4645                assert!(matches!(value, Cow::Borrowed("foo")));
4646                break;
4647            }
4648        }
4649    }
4650
4651    #[test]
4652    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
4653        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
4654
4655        loop {
4656            let tok = scanner
4657                .next_token()
4658                .expect("valid YAML must scan without errors")
4659                .expect("scanner must eventually produce a token");
4660            if let TokenType::Scalar(_, value) = tok.1 {
4661                assert!(matches!(value, Cow::Borrowed("foo bar")));
4662                break;
4663            }
4664        }
4665    }
4666
4667    #[test]
4668    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4669        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
4670
4671        loop {
4672            let tok = scanner
4673                .next_token()
4674                .expect("valid YAML must scan without errors")
4675                .expect("scanner must eventually produce a token");
4676            if let TokenType::Scalar(_, value) = tok.1 {
4677                assert!(matches!(value, Cow::Borrowed("foo bar")));
4678                break;
4679            }
4680        }
4681    }
4682
4683    #[test]
4684    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
4685        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
4686
4687        loop {
4688            let tok = scanner
4689                .next_token()
4690                .expect("valid YAML must scan without errors")
4691                .expect("scanner must eventually produce a token");
4692            if let TokenType::Scalar(_, value) = tok.1 {
4693                assert!(matches!(value, Cow::Owned(_)));
4694                assert_eq!(&*value, "foo'bar");
4695                break;
4696            }
4697        }
4698    }
4699
4700    #[test]
4701    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4702        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
4703
4704        loop {
4705            let tok = scanner
4706                .next_token()
4707                .expect("valid YAML must scan without errors")
4708                .expect("scanner must eventually produce a token");
4709            if let TokenType::Scalar(_, value) = tok.1 {
4710                assert!(matches!(value, Cow::Borrowed("foo bar")));
4711                break;
4712            }
4713        }
4714    }
4715
4716    #[test]
4717    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
4718        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
4719
4720        loop {
4721            let tok = scanner
4722                .next_token()
4723                .expect("valid YAML must scan without errors")
4724                .expect("scanner must eventually produce a token");
4725            if let TokenType::Scalar(_, value) = tok.1 {
4726                assert!(matches!(value, Cow::Owned(_)));
4727                assert_eq!(&*value, "foo\nbar");
4728                break;
4729            }
4730        }
4731    }
4732
4733    #[test]
4734    fn plain_key_is_borrowed_for_str_input() {
4735        // Keys are just scalars in a key position; they should also be borrowed.
4736        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
4737
4738        let mut found_key = false;
4739        let mut key_value: Option<Cow<'_, str>> = None;
4740
4741        loop {
4742            let tok = scanner
4743                .next_token()
4744                .expect("valid YAML must scan without errors");
4745            let Some(tok) = tok else { break };
4746
4747            if matches!(tok.1, TokenType::Key) {
4748                found_key = true;
4749            } else if found_key {
4750                if let TokenType::Scalar(_, value) = tok.1 {
4751                    key_value = Some(value);
4752                    break;
4753                }
4754            }
4755        }
4756
4757        assert!(found_key, "expected to find a Key token");
4758        let key_value = key_value.expect("expected to find a scalar after Key token");
4759        assert!(
4760            matches!(key_value, Cow::Borrowed("mykey")),
4761            "key should be borrowed, got: {key_value:?}"
4762        );
4763    }
4764
4765    #[test]
4766    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
4767        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
4768
4769        let mut found_key = false;
4770        let mut key_value: Option<Cow<'_, str>> = None;
4771
4772        loop {
4773            let tok = scanner
4774                .next_token()
4775                .expect("valid YAML must scan without errors");
4776            let Some(tok) = tok else { break };
4777
4778            if matches!(tok.1, TokenType::Key) {
4779                found_key = true;
4780            } else if found_key {
4781                if let TokenType::Scalar(_, value) = tok.1 {
4782                    key_value = Some(value);
4783                    break;
4784                }
4785            }
4786        }
4787
4788        assert!(found_key, "expected to find a Key token");
4789        let key_value = key_value.expect("expected to find a scalar after Key token");
4790        assert!(
4791            matches!(key_value, Cow::Borrowed("mykey")),
4792            "quoted key should be borrowed when verbatim, got: {key_value:?}"
4793        );
4794    }
4795
4796    #[test]
4797    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
4798        // Test a tag like !!str which should have handle="!!" and suffix="str"
4799        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
4800
4801        loop {
4802            let tok = scanner
4803                .next_token()
4804                .expect("valid YAML must scan without errors")
4805                .expect("scanner must eventually produce a token");
4806            if let TokenType::Tag(handle, suffix) = tok.1 {
4807                assert!(
4808                    matches!(handle, Cow::Borrowed("!!")),
4809                    "tag handle should be borrowed, got: {handle:?}"
4810                );
4811                assert!(
4812                    matches!(suffix, Cow::Borrowed("str")),
4813                    "tag suffix should be borrowed, got: {suffix:?}"
4814                );
4815                break;
4816            }
4817        }
4818    }
4819
4820    #[test]
4821    fn local_tag_suffix_is_borrowed_for_str_input() {
4822        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
4823        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
4824
4825        loop {
4826            let tok = scanner
4827                .next_token()
4828                .expect("valid YAML must scan without errors")
4829                .expect("scanner must eventually produce a token");
4830            if let TokenType::Tag(handle, suffix) = tok.1 {
4831                assert!(
4832                    matches!(handle, Cow::Borrowed("!")),
4833                    "local tag handle should be '!', got: {handle:?}"
4834                );
4835                assert!(
4836                    matches!(suffix, Cow::Borrowed("mytag")),
4837                    "local tag suffix should be borrowed, got: {suffix:?}"
4838                );
4839                break;
4840            }
4841        }
4842    }
4843
4844    #[test]
4845    fn tag_with_uri_escape_is_owned_for_str_input() {
4846        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
4847        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
4848
4849        loop {
4850            let tok = scanner
4851                .next_token()
4852                .expect("valid YAML must scan without errors")
4853                .expect("scanner must eventually produce a token");
4854            if let TokenType::Tag(handle, suffix) = tok.1 {
4855                assert!(
4856                    matches!(handle, Cow::Borrowed("!!")),
4857                    "tag handle should still be borrowed, got: {handle:?}"
4858                );
4859                assert!(
4860                    matches!(suffix, Cow::Owned(_)),
4861                    "tag suffix with URI escape should be owned, got: {suffix:?}"
4862                );
4863                assert_eq!(&*suffix, "my tag");
4864                break;
4865            }
4866        }
4867    }
4868
4869    #[test]
4870    fn flow_scalar_buffer_tracks_pending_whitespace() {
4871        let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
4872
4873        borrowed.note_pending_ws(5, 8);
4874        borrowed.commit_pending_ws();
4875        assert!(matches!(
4876            borrowed,
4877            super::FlowScalarBuf::Borrowed {
4878                end: 8,
4879                pending_ws_start: None,
4880                pending_ws_end: 8,
4881                ..
4882            }
4883        ));
4884
4885        borrowed.note_pending_ws(9, 11);
4886        borrowed.discard_pending_ws();
4887        assert!(matches!(
4888            borrowed,
4889            super::FlowScalarBuf::Borrowed {
4890                end: 8,
4891                pending_ws_start: None,
4892                pending_ws_end: 8,
4893                ..
4894            }
4895        ));
4896        assert!(borrowed.as_owned_mut().is_none());
4897
4898        let mut owned = super::FlowScalarBuf::new_owned();
4899        owned.as_owned_mut().unwrap().push_str("owned");
4900        assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
4901    }
4902
4903    fn first_scanner_error_info(input: &str) -> String {
4904        let mut scanner = Scanner::new(StrInput::new(input));
4905        loop {
4906            match scanner.next_token() {
4907                Ok(Some(_)) => {}
4908                Ok(None) => panic!("expected scanner error"),
4909                Err(error) => return error.info().to_owned(),
4910            }
4911        }
4912    }
4913
4914    fn first_scalar_value(input: &str) -> String {
4915        let mut scanner = Scanner::new(StrInput::new(input));
4916        loop {
4917            match scanner.next_token().expect("scanner should not error") {
4918                Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
4919                Some(_) => {}
4920                None => panic!("expected scalar token"),
4921            }
4922        }
4923    }
4924
4925    #[test]
4926    fn iterator_next_records_error_and_then_stays_empty() {
4927        let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
4928
4929        while scanner.next().is_some() {}
4930
4931        let error = scanner
4932            .get_error()
4933            .expect("scanner should retain the error");
4934        assert_eq!(error.info(), "unclosed quote");
4935        assert!(scanner.next().is_none());
4936    }
4937
4938    #[test]
4939    fn next_token_returns_none_after_stream_end() {
4940        let mut scanner = Scanner::new(StrInput::new(""));
4941
4942        while let Some(token) = scanner.next_token().unwrap() {
4943            if matches!(token.1, TokenType::StreamEnd) {
4944                break;
4945            }
4946        }
4947
4948        assert!(scanner.stream_started());
4949        assert!(scanner.stream_ended());
4950        assert!(scanner.next_token().unwrap().is_none());
4951    }
4952
4953    #[test]
4954    fn directive_name_must_be_present() {
4955        assert_eq!(
4956            first_scanner_error_info("%\n"),
4957            "while scanning a directive, could not find expected directive name"
4958        );
4959    }
4960
4961    #[test]
4962    fn yaml_directive_requires_dot_between_version_numbers() {
4963        assert_eq!(
4964            first_scanner_error_info("%YAML 1\n"),
4965            "while scanning a YAML directive, did not find expected digit or '.' character"
4966        );
4967    }
4968
4969    #[test]
4970    fn yaml_directive_requires_major_version_number() {
4971        assert_eq!(
4972            first_scanner_error_info("%YAML .2\n"),
4973            "while scanning a YAML directive, did not find expected version number"
4974        );
4975    }
4976
4977    #[test]
4978    fn yaml_directive_rejects_extremely_long_version_number() {
4979        assert_eq!(
4980            first_scanner_error_info("%YAML 1234567890.2\n"),
4981            "while scanning a YAML directive, found extremely long version number"
4982        );
4983    }
4984
4985    #[test]
4986    fn tag_directive_handle_must_end_with_bang() {
4987        assert_eq!(
4988            first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
4989            "while parsing a tag directive, did not find expected '!'"
4990        );
4991    }
4992
4993    #[test]
4994    fn tag_directive_handle_must_start_with_bang() {
4995        assert_eq!(
4996            first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
4997            "while scanning a tag, did not find expected '!'"
4998        );
4999    }
5000
5001    #[test]
5002    fn tag_directive_prefix_must_start_with_tag_character() {
5003        assert_eq!(
5004            first_scanner_error_info("%TAG !e! `bad\n"),
5005            "invalid global tag character"
5006        );
5007    }
5008
5009    #[test]
5010    fn tag_directive_prefix_must_end_before_invalid_content() {
5011        assert_eq!(
5012            first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
5013            "while scanning TAG, did not find expected whitespace or line break"
5014        );
5015    }
5016
5017    #[test]
5018    fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
5019        let mut scanner =
5020            Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
5021
5022        loop {
5023            let token = scanner
5024                .next_token()
5025                .expect("valid directive should scan")
5026                .expect("scanner must produce a directive token");
5027            if let TokenType::TagDirective(handle, prefix) = token.1 {
5028                assert!(matches!(handle, Cow::Borrowed("!e!")));
5029                assert!(matches!(prefix, Cow::Owned(_)));
5030                assert_eq!(&*prefix, "tag:example.com,2024:some app/");
5031                break;
5032            }
5033        }
5034    }
5035
5036    #[test]
5037    fn bare_bang_tag_scans_as_non_specific_tag() {
5038        let mut scanner = Scanner::new(StrInput::new("! foo\n"));
5039
5040        loop {
5041            let token = scanner
5042                .next_token()
5043                .expect("valid tag should scan")
5044                .expect("scanner must produce a tag token");
5045            if let TokenType::Tag(handle, suffix) = token.1 {
5046                assert_eq!(&*handle, "");
5047                assert_eq!(&*suffix, "!");
5048                break;
5049            }
5050        }
5051    }
5052
5053    #[test]
5054    fn tag_requires_separation_after_suffix() {
5055        assert_eq!(
5056            first_scanner_error_info("!foo,bar\n"),
5057            "while scanning a tag, did not find expected whitespace or line break"
5058        );
5059    }
5060
5061    #[test]
5062    fn verbatim_tag_requires_uri() {
5063        assert_eq!(
5064            first_scanner_error_info("!<> foo\n"),
5065            "while parsing a tag, did not find expected tag URI"
5066        );
5067    }
5068
5069    #[test]
5070    fn verbatim_tag_requires_closing_angle_bracket() {
5071        assert_eq!(
5072            first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
5073            "while scanning a verbatim tag, did not find the expected '>'"
5074        );
5075    }
5076
5077    #[test]
5078    fn tag_uri_escape_requires_hex_digits() {
5079        assert_eq!(
5080            first_scanner_error_info("!!bad%zz foo\n"),
5081            "while parsing a tag, found an invalid escape sequence"
5082        );
5083    }
5084
5085    #[test]
5086    fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
5087        assert_eq!(
5088            first_scanner_error_info("!!bad%80 foo\n"),
5089            "while parsing a tag, found an incorrect leading UTF-8 byte"
5090        );
5091    }
5092
5093    #[test]
5094    fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
5095        assert_eq!(
5096            first_scanner_error_info("!!bad%C2%41 foo\n"),
5097            "while parsing a tag, found an incorrect trailing UTF-8 byte"
5098        );
5099    }
5100
5101    #[test]
5102    fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
5103        assert_eq!(
5104            first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
5105            "while parsing a tag, found an invalid UTF-8 codepoint"
5106        );
5107    }
5108
5109    #[test]
5110    fn anchors_and_aliases_require_names() {
5111        let expected =
5112            "while scanning an anchor or alias, did not find expected alphabetic or numeric character";
5113
5114        assert_eq!(first_scanner_error_info("& \n"), expected);
5115        assert_eq!(first_scanner_error_info("* \n"), expected);
5116    }
5117
5118    #[test]
5119    fn document_end_marker_rejects_trailing_content() {
5120        assert_eq!(
5121            first_scanner_error_info("... trailing\n"),
5122            "invalid content after document end marker"
5123        );
5124    }
5125
5126    #[test]
5127    fn reserved_indicators_are_rejected_outside_directives() {
5128        assert_eq!(
5129            first_scanner_error_info(" @\n"),
5130            "unexpected character: `@'"
5131        );
5132    }
5133
5134    #[test]
5135    fn flow_block_entry_indicator_is_rejected() {
5136        assert_eq!(
5137            first_scanner_error_info("[- ]\n"),
5138            r#""-" is only valid inside a block"#
5139        );
5140    }
5141
5142    #[test]
5143    fn block_entry_after_tabbed_separator_reports_specific_error() {
5144        assert_eq!(
5145            first_scanner_error_info("-\t- value\n"),
5146            "'-' must be followed by a valid YAML whitespace"
5147        );
5148    }
5149
5150    #[test]
5151    fn document_indicator_reports_unclosed_flow_collection() {
5152        assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
5153    }
5154
5155    #[test]
5156    fn block_scalar_header_rejects_trailing_content() {
5157        assert_eq!(
5158            first_scanner_error_info("|+ trailing\n"),
5159            "while scanning a block scalar, did not find expected comment or line break"
5160        );
5161    }
5162
5163    #[test]
5164    fn block_scalar_rejects_zero_indent_indicator() {
5165        let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
5166
5167        assert_eq!(first_scanner_error_info("|0\n"), expected);
5168        assert_eq!(first_scanner_error_info("|+0\n"), expected);
5169    }
5170
5171    #[test]
5172    fn empty_block_scalar_at_eof_honors_chomping() {
5173        assert_eq!(first_scalar_value("|-\n"), "");
5174        assert_eq!(first_scalar_value("|+\n"), "\n");
5175    }
5176
5177    #[test]
5178    fn explicit_indent_block_scalar_can_end_at_document_marker() {
5179        assert_eq!(first_scalar_value("|1\n...\n"), "");
5180    }
5181
5182    #[test]
5183    fn root_explicit_indent_block_scalar_rejects_underindented_content() {
5184        assert_eq!(
5185            first_scanner_error_info("|2\nx\n"),
5186            "wrongly indented line in block scalar"
5187        );
5188    }
5189
5190    #[test]
5191    fn quoted_scalar_rejects_document_indicator_at_line_start() {
5192        assert_eq!(
5193            first_scanner_error_info("\"one\n---\ntwo\"\n"),
5194            "while scanning a quoted scalar, found unexpected document indicator"
5195        );
5196    }
5197
5198    #[test]
5199    fn quoted_scalar_rejects_tab_indentation_after_line_break() {
5200        assert_eq!(
5201            first_scanner_error_info("a: \"one\n\tbad\"\n"),
5202            "tab cannot be used as indentation"
5203        );
5204    }
5205
5206    #[test]
5207    fn quoted_scalar_rejects_underindented_continuation() {
5208        assert_eq!(
5209            first_scanner_error_info("a: \"one\nbad\"\n"),
5210            "invalid indentation in multiline quoted scalar"
5211        );
5212    }
5213
5214    #[test]
5215    fn indented_flow_scalar_reports_invalid_indentation() {
5216        assert_eq!(
5217            first_scanner_error_info("a:\n  [\nfoo]\n"),
5218            "invalid indentation"
5219        );
5220    }
5221
5222    #[test]
5223    fn required_simple_key_requires_value_at_stream_end() {
5224        assert_eq!(
5225            first_scanner_error_info("a:\n&b\n- c\n"),
5226            "simple key expect ':'"
5227        );
5228    }
5229
5230    #[test]
5231    fn plain_scalar_rejects_dash_before_flow_indicator() {
5232        assert_eq!(
5233            first_scanner_error_info("[-]\n"),
5234            "plain scalar cannot start with '-' followed by ,[]{}"
5235        );
5236    }
5237
5238    #[test]
5239    fn explicit_key_rejects_tab_after_indicator() {
5240        assert_eq!(
5241            first_scanner_error_info("? \tfoo\n"),
5242            "tabs disallowed in this context"
5243        );
5244    }
5245
5246    #[test]
5247    fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
5248        assert_eq!(
5249            first_scanner_error_info("[a:[]]\n"),
5250            "':' may not precede any of `[{` in flow mapping"
5251        );
5252    }
5253
5254    #[test]
5255    fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
5256        assert_eq!(
5257            first_scanner_error_info("[foo\n: bar]\n"),
5258            "illegal placement of ':' indicator"
5259        );
5260    }
5261
5262    #[test]
5263    fn stale_simple_key_token_position_is_a_scan_error() {
5264        let mut scanner = Scanner::new(StrInput::new(": value\n"));
5265        scanner.fetch_stream_start();
5266        scanner.tokens.clear();
5267        scanner.tokens_parsed = 1;
5268
5269        let simple_key = scanner
5270            .simple_keys
5271            .last_mut()
5272            .expect("stream start should create a simple key slot");
5273        simple_key.possible = true;
5274        simple_key.token_number = 0;
5275
5276        let error = scanner
5277            .fetch_value()
5278            .expect_err("stale simple key should be reported as a scan error");
5279        assert_eq!(error.info(), "simple key is no longer valid");
5280    }
5281}
granit_parser/scanner.rs

granit_parser/
scanner.rs