granit_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// Maximum number of characters the scanner may look ahead while disambiguating a simple key.
29const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31/// The encoding of the input. Currently, only UTF-8 is supported.
32#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34    /// UTF-8 encoding.
35    Utf8,
36}
37
38/// The source style used for a YAML scalar.
39#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41    /// A YAML plain scalar.
42    Plain,
43    /// A YAML single quoted scalar.
44    SingleQuoted,
45    /// A YAML double quoted scalar.
46    DoubleQuoted,
47
48    /// A YAML literal block (`|` block).
49    ///
50    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
51    /// In literal blocks, any indented character is content, including white space characters.
52    /// There is no way to escape characters, nor to break a long line.
53    Literal,
54    /// A YAML folded block (`>` block).
55    ///
56    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
57    /// In folded blocks, any indented character is content, including white space characters.
58    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
59    /// long lines.
60    Folded,
61}
62
63/// Offset information for a [`Marker`].
64///
65/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
66/// character source. For stable inputs, we can track both a character index and a byte offset.
67/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
68/// meaningful underlying file/source), so they are optional.
69#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71    /// The index (in characters) in the source.
72    chars: usize,
73    /// The offset (in bytes) in the source, if available.
74    bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78    fn eq(&self, other: &Self) -> bool {
79        // Byte offsets are an optional diagnostic enhancement and may differ between input
80        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
81        // position only.
82        self.chars == other.chars
83    }
84}
85
86impl Eq for MarkerOffsets {}
87
88/// A location in a YAML document.
89#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91    /// Offsets in the source.
92    offsets: MarkerOffsets,
93    /// The line (1-indexed).
94    line: usize,
95    /// The column (0-indexed).
96    col: usize,
97}
98
99impl Marker {
100    /// Create a new [`Marker`] at the given position.
101    #[must_use]
102    pub fn new(index: usize, line: usize, col: usize) -> Marker {
103        Marker {
104            offsets: MarkerOffsets {
105                chars: index,
106                bytes: None,
107            },
108            line,
109            col,
110        }
111    }
112
113    /// Return a copy of the marker with the given optional byte offset.
114    #[must_use]
115    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116        self.offsets.bytes = byte_offset;
117        self
118    }
119
120    /// Return the index (in characters) of the marker in the source.
121    #[must_use]
122    pub fn index(&self) -> usize {
123        self.offsets.chars
124    }
125
126    /// Return the byte offset of the marker in the source, if available.
127    #[must_use]
128    pub fn byte_offset(&self) -> Option<usize> {
129        self.offsets.bytes
130    }
131
132    /// Return the line of the marker in the source.
133    #[must_use]
134    pub fn line(&self) -> usize {
135        self.line
136    }
137
138    /// Return the column of the marker in the source.
139    #[must_use]
140    pub fn col(&self) -> usize {
141        self.col
142    }
143}
144
145/// A range of locations in a YAML document.
146#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148    /// The start (inclusive) of the range.
149    pub start: Marker,
150    /// The end (exclusive) of the range.
151    pub end: Marker,
152
153    /// Optional indentation hint associated with this span.
154    ///
155    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
156    /// When indentation is not meaningful or cannot be provided, it must be `None`.
157    pub indent: Option<usize>,
158}
159
160impl Span {
161    /// Create a new [`Span`] for the given range.
162    #[must_use]
163    pub fn new(start: Marker, end: Marker) -> Span {
164        Span {
165            start,
166            end,
167            indent: None,
168        }
169    }
170
171    /// Create an empty [`Span`] at a given location.
172    ///
173    /// An empty span doesn't contain any characters, but its position may still be meaningful.
174    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
175    ///
176    /// [`SequenceEnd`]: crate::Event::SequenceEnd
177    #[must_use]
178    pub fn empty(mark: Marker) -> Span {
179        Span {
180            start: mark,
181            end: mark,
182            indent: None,
183        }
184    }
185
186    /// Return a copy of this [`Span`] with the given indentation hint.
187    #[must_use]
188    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
189        self.indent = indent;
190        self
191    }
192
193    /// Return the length of the span (in characters).
194    #[must_use]
195    pub fn len(&self) -> usize {
196        self.end.index() - self.start.index()
197    }
198
199    /// Return whether the [`Span`] has a length of zero.
200    #[must_use]
201    pub fn is_empty(&self) -> bool {
202        self.len() == 0
203    }
204
205    /// Return the byte range of the span, if available.
206    #[must_use]
207    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
208        let start = self.start.byte_offset()?;
209        let end = self.end.byte_offset()?;
210        Some(start..end)
211    }
212
213    /// Return the source text covered by this span, if byte offsets are available
214    /// and the range is valid for the provided input.
215    #[must_use]
216    pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
217        source.get(self.byte_range()?)
218    }
219}
220
221/// A positional hint for a YAML source comment.
222///
223/// The parser currently recognizes these placements:
224///
225/// ```yaml
226/// # Above
227/// key: value # Right
228///
229/// # Free
230///
231/// next: value
232///
233/// # Last
234/// ```
235#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
236pub enum Placement {
237    /// An own-line comment immediately before another YAML token.
238    ///
239    /// This usually means the comment visually describes the following node.
240    /// Consecutive own-line comments without blank lines between them are also considered
241    /// `Above`, so a comment block can attach to the next YAML element as a group.
242    Above,
243    /// A same-line comment after YAML content or syntax. Examples include `key: value # Right`
244    /// and `- # Right` for an empty sequence entry.
245    Right,
246    /// A standalone own-line comment that is separated from nearby YAML tokens.
247    ///
248    /// This is the fallback for comments that are neither same-line comments, immediately above a
249    /// following token, nor the final comment in the stream. Consumers should treat `Free` as not
250    /// having an obvious neighboring node.
251    #[default]
252    Free,
253    /// An own-line comment at the end of the input stream.
254    ///
255    /// A `Last` comment may be followed by blank lines, but no further YAML token appears before
256    /// `StreamEnd`.
257    Last,
258}
259
260/// A YAML comment captured from the source.
261///
262/// Comments are presentation metadata, not YAML data. This type carries the raw comment payload,
263/// source span, and a best-effort [`Placement`] hint for callers that want to correlate comments
264/// with nearby YAML presentation.
265#[derive(Clone, PartialEq, Debug, Eq)]
266pub struct Comment<'input> {
267    /// Span covering the whole source comment, including `#` and excluding the line break.
268    pub span: Span,
269    /// Raw comment payload exactly after `#`, excluding only the line break.
270    ///
271    /// Leading spaces are preserved, including a single space immediately after `#` when present.
272    pub text: Cow<'input, str>,
273    /// Best-effort placement of this comment relative to nearby YAML content.
274    pub placement: Placement,
275}
276
277impl<'input> Comment<'input> {
278    /// Create a captured YAML comment from a source span and raw payload.
279    ///
280    /// The placement defaults to [`Placement::Free`]. Use [`Comment::with_placement`] when the
281    /// caller already knows a more specific placement.
282    #[must_use]
283    pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
284        Self {
285            span,
286            text: text.into(),
287            placement: Placement::Free,
288        }
289    }
290
291    /// Return this comment with the given placement.
292    #[must_use]
293    pub fn with_placement(mut self, placement: Placement) -> Self {
294        self.placement = placement;
295        self
296    }
297
298    /// Return the comment payload with surrounding whitespace removed.
299    ///
300    /// This helper is ergonomic only. The raw [`Self::text`] payload remains unchanged.
301    #[must_use]
302    pub fn trimmed_text(&self) -> &str {
303        self.text.trim()
304    }
305}
306
307impl AsRef<str> for Comment<'_> {
308    fn as_ref(&self) -> &str {
309        self.text.as_ref()
310    }
311}
312
313/// An error that occurred while scanning.
314#[derive(Clone, PartialEq, Debug, Eq)]
315pub struct ScanError {
316    /// The position at which the error happened in the source.
317    mark: Marker,
318    /// Human-readable details about the error.
319    info: String,
320}
321
322impl ScanError {
323    /// Create a new error from a location and an error string.
324    #[must_use]
325    #[cold]
326    pub fn new(loc: Marker, info: String) -> ScanError {
327        ScanError { mark: loc, info }
328    }
329
330    /// Convenience alias for string slices.
331    #[must_use]
332    #[cold]
333    pub fn new_str(loc: Marker, info: &str) -> ScanError {
334        ScanError {
335            mark: loc,
336            info: info.to_owned(),
337        }
338    }
339
340    #[cold]
341    pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
342        Err(self)
343    }
344
345    /// Return the marker pointing to the error in the source.
346    #[must_use]
347    pub fn marker(&self) -> &Marker {
348        &self.mark
349    }
350
351    /// Return the information string describing the error that happened.
352    #[must_use]
353    pub fn info(&self) -> &str {
354        self.info.as_ref()
355    }
356}
357
358impl fmt::Display for ScanError {
359    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
360        write!(
361            f,
362            "{} at char {} line {} column {}",
363            self.info,
364            self.mark.index(),
365            self.mark.line(),
366            self.mark.col() + 1
367        )
368    }
369}
370
371impl core::error::Error for ScanError {}
372
373/// The contents of a scanner token.
374#[derive(Clone, PartialEq, Debug, Eq)]
375pub enum TokenType<'input> {
376    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
377    StreamStart(TEncoding),
378    /// The end of the stream, EOF.
379    StreamEnd,
380    /// A YAML version directive.
381    VersionDirective(
382        /// Major version number.
383        u32,
384        /// Minor version number.
385        u32,
386    ),
387    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
388    TagDirective(
389        /// Tag directive handle, such as `!` or `!app!`.
390        Cow<'input, str>,
391        /// Tag URI prefix associated with the handle.
392        Cow<'input, str>,
393    ),
394    /// The start of a YAML document (`---`).
395    DocumentStart,
396    /// The end of a YAML document (`...`).
397    DocumentEnd,
398    /// The start of a sequence block.
399    ///
400    /// Sequence blocks are arrays starting with a `-`.
401    BlockSequenceStart,
402    /// The start of a block mapping.
403    ///
404    /// Block mappings are key-value collections written with `key: value` entries.
405    BlockMappingStart,
406    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
407    BlockEnd,
408    /// Start of an inline sequence (`[ a, b ]`).
409    FlowSequenceStart,
410    /// End of an inline sequence.
411    FlowSequenceEnd,
412    /// Start of an inline mapping (`{ a: b, c: d }`).
413    FlowMappingStart,
414    /// End of an inline mapping.
415    FlowMappingEnd,
416    /// An entry in a block sequence (see [`TokenType::BlockSequenceStart`]).
417    BlockEntry,
418    /// An entry in a flow sequence (see [`TokenType::FlowSequenceStart`]).
419    FlowEntry,
420    /// A key in a mapping.
421    Key,
422    /// A value in a mapping.
423    Value,
424    /// A reference to a previously defined anchor.
425    Alias(Cow<'input, str>),
426    /// A YAML anchor definition introduced by `&`.
427    Anchor(Cow<'input, str>),
428    /// A YAML tag (starting with bangs `!`).
429    Tag(
430        /// The handle of the tag.
431        Cow<'input, str>,
432        /// The suffix of the tag.
433        Cow<'input, str>,
434    ),
435    /// A regular YAML scalar.
436    Scalar(ScalarStyle, Cow<'input, str>),
437    /// A YAML source comment.
438    ///
439    /// The token payload carries the raw text exactly after `#`, the source span, and an initial
440    /// [`Placement`] hint. The token's companion [`Span`] is the same as [`Comment::span`].
441    Comment(
442        /// Captured comment metadata.
443        Comment<'input>,
444    ),
445    /// A reserved YAML directive.
446    ReservedDirective(
447        /// Directive name.
448        String,
449        /// Directive parameters, split on YAML whitespace.
450        Vec<String>,
451    ),
452}
453
454/// A scanner token.
455#[derive(Clone, PartialEq, Debug, Eq)]
456pub struct Token<'input>(
457    /// Source span covered by this token.
458    pub Span,
459    /// Token payload emitted by the scanner.
460    pub TokenType<'input>,
461);
462
463/// A scalar that was parsed and may correspond to a simple key.
464///
465/// Upon scanning the following YAML:
466/// ```yaml
467/// a: b
468/// ```
469/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
470/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
471/// kept inside the scanner until more context is fetched and we are able to know whether it is a
472/// plain scalar or a key.
473///
474/// For example, see the following two YAML documents:
475/// ```yaml
476/// ---
477/// a: b # Here, `a` is a key.
478/// ...
479/// ---
480/// a # Here, `a` is a plain scalar.
481/// ...
482/// ```
483/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
484///
485/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
486/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
487/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
488///
489/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
490/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
491/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
492/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
493/// [`TokenType::Scalar`] token.
494///
495/// In the second document however, reaching EOF would mark the [`SimpleKey`] as no longer possible,
496/// and no [`TokenType::Key`] would be emitted by the scanner.
497#[derive(Clone, PartialEq, Debug, Eq)]
498struct SimpleKey {
499    /// Whether the token this [`SimpleKey`] refers to may still be a key.
500    ///
501    /// Sometimes, when we have more context, we notice that what we thought could be a key no
502    /// longer can be. In that case, [`Self::possible`] is set to `false`.
503    ///
504    /// For instance, let us consider the following invalid YAML:
505    /// ```yaml
506    /// key
507    ///   : value
508    /// ```
509    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is no longer
510    /// possible and [`Self::possible`] is set to `false`.
511    possible: bool,
512    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
513    ///
514    /// With more context, we may know for sure that the token must be a key. If later input makes
515    /// that impossible, the scanner must report an error instead of silently treating the token as a
516    /// plain scalar.
517    ///
518    /// This happens for simple keys at the current block indentation where the surrounding
519    /// collection requires the next token to be a mapping key.
520    required: bool,
521    /// The index of the token referred to by the [`SimpleKey`].
522    ///
523    /// This is the index in the scanner, which takes into account both the tokens that have been
524    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
525    /// [`Scanner::tokens`] for more details.
526    token_number: usize,
527    /// The position at which the token the [`SimpleKey`] refers to is.
528    mark: Marker,
529}
530
531impl SimpleKey {
532    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
533    fn new(mark: Marker) -> SimpleKey {
534        SimpleKey {
535            possible: false,
536            required: false,
537            token_number: 0,
538            mark,
539        }
540    }
541}
542
543/// An indentation level on the stack of indentations.
544#[derive(Clone, Debug, Default)]
545struct Indent {
546    /// The former indentation level.
547    indent: isize,
548    /// Whether, upon closing, this indents generates a `BlockEnd` token.
549    ///
550    /// There are levels of indentation which do not start a block. Examples of this would be:
551    /// ```yaml
552    /// -
553    ///   foo # ok
554    /// -
555    /// bar # ko, bar needs to be indented further than the `-`.
556    /// - [
557    ///  baz, # ok
558    /// quux # ko, quux needs to be indented further than the '-'.
559    /// ] # ko, the closing bracket needs to be indented further than the `-`.
560    /// ```
561    ///
562    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
563    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
564    /// sequence, although we must have exactly one to end the sequence.
565    needs_block_end: bool,
566}
567
568/// The knowledge we have about an implicit mapping.
569///
570/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
571/// sequence is omitted:
572/// ```yaml
573/// [ a: b, c: d ]
574/// # Equivalent to
575/// [ { a: b }, { c: d } ]
576/// # Equivalent to
577/// - a: b
578/// - c: d
579/// ```
580///
581/// The state must be carefully tracked for each nested flow sequence since we must emit a
582/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
583/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
584/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
585/// out-of-order.
586///
587/// [`FlowMappingStart`]: TokenType::FlowMappingStart
588/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
589#[derive(Debug, PartialEq)]
590enum ImplicitMappingState {
591    /// It is possible there is an implicit mapping.
592    ///
593    /// This state is the one when we have just encountered the opening `[`. We need more context
594    /// to know whether an implicit mapping follows.
595    Possible,
596    /// We are inside the implicit mapping.
597    ///
598    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
599    Inside(u8),
600}
601
602/// The YAML scanner.
603///
604/// This corresponds to the low-level interface when reading YAML. The scanner emits tokens as they
605/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
606/// some of the constructs. It has understanding of indentation and whitespace and is able to
607/// generate error messages for some invalid YAML constructs.
608///
609/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
610/// YAML documents.
611#[derive(Debug)]
612#[allow(clippy::struct_excessive_bools)]
613pub struct Scanner<'input, T> {
614    /// The input source.
615    ///
616    /// This must implement [`Input`].
617    input: T,
618    /// The position of the cursor within the reader.
619    mark: Marker,
620    /// Buffer for tokens to be returned.
621    ///
622    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
623    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
624    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
625    /// [`Self::next`] until we have more context.
626    tokens: VecDeque<Token<'input>>,
627    /// The last error that happened.
628    error: Option<ScanError>,
629    /// Error found after one or more already-scanned comment tokens.
630    deferred_error: Option<ScanError>,
631
632    /// Whether we have already emitted the `StreamStart` token.
633    stream_start_produced: bool,
634    /// Whether we have already emitted the `StreamEnd` token.
635    stream_end_produced: bool,
636    /// Whether the scanner is still in the prefix of the next document.
637    ///
638    /// A BOM may appear in a document prefix, before directives/comments/content. Once a document
639    /// start marker or any content token is scanned, another BOM is document content and must be
640    /// rejected unless it appears inside a quoted scalar.
641    document_prefix_allowed: bool,
642    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
643    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
644    adjacent_value_allowed_at: usize,
645    /// Whether a simple key could potentially start at the current position.
646    ///
647    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
648    simple_key_allowed: bool,
649    /// A stack of potential simple keys.
650    ///
651    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
652    /// are.
653    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
654    /// The current indentation level.
655    indent: isize,
656    /// List of all block indentation levels we are in (except the current one).
657    indents: smallvec::SmallVec<[Indent; 8]>,
658    /// Level of nesting of flow sequences.
659    flow_level: u8,
660    /// The number of tokens that have been returned from the scanner.
661    ///
662    /// This excludes the tokens from [`Self::tokens`].
663    tokens_parsed: usize,
664    /// Whether a token is ready to be taken from [`Self::tokens`].
665    token_available: bool,
666    /// Whether all characters encountered since the last newline were whitespace.
667    leading_whitespace: bool,
668    /// Whether we started a flow mapping at each flow nesting level.
669    ///
670    /// This is used to detect implicit flow mapping starts such as:
671    /// ```yaml
672    /// [ : foo ] # { null: "foo" }
673    /// ```
674    flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
675    /// An array of states, representing whether flow sequences have implicit mappings.
676    ///
677    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
678    /// the state is set to [`Possible`].
679    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
680    /// [`Inside`].
681    ///
682    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
683    /// The entries are created with the opening `[` and popped with the closing `]`.
684    ///
685    /// [`Possible`]: ImplicitMappingState::Possible
686    /// [`Inside`]: ImplicitMappingState::Inside
687    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
688    /// If a plain scalar was terminated by a `#` comment on its line, we set this
689    /// to detect an illegal multiline continuation on the following line.
690    interrupted_plain_by_comment: Option<Marker>,
691    /// A stack of markers for opening brackets `[` and `{`.
692    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
693    buf_leading_break: String,
694    buf_trailing_breaks: String,
695    buf_whitespaces: String,
696}
697
698impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
699    type Item = Token<'input>;
700
701    fn next(&mut self) -> Option<Self::Item> {
702        if self.error.is_some() {
703            return None;
704        }
705        match self.next_token() {
706            Ok(Some(tok)) => {
707                debug_print!(
708                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
709                    tok.1,
710                    tok.0
711                );
712                Some(tok)
713            }
714            Ok(tok) => tok,
715            Err(e) => self.stop_after_error(e),
716        }
717    }
718}
719
720/// A convenience alias for scanner functions that may fail without returning a value.
721pub type ScanResult = Result<(), ScanError>;
722
723#[derive(Debug)]
724enum FlowScalarBuf {
725    /// Candidate for `Cow::Borrowed`.
726    ///
727    /// `start..end` is the committed verbatim range.
728    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
729    /// committed (they must be dropped if followed by a line break).
730    Borrowed {
731        start: usize,
732        end: usize,
733        pending_ws_start: Option<usize>,
734        pending_ws_end: usize,
735    },
736    Owned(String),
737}
738
739impl FlowScalarBuf {
740    #[inline]
741    fn new_borrowed(start: usize) -> Self {
742        Self::Borrowed {
743            start,
744            end: start,
745            pending_ws_start: None,
746            pending_ws_end: start,
747        }
748    }
749
750    #[inline]
751    fn new_owned() -> Self {
752        Self::Owned(String::new())
753    }
754
755    #[inline]
756    fn as_owned_mut(&mut self) -> Option<&mut String> {
757        match self {
758            Self::Owned(s) => Some(s),
759            Self::Borrowed { .. } => None,
760        }
761    }
762
763    #[inline]
764    fn commit_pending_ws(&mut self) {
765        if let Self::Borrowed {
766            end,
767            pending_ws_start,
768            pending_ws_end,
769            ..
770        } = self
771        {
772            if pending_ws_start.is_some() {
773                *end = *pending_ws_end;
774                *pending_ws_start = None;
775            }
776        }
777    }
778
779    #[inline]
780    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
781        if let Self::Borrowed {
782            pending_ws_start,
783            pending_ws_end,
784            ..
785        } = self
786        {
787            if pending_ws_start.is_none() {
788                *pending_ws_start = Some(ws_start);
789            }
790            *pending_ws_end = ws_end;
791        }
792    }
793
794    #[inline]
795    fn discard_pending_ws(&mut self) {
796        if let Self::Borrowed {
797            pending_ws_start,
798            pending_ws_end,
799            end,
800            ..
801        } = self
802        {
803            *pending_ws_start = None;
804            *pending_ws_end = *end;
805        }
806    }
807}
808
809impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
810    #[inline]
811    fn promote_flow_scalar_buf_to_owned(
812        &self,
813        start_mark: &Marker,
814        buf: &mut FlowScalarBuf,
815    ) -> Result<(), ScanError> {
816        let FlowScalarBuf::Borrowed {
817            start,
818            end,
819            pending_ws_start: _,
820            pending_ws_end: _,
821        } = *buf
822        else {
823            return Ok(());
824        };
825
826        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
827            ScanError::new_str(
828                *start_mark,
829                "internal error: input advertised offsets but did not provide a slice",
830            )
831        })?;
832        *buf = FlowScalarBuf::Owned(slice.to_owned());
833        Ok(())
834    }
835    /// Try to borrow a slice from the underlying input.
836    ///
837    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
838    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
839    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
840    #[inline]
841    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
842        self.input.slice_borrowed(start, end)
843    }
844
845    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
846    ///
847    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
848    /// borrowing is not possible, it falls back to allocating.
849    fn scan_tag_handle_directive_cow(
850        &mut self,
851        mark: &Marker,
852    ) -> Result<Cow<'input, str>, ScanError> {
853        let Some(start) = self.input.byte_offset() else {
854            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
855        };
856
857        if self.input.look_ch() != '!' {
858            return Err(ScanError::new_str(
859                *mark,
860                "while scanning a tag, did not find expected '!'",
861            ));
862        }
863
864        // Consume the leading '!'.
865        self.skip_non_blank();
866
867        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
868        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
869        self.input.lookahead(1);
870        while self.input.next_is_alpha() {
871            self.skip_non_blank();
872            self.input.lookahead(1);
873        }
874
875        // Optional trailing '!'.
876        if self.input.peek() == '!' {
877            self.skip_non_blank();
878        }
879
880        let Some(end) = self.input.byte_offset() else {
881            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
882            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
883        };
884
885        let Some(slice) = self.try_borrow_slice(start, end) else {
886            // Fall back to allocating if zero-copy borrow is not available.
887            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
888                ScanError::new_str(
889                    *mark,
890                    "internal error: input advertised slicing but did not provide a slice",
891                )
892            })?;
893            if !slice.ends_with('!') && slice != "!" {
894                return Err(ScanError::new_str(
895                    *mark,
896                    "while parsing a tag directive, did not find expected '!'",
897                ));
898            }
899            return Ok(Cow::Owned(slice.to_owned()));
900        };
901
902        if !slice.ends_with('!') && slice != "!" {
903            return Err(ScanError::new_str(
904                *mark,
905                "while parsing a tag directive, did not find expected '!'",
906            ));
907        }
908
909        Ok(Cow::Borrowed(slice))
910    }
911
912    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
913    ///
914    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
915    /// escape is present, the prefix must be decoded and therefore allocated.
916    fn scan_tag_prefix_directive_cow(
917        &mut self,
918        start_mark: &Marker,
919    ) -> Result<Cow<'input, str>, ScanError> {
920        let Some(start) = self.input.byte_offset() else {
921            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
922        };
923
924        // The prefix must start with either '!' (local) or a valid global tag char.
925        if self.input.look_ch() == '!' {
926            self.skip_non_blank();
927        } else if !is_tag_char(self.input.peek()) {
928            return Err(ScanError::new_str(
929                *start_mark,
930                "invalid global tag character",
931            ));
932        } else if self.input.peek() == '%' {
933            // Needs decoding. Fall back to allocating path below.
934        } else {
935            self.skip_non_blank();
936        }
937
938        // Consume URI chars while we can stay in the borrowed path.
939        while is_uri_char(self.input.look_ch()) {
940            if self.input.peek() == '%' {
941                break;
942            }
943            self.skip_non_blank();
944        }
945
946        // If we encountered an escape sequence, we must decode, therefore allocate.
947        if self.input.peek() == '%' {
948            let current = self
949                .input
950                .byte_offset()
951                .expect("byte_offset() must remain available once enabled");
952            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
953                slice.to_owned()
954            } else {
955                String::new()
956            };
957
958            while is_uri_char(self.input.look_ch()) {
959                if self.input.peek() == '%' {
960                    out.push(self.scan_uri_escapes(start_mark)?);
961                } else {
962                    out.push(self.input.peek());
963                    self.skip_non_blank();
964                }
965            }
966            return Ok(Cow::Owned(out));
967        }
968
969        let Some(end) = self.input.byte_offset() else {
970            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
971        };
972
973        let Some(slice) = self.try_borrow_slice(start, end) else {
974            // Fall back to allocating if zero-copy borrow is not available.
975            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
976                ScanError::new_str(
977                    *start_mark,
978                    "internal error: input advertised slicing but did not provide a slice",
979                )
980            })?;
981            return Ok(Cow::Owned(slice.to_owned()));
982        };
983
984        Ok(Cow::Borrowed(slice))
985    }
986    /// Create a scanner over the given input source.
987    pub fn new(input: T) -> Self {
988        let initial_byte_offset = input.byte_offset();
989        Scanner {
990            input,
991            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
992            tokens: VecDeque::with_capacity(64),
993            error: None,
994            deferred_error: None,
995
996            stream_start_produced: false,
997            stream_end_produced: false,
998            document_prefix_allowed: true,
999            adjacent_value_allowed_at: 0,
1000            simple_key_allowed: true,
1001            simple_keys: smallvec::SmallVec::new(),
1002            indent: -1,
1003            indents: smallvec::SmallVec::new(),
1004            flow_level: 0,
1005            tokens_parsed: 0,
1006            token_available: false,
1007            leading_whitespace: true,
1008            flow_mapping_started: smallvec::SmallVec::new(),
1009            implicit_flow_mapping_states: smallvec::SmallVec::new(),
1010            flow_markers: smallvec::SmallVec::new(),
1011            interrupted_plain_by_comment: None,
1012
1013            buf_leading_break: String::with_capacity(128),
1014            buf_trailing_breaks: String::with_capacity(128),
1015            buf_whitespaces: String::with_capacity(128),
1016        }
1017    }
1018
1019    /// Return a copy of the last error that was encountered, if any.
1020    ///
1021    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
1022    /// clone of) the same error.
1023    #[inline]
1024    pub fn get_error(&self) -> Option<ScanError> {
1025        self.error.clone().or_else(|| self.deferred_error.clone())
1026    }
1027
1028    #[cold]
1029    fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
1030        self.error = Some(error);
1031        None
1032    }
1033
1034    #[cold]
1035    fn simple_key_expected(&self) -> ScanError {
1036        ScanError::new_str(self.mark, "simple key expected")
1037    }
1038
1039    #[cold]
1040    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
1041        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
1042    }
1043
1044    /// Consume the next character. It is assumed the next character is a blank.
1045    #[inline]
1046    fn skip_blank(&mut self) {
1047        self.input.skip();
1048
1049        self.mark.offsets.chars += 1;
1050        self.mark.col += 1;
1051        self.mark.offsets.bytes = self.input.byte_offset();
1052    }
1053
1054    /// Consume the next character. It is assumed the next character is not a blank.
1055    #[inline]
1056    fn skip_non_blank(&mut self) {
1057        self.input.skip();
1058
1059        self.mark.offsets.chars += 1;
1060        self.mark.col += 1;
1061        self.mark.offsets.bytes = self.input.byte_offset();
1062        self.leading_whitespace = false;
1063    }
1064
1065    /// Consume a byte order mark from a document prefix.
1066    ///
1067    /// The source index advances, but the logical column remains unchanged so directives and
1068    /// document markers immediately following the BOM are still recognized as line-start tokens.
1069    #[inline]
1070    fn skip_bom(&mut self) {
1071        self.input.skip();
1072
1073        self.mark.offsets.chars += 1;
1074        self.mark.offsets.bytes = self.input.byte_offset();
1075    }
1076
1077    /// Consume one character that belongs to a comment.
1078    ///
1079    /// Unlike [`Self::skip_non_blank`], this deliberately does not change
1080    /// `leading_whitespace`. Comments are presentation content, so consuming one for either
1081    /// tokenization or skipping should only advance position bookkeeping.
1082    #[inline]
1083    fn skip_comment_char(&mut self) {
1084        self.input.skip();
1085
1086        self.mark.offsets.chars += 1;
1087        self.mark.col += 1;
1088        self.mark.offsets.bytes = self.input.byte_offset();
1089    }
1090
1091    /// Consume the next characters. It is assumed none of the next characters are blanks.
1092    #[inline]
1093    fn skip_n_non_blank(&mut self, count: usize) {
1094        for _ in 0..count {
1095            self.input.skip();
1096            self.mark.offsets.chars += 1;
1097            self.mark.col += 1;
1098        }
1099        self.mark.offsets.bytes = self.input.byte_offset();
1100        self.leading_whitespace = false;
1101    }
1102
1103    /// Consume the next character. It is assumed the next character is a newline.
1104    #[inline]
1105    fn skip_nl(&mut self) {
1106        self.input.skip();
1107
1108        self.mark.offsets.chars += 1;
1109        self.mark.col = 0;
1110        self.mark.line += 1;
1111        self.mark.offsets.bytes = self.input.byte_offset();
1112        self.leading_whitespace = true;
1113    }
1114
1115    /// Consume a line break (either CR, LF, or CRLF), if any. Do nothing if there is none.
1116    #[inline]
1117    fn skip_linebreak(&mut self) {
1118        if self.input.next_2_are('\r', '\n') {
1119            // While technically not a blank, this does not matter as `self.leading_whitespace`
1120            // will be reset by `skip_nl`.
1121            self.skip_blank();
1122            self.skip_nl();
1123        } else if self.input.next_is_break() {
1124            self.skip_nl();
1125        }
1126    }
1127
1128    fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
1129        let start_mark = self.mark;
1130        debug_assert_eq!(self.input.peek(), '#');
1131        let placement = if self.leading_whitespace {
1132            Placement::Free
1133        } else {
1134            Placement::Right
1135        };
1136
1137        self.skip_comment_char();
1138
1139        let text = if let Some(start) = self.input.byte_offset() {
1140            // Stable byte offsets are available; slice the payload once at the end.
1141            let n = self.input.skip_while_non_breakz();
1142            self.mark.offsets.chars += n;
1143            self.mark.col += n;
1144            let byte_offset = self.input.byte_offset();
1145            self.mark.offsets.bytes = byte_offset;
1146            let end = byte_offset.expect("byte_offset must remain available once enabled");
1147
1148            if let Some(slice) = self.try_borrow_slice(start, end) {
1149                Cow::Borrowed(slice)
1150            } else if let Some(slice) = self.input.slice_bytes(start, end) {
1151                // Defensive fallback for third-party inputs that expose offsets but cannot borrow.
1152                Cow::Owned(slice.to_owned())
1153            } else {
1154                return Err(ScanError::new_str(
1155                    start_mark,
1156                    "internal error: input advertised offsets but did not provide a slice",
1157                ));
1158            }
1159        } else {
1160            // Streaming input without stable offsets; collect into an owned string.
1161            let mut owned = String::new();
1162            while !is_breakz(self.input.look_ch()) {
1163                owned.push(self.input.peek());
1164                self.skip_comment_char();
1165            }
1166            Cow::Owned(owned)
1167        };
1168
1169        let end_mark = self.mark;
1170        let span = Span::new(start_mark, end_mark);
1171        Ok(Token(
1172            span,
1173            TokenType::Comment(Comment::new(span, text).with_placement(placement)),
1174        ))
1175    }
1176
1177    fn push_comment_token(&mut self) -> ScanResult {
1178        let token = self.scan_comment_token()?;
1179        self.tokens.push_back(token);
1180        Ok(())
1181    }
1182
1183    fn skip_comment(&mut self) {
1184        debug_assert_eq!(self.input.peek(), '#');
1185
1186        self.skip_comment_char();
1187        let n = self.input.skip_while_non_breakz();
1188        self.mark.offsets.chars += n;
1189        self.mark.col += n;
1190        self.mark.offsets.bytes = self.input.byte_offset();
1191    }
1192
1193    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
1194    #[inline]
1195    pub fn stream_started(&self) -> bool {
1196        self.stream_start_produced
1197    }
1198
1199    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
1200    #[inline]
1201    pub fn stream_ended(&self) -> bool {
1202        self.stream_end_produced
1203    }
1204
1205    /// Return the current position in the input stream.
1206    #[inline]
1207    pub fn mark(&self) -> Marker {
1208        self.mark
1209    }
1210
1211    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1212    //
1213    // A `\n` is pushed into `s`.
1214    //
1215    // # Panics (in debug)
1216    // If the next characters do not correspond to a line break.
1217    #[inline]
1218    fn read_break(&mut self, s: &mut String) {
1219        self.skip_break();
1220        s.push('\n');
1221    }
1222
1223    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1224    //
1225    // # Panics (in debug)
1226    // If the next characters do not correspond to a line break.
1227    #[inline]
1228    fn skip_break(&mut self) {
1229        let c = self.input.peek();
1230        let nc = self.input.peek_nth(1);
1231        debug_assert!(is_break(c));
1232        if c == '\r' && nc == '\n' {
1233            self.skip_blank();
1234        }
1235        self.skip_nl();
1236    }
1237
1238    /// Insert a token at the given position.
1239    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1240        let old_len = self.tokens.len();
1241        assert!(pos <= old_len);
1242        self.tokens.insert(pos, tok);
1243    }
1244
1245    #[inline]
1246    fn allow_simple_key(&mut self) {
1247        self.simple_key_allowed = true;
1248    }
1249
1250    #[inline]
1251    fn disallow_simple_key(&mut self) {
1252        self.simple_key_allowed = false;
1253    }
1254
1255    /// Scan enough input to append one next token to the internal token queue.
1256    ///
1257    /// # Errors
1258    /// Returns `ScanError` when the scanner does not find the next expected token.
1259    pub fn fetch_next_token(&mut self) -> ScanResult {
1260        self.input.lookahead(1);
1261
1262        if !self.stream_start_produced {
1263            self.fetch_stream_start();
1264            return Ok(());
1265        }
1266        self.skip_to_next_token()?;
1267
1268        debug_print!(
1269            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1270            self.mark,
1271            self.input.peek()
1272        );
1273
1274        self.stale_simple_keys()?;
1275
1276        let mark = self.mark;
1277        self.unroll_indent(mark.col as isize);
1278
1279        self.input.lookahead(4);
1280
1281        if self.input.next_is_z() {
1282            self.fetch_stream_end()?;
1283            return Ok(());
1284        }
1285
1286        if self.mark.col == 0 {
1287            if self.input.next_char_is('%') {
1288                return self.fetch_directive();
1289            } else if self.input.next_is_document_start() {
1290                return self.fetch_document_indicator(TokenType::DocumentStart);
1291            } else if self.input.next_is_document_end() {
1292                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1293                self.skip_ws_to_eol(SkipTabs::Yes)?;
1294                if !self.input.next_is_breakz() {
1295                    return Err(ScanError::new_str(
1296                        self.mark,
1297                        "invalid content after document end marker",
1298                    ));
1299                }
1300                return Ok(());
1301            }
1302        }
1303
1304        if self.document_prefix_allowed {
1305            self.document_prefix_allowed = false;
1306        }
1307
1308        if (self.mark.col as isize) < self.indent {
1309            self.input.lookahead(1);
1310            let c = self.input.peek();
1311            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1312                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1313            }
1314        }
1315
1316        let c = self.input.peek();
1317        let nc = self.input.peek_nth(1);
1318        match c {
1319            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1320            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1321            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1322            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1323            ',' => self.fetch_flow_entry(),
1324            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1325            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1326            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1327            ':' if self.flow_level > 0
1328                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1329            {
1330                self.fetch_flow_value()
1331            }
1332            // Is it an alias?
1333            '*' => self.fetch_anchor(true),
1334            // Is it an anchor?
1335            '&' => self.fetch_anchor(false),
1336            '!' => self.fetch_tag(),
1337            // Is it a literal scalar?
1338            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1339            // Is it a folded scalar?
1340            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1341            '\'' => self.fetch_flow_scalar(true),
1342            '"' => self.fetch_flow_scalar(false),
1343            // plain scalar
1344            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1345            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1346                self.fetch_plain_scalar()
1347            }
1348            c if is_bom(c) => Err(ScanError::new_str(
1349                self.mark,
1350                "a BOM must not appear inside a document",
1351            )),
1352            '%' | '@' | '`' => Err(ScanError::new(
1353                self.mark,
1354                format!("unexpected character: `{c}'"),
1355            )),
1356            _ => self.fetch_plain_scalar(),
1357        }
1358    }
1359
1360    /// Return the next queued token, scanning more input when needed.
1361    ///
1362    /// # Errors
1363    /// Returns `ScanError` when scanning fails to find an expected next token.
1364    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1365        if self.deferred_error.is_some() {
1366            if !matches!(
1367                self.tokens.front().map(|token| &token.1),
1368                Some(TokenType::Comment(_))
1369            ) {
1370                if let Some(error) = self.deferred_error.take() {
1371                    return error.into_result();
1372                }
1373            }
1374            self.token_available = true;
1375        }
1376
1377        if self.stream_end_produced {
1378            return Ok(None);
1379        }
1380
1381        if !self.token_available {
1382            if let Err(error) = self.fetch_more_tokens() {
1383                if matches!(
1384                    self.tokens.front().map(|token| &token.1),
1385                    Some(TokenType::Comment(_))
1386                ) {
1387                    self.deferred_error = Some(error);
1388                } else {
1389                    return Err(error);
1390                }
1391            }
1392        }
1393        let Some(t) = self.tokens.pop_front() else {
1394            return Err(ScanError::new_str(
1395                self.mark,
1396                "did not find expected next token",
1397            ));
1398        };
1399        self.token_available = false;
1400        self.tokens_parsed += 1;
1401
1402        if let TokenType::StreamEnd = t.1 {
1403            self.stream_end_produced = true;
1404        }
1405        Ok(Some(t))
1406    }
1407
1408    /// Scan more input until a token is ready to be returned.
1409    ///
1410    /// # Errors
1411    /// Returns `ScanError` when scanning fails.
1412    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1413        let mut need_more;
1414        loop {
1415            if self.tokens.is_empty() {
1416                need_more = true;
1417            } else {
1418                need_more = false;
1419                // Stale potential keys that we know won't be keys.
1420                self.stale_simple_keys()?;
1421                // If our next token to be emitted may be a key, fetch more context.
1422                for sk in &self.simple_keys {
1423                    if sk.possible && sk.token_number == self.tokens_parsed {
1424                        need_more = true;
1425                        break;
1426                    }
1427                }
1428            }
1429
1430            // Stop fetching immediately after document end/start markers
1431            // to allow the parser to emit the event before reading more content.
1432            if let Some(token) = self.tokens.back() {
1433                if matches!(token.1, TokenType::DocumentEnd | TokenType::DocumentStart) {
1434                    break;
1435                }
1436            }
1437
1438            if !need_more {
1439                break;
1440            }
1441            self.fetch_next_token()?;
1442        }
1443        self.token_available = true;
1444
1445        Ok(())
1446    }
1447
1448    /// Mark simple keys that can no longer be keys as such.
1449    ///
1450    /// This function sets `possible` to `false` to each key that, now we have more context, we
1451    /// know will not be keys.
1452    ///
1453    /// # Errors
1454    /// This function returns an error if one of the keys becoming impossible was required to be a
1455    /// key.
1456    fn stale_simple_keys(&mut self) -> ScanResult {
1457        for sk in &mut self.simple_keys {
1458            let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1459            // The length cap applies in flow contexts too; otherwise token buffering can grow
1460            // without bound while the scanner waits to see whether a later ':' resolves the key.
1461            let is_length_stale =
1462                self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1463
1464            if sk.possible && (is_line_stale || is_length_stale) {
1465                if sk.required {
1466                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1467                }
1468                sk.possible = false;
1469            }
1470        }
1471        Ok(())
1472    }
1473
1474    /// Skip over whitespace (`\t`, ` `, `\n`, `\r`) until the next non-comment token.
1475    ///
1476    /// Comments encountered while skipping are queued as [`TokenType::Comment`] tokens so the
1477    /// parser can emit them as presentation events.
1478    ///
1479    /// # Errors
1480    /// This function returns an error if a tab is encountered where there should not be
1481    /// one.
1482    fn skip_to_next_token(&mut self) -> ScanResult {
1483        // Hot-path helper: consume a single logical line break and apply simple-key rules.
1484        // (Kept local to ensure the compiler can inline it easily.)
1485        let consume_linebreak = |this: &mut Self| {
1486            this.input.lookahead(2);
1487            this.skip_linebreak();
1488            if this.flow_level == 0 {
1489                this.allow_simple_key();
1490            }
1491        };
1492
1493        loop {
1494            match self.input.look_ch() {
1495                // Tabs may not be used as indentation (block context only).
1496                '\t' => {
1497                    if self.is_within_block()
1498                        && self.leading_whitespace
1499                        && (self.mark.col as isize) < self.indent
1500                    {
1501                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1502
1503                        // If we have content on that line with a tab, return an error.
1504                        if !self.input.next_is_breakz() {
1505                            return Err(ScanError::new_str(
1506                                self.mark,
1507                                "tabs disallowed within this context (block indentation)",
1508                            ));
1509                        }
1510
1511                        // Micro-opt: if we stopped on a line break, consume it now (avoids another loop trip).
1512                        if matches!(self.input.look_ch(), '\n' | '\r') {
1513                            consume_linebreak(self);
1514                        }
1515                    } else {
1516                        // Non-indentation tab behaves like blank.
1517                        self.skip_blank();
1518                    }
1519                }
1520
1521                ' ' => self.skip_blank(),
1522
1523                '\n' | '\r' => consume_linebreak(self),
1524
1525                c if is_bom(c)
1526                    && self.document_prefix_allowed
1527                    && self.flow_level == 0
1528                    && self.mark.col == 0 =>
1529                {
1530                    self.skip_bom();
1531                }
1532
1533                '#' => {
1534                    self.push_comment_token()?;
1535
1536                    // Micro-opt: comment-only lines are common; consume the following line break here.
1537                    if matches!(self.input.look_ch(), '\n' | '\r') {
1538                        consume_linebreak(self);
1539                    }
1540                }
1541
1542                _ => break,
1543            }
1544        }
1545
1546        // If a plain scalar was interrupted by a comment, and the next line could
1547        // continue the scalar in block context, this is invalid.
1548        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1549            // BS4K should only trigger when the continuation would start on the immediate next
1550            // line (no intervening empty/comment-only lines). A blank line resets the folding
1551            // opportunity and thus should not error.
1552            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1553
1554            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1555            if self.flow_level == 0
1556                && is_immediate_next_line
1557                && (self.mark.col as isize) > self.indent
1558            {
1559                // Ensure enough lookahead for:
1560                // - the checks below (peek/peek_nth)
1561                // - document indicator detection which needs 4 chars.
1562                self.input.lookahead(4);
1563
1564                if !self.input.next_is_z()
1565                    && !self.input.next_is_document_indicator()
1566                    && self.input.next_can_be_plain_scalar(false)
1567                {
1568                    return Err(ScanError::new_str(
1569                        err_mark,
1570                        "comment intercepting the multiline text",
1571                    ));
1572                }
1573            }
1574        }
1575
1576        Ok(())
1577    }
1578
1579    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1580    ///
1581    /// # Errors
1582    /// This function returns an error if no whitespace was found.
1583    fn skip_yaml_whitespace(&mut self) -> ScanResult {
1584        let mut need_whitespace = true;
1585        loop {
1586            match self.input.look_ch() {
1587                ' ' => {
1588                    self.skip_blank();
1589
1590                    need_whitespace = false;
1591                }
1592                '\n' | '\r' => {
1593                    self.input.lookahead(2);
1594                    self.skip_linebreak();
1595                    if self.flow_level == 0 {
1596                        self.allow_simple_key();
1597                    }
1598                    need_whitespace = false;
1599                }
1600                '#' => {
1601                    if need_whitespace {
1602                        self.skip_comment();
1603                    } else {
1604                        self.push_comment_token()?;
1605                    }
1606                }
1607                _ => break,
1608            }
1609        }
1610
1611        if need_whitespace {
1612            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1613        } else {
1614            Ok(())
1615        }
1616    }
1617
1618    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1619        debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
1620
1621        let mut encountered_tab = false;
1622        let mut has_yaml_ws = false;
1623
1624        loop {
1625            match self.input.look_ch() {
1626                ' ' => {
1627                    has_yaml_ws = true;
1628                    self.skip_blank();
1629                }
1630                '\t' if skip_tabs != SkipTabs::No => {
1631                    encountered_tab = true;
1632                    self.skip_blank();
1633                }
1634                '#' if !encountered_tab && !has_yaml_ws => {
1635                    return Err(ScanError::new_str(
1636                        self.mark,
1637                        "comments must be separated from other tokens by whitespace",
1638                    ));
1639                }
1640                '#' => self.push_comment_token()?,
1641                _ => break,
1642            }
1643        }
1644
1645        Ok(SkipTabs::Result(encountered_tab, has_yaml_ws))
1646    }
1647
1648    fn fetch_stream_start(&mut self) {
1649        let mark = self.mark;
1650        self.indent = -1;
1651        self.stream_start_produced = true;
1652        self.allow_simple_key();
1653        self.tokens.push_back(Token(
1654            Span::empty(mark),
1655            TokenType::StreamStart(TEncoding::Utf8),
1656        ));
1657        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1658    }
1659
1660    fn fetch_stream_end(&mut self) -> ScanResult {
1661        // force new line
1662        if self.mark.col != 0 {
1663            self.mark.col = 0;
1664            self.mark.line += 1;
1665        }
1666
1667        if let Some((mark, bracket)) = self.flow_markers.pop() {
1668            return Err(Self::unclosed_bracket(mark, bracket));
1669        }
1670
1671        // If the stream ended, we won't have more context. We can stall all the simple keys we
1672        // had. If one was required, however, that was an error and we must propagate it.
1673        for sk in &mut self.simple_keys {
1674            if sk.required && sk.possible {
1675                return Err(self.simple_key_expected());
1676            }
1677            sk.possible = false;
1678        }
1679
1680        self.unroll_indent(-1);
1681        self.remove_simple_key()?;
1682        self.disallow_simple_key();
1683
1684        self.tokens
1685            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
1686        Ok(())
1687    }
1688
1689    fn fetch_directive(&mut self) -> ScanResult {
1690        self.unroll_indent(-1);
1691        self.remove_simple_key()?;
1692
1693        self.disallow_simple_key();
1694
1695        let token_index = self.tokens.len();
1696        let tok = self.scan_directive()?;
1697        self.insert_token(token_index, tok);
1698
1699        Ok(())
1700    }
1701
1702    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1703        let start_mark = self.mark;
1704        self.skip_non_blank();
1705
1706        let name = self.scan_directive_name()?;
1707        let tok = match name.as_ref() {
1708            "YAML" => self.scan_version_directive_value(&start_mark)?,
1709            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1710            _ => {
1711                let mut params = Vec::new();
1712                while self.input.next_is_blank() {
1713                    let n_blanks = self.input.skip_while_blank();
1714                    self.mark.offsets.chars += n_blanks;
1715                    self.mark.col += n_blanks;
1716                    self.mark.offsets.bytes = self.input.byte_offset();
1717
1718                    if !is_blank_or_breakz(self.input.peek()) {
1719                        let mut param = String::new();
1720                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1721                        self.mark.offsets.chars += n_chars;
1722                        self.mark.col += n_chars;
1723                        self.mark.offsets.bytes = self.input.byte_offset();
1724                        params.push(param);
1725                    }
1726                }
1727
1728                Token(
1729                    Span::new(start_mark, self.mark),
1730                    TokenType::ReservedDirective(name, params),
1731                )
1732            }
1733        };
1734
1735        self.skip_ws_to_eol(SkipTabs::Yes)?;
1736
1737        if self.input.next_is_breakz() {
1738            self.input.lookahead(2);
1739            self.skip_linebreak();
1740            Ok(tok)
1741        } else {
1742            Err(ScanError::new_str(
1743                start_mark,
1744                "while scanning a directive, did not find expected comment or line break",
1745            ))
1746        }
1747    }
1748
1749    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1750        let n_blanks = self.input.skip_while_blank();
1751        self.mark.offsets.chars += n_blanks;
1752        self.mark.col += n_blanks;
1753        self.mark.offsets.bytes = self.input.byte_offset();
1754
1755        let major = self.scan_version_directive_number(mark)?;
1756
1757        if self.input.peek() != '.' {
1758            return Err(ScanError::new_str(
1759                *mark,
1760                "while scanning a YAML directive, did not find expected digit or '.' character",
1761            ));
1762        }
1763        self.skip_non_blank();
1764
1765        let minor = self.scan_version_directive_number(mark)?;
1766
1767        Ok(Token(
1768            Span::new(*mark, self.mark),
1769            TokenType::VersionDirective(major, minor),
1770        ))
1771    }
1772
1773    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1774        let start_mark = self.mark;
1775        let mut string = String::new();
1776
1777        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
1778        self.mark.offsets.chars += n_chars;
1779        self.mark.col += n_chars;
1780        self.mark.offsets.bytes = self.input.byte_offset();
1781
1782        if string.is_empty() {
1783            return Err(ScanError::new_str(
1784                start_mark,
1785                "while scanning a directive, could not find expected directive name",
1786            ));
1787        }
1788
1789        if !is_blank_or_breakz(self.input.peek()) {
1790            return Err(ScanError::new_str(
1791                start_mark,
1792                "while scanning a directive, found unexpected non-alphabetical character",
1793            ));
1794        }
1795
1796        Ok(string)
1797    }
1798
1799    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1800        let mut val = 0u32;
1801        let mut length = 0usize;
1802        while let Some(digit) = self.input.look_ch().to_digit(10) {
1803            if length + 1 > 9 {
1804                return Err(ScanError::new_str(
1805                    *mark,
1806                    "while scanning a YAML directive, found extremely long version number",
1807                ));
1808            }
1809            length += 1;
1810            val = val * 10 + digit;
1811            self.skip_non_blank();
1812        }
1813
1814        if length == 0 {
1815            return Err(ScanError::new_str(
1816                *mark,
1817                "while scanning a YAML directive, did not find expected version number",
1818            ));
1819        }
1820
1821        Ok(val)
1822    }
1823
1824    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1825        let n_blanks = self.input.skip_while_blank();
1826        self.mark.offsets.chars += n_blanks;
1827        self.mark.col += n_blanks;
1828        self.mark.offsets.bytes = self.input.byte_offset();
1829
1830        let handle = self.scan_tag_handle_directive_cow(mark)?;
1831
1832        let n_blanks = self.input.skip_while_blank();
1833        self.mark.offsets.chars += n_blanks;
1834        self.mark.col += n_blanks;
1835        self.mark.offsets.bytes = self.input.byte_offset();
1836
1837        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
1838
1839        self.input.lookahead(1);
1840
1841        if self.input.next_is_blank_or_breakz() {
1842            Ok(Token(
1843                Span::new(*mark, self.mark),
1844                TokenType::TagDirective(handle, prefix),
1845            ))
1846        } else {
1847            Err(ScanError::new_str(
1848                *mark,
1849                "while scanning TAG, did not find expected whitespace or line break",
1850            ))
1851        }
1852    }
1853
1854    fn fetch_tag(&mut self) -> ScanResult {
1855        self.save_simple_key();
1856        self.disallow_simple_key();
1857
1858        let tok = self.scan_tag()?;
1859        self.tokens.push_back(tok);
1860        Ok(())
1861    }
1862
1863    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
1864        let start_mark = self.mark;
1865
1866        // Check if the tag is in the canonical form (verbatim).
1867        self.input.lookahead(2);
1868
1869        // If byte_offset is not available, use the original owned-only path.
1870        if self.input.byte_offset().is_none() {
1871            return self.scan_tag_owned(&start_mark);
1872        }
1873
1874        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
1875            if self.input.nth_char_is(1, '<') {
1876                // Verbatim tags always need owned strings (URI escapes).
1877                let suffix = self.scan_verbatim_tag(&start_mark)?;
1878                (Cow::Owned(String::new()), Cow::Owned(suffix))
1879            } else {
1880                // The tag has either the '!suffix' or the '!handle!suffix'
1881                let handle = self.scan_tag_handle_cow(&start_mark)?;
1882                // Check if it is, indeed, handle.
1883                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1884                    // A tag handle starting with "!!" is a secondary tag handle.
1885                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
1886                    (handle, suffix)
1887                } else {
1888                    // Not a real handle, it's part of the suffix.
1889                    // E.g., "!foo" -> handle="!", suffix="foo"
1890                    // The "handle" we scanned is actually "!" + suffix_part1.
1891                    // We need to also scan any remaining suffix characters.
1892                    let remaining_suffix =
1893                        self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
1894
1895                    // Extract suffix from handle (skip leading '!') and combine with remaining.
1896                    let suffix = if handle.len() > 1 {
1897                        if remaining_suffix.is_empty() {
1898                            // The suffix is just what's in handle after '!'
1899                            match handle {
1900                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
1901                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
1902                            }
1903                        } else {
1904                            // Combine handle (minus leading '!') with remaining suffix.
1905                            let mut combined = handle[1..].to_owned();
1906                            combined.push_str(&remaining_suffix);
1907                            Cow::Owned(combined)
1908                        }
1909                    } else {
1910                        // handle is just "!", suffix is whatever we scanned after
1911                        remaining_suffix
1912                    };
1913
1914                    // A special case: the '!' tag.  Set the handle to '' and the
1915                    // suffix to '!'.
1916                    if suffix.is_empty() {
1917                        (Cow::Borrowed(""), Cow::Borrowed("!"))
1918                    } else {
1919                        (Cow::Borrowed("!"), suffix)
1920                    }
1921                }
1922            };
1923
1924        if is_blank_or_breakz(self.input.look_ch())
1925            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
1926        {
1927            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
1928            // delimiter follows.
1929            Ok(Token(
1930                Span::new(start_mark, self.mark),
1931                TokenType::Tag(handle, suffix),
1932            ))
1933        } else {
1934            Err(ScanError::new_str(
1935                start_mark,
1936                "while scanning a tag, did not find expected whitespace or line break",
1937            ))
1938        }
1939    }
1940
1941    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
1942    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
1943        let mut handle = String::new();
1944        let mut suffix;
1945
1946        if self.input.nth_char_is(1, '<') {
1947            suffix = self.scan_verbatim_tag(start_mark)?;
1948        } else {
1949            // The tag has either the '!suffix' or the '!handle!suffix'
1950            handle = self.scan_tag_handle(false, start_mark)?;
1951            // Check if it is, indeed, handle.
1952            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1953                // A tag handle starting with "!!" is a secondary tag handle.
1954                let is_secondary_handle = handle == "!!";
1955                suffix =
1956                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
1957            } else {
1958                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
1959                "!".clone_into(&mut handle);
1960                // A special case: the '!' tag.  Set the handle to '' and the
1961                // suffix to '!'.
1962                if suffix.is_empty() {
1963                    handle.clear();
1964                    "!".clone_into(&mut suffix);
1965                }
1966            }
1967        }
1968
1969        if is_blank_or_breakz(self.input.look_ch())
1970            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
1971        {
1972            // YAML example 7.2 allows a tag to annotate an empty scalar when a separator or flow
1973            // delimiter follows.
1974            Ok(Token(
1975                Span::new(*start_mark, self.mark),
1976                TokenType::Tag(handle.into(), suffix.into()),
1977            ))
1978        } else {
1979            Err(ScanError::new_str(
1980                *start_mark,
1981                "while scanning a tag, did not find expected whitespace or line break",
1982            ))
1983        }
1984    }
1985
1986    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
1987    ///
1988    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
1989    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
1990    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
1991        let Some(start) = self.input.byte_offset() else {
1992            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1993        };
1994
1995        if self.input.look_ch() != '!' {
1996            return Err(ScanError::new_str(
1997                *mark,
1998                "while scanning a tag, did not find expected '!'",
1999            ));
2000        }
2001
2002        // Consume the leading '!'.
2003        self.skip_non_blank();
2004
2005        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
2006        self.input.lookahead(1);
2007        while self.input.next_is_alpha() {
2008            self.skip_non_blank();
2009            self.input.lookahead(1);
2010        }
2011
2012        // Optional trailing '!'.
2013        if self.input.peek() == '!' {
2014            self.skip_non_blank();
2015        }
2016
2017        let Some(end) = self.input.byte_offset() else {
2018            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2019        };
2020
2021        if let Some(slice) = self.try_borrow_slice(start, end) {
2022            Ok(Cow::Borrowed(slice))
2023        } else {
2024            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2025                ScanError::new_str(
2026                    *mark,
2027                    "internal error: input advertised slicing but did not provide a slice",
2028                )
2029            })?;
2030            Ok(Cow::Owned(slice.to_owned()))
2031        }
2032    }
2033
2034    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
2035    ///
2036    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
2037    fn scan_tag_shorthand_suffix_cow(
2038        &mut self,
2039        mark: &Marker,
2040        require_non_empty: bool,
2041    ) -> Result<Cow<'input, str>, ScanError> {
2042        let Some(start) = self.input.byte_offset() else {
2043            return Ok(Cow::Owned(
2044                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2045            ));
2046        };
2047
2048        // Scan tag characters, checking for URI escapes.
2049        while is_tag_char(self.input.look_ch()) {
2050            if self.input.peek() == '%' {
2051                // URI escape found - must decode, so fall back to owned path.
2052                let current = self
2053                    .input
2054                    .byte_offset()
2055                    .expect("byte_offset() must remain available once enabled");
2056                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
2057                    slice.to_owned()
2058                } else {
2059                    String::new()
2060                };
2061
2062                // Continue scanning with owned buffer.
2063                while is_tag_char(self.input.look_ch()) {
2064                    if self.input.peek() == '%' {
2065                        out.push(self.scan_uri_escapes(mark)?);
2066                    } else {
2067                        out.push(self.input.peek());
2068                        self.skip_non_blank();
2069                    }
2070                }
2071                return Ok(Cow::Owned(out));
2072            }
2073            self.skip_non_blank();
2074        }
2075
2076        let Some(end) = self.input.byte_offset() else {
2077            return Ok(Cow::Owned(
2078                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2079            ));
2080        };
2081
2082        if require_non_empty && start == end {
2083            return Err(ScanError::new_str(
2084                *mark,
2085                "while parsing a tag, did not find expected tag URI",
2086            ));
2087        }
2088
2089        if let Some(slice) = self.try_borrow_slice(start, end) {
2090            Ok(Cow::Borrowed(slice))
2091        } else {
2092            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2093                ScanError::new_str(
2094                    *mark,
2095                    "internal error: input advertised slicing but did not provide a slice",
2096                )
2097            })?;
2098            Ok(Cow::Owned(slice.to_owned()))
2099        }
2100    }
2101
2102    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
2103        let mut string = String::new();
2104        if self.input.look_ch() != '!' {
2105            return Err(ScanError::new_str(
2106                *mark,
2107                "while scanning a tag, did not find expected '!'",
2108            ));
2109        }
2110
2111        string.push(self.input.peek());
2112        self.skip_non_blank();
2113
2114        let n_chars = self.input.fetch_while_is_alpha(&mut string);
2115        self.mark.offsets.chars += n_chars;
2116        self.mark.col += n_chars;
2117        self.mark.offsets.bytes = self.input.byte_offset();
2118
2119        // Check if the trailing character is '!' and copy it.
2120        if self.input.peek() == '!' {
2121            string.push(self.input.peek());
2122            self.skip_non_blank();
2123        } else if directive && string != "!" {
2124            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
2125            // directive, it's an error.  If it's a tag token, it must be a part of
2126            // URI.
2127            return Err(ScanError::new_str(
2128                *mark,
2129                "while parsing a tag directive, did not find expected '!'",
2130            ));
2131        }
2132        Ok(string)
2133    }
2134
2135    /// Scan for a tag prefix (6.8.2.2).
2136    ///
2137    /// There are 2 kinds of tag prefixes:
2138    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
2139    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
2140    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2141        let mut string = String::new();
2142
2143        if self.input.look_ch() == '!' {
2144            // If we have a local tag, insert and skip `!`.
2145            string.push(self.input.peek());
2146            self.skip_non_blank();
2147        } else if !is_tag_char(self.input.peek()) {
2148            // Otherwise, check if the first global tag character is valid.
2149            return Err(ScanError::new_str(
2150                *start_mark,
2151                "invalid global tag character",
2152            ));
2153        } else if self.input.peek() == '%' {
2154            // If it is valid and an escape sequence, escape it.
2155            string.push(self.scan_uri_escapes(start_mark)?);
2156        } else {
2157            // Otherwise, push the first character.
2158            string.push(self.input.peek());
2159            self.skip_non_blank();
2160        }
2161
2162        while is_uri_char(self.input.look_ch()) {
2163            if self.input.peek() == '%' {
2164                string.push(self.scan_uri_escapes(start_mark)?);
2165            } else {
2166                string.push(self.input.peek());
2167                self.skip_non_blank();
2168            }
2169        }
2170
2171        Ok(string)
2172    }
2173
2174    /// Scan for a verbatim tag.
2175    ///
2176    /// The prefixing `!<` must _not_ have been skipped.
2177    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2178        // Eat `!<`
2179        self.skip_non_blank();
2180        self.skip_non_blank();
2181
2182        let mut string = String::new();
2183        while is_uri_char(self.input.look_ch()) {
2184            if self.input.peek() == '%' {
2185                string.push(self.scan_uri_escapes(start_mark)?);
2186            } else {
2187                string.push(self.input.peek());
2188                self.skip_non_blank();
2189            }
2190        }
2191
2192        if string.is_empty() {
2193            return Err(ScanError::new_str(
2194                *start_mark,
2195                "while parsing a tag, did not find expected tag URI",
2196            ));
2197        }
2198
2199        if self.input.peek() != '>' {
2200            return Err(ScanError::new_str(
2201                *start_mark,
2202                "while scanning a verbatim tag, did not find the expected '>'",
2203            ));
2204        }
2205        self.skip_non_blank();
2206
2207        Ok(string)
2208    }
2209
2210    fn scan_tag_shorthand_suffix(
2211        &mut self,
2212        _directive: bool,
2213        _is_secondary: bool,
2214        head: &str,
2215        mark: &Marker,
2216    ) -> Result<String, ScanError> {
2217        let mut length = head.len();
2218        let mut string = String::new();
2219
2220        // Copy the head if needed.
2221        // Note that we don't copy the leading '!' character.
2222        if length > 1 {
2223            string.extend(head.chars().skip(1));
2224        }
2225
2226        while is_tag_char(self.input.look_ch()) {
2227            // Check if it is a URI-escape sequence.
2228            if self.input.peek() == '%' {
2229                string.push(self.scan_uri_escapes(mark)?);
2230            } else {
2231                string.push(self.input.peek());
2232                self.skip_non_blank();
2233            }
2234
2235            length += 1;
2236        }
2237
2238        if length == 0 {
2239            return Err(ScanError::new_str(
2240                *mark,
2241                "while parsing a tag, did not find expected tag URI",
2242            ));
2243        }
2244
2245        Ok(string)
2246    }
2247
2248    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2249        let mut width = 0usize;
2250        let mut bytes = [0u8; 4];
2251        let mut bytes_len = 0usize;
2252        loop {
2253            self.input.lookahead(3);
2254
2255            let c = self.input.peek_nth(1);
2256            let nc = self.input.peek_nth(2);
2257
2258            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
2259                return Err(ScanError::new_str(
2260                    *mark,
2261                    "while parsing a tag, found an invalid escape sequence",
2262                ));
2263            }
2264
2265            let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
2266                .expect("two hex nibbles always fit in a byte");
2267            if width == 0 {
2268                width = match byte {
2269                    _ if byte & 0x80 == 0x00 => 1,
2270                    _ if byte & 0xE0 == 0xC0 => 2,
2271                    _ if byte & 0xF0 == 0xE0 => 3,
2272                    _ if byte & 0xF8 == 0xF0 => 4,
2273                    _ => {
2274                        return Err(ScanError::new_str(
2275                            *mark,
2276                            "while parsing a tag, found an incorrect leading UTF-8 byte",
2277                        ));
2278                    }
2279                };
2280            } else if byte & 0xc0 != 0x80 {
2281                return Err(ScanError::new_str(
2282                    *mark,
2283                    "while parsing a tag, found an incorrect trailing UTF-8 byte",
2284                ));
2285            }
2286
2287            bytes[bytes_len] = byte;
2288            bytes_len += 1;
2289
2290            self.skip_n_non_blank(3);
2291
2292            width -= 1;
2293            if width == 0 {
2294                break;
2295            }
2296        }
2297
2298        let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2299            ScanError::new_str(
2300                *mark,
2301                "while parsing a tag, found an invalid UTF-8 codepoint",
2302            )
2303        })?;
2304
2305        let mut chars = s.chars();
2306        match (chars.next(), chars.next()) {
2307            (Some(ch), None) => Ok(ch),
2308            _ => Err(ScanError::new_str(
2309                *mark,
2310                "while parsing a tag, found an invalid UTF-8 codepoint",
2311            )),
2312        }
2313    }
2314
2315    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2316        self.save_simple_key();
2317        self.disallow_simple_key();
2318
2319        let tok = self.scan_anchor(alias)?;
2320
2321        self.tokens.push_back(tok);
2322
2323        Ok(())
2324    }
2325
2326    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2327        let start_mark = self.mark;
2328
2329        // Skip `&` / `*`.
2330        self.skip_non_blank();
2331
2332        // Borrow from input when possible.
2333        if let Some(start) = self.input.byte_offset() {
2334            while is_anchor_char(self.input.look_ch()) {
2335                self.skip_non_blank();
2336            }
2337
2338            let end = self
2339                .input
2340                .byte_offset()
2341                .expect("byte_offset() must remain available once enabled");
2342
2343            if start == end {
2344                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2345            }
2346
2347            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2348                Cow::Borrowed(slice)
2349            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2350                Cow::Owned(slice.to_owned())
2351            } else {
2352                return Err(ScanError::new_str(
2353                    start_mark,
2354                    "internal error: input advertised slicing but did not provide a slice",
2355                ));
2356            };
2357
2358            let tok = if alias {
2359                TokenType::Alias(cow)
2360            } else {
2361                TokenType::Anchor(cow)
2362            };
2363            return Ok(Token(Span::new(start_mark, self.mark), tok));
2364        }
2365
2366        let mut string = String::new();
2367        while is_anchor_char(self.input.look_ch()) {
2368            string.push(self.input.peek());
2369            self.skip_non_blank();
2370        }
2371
2372        if string.is_empty() {
2373            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2374        }
2375
2376        let tok = if alias {
2377            TokenType::Alias(string.into())
2378        } else {
2379            TokenType::Anchor(string.into())
2380        };
2381        Ok(Token(Span::new(start_mark, self.mark), tok))
2382    }
2383
2384    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2385        // The indicators '[' and '{' may start a simple key.
2386        self.save_simple_key();
2387
2388        let start_mark = self.mark;
2389        let indicator = self.input.peek();
2390        self.flow_markers.push((start_mark, indicator));
2391
2392        self.roll_one_col_indent();
2393        self.increase_flow_level()?;
2394
2395        self.allow_simple_key();
2396
2397        self.skip_non_blank();
2398
2399        if tok == TokenType::FlowMappingStart {
2400            self.flow_mapping_started.push(true);
2401        } else {
2402            self.flow_mapping_started.push(false);
2403            self.implicit_flow_mapping_states
2404                .push(ImplicitMappingState::Possible);
2405        }
2406
2407        let token_index = self.tokens.len();
2408        self.skip_ws_to_eol(SkipTabs::Yes)?;
2409
2410        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2411        Ok(())
2412    }
2413
2414    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2415        // A closing bracket without a corresponding opening is invalid YAML.
2416        if self.flow_level == 0 {
2417            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2418        }
2419
2420        let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2421            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2422        };
2423
2424        let (expected_open, actual_close) = match tok {
2425            TokenType::FlowSequenceEnd => ('[', ']'),
2426            TokenType::FlowMappingEnd => ('{', '}'),
2427            _ => unreachable!("flow collection end called with non-closing token"),
2428        };
2429
2430        if open_ch != expected_open {
2431            return Err(ScanError::new(
2432                open_mark,
2433                format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2434            ));
2435        }
2436
2437        let flow_level = self.flow_level;
2438
2439        self.remove_simple_key()?;
2440
2441        if matches!(tok, TokenType::FlowSequenceEnd) {
2442            self.end_implicit_mapping(self.mark, flow_level);
2443            // We are out exiting the flow sequence, nesting goes down 1 level.
2444            self.implicit_flow_mapping_states.pop();
2445        }
2446        self.flow_mapping_started.pop();
2447
2448        self.decrease_flow_level();
2449
2450        self.disallow_simple_key();
2451
2452        let start_mark = self.mark;
2453        self.skip_non_blank();
2454        let token_index = self.tokens.len();
2455        self.skip_ws_to_eol(SkipTabs::Yes)?;
2456
2457        // A flow collection within a flow mapping can be a key. In that case, the value may be
2458        // adjacent to the `:`.
2459        // ```yaml
2460        // - [ {a: b}:value ]
2461        // ```
2462        if self.flow_level > 0 {
2463            self.adjacent_value_allowed_at = self.mark.index();
2464        }
2465
2466        self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2467        Ok(())
2468    }
2469
2470    /// Push the `FlowEntry` token and skip over the `,`.
2471    fn fetch_flow_entry(&mut self) -> ScanResult {
2472        self.remove_simple_key()?;
2473        self.allow_simple_key();
2474
2475        self.end_implicit_mapping(self.mark, self.flow_level);
2476        if self.current_flow_collection_is_sequence() {
2477            self.set_current_flow_mapping_started(false);
2478        }
2479
2480        let start_mark = self.mark;
2481        self.skip_non_blank();
2482        let token_index = self.tokens.len();
2483        self.skip_ws_to_eol(SkipTabs::Yes)?;
2484
2485        self.insert_token(
2486            token_index,
2487            Token(Span::new(start_mark, self.mark), TokenType::FlowEntry),
2488        );
2489        Ok(())
2490    }
2491
2492    fn increase_flow_level(&mut self) -> ScanResult {
2493        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2494        self.flow_level = self
2495            .flow_level
2496            .checked_add(1)
2497            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2498        Ok(())
2499    }
2500
2501    fn decrease_flow_level(&mut self) {
2502        if self.flow_level > 0 {
2503            self.flow_level -= 1;
2504            self.simple_keys.pop().unwrap();
2505        }
2506    }
2507
2508    /// Push the `Block*` token(s) and skip over the `-`.
2509    ///
2510    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2511    /// `BlockEntry` token.
2512    /// This function only skips over the `-` and does not fetch the entry value.
2513    fn fetch_block_entry(&mut self) -> ScanResult {
2514        if self.flow_level > 0 {
2515            // - * only allowed in block
2516            return Err(ScanError::new_str(
2517                self.mark,
2518                r#""-" is only valid inside a block"#,
2519            ));
2520        }
2521        // Check if we are allowed to start a new entry.
2522        if !self.simple_key_allowed {
2523            return Err(ScanError::new_str(
2524                self.mark,
2525                "block sequence entries are not allowed in this context",
2526            ));
2527        }
2528
2529        // ???, fixes test G9HC.
2530        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
2531            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2532                return Err(ScanError::new_str(
2533                    span.start,
2534                    "invalid indentation for anchor",
2535                ));
2536            }
2537        }
2538
2539        // Skip over the `-`.
2540        let mark = self.mark;
2541        self.skip_non_blank();
2542
2543        // generate BLOCK-SEQUENCE-START if indented
2544        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2545        let token_index = self.tokens.len();
2546        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2547        self.input.lookahead(2);
2548        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2549        {
2550            return Err(ScanError::new_str(
2551                self.mark,
2552                "'-' must be followed by a valid YAML whitespace",
2553            ));
2554        }
2555
2556        self.skip_ws_to_eol(SkipTabs::No)?;
2557        self.input.lookahead(1);
2558        if self.input.next_is_break() || self.input.next_is_flow() {
2559            self.roll_one_col_indent();
2560        }
2561
2562        self.remove_simple_key()?;
2563        self.allow_simple_key();
2564
2565        self.insert_token(
2566            token_index,
2567            Token(Span::empty(self.mark), TokenType::BlockEntry),
2568        );
2569
2570        Ok(())
2571    }
2572
2573    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2574        if let Some((mark, bracket)) = self.flow_markers.pop() {
2575            return Err(ScanError::new(
2576                mark,
2577                format!("unclosed bracket '{bracket}'"),
2578            ));
2579        }
2580
2581        self.unroll_indent(-1);
2582        self.remove_simple_key()?;
2583        self.disallow_simple_key();
2584
2585        let mark = self.mark;
2586
2587        self.skip_n_non_blank(3);
2588
2589        self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
2590        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
2591        Ok(())
2592    }
2593
2594    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2595        self.save_simple_key();
2596        self.allow_simple_key();
2597        let tok = self.scan_block_scalar(literal)?;
2598
2599        self.tokens.push_back(tok);
2600        Ok(())
2601    }
2602
2603    #[allow(clippy::too_many_lines)]
2604    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2605        let start_mark = self.mark;
2606        let mut chomping = Chomping::Clip;
2607        let mut increment: usize = 0;
2608        let mut indent: usize = 0;
2609        let mut trailing_blank: bool;
2610        let mut leading_blank: bool = false;
2611        let style = if literal {
2612            ScalarStyle::Literal
2613        } else {
2614            ScalarStyle::Folded
2615        };
2616
2617        let mut string = String::new();
2618        let mut leading_break = String::new();
2619        let mut trailing_breaks = String::new();
2620        let mut chomping_break = String::new();
2621
2622        // skip '|' or '>'
2623        self.skip_non_blank();
2624        self.unroll_non_block_indents();
2625
2626        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2627            if self.input.peek() == '+' {
2628                chomping = Chomping::Keep;
2629            } else {
2630                chomping = Chomping::Strip;
2631            }
2632            self.skip_non_blank();
2633            self.input.lookahead(1);
2634            if self.input.next_is_digit() {
2635                if self.input.peek() == '0' {
2636                    return Err(ScanError::new_str(
2637                        start_mark,
2638                        "while scanning a block scalar, found an indentation indicator equal to 0",
2639                    ));
2640                }
2641                increment = (self.input.peek() as usize) - ('0' as usize);
2642                self.skip_non_blank();
2643            }
2644        } else if self.input.next_is_digit() {
2645            if self.input.peek() == '0' {
2646                return Err(ScanError::new_str(
2647                    start_mark,
2648                    "while scanning a block scalar, found an indentation indicator equal to 0",
2649                ));
2650            }
2651
2652            increment = (self.input.peek() as usize) - ('0' as usize);
2653            self.skip_non_blank();
2654            self.input.lookahead(1);
2655            if self.input.peek() == '+' || self.input.peek() == '-' {
2656                if self.input.peek() == '+' {
2657                    chomping = Chomping::Keep;
2658                } else {
2659                    chomping = Chomping::Strip;
2660                }
2661                self.skip_non_blank();
2662            }
2663        }
2664
2665        self.skip_ws_to_eol(SkipTabs::Yes)?;
2666
2667        // Check if we are at the end of the line.
2668        self.input.lookahead(1);
2669        if !self.input.next_is_breakz() {
2670            return Err(ScanError::new_str(
2671                start_mark,
2672                "while scanning a block scalar, did not find expected comment or line break",
2673            ));
2674        }
2675
2676        if self.input.next_is_break() {
2677            self.input.lookahead(2);
2678            self.read_break(&mut chomping_break);
2679        }
2680
2681        if self.input.look_ch() == '\t' {
2682            return Err(ScanError::new_str(
2683                start_mark,
2684                "a block scalar content cannot start with a tab",
2685            ));
2686        }
2687
2688        if increment > 0 {
2689            indent = if self.indent >= 0 {
2690                (self.indent + increment as isize) as usize
2691            } else {
2692                increment
2693            }
2694        }
2695
2696        // Scan the leading line breaks and determine the indentation level if needed.
2697        if indent == 0 {
2698            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2699        } else {
2700            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2701        }
2702
2703        // We have an end-of-stream with no content, e.g.:
2704        // ```yaml
2705        // - |+
2706        // ```
2707        if self.input.next_is_z() {
2708            let contents = match chomping {
2709                // We strip trailing line breaks. Nothing remains.
2710                Chomping::Strip => String::new(),
2711                // There was no newline after the chomping indicator.
2712                _ if self.mark.line == start_mark.line() => String::new(),
2713                // We clip lines, and there was a newline after the chomping indicator.
2714                // All other breaks are ignored.
2715                Chomping::Clip => chomping_break,
2716                // We keep lines. There was a newline after the chomping indicator but nothing
2717                // else.
2718                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2719                // Otherwise, the newline after chomping is ignored.
2720                Chomping::Keep => trailing_breaks,
2721            };
2722            return Ok(Token(
2723                Span::new(start_mark, self.mark),
2724                TokenType::Scalar(style, contents.into()),
2725            ));
2726        }
2727
2728        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2729            if self.indent < 0 && self.mark.col == 0 {
2730                self.input.lookahead(4);
2731                if self.input.next_is_document_start()
2732                    || self.input.next_is_document_end()
2733                    || self.input.peek() == '#'
2734                {
2735                    // At the root level, an explicit indentation indicator can still yield an
2736                    // empty scalar when the next line is a document marker or comment.
2737                    // In this case, the scalar is terminated rather than under-indented.
2738                } else {
2739                    return Err(ScanError::new_str(
2740                        self.mark,
2741                        "wrongly indented line in block scalar",
2742                    ));
2743                }
2744            } else {
2745                return Err(ScanError::new_str(
2746                    self.mark,
2747                    "wrongly indented line in block scalar",
2748                ));
2749            }
2750        }
2751
2752        let mut line_buffer = String::with_capacity(100);
2753        let start_mark = self.mark;
2754        while self.mark.col == indent && !self.input.next_is_z() {
2755            if indent == 0 {
2756                self.input.lookahead(4);
2757                if self.input.next_is_document_end() {
2758                    break;
2759                }
2760            }
2761
2762            // We are at the first content character of a content line.
2763            trailing_blank = self.input.next_is_blank();
2764            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
2765                string.push_str(&trailing_breaks);
2766                if trailing_breaks.is_empty() {
2767                    string.push(' ');
2768                }
2769            } else {
2770                string.push_str(&leading_break);
2771                string.push_str(&trailing_breaks);
2772            }
2773
2774            leading_break.clear();
2775            trailing_breaks.clear();
2776
2777            leading_blank = self.input.next_is_blank();
2778
2779            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
2780
2781            // break on EOF
2782            self.input.lookahead(2);
2783            if self.input.next_is_z() {
2784                break;
2785            }
2786
2787            self.read_break(&mut leading_break);
2788
2789            // Eat the following indentation spaces and line breaks.
2790            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2791        }
2792
2793        // Chomp the tail.
2794        if chomping != Chomping::Strip {
2795            string.push_str(&leading_break);
2796            // If we had reached an eof but the last character wasn't an end-of-line, check if the
2797            // last line was indented at least as the rest of the scalar, then we need to consider
2798            // there is a newline.
2799            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
2800                string.push('\n');
2801            }
2802        }
2803
2804        if chomping == Chomping::Keep {
2805            string.push_str(&trailing_breaks);
2806        }
2807
2808        Ok(Token(
2809            Span::new(start_mark, self.mark),
2810            TokenType::Scalar(style, string.into()),
2811        ))
2812    }
2813
2814    /// Retrieve the contents of the line, parsing it as a block scalar.
2815    ///
2816    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
2817    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
2818    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
2819    /// `clear`ed before the end of the function.
2820    ///
2821    /// This function assumes the first character to read is the first content character in the
2822    /// line. This function does not consume the line break character(s) after the line.
2823    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
2824        // Start by evaluating characters in the buffer.
2825        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
2826            string.push(self.input.peek());
2827            // We may technically skip non-blank characters. However, the only distinction is
2828            // to determine what is leading whitespace and what is not. Here, we read the
2829            // contents of the line until either EOF or a line break. We know we will not read
2830            // `self.leading_whitespace` until the end of the line, where it will be reset.
2831            // This allows us to call a slightly less expensive function.
2832            self.skip_blank();
2833        }
2834
2835        // All characters that were in the buffer were consumed. We need to check if more
2836        // follow.
2837        if self.input.buf_is_empty() {
2838            // We will read all consecutive non-breakz characters. We push them into a
2839            // temporary buffer. The main difference with going through `self.buffer` is that
2840            // characters are appended here as their real size (1B for ASCII, or up to 4 bytes for
2841            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
2842            // (using `String::push_str`).
2843
2844            // line_buffer is empty at this point so we can compute n_chars here as well
2845            let mut n_chars = 0;
2846            debug_assert!(line_buffer.is_empty());
2847            while let Some(c) = self.input.raw_read_non_breakz_ch() {
2848                line_buffer.push(c);
2849                n_chars += 1;
2850            }
2851
2852            // We need to manually update our position; we haven't called a `skip` function.
2853            self.mark.col += n_chars;
2854            self.mark.offsets.chars += n_chars;
2855            self.mark.offsets.bytes = self.input.byte_offset();
2856
2857            // We can now append our bytes to our `string`.
2858            string.reserve(line_buffer.len());
2859            string.push_str(line_buffer);
2860            // This clears the _contents_ without touching the _capacity_.
2861            line_buffer.clear();
2862        }
2863    }
2864
2865    /// Skip the block scalar indentation and empty lines.
2866    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
2867        loop {
2868            // Consume all spaces. Tabs cannot be used as indentation.
2869            if indent < self.input.bufmaxlen() - 2 {
2870                self.input.lookahead(self.input.bufmaxlen());
2871                while self.mark.col < indent && self.input.peek() == ' ' {
2872                    self.skip_blank();
2873                }
2874            } else {
2875                loop {
2876                    self.input.lookahead(self.input.bufmaxlen());
2877                    while !self.input.buf_is_empty()
2878                        && self.mark.col < indent
2879                        && self.input.peek() == ' '
2880                    {
2881                        self.skip_blank();
2882                    }
2883                    // If we reached our indent, we can break. We must also break if we have
2884                    // reached content or EOF; that is, the buffer is not empty and the next
2885                    // character is not a space.
2886                    if self.mark.col == indent
2887                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
2888                    {
2889                        break;
2890                    }
2891                }
2892                self.input.lookahead(2);
2893            }
2894
2895            // If our current line is empty, skip over the break and continue looping.
2896            if self.input.next_is_break() {
2897                self.read_break(breaks);
2898            } else {
2899                // Otherwise, we have a content line. Return control.
2900                break;
2901            }
2902        }
2903    }
2904
2905    /// Determine the indentation level for a block scalar from the first line of its contents.
2906    ///
2907    /// The function skips over whitespace-only lines and sets `indent` to the longest
2908    /// whitespace line that was encountered.
2909    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
2910        let mut max_indent = 0;
2911        loop {
2912            // Consume all spaces. Tabs cannot be used as indentation.
2913            while self.input.look_ch() == ' ' {
2914                self.skip_blank();
2915            }
2916
2917            if self.mark.col > max_indent {
2918                max_indent = self.mark.col;
2919            }
2920
2921            if self.input.next_is_break() {
2922                // If our current line is empty, skip over the break and continue looping.
2923                self.input.lookahead(2);
2924                self.read_break(breaks);
2925            } else {
2926                // Otherwise, we have a content line. Return control.
2927                break;
2928            }
2929        }
2930
2931        // In case a YAML document looks like:
2932        // ```yaml
2933        // |
2934        // foo
2935        // bar
2936        // ```
2937        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
2938        // least 1. When in the above example, `self.indent` will be set to -1.
2939        *indent = max_indent.max((self.indent + 1) as usize);
2940        if self.indent > 0 {
2941            *indent = (*indent).max(1);
2942        }
2943    }
2944
2945    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
2946        self.save_simple_key();
2947        self.disallow_simple_key();
2948
2949        let token_index = self.tokens.len();
2950        let tok = self.scan_flow_scalar(single)?;
2951
2952        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
2953        // YAML allows the following value to be specified adjacent to the “:”.
2954        self.skip_to_next_token()?;
2955        self.adjacent_value_allowed_at = self.mark.index();
2956
2957        self.insert_token(token_index, tok);
2958        Ok(())
2959    }
2960
2961    #[allow(clippy::too_many_lines)]
2962    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
2963        let start_mark = self.mark;
2964
2965        // Output scalar contents.
2966        let mut buf = match self.input.byte_offset() {
2967            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
2968            None => FlowScalarBuf::new_owned(),
2969        };
2970
2971        // Scratch used to consume the *first* line break in a break run without emitting it.
2972        // (The first break folds to ' ' or to nothing depending on escaping rules.)
2973        let mut break_scratch = String::new();
2974
2975        /* Eat the left quote. */
2976        self.skip_non_blank();
2977
2978        loop {
2979            /* Check for a document indicator. */
2980            self.input.lookahead(4);
2981
2982            if self.mark.col == 0 && self.input.next_is_document_indicator() {
2983                return Err(ScanError::new_str(
2984                    start_mark,
2985                    "while scanning a quoted scalar, found unexpected document indicator",
2986                ));
2987            }
2988
2989            if self.input.next_is_z() {
2990                return Err(ScanError::new_str(start_mark, "unclosed quote"));
2991            }
2992
2993            // Do not enforce block indentation inside quoted (flow) scalars.
2994            // YAML allows line breaks within quoted scalars.
2995            let mut leading_blanks = false;
2996            self.consume_flow_scalar_non_whitespace_chars(
2997                single,
2998                &mut buf,
2999                &mut leading_blanks,
3000                &start_mark,
3001            )?;
3002
3003            match self.input.look_ch() {
3004                '\'' if single => break,
3005                '"' if !single => break,
3006                _ => {}
3007            }
3008
3009            // --- Faster whitespace / line break handling (no temporary Strings) ---
3010            //
3011            // Instead of:
3012            //   - collecting blanks into `whitespaces` and then copying them
3013            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
3014            //
3015            // We do:
3016            //   - append trailing blanks directly to `string`, remember where they started,
3017            //     and truncate them if a line break follows.
3018            //   - for line breaks: consume the first break into a scratch (discarded),
3019            //     append subsequent breaks directly to `string`.
3020            //
3021            // These flags replace temporary-string emptiness checks:
3022            //   has_leading_break  <=> !leading_break.is_empty()
3023            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
3024            let mut trailing_ws_start: Option<usize> = None;
3025            let mut has_leading_break = false;
3026            let mut has_trailing_breaks = false;
3027
3028            // For the borrowed path: track the (byte) start of a pending whitespace run.
3029            let mut pending_ws_start: Option<usize> = None;
3030
3031            // Consume blank characters.
3032            while self.input.next_is_blank() || self.input.next_is_break() {
3033                if self.input.next_is_blank() {
3034                    // Consume a space or a tab character.
3035                    if leading_blanks {
3036                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
3037                            return Err(ScanError::new_str(
3038                                self.mark,
3039                                "tab cannot be used as indentation",
3040                            ));
3041                        }
3042                        self.skip_blank();
3043                    } else {
3044                        // Append to output immediately; if a break appears next, we'll truncate.
3045                        match buf {
3046                            FlowScalarBuf::Owned(ref mut string) => {
3047                                if trailing_ws_start.is_none() {
3048                                    trailing_ws_start = Some(string.len());
3049                                }
3050                                string.push(self.input.peek());
3051                            }
3052                            FlowScalarBuf::Borrowed { .. } => {
3053                                if pending_ws_start.is_none() {
3054                                    pending_ws_start = self.input.byte_offset();
3055                                }
3056                            }
3057                        }
3058                        self.skip_blank();
3059
3060                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
3061                            (&mut buf, pending_ws_start, self.input.byte_offset())
3062                        {
3063                            buf.note_pending_ws(ws_start, ws_end);
3064                        }
3065                    }
3066                } else {
3067                    self.input.lookahead(2);
3068
3069                    // Check if it is a first line break.
3070                    if leading_blanks {
3071                        // Second+ line break in a run: preserve it.
3072                        match buf {
3073                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
3074                            FlowScalarBuf::Borrowed { .. } => {
3075                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3076                                let Some(string) = buf.as_owned_mut() else {
3077                                    unreachable!()
3078                                };
3079                                self.read_break(string);
3080                            }
3081                        }
3082                        has_trailing_breaks = true;
3083                    } else {
3084                        // First break: drop any trailing blanks we appended, then consume the break.
3085                        if let Some(pos) = trailing_ws_start.take() {
3086                            if let FlowScalarBuf::Owned(ref mut string) = buf {
3087                                string.truncate(pos);
3088                            }
3089                        }
3090
3091                        if pending_ws_start.take().is_some() {
3092                            // Trailing blanks before a break are discarded => transformation.
3093                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3094                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3095                            }
3096                            buf.discard_pending_ws();
3097                        } else {
3098                            buf.commit_pending_ws();
3099                        }
3100
3101                        break_scratch.clear();
3102                        self.read_break(&mut break_scratch);
3103                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
3104
3105                        has_leading_break = true;
3106                        leading_blanks = true;
3107                    }
3108                }
3109
3110                self.input.lookahead(1);
3111            }
3112
3113            // If we had a line break inside a quoted (flow) scalar, validate indentation
3114            // of the continuation line in block context.
3115            if leading_blanks && has_leading_break && self.flow_level == 0 {
3116                let next_ch = self.input.peek();
3117                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
3118                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
3119                    return Err(ScanError::new_str(
3120                        self.mark,
3121                        "invalid indentation in multiline quoted scalar",
3122                    ));
3123                }
3124            }
3125
3126            // Join the whitespace or fold line breaks.
3127            if leading_blanks {
3128                // Folding rule:
3129                //   if there was no leading break, preserve the pending whitespace already emitted
3130                //   if there was a leading break but no trailing breaks, fold to one space
3131                //   otherwise, preserve the trailing breaks already emitted
3132                if has_leading_break && !has_trailing_breaks {
3133                    match buf {
3134                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
3135                        FlowScalarBuf::Borrowed { .. } => {
3136                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3137                            let Some(string) = buf.as_owned_mut() else {
3138                                unreachable!()
3139                            };
3140                            string.push(' ');
3141                        }
3142                    }
3143                }
3144            }
3145            // else: trailing blanks are already appended to `string`
3146        } // loop
3147
3148        // Eat the right quote.
3149        self.skip_non_blank();
3150        let end_mark = self.mark;
3151
3152        // Ensure there is no invalid trailing content.
3153        self.skip_ws_to_eol(SkipTabs::Yes)?;
3154        match self.input.peek() {
3155            // These can be encountered in flow sequences or mappings.
3156            ',' | '}' | ']' if self.flow_level > 0 => {}
3157            // An end-of-line / end-of-stream is fine. No trailing content.
3158            c if is_breakz(c) => {}
3159            // ':' can be encountered if our scalar is a key.
3160            // Outside of flow contexts, keys cannot span multiple lines
3161            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
3162            // Inside a flow context, this is allowed.
3163            ':' if self.flow_level > 0 => {}
3164            _ => {
3165                return Err(ScanError::new_str(
3166                    self.mark,
3167                    "invalid trailing content after double-quoted scalar",
3168                ));
3169            }
3170        }
3171
3172        let style = if single {
3173            ScalarStyle::SingleQuoted
3174        } else {
3175            ScalarStyle::DoubleQuoted
3176        };
3177
3178        let contents = match buf {
3179            FlowScalarBuf::Owned(string) => Cow::Owned(string),
3180            FlowScalarBuf::Borrowed {
3181                start,
3182                mut end,
3183                pending_ws_start,
3184                pending_ws_end,
3185            } => {
3186                // If we ended after a whitespace run, it is part of the output (no break followed).
3187                if pending_ws_start.is_some() {
3188                    end = pending_ws_end;
3189                }
3190                if let Some(slice) = self.try_borrow_slice(start, end) {
3191                    Cow::Borrowed(slice)
3192                } else {
3193                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
3194                        ScanError::new_str(
3195                            start_mark,
3196                            "internal error: input advertised offsets but did not provide a slice",
3197                        )
3198                    })?;
3199                    Cow::Owned(slice.to_owned())
3200                }
3201            }
3202        };
3203
3204        Ok(Token(
3205            Span::new(start_mark, end_mark),
3206            TokenType::Scalar(style, contents),
3207        ))
3208    }
3209
3210    /// Consume successive non-whitespace characters from a flow scalar.
3211    ///
3212    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
3213    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
3214    /// for double quoted scalars).
3215    ///
3216    /// # Errors
3217    /// Return an error if an invalid escape sequence is found.
3218    fn consume_flow_scalar_non_whitespace_chars(
3219        &mut self,
3220        single: bool,
3221        buf: &mut FlowScalarBuf,
3222        leading_blanks: &mut bool,
3223        start_mark: &Marker,
3224    ) -> Result<(), ScanError> {
3225        self.input.lookahead(2);
3226        while !is_blank_or_breakz(self.input.peek()) {
3227            match self.input.peek() {
3228                // Check for an escaped single quote.
3229                '\'' if self.input.peek_nth(1) == '\'' && single => {
3230                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3231                        buf.commit_pending_ws();
3232                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3233                    }
3234                    let Some(string) = buf.as_owned_mut() else {
3235                        unreachable!()
3236                    };
3237                    string.push('\'');
3238                    self.skip_n_non_blank(2);
3239                }
3240                // Check for the right quote.
3241                '\'' if single => break,
3242                '"' if !single => break,
3243                // Check for an escaped line break.
3244                '\\' if !single && is_break(self.input.peek_nth(1)) => {
3245                    self.input.lookahead(3);
3246                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3247                        buf.commit_pending_ws();
3248                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3249                    }
3250                    self.skip_non_blank();
3251                    self.skip_linebreak();
3252                    *leading_blanks = true;
3253                    break;
3254                }
3255                // Check for an escape sequence.
3256                '\\' if !single => {
3257                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3258                        buf.commit_pending_ws();
3259                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3260                    }
3261                    let Some(string) = buf.as_owned_mut() else {
3262                        unreachable!()
3263                    };
3264                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
3265                }
3266                c => {
3267                    match buf {
3268                        FlowScalarBuf::Owned(ref mut string) => {
3269                            string.push(c);
3270                        }
3271                        FlowScalarBuf::Borrowed { .. } => {
3272                            buf.commit_pending_ws();
3273                        }
3274                    }
3275                    self.skip_non_blank();
3276
3277                    if let Some(new_end) = self.input.byte_offset() {
3278                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
3279                            *end = new_end;
3280                        }
3281                    }
3282                }
3283            }
3284            self.input.lookahead(2);
3285        }
3286        Ok(())
3287    }
3288
3289    /// Escape the sequence we encounter in a flow scalar.
3290    ///
3291    /// `self.input.peek()` must point to the `\` starting the escape sequence.
3292    ///
3293    /// # Errors
3294    /// Return an error if an invalid escape sequence is found.
3295    fn resolve_flow_scalar_escape_sequence(
3296        &mut self,
3297        start_mark: &Marker,
3298    ) -> Result<char, ScanError> {
3299        let mut code_length = 0usize;
3300        let mut ret = '\0';
3301
3302        match self.input.peek_nth(1) {
3303            '0' => ret = '\0',
3304            'a' => ret = '\x07',
3305            'b' => ret = '\x08',
3306            't' | '\t' => ret = '\t',
3307            'n' => ret = '\n',
3308            'v' => ret = '\x0b',
3309            'f' => ret = '\x0c',
3310            'r' => ret = '\x0d',
3311            'e' => ret = '\x1b',
3312            ' ' => ret = '\x20',
3313            '"' => ret = '"',
3314            '/' => ret = '/',
3315            '\\' => ret = '\\',
3316            // Unicode next line (#x85)
3317            'N' => ret = char::from_u32(0x85).unwrap(),
3318            // Unicode non-breaking space (#xA0)
3319            '_' => ret = char::from_u32(0xA0).unwrap(),
3320            // Unicode line separator (#x2028)
3321            'L' => ret = char::from_u32(0x2028).unwrap(),
3322            // Unicode paragraph separator (#x2029)
3323            'P' => ret = char::from_u32(0x2029).unwrap(),
3324            'x' => code_length = 2,
3325            'u' => code_length = 4,
3326            'U' => code_length = 8,
3327            _ => {
3328                return Err(ScanError::new_str(
3329                    *start_mark,
3330                    "while parsing a quoted scalar, found unknown escape character",
3331                ))
3332            }
3333        }
3334        self.skip_n_non_blank(2);
3335
3336        // Consume an arbitrary escape code.
3337        if code_length > 0 {
3338            self.input.lookahead(code_length);
3339            let mut value = 0u32;
3340            for i in 0..code_length {
3341                let c = self.input.peek_nth(i);
3342                if !is_hex(c) {
3343                    return Err(ScanError::new_str(
3344                        *start_mark,
3345                        "while parsing a quoted scalar, did not find expected hexadecimal number",
3346                    ));
3347                }
3348                value = (value << 4) + as_hex(c);
3349            }
3350
3351            self.skip_n_non_blank(code_length);
3352
3353            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
3354            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3355                self.input.lookahead(2);
3356                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3357                    self.skip_n_non_blank(2);
3358                    self.input.lookahead(4);
3359                    let mut low_value = 0u32;
3360                    for i in 0..4 {
3361                        let c = self.input.peek_nth(i);
3362                        if !is_hex(c) {
3363                            return Err(ScanError::new_str(
3364                                *start_mark,
3365                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3366                            ));
3367                        }
3368                        low_value = (low_value << 4) + as_hex(c);
3369                    }
3370                    if (0xDC00..=0xDFFF).contains(&low_value) {
3371                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3372                        self.skip_n_non_blank(4);
3373                    } else {
3374                        return Err(ScanError::new_str(
3375                            *start_mark,
3376                            "while parsing a quoted scalar, found invalid low surrogate",
3377                        ));
3378                    }
3379                } else {
3380                    return Err(ScanError::new_str(
3381                        *start_mark,
3382                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3383                    ));
3384                }
3385            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3386                return Err(ScanError::new_str(
3387                    *start_mark,
3388                    "while parsing a quoted scalar, found unpaired low surrogate",
3389                ));
3390            }
3391
3392            let Some(ch) = char::from_u32(value) else {
3393                return Err(ScanError::new_str(
3394                    *start_mark,
3395                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3396                ));
3397            };
3398            ret = ch;
3399        }
3400        Ok(ret)
3401    }
3402
3403    fn fetch_plain_scalar(&mut self) -> ScanResult {
3404        self.save_simple_key();
3405        self.disallow_simple_key();
3406
3407        let token_index = self.tokens.len();
3408        let tok = self.scan_plain_scalar()?;
3409
3410        self.insert_token(token_index, tok);
3411        Ok(())
3412    }
3413
3414    /// Scan for a plain scalar.
3415    ///
3416    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3417    /// some contexts.
3418    #[allow(clippy::too_many_lines)]
3419    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3420        self.unroll_non_block_indents();
3421        let indent = self.indent + 1;
3422        let start_mark = self.mark;
3423
3424        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3425            return Err(ScanError::new_str(
3426                start_mark,
3427                "invalid indentation in flow construct",
3428            ));
3429        }
3430
3431        let mut string = String::with_capacity(32);
3432        self.buf_whitespaces.clear();
3433        self.buf_leading_break.clear();
3434        self.buf_trailing_breaks.clear();
3435        let mut end_mark = self.mark;
3436
3437        loop {
3438            self.input.lookahead(4);
3439            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3440                || self.input.peek() == '#'
3441            {
3442                // BS4K: If a `#` starts a comment after some separation spaces following content
3443                // of a plain scalar in block context, and there is potential continuation on the
3444                // next line, this is invalid. We cannot decide yet if there will be continuation,
3445                // so record that a comment interrupted a plain scalar.
3446                if self.input.peek() == '#'
3447                    && !string.is_empty()
3448                    && !self.buf_whitespaces.is_empty()
3449                    && self.flow_level == 0
3450                {
3451                    self.interrupted_plain_by_comment = Some(self.mark);
3452                }
3453                break;
3454            }
3455
3456            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3457                return Err(ScanError::new_str(
3458                    self.mark,
3459                    "plain scalar cannot start with '-' followed by ,[]{}",
3460                ));
3461            }
3462
3463            if !self.input.next_is_blank_or_breakz()
3464                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3465            {
3466                if self.leading_whitespace {
3467                    if self.buf_leading_break.is_empty() {
3468                        string.push_str(&self.buf_leading_break);
3469                        string.push_str(&self.buf_trailing_breaks);
3470                        self.buf_trailing_breaks.clear();
3471                        self.buf_leading_break.clear();
3472                    } else {
3473                        if self.buf_trailing_breaks.is_empty() {
3474                            string.push(' ');
3475                        } else {
3476                            string.push_str(&self.buf_trailing_breaks);
3477                            self.buf_trailing_breaks.clear();
3478                        }
3479                        self.buf_leading_break.clear();
3480                    }
3481                    self.leading_whitespace = false;
3482                } else if !self.buf_whitespaces.is_empty() {
3483                    string.push_str(&self.buf_whitespaces);
3484                    self.buf_whitespaces.clear();
3485                }
3486
3487                // We can unroll the first iteration of the loop.
3488                string.push(self.input.peek());
3489                self.skip_non_blank();
3490                string.reserve(self.input.bufmaxlen());
3491
3492                // Add content non-blank characters to the scalar.
3493                let mut end = false;
3494                while !end {
3495                    // Fill the buffer once and process all characters in the buffer until the next
3496                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3497                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3498                    self.input.lookahead(self.input.bufmaxlen());
3499                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3500                        &mut string,
3501                        self.input.bufmaxlen() - 1,
3502                        self.flow_level > 0,
3503                    );
3504                    end = stop;
3505                    self.mark.offsets.chars += chars_consumed;
3506                    self.mark.col += chars_consumed;
3507                    self.mark.offsets.bytes = self.input.byte_offset();
3508                }
3509                end_mark = self.mark;
3510            }
3511
3512            // We may reach the end of a plain scalar if:
3513            //  - We reach eof
3514            //  - We reach ": "
3515            //  - We find a flow character in a flow context
3516            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3517                break;
3518            }
3519
3520            // Process blank characters.
3521            self.input.lookahead(2);
3522            while self.input.next_is_blank_or_break() {
3523                if self.input.next_is_blank() {
3524                    if !self.leading_whitespace {
3525                        self.buf_whitespaces.push(self.input.peek());
3526                        self.skip_blank();
3527                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3528                        // Tabs in an indentation columns are allowed if and only if the line is
3529                        // empty. Skip to the end of the line.
3530                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3531                        if !self.input.next_is_breakz() {
3532                            return Err(ScanError::new_str(
3533                                start_mark,
3534                                "while scanning a plain scalar, found a tab",
3535                            ));
3536                        }
3537                    } else {
3538                        self.skip_blank();
3539                    }
3540                } else {
3541                    // Check if it is a first line break
3542                    if self.leading_whitespace {
3543                        self.skip_break();
3544                        self.buf_trailing_breaks.push('\n');
3545                    } else {
3546                        self.buf_whitespaces.clear();
3547                        self.skip_break();
3548                        self.buf_leading_break.push('\n');
3549                        self.leading_whitespace = true;
3550                    }
3551                }
3552                self.input.lookahead(2);
3553            }
3554
3555            // check indentation level
3556            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3557                break;
3558            }
3559        }
3560
3561        if self.leading_whitespace {
3562            self.allow_simple_key();
3563        }
3564
3565        if string.is_empty() {
3566            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3567            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3568            // erroneous inputs such as "{...".
3569            Err(ScanError::new_str(
3570                start_mark,
3571                "unexpected end of plain scalar",
3572            ))
3573        } else {
3574            let contents = if let (Some(start), Some(end)) =
3575                (start_mark.byte_offset(), end_mark.byte_offset())
3576            {
3577                match self.try_borrow_slice(start, end) {
3578                    Some(slice) if slice == string => Cow::Borrowed(slice),
3579                    _ => Cow::Owned(string),
3580                }
3581            } else {
3582                Cow::Owned(string)
3583            };
3584
3585            Ok(Token(
3586                Span::new(start_mark, end_mark),
3587                TokenType::Scalar(ScalarStyle::Plain, contents),
3588            ))
3589        }
3590    }
3591
3592    fn fetch_key(&mut self) -> ScanResult {
3593        let start_mark = self.mark;
3594        if self.flow_level == 0 {
3595            // Check if we are allowed to start a new key (not necessarily simple).
3596            if !self.simple_key_allowed {
3597                return Err(ScanError::new_str(
3598                    self.mark,
3599                    "mapping keys are not allowed in this context",
3600                ));
3601            }
3602            self.roll_indent(
3603                start_mark.col,
3604                None,
3605                TokenType::BlockMappingStart,
3606                start_mark,
3607            );
3608        } else {
3609            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3610            self.set_current_flow_mapping_started(true);
3611        }
3612
3613        self.remove_simple_key()?;
3614
3615        if self.flow_level == 0 {
3616            self.allow_simple_key();
3617        } else {
3618            self.disallow_simple_key();
3619        }
3620
3621        self.skip_non_blank();
3622        let token_index = self.tokens.len();
3623        self.skip_yaml_whitespace()?;
3624        if self.input.peek() == '\t' {
3625            return Err(ScanError::new_str(
3626                self.mark(),
3627                "tabs disallowed in this context",
3628            ));
3629        }
3630        self.insert_token(
3631            token_index,
3632            Token(Span::new(start_mark, self.mark), TokenType::Key),
3633        );
3634        Ok(())
3635    }
3636
3637    /// Fetch a value in a mapping inside of a flow collection.
3638    ///
3639    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3640    /// values in flow collections are respected prior to calling [`fetch_value`].
3641    ///
3642    /// [`self.flow_level`]: Self::flow_level
3643    /// [`fetch_value`]: Self::fetch_value
3644    fn fetch_flow_value(&mut self) -> ScanResult {
3645        let nc = self.input.peek_nth(1);
3646
3647        // If we encounter a ':' inside a flow collection and it is not immediately
3648        // followed by a blank or breakz:
3649        //   - We must check whether an adjacent value is allowed
3650        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3651        //     is needed for JSON compatibility.
3652        //   - If not, we must ensure there is a space after the ':' and before its value.
3653        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3654        //   - But if the value is empty (null), then it's okay.
3655        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3656        // flow character), but the ']' is not the value. The value is an invisible empty
3657        // space which is represented as null ('~').
3658        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3659            return Err(ScanError::new_str(
3660                self.mark,
3661                "':' may not precede any of `[{` in flow mapping",
3662            ));
3663        }
3664
3665        self.fetch_value()
3666    }
3667
3668    /// Fetch a value from a mapping (after a `:`).
3669    fn fetch_value(&mut self) -> ScanResult {
3670        let sk = self.simple_keys.last().unwrap().clone();
3671        let start_mark = self.mark;
3672        let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3673            && !self.current_flow_mapping_started()
3674            && !self.implicit_flow_mapping_states.is_empty();
3675        if is_implicit_flow_mapping {
3676            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3677                ImplicitMappingState::Inside(self.flow_level);
3678        }
3679
3680        // Skip over ':'.
3681        self.skip_non_blank();
3682        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3683        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3684        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3685        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3686        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3687        // cases will still fail to parse correctly (just with a less specific error message).
3688        let mut trailing_tokens = VecDeque::new();
3689        if self.input.look_ch() == '\t' {
3690            let trailing_token_index = self.tokens.len();
3691            let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
3692            trailing_tokens = self.tokens.split_off(trailing_token_index);
3693
3694            if !whitespace.has_valid_yaml_ws()
3695                && (self.input.peek() == '-' || self.input.next_is_alpha())
3696            {
3697                return Err(ScanError::new_str(
3698                    self.mark,
3699                    "':' must be followed by a valid YAML whitespace",
3700                ));
3701            }
3702        }
3703
3704        if sk.possible {
3705            // insert simple key
3706            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3707            self.insert_token(sk.token_number - self.tokens_parsed, tok);
3708            if is_implicit_flow_mapping {
3709                if sk.mark.line < start_mark.line {
3710                    return Err(ScanError::new_str(
3711                        start_mark,
3712                        "illegal placement of ':' indicator",
3713                    ));
3714                }
3715                self.insert_token(
3716                    sk.token_number - self.tokens_parsed,
3717                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3718                );
3719            }
3720
3721            // Add the BLOCK-MAPPING-START token if needed.
3722            self.roll_indent(
3723                sk.mark.col,
3724                Some(sk.token_number),
3725                TokenType::BlockMappingStart,
3726                sk.mark,
3727            );
3728            self.roll_one_col_indent();
3729
3730            self.simple_keys.last_mut().unwrap().possible = false;
3731            self.disallow_simple_key();
3732        } else {
3733            if is_implicit_flow_mapping {
3734                self.tokens
3735                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
3736            }
3737            // The ':' indicator follows a complex key.
3738            if self.flow_level == 0 {
3739                if !self.simple_key_allowed {
3740                    return Err(ScanError::new_str(
3741                        start_mark,
3742                        "mapping values are not allowed in this context",
3743                    ));
3744                }
3745
3746                self.roll_indent(
3747                    start_mark.col,
3748                    None,
3749                    TokenType::BlockMappingStart,
3750                    start_mark,
3751                );
3752            }
3753            self.roll_one_col_indent();
3754
3755            if self.flow_level == 0 {
3756                self.allow_simple_key();
3757            } else {
3758                self.disallow_simple_key();
3759            }
3760        }
3761        self.tokens
3762            .push_back(Token(Span::empty(start_mark), TokenType::Value));
3763        self.tokens.append(&mut trailing_tokens);
3764
3765        Ok(())
3766    }
3767
3768    /// Add an indentation level to the stack with the given block token, if needed.
3769    ///
3770    /// An indentation level is added only if:
3771    ///   - We are not in a flow-style construct (which don't have indentation per-se).
3772    ///   - The current column is further indented than the last indent we have registered.
3773    fn roll_indent(
3774        &mut self,
3775        col: usize,
3776        number: Option<usize>,
3777        tok: TokenType<'input>,
3778        mark: Marker,
3779    ) {
3780        if self.flow_level > 0 {
3781            return;
3782        }
3783
3784        // If the last indent was a non-block indent, remove it.
3785        // This means that we prepared an indent that we thought we wouldn't use, but realized just
3786        // now that it is a block indent.
3787        if self.indent <= col as isize {
3788            if let Some(indent) = self.indents.last() {
3789                if !indent.needs_block_end {
3790                    self.indent = indent.indent;
3791                    self.indents.pop();
3792                }
3793            }
3794        }
3795
3796        if self.indent < col as isize {
3797            self.indents.push(Indent {
3798                indent: self.indent,
3799                needs_block_end: true,
3800            });
3801            self.indent = col as isize;
3802            let tokens_parsed = self.tokens_parsed;
3803            match number {
3804                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
3805                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
3806            }
3807        }
3808    }
3809
3810    /// Pop indentation levels from the stack as much as needed.
3811    ///
3812    /// Indentation levels are popped from the stack while they are further indented than `col`.
3813    /// If we are in a flow-style construct (which don't have indentation per-se), this function
3814    /// does nothing.
3815    fn unroll_indent(&mut self, col: isize) {
3816        if self.flow_level > 0 {
3817            return;
3818        }
3819        while self.indent > col {
3820            let indent = self.indents.pop().unwrap();
3821            self.indent = indent.indent;
3822            if indent.needs_block_end {
3823                self.tokens
3824                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
3825            }
3826        }
3827    }
3828
3829    /// Add an indentation level of 1 column that does not start a block.
3830    ///
3831    /// See the documentation of [`Indent::needs_block_end`] for more details.
3832    /// An indentation is not added if we are inside a flow level or if the last indent is already
3833    /// a non-block indent.
3834    fn roll_one_col_indent(&mut self) {
3835        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
3836            self.indents.push(Indent {
3837                indent: self.indent,
3838                needs_block_end: false,
3839            });
3840            self.indent += 1;
3841        }
3842    }
3843
3844    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
3845    fn unroll_non_block_indents(&mut self) {
3846        while let Some(indent) = self.indents.last() {
3847            if indent.needs_block_end {
3848                break;
3849            }
3850            self.indent = indent.indent;
3851            self.indents.pop();
3852        }
3853    }
3854
3855    /// Mark the next token to be inserted as a potential simple key.
3856    fn save_simple_key(&mut self) {
3857        if self.simple_key_allowed {
3858            let required = self.flow_level == 0
3859                && self.indent == (self.mark.col as isize)
3860                && self.indents.last().unwrap().needs_block_end;
3861
3862            if let Some(last) = self.simple_keys.last_mut() {
3863                *last = SimpleKey {
3864                    mark: self.mark,
3865                    possible: true,
3866                    required,
3867                    token_number: self.tokens_parsed + self.tokens.len(),
3868                };
3869            }
3870        }
3871    }
3872
3873    fn remove_simple_key(&mut self) -> ScanResult {
3874        let last = self.simple_keys.last_mut().unwrap();
3875        if last.possible && last.required {
3876            return Err(self.simple_key_expected());
3877        }
3878
3879        last.possible = false;
3880        Ok(())
3881    }
3882
3883    /// Return whether the scanner is inside a block but outside of a flow sequence.
3884    fn is_within_block(&self) -> bool {
3885        !self.indents.is_empty()
3886    }
3887
3888    /// If an implicit mapping had started, end it.
3889    ///
3890    /// This function does not pop the state in [`implicit_flow_mapping_states`].
3891    ///
3892    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
3893    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
3894        if self
3895            .implicit_flow_mapping_states
3896            .last()
3897            .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
3898        {
3899            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
3900            self.set_current_flow_mapping_started(false);
3901            self.tokens
3902                .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
3903        }
3904    }
3905
3906    fn current_flow_collection_is_sequence(&self) -> bool {
3907        self.flow_markers
3908            .last()
3909            .is_some_and(|(_, bracket)| *bracket == '[')
3910    }
3911
3912    fn current_flow_mapping_started(&self) -> bool {
3913        self.flow_mapping_started.last().copied().unwrap_or(false)
3914    }
3915
3916    fn set_current_flow_mapping_started(&mut self, started: bool) {
3917        if let Some(current) = self.flow_mapping_started.last_mut() {
3918            *current = started;
3919        }
3920    }
3921}
3922
3923/// Chomping, how final line breaks and trailing empty lines are interpreted.
3924///
3925/// See YAML spec 8.1.1.2.
3926#[derive(PartialEq, Eq)]
3927pub enum Chomping {
3928    /// The final line break and any trailing empty lines are excluded.
3929    Strip,
3930    /// The final line break is preserved, but trailing empty lines are excluded.
3931    Clip,
3932    /// The final line break and trailing empty lines are included.
3933    Keep,
3934}
3935
3936#[cfg(test)]
3937mod test {
3938    use alloc::{
3939        borrow::{Cow, ToOwned},
3940        rc::Rc,
3941        string::String,
3942        vec::Vec,
3943    };
3944    use core::cell::Cell;
3945
3946    use crate::{
3947        input::{str::StrInput, BorrowedInput, BufferedInput, Input},
3948        scanner::{Scanner, Token, TokenType},
3949    };
3950
3951    struct CountingChars {
3952        chars: alloc::vec::IntoIter<char>,
3953        read: Rc<Cell<usize>>,
3954    }
3955
3956    impl Iterator for CountingChars {
3957        type Item = char;
3958
3959        fn next(&mut self) -> Option<Self::Item> {
3960            let next = self.chars.next();
3961            if next.is_some() {
3962                self.read.set(self.read.get() + 1);
3963            }
3964            next
3965        }
3966    }
3967
3968    struct SlicingOnlyInput<'input> {
3969        inner: StrInput<'input>,
3970        expose_slice: bool,
3971    }
3972
3973    impl<'input> SlicingOnlyInput<'input> {
3974        fn new(source: &'input str, expose_slice: bool) -> Self {
3975            Self {
3976                inner: StrInput::new(source),
3977                expose_slice,
3978            }
3979        }
3980    }
3981
3982    impl Input for SlicingOnlyInput<'_> {
3983        fn lookahead(&mut self, count: usize) {
3984            self.inner.lookahead(count);
3985        }
3986
3987        fn buflen(&self) -> usize {
3988            self.inner.buflen()
3989        }
3990
3991        fn bufmaxlen(&self) -> usize {
3992            self.inner.bufmaxlen()
3993        }
3994
3995        fn raw_read_ch(&mut self) -> char {
3996            self.inner.raw_read_ch()
3997        }
3998
3999        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4000            self.inner.raw_read_non_breakz_ch()
4001        }
4002
4003        fn skip(&mut self) {
4004            self.inner.skip();
4005        }
4006
4007        fn skip_n(&mut self, count: usize) {
4008            self.inner.skip_n(count);
4009        }
4010
4011        fn peek(&self) -> char {
4012            self.inner.peek()
4013        }
4014
4015        fn peek_nth(&self, n: usize) -> char {
4016            self.inner.peek_nth(n)
4017        }
4018
4019        fn byte_offset(&self) -> Option<usize> {
4020            self.inner.byte_offset()
4021        }
4022
4023        fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
4024            if self.expose_slice {
4025                self.inner.slice_bytes(start, end)
4026            } else {
4027                None
4028            }
4029        }
4030    }
4031
4032    impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
4033        fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
4034            None
4035        }
4036    }
4037
4038    #[test]
4039    fn test_is_anchor_char() {
4040        use super::is_anchor_char;
4041        assert!(is_anchor_char('x'));
4042    }
4043
4044    #[test]
4045    fn flow_simple_key_length_limit_bounds_buffering() {
4046        let mut yaml = String::from("[\n\"start\"\n");
4047        for _ in 0..600 {
4048            yaml.push_str("\"x\"\n");
4049        }
4050        let total_chars = yaml.chars().count();
4051        let read = Rc::new(Cell::new(0));
4052        let chars = yaml.chars().collect::<Vec<_>>().into_iter();
4053        let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
4054            chars,
4055            read: Rc::clone(&read),
4056        }));
4057
4058        assert!(matches!(
4059            scanner.next_token().unwrap().unwrap().1,
4060            TokenType::StreamStart(_)
4061        ));
4062
4063        let token = scanner.next_token().unwrap().unwrap();
4064        assert!(matches!(token.1, TokenType::FlowSequenceStart));
4065
4066        let token = scanner.next_token().unwrap().unwrap();
4067        assert!(matches!(
4068            token.1,
4069            TokenType::Scalar(_, ref value) if value == "start"
4070        ));
4071        assert!(
4072            read.get() < total_chars,
4073            "scanner consumed all {total_chars} chars before yielding the first flow scalar"
4074        );
4075        assert!(
4076            read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
4077            "scanner read {} chars before yielding the first flow scalar",
4078            read.get()
4079        );
4080    }
4081
4082    #[test]
4083    fn comment_capture_does_not_change_leading_whitespace() {
4084        let mut scanner = Scanner::new(StrInput::new("# comment\n"));
4085
4086        let token = scanner.scan_comment_token().unwrap();
4087
4088        assert!(scanner.leading_whitespace);
4089        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
4090
4091        let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
4092        scanner.input.lookahead(1);
4093
4094        let token = scanner.scan_comment_token().unwrap();
4095
4096        assert!(scanner.leading_whitespace);
4097        assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
4098    }
4099
4100    #[test]
4101    fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
4102        let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
4103        scanner.input.lookahead(2);
4104        assert_eq!(scanner.input.peek_nth(1), ' ');
4105
4106        let token = scanner.scan_comment_token().unwrap();
4107
4108        assert!(matches!(token.1, TokenType::Comment(ref comment)
4109            if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
4110    }
4111
4112    #[test]
4113    fn comment_capture_errors_when_offsets_have_no_slice() {
4114        let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
4115
4116        let error = scanner.scan_comment_token().unwrap_err();
4117
4118        assert_eq!(
4119            error.info(),
4120            "internal error: input advertised offsets but did not provide a slice"
4121        );
4122    }
4123
4124    #[test]
4125    fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
4126        let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
4127
4128        scanner.skip_yaml_whitespace().unwrap();
4129
4130        assert!(scanner.tokens.is_empty());
4131        assert_eq!(scanner.mark.line(), 2);
4132        assert_eq!(scanner.mark.col(), 0);
4133    }
4134
4135    #[test]
4136    fn deferred_error_waits_for_all_comment_tokens() {
4137        let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
4138
4139        assert!(matches!(
4140            scanner.next_token().unwrap().unwrap().1,
4141            TokenType::StreamStart(_)
4142        ));
4143        assert!(matches!(
4144            scanner.next_token().unwrap().unwrap().1,
4145            TokenType::Comment(ref comment) if comment.text == " first"
4146        ));
4147        assert!(matches!(
4148            scanner.next_token().unwrap().unwrap().1,
4149            TokenType::Comment(ref comment) if comment.text == " second"
4150        ));
4151
4152        let error = scanner.next_token().unwrap_err();
4153
4154        assert!(error.info().contains("unexpected character"));
4155    }
4156
4157    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
4158    #[test]
4159    fn anchor_name_is_borrowed_for_str_input() {
4160        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
4161
4162        loop {
4163            let tok = scanner
4164                .next_token()
4165                .expect("valid YAML must scan without errors")
4166                .expect("scanner must eventually produce a token");
4167            if let TokenType::Anchor(name) = tok.1 {
4168                assert!(matches!(name, Cow::Borrowed("anch")));
4169                break;
4170            }
4171        }
4172    }
4173
4174    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
4175    #[test]
4176    fn anchor_name_rejects_non_printable_control_chars() {
4177        let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
4178
4179        loop {
4180            let tok = scanner
4181                .next_token()
4182                .expect("scanning should not fail")
4183                .expect("scanner must eventually produce a token");
4184            if let TokenType::Anchor(name) = tok.1 {
4185                assert!(matches!(name, Cow::Borrowed("foo")));
4186                let next = scanner.next_token().expect("scanning should not fail");
4187                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4188                    assert!(rest.starts_with('\u{0001}'));
4189                }
4190                break;
4191            }
4192        }
4193    }
4194
4195    #[test]
4196    fn alias_name_rejects_non_printable_control_chars() {
4197        let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
4198
4199        loop {
4200            let tok = scanner
4201                .next_token()
4202                .expect("scanning should not fail")
4203                .expect("scanner must eventually produce a token");
4204            if let TokenType::Alias(name) = tok.1 {
4205                assert!(matches!(name, Cow::Borrowed("foo")));
4206                let next = scanner.next_token().expect("scanning should not fail");
4207                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4208                    assert!(rest.starts_with('\u{0001}'));
4209                }
4210                break;
4211            }
4212        }
4213    }
4214
4215    #[test]
4216    fn alias_name_is_borrowed_for_str_input() {
4217        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
4218
4219        loop {
4220            let tok = scanner
4221                .next_token()
4222                .expect("valid YAML must scan without errors")
4223                .expect("scanner must eventually produce a token");
4224            if let TokenType::Alias(name) = tok.1 {
4225                assert!(matches!(name, Cow::Borrowed("anch")));
4226                break;
4227            }
4228        }
4229    }
4230
4231    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
4232    #[test]
4233    fn tag_directive_parts_are_borrowed_for_str_input() {
4234        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
4235
4236        loop {
4237            let tok = scanner
4238                .next_token()
4239                .expect("valid YAML must scan without errors")
4240                .expect("scanner must eventually produce a token");
4241            if let TokenType::TagDirective(handle, prefix) = tok.1 {
4242                assert!(matches!(handle, Cow::Borrowed("!e!")));
4243                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
4244                break;
4245            }
4246        }
4247    }
4248
4249    #[test]
4250    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
4251        let mut scanner = Scanner::new(StrInput::new("foo\n"));
4252
4253        loop {
4254            let tok = scanner
4255                .next_token()
4256                .expect("valid YAML must scan without errors")
4257                .expect("scanner must eventually produce a token");
4258            if let TokenType::Scalar(_, value) = tok.1 {
4259                assert!(matches!(value, Cow::Borrowed("foo")));
4260                break;
4261            }
4262        }
4263    }
4264
4265    #[test]
4266    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
4267        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
4268
4269        loop {
4270            let tok = scanner
4271                .next_token()
4272                .expect("valid YAML must scan without errors")
4273                .expect("scanner must eventually produce a token");
4274            if let TokenType::Scalar(_, value) = tok.1 {
4275                assert!(matches!(value, Cow::Borrowed("foo bar")));
4276                break;
4277            }
4278        }
4279    }
4280
4281    #[test]
4282    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4283        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
4284
4285        loop {
4286            let tok = scanner
4287                .next_token()
4288                .expect("valid YAML must scan without errors")
4289                .expect("scanner must eventually produce a token");
4290            if let TokenType::Scalar(_, value) = tok.1 {
4291                assert!(matches!(value, Cow::Borrowed("foo bar")));
4292                break;
4293            }
4294        }
4295    }
4296
4297    #[test]
4298    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
4299        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
4300
4301        loop {
4302            let tok = scanner
4303                .next_token()
4304                .expect("valid YAML must scan without errors")
4305                .expect("scanner must eventually produce a token");
4306            if let TokenType::Scalar(_, value) = tok.1 {
4307                assert!(matches!(value, Cow::Owned(_)));
4308                assert_eq!(&*value, "foo'bar");
4309                break;
4310            }
4311        }
4312    }
4313
4314    #[test]
4315    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4316        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
4317
4318        loop {
4319            let tok = scanner
4320                .next_token()
4321                .expect("valid YAML must scan without errors")
4322                .expect("scanner must eventually produce a token");
4323            if let TokenType::Scalar(_, value) = tok.1 {
4324                assert!(matches!(value, Cow::Borrowed("foo bar")));
4325                break;
4326            }
4327        }
4328    }
4329
4330    #[test]
4331    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
4332        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
4333
4334        loop {
4335            let tok = scanner
4336                .next_token()
4337                .expect("valid YAML must scan without errors")
4338                .expect("scanner must eventually produce a token");
4339            if let TokenType::Scalar(_, value) = tok.1 {
4340                assert!(matches!(value, Cow::Owned(_)));
4341                assert_eq!(&*value, "foo\nbar");
4342                break;
4343            }
4344        }
4345    }
4346
4347    #[test]
4348    fn plain_key_is_borrowed_for_str_input() {
4349        // Keys are just scalars in a key position; they should also be borrowed.
4350        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
4351
4352        let mut found_key = false;
4353        let mut key_value: Option<Cow<'_, str>> = None;
4354
4355        loop {
4356            let tok = scanner
4357                .next_token()
4358                .expect("valid YAML must scan without errors");
4359            let Some(tok) = tok else { break };
4360
4361            if matches!(tok.1, TokenType::Key) {
4362                found_key = true;
4363            } else if found_key {
4364                if let TokenType::Scalar(_, value) = tok.1 {
4365                    key_value = Some(value);
4366                    break;
4367                }
4368            }
4369        }
4370
4371        assert!(found_key, "expected to find a Key token");
4372        let key_value = key_value.expect("expected to find a scalar after Key token");
4373        assert!(
4374            matches!(key_value, Cow::Borrowed("mykey")),
4375            "key should be borrowed, got: {key_value:?}"
4376        );
4377    }
4378
4379    #[test]
4380    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
4381        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
4382
4383        let mut found_key = false;
4384        let mut key_value: Option<Cow<'_, str>> = None;
4385
4386        loop {
4387            let tok = scanner
4388                .next_token()
4389                .expect("valid YAML must scan without errors");
4390            let Some(tok) = tok else { break };
4391
4392            if matches!(tok.1, TokenType::Key) {
4393                found_key = true;
4394            } else if found_key {
4395                if let TokenType::Scalar(_, value) = tok.1 {
4396                    key_value = Some(value);
4397                    break;
4398                }
4399            }
4400        }
4401
4402        assert!(found_key, "expected to find a Key token");
4403        let key_value = key_value.expect("expected to find a scalar after Key token");
4404        assert!(
4405            matches!(key_value, Cow::Borrowed("mykey")),
4406            "quoted key should be borrowed when verbatim, got: {key_value:?}"
4407        );
4408    }
4409
4410    #[test]
4411    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
4412        // Test a tag like !!str which should have handle="!!" and suffix="str"
4413        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
4414
4415        loop {
4416            let tok = scanner
4417                .next_token()
4418                .expect("valid YAML must scan without errors")
4419                .expect("scanner must eventually produce a token");
4420            if let TokenType::Tag(handle, suffix) = tok.1 {
4421                assert!(
4422                    matches!(handle, Cow::Borrowed("!!")),
4423                    "tag handle should be borrowed, got: {handle:?}"
4424                );
4425                assert!(
4426                    matches!(suffix, Cow::Borrowed("str")),
4427                    "tag suffix should be borrowed, got: {suffix:?}"
4428                );
4429                break;
4430            }
4431        }
4432    }
4433
4434    #[test]
4435    fn local_tag_suffix_is_borrowed_for_str_input() {
4436        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
4437        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
4438
4439        loop {
4440            let tok = scanner
4441                .next_token()
4442                .expect("valid YAML must scan without errors")
4443                .expect("scanner must eventually produce a token");
4444            if let TokenType::Tag(handle, suffix) = tok.1 {
4445                assert!(
4446                    matches!(handle, Cow::Borrowed("!")),
4447                    "local tag handle should be '!', got: {handle:?}"
4448                );
4449                assert!(
4450                    matches!(suffix, Cow::Borrowed("mytag")),
4451                    "local tag suffix should be borrowed, got: {suffix:?}"
4452                );
4453                break;
4454            }
4455        }
4456    }
4457
4458    #[test]
4459    fn tag_with_uri_escape_is_owned_for_str_input() {
4460        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
4461        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
4462
4463        loop {
4464            let tok = scanner
4465                .next_token()
4466                .expect("valid YAML must scan without errors")
4467                .expect("scanner must eventually produce a token");
4468            if let TokenType::Tag(handle, suffix) = tok.1 {
4469                assert!(
4470                    matches!(handle, Cow::Borrowed("!!")),
4471                    "tag handle should still be borrowed, got: {handle:?}"
4472                );
4473                assert!(
4474                    matches!(suffix, Cow::Owned(_)),
4475                    "tag suffix with URI escape should be owned, got: {suffix:?}"
4476                );
4477                assert_eq!(&*suffix, "my tag");
4478                break;
4479            }
4480        }
4481    }
4482
4483    #[test]
4484    fn flow_scalar_buffer_tracks_pending_whitespace() {
4485        let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
4486
4487        borrowed.note_pending_ws(5, 8);
4488        borrowed.commit_pending_ws();
4489        assert!(matches!(
4490            borrowed,
4491            super::FlowScalarBuf::Borrowed {
4492                end: 8,
4493                pending_ws_start: None,
4494                pending_ws_end: 8,
4495                ..
4496            }
4497        ));
4498
4499        borrowed.note_pending_ws(9, 11);
4500        borrowed.discard_pending_ws();
4501        assert!(matches!(
4502            borrowed,
4503            super::FlowScalarBuf::Borrowed {
4504                end: 8,
4505                pending_ws_start: None,
4506                pending_ws_end: 8,
4507                ..
4508            }
4509        ));
4510        assert!(borrowed.as_owned_mut().is_none());
4511
4512        let mut owned = super::FlowScalarBuf::new_owned();
4513        owned.as_owned_mut().unwrap().push_str("owned");
4514        assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
4515    }
4516
4517    fn first_scanner_error_info(input: &str) -> String {
4518        let mut scanner = Scanner::new(StrInput::new(input));
4519        loop {
4520            match scanner.next_token() {
4521                Ok(Some(_)) => {}
4522                Ok(None) => panic!("expected scanner error"),
4523                Err(error) => return error.info().to_owned(),
4524            }
4525        }
4526    }
4527
4528    fn first_scalar_value(input: &str) -> String {
4529        let mut scanner = Scanner::new(StrInput::new(input));
4530        loop {
4531            match scanner.next_token().expect("scanner should not error") {
4532                Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
4533                Some(_) => {}
4534                None => panic!("expected scalar token"),
4535            }
4536        }
4537    }
4538
4539    #[test]
4540    fn iterator_next_records_error_and_then_stays_empty() {
4541        let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
4542
4543        while scanner.next().is_some() {}
4544
4545        let error = scanner
4546            .get_error()
4547            .expect("scanner should retain the error");
4548        assert_eq!(error.info(), "unclosed quote");
4549        assert!(scanner.next().is_none());
4550    }
4551
4552    #[test]
4553    fn next_token_returns_none_after_stream_end() {
4554        let mut scanner = Scanner::new(StrInput::new(""));
4555
4556        while let Some(token) = scanner.next_token().unwrap() {
4557            if matches!(token.1, TokenType::StreamEnd) {
4558                break;
4559            }
4560        }
4561
4562        assert!(scanner.stream_started());
4563        assert!(scanner.stream_ended());
4564        assert!(scanner.next_token().unwrap().is_none());
4565    }
4566
4567    #[test]
4568    fn directive_name_must_be_present() {
4569        assert_eq!(
4570            first_scanner_error_info("%\n"),
4571            "while scanning a directive, could not find expected directive name"
4572        );
4573    }
4574
4575    #[test]
4576    fn yaml_directive_requires_dot_between_version_numbers() {
4577        assert_eq!(
4578            first_scanner_error_info("%YAML 1\n"),
4579            "while scanning a YAML directive, did not find expected digit or '.' character"
4580        );
4581    }
4582
4583    #[test]
4584    fn yaml_directive_requires_major_version_number() {
4585        assert_eq!(
4586            first_scanner_error_info("%YAML .2\n"),
4587            "while scanning a YAML directive, did not find expected version number"
4588        );
4589    }
4590
4591    #[test]
4592    fn yaml_directive_rejects_extremely_long_version_number() {
4593        assert_eq!(
4594            first_scanner_error_info("%YAML 1234567890.2\n"),
4595            "while scanning a YAML directive, found extremely long version number"
4596        );
4597    }
4598
4599    #[test]
4600    fn tag_directive_handle_must_end_with_bang() {
4601        assert_eq!(
4602            first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
4603            "while parsing a tag directive, did not find expected '!'"
4604        );
4605    }
4606
4607    #[test]
4608    fn tag_directive_handle_must_start_with_bang() {
4609        assert_eq!(
4610            first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
4611            "while scanning a tag, did not find expected '!'"
4612        );
4613    }
4614
4615    #[test]
4616    fn tag_directive_prefix_must_start_with_tag_character() {
4617        assert_eq!(
4618            first_scanner_error_info("%TAG !e! `bad\n"),
4619            "invalid global tag character"
4620        );
4621    }
4622
4623    #[test]
4624    fn tag_directive_prefix_must_end_before_invalid_content() {
4625        assert_eq!(
4626            first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
4627            "while scanning TAG, did not find expected whitespace or line break"
4628        );
4629    }
4630
4631    #[test]
4632    fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
4633        let mut scanner =
4634            Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
4635
4636        loop {
4637            let token = scanner
4638                .next_token()
4639                .expect("valid directive should scan")
4640                .expect("scanner must produce a directive token");
4641            if let TokenType::TagDirective(handle, prefix) = token.1 {
4642                assert!(matches!(handle, Cow::Borrowed("!e!")));
4643                assert!(matches!(prefix, Cow::Owned(_)));
4644                assert_eq!(&*prefix, "tag:example.com,2024:some app/");
4645                break;
4646            }
4647        }
4648    }
4649
4650    #[test]
4651    fn bare_bang_tag_scans_as_non_specific_tag() {
4652        let mut scanner = Scanner::new(StrInput::new("! foo\n"));
4653
4654        loop {
4655            let token = scanner
4656                .next_token()
4657                .expect("valid tag should scan")
4658                .expect("scanner must produce a tag token");
4659            if let TokenType::Tag(handle, suffix) = token.1 {
4660                assert_eq!(&*handle, "");
4661                assert_eq!(&*suffix, "!");
4662                break;
4663            }
4664        }
4665    }
4666
4667    #[test]
4668    fn tag_requires_separation_after_suffix() {
4669        assert_eq!(
4670            first_scanner_error_info("!foo,bar\n"),
4671            "while scanning a tag, did not find expected whitespace or line break"
4672        );
4673    }
4674
4675    #[test]
4676    fn verbatim_tag_requires_uri() {
4677        assert_eq!(
4678            first_scanner_error_info("!<> foo\n"),
4679            "while parsing a tag, did not find expected tag URI"
4680        );
4681    }
4682
4683    #[test]
4684    fn verbatim_tag_requires_closing_angle_bracket() {
4685        assert_eq!(
4686            first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
4687            "while scanning a verbatim tag, did not find the expected '>'"
4688        );
4689    }
4690
4691    #[test]
4692    fn tag_uri_escape_requires_hex_digits() {
4693        assert_eq!(
4694            first_scanner_error_info("!!bad%zz foo\n"),
4695            "while parsing a tag, found an invalid escape sequence"
4696        );
4697    }
4698
4699    #[test]
4700    fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
4701        assert_eq!(
4702            first_scanner_error_info("!!bad%80 foo\n"),
4703            "while parsing a tag, found an incorrect leading UTF-8 byte"
4704        );
4705    }
4706
4707    #[test]
4708    fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
4709        assert_eq!(
4710            first_scanner_error_info("!!bad%C2%41 foo\n"),
4711            "while parsing a tag, found an incorrect trailing UTF-8 byte"
4712        );
4713    }
4714
4715    #[test]
4716    fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
4717        assert_eq!(
4718            first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
4719            "while parsing a tag, found an invalid UTF-8 codepoint"
4720        );
4721    }
4722
4723    #[test]
4724    fn anchors_and_aliases_require_names() {
4725        let expected =
4726            "while scanning an anchor or alias, did not find expected alphabetic or numeric character";
4727
4728        assert_eq!(first_scanner_error_info("& \n"), expected);
4729        assert_eq!(first_scanner_error_info("* \n"), expected);
4730    }
4731
4732    #[test]
4733    fn document_end_marker_rejects_trailing_content() {
4734        assert_eq!(
4735            first_scanner_error_info("... trailing\n"),
4736            "invalid content after document end marker"
4737        );
4738    }
4739
4740    #[test]
4741    fn reserved_indicators_are_rejected_outside_directives() {
4742        assert_eq!(
4743            first_scanner_error_info(" @\n"),
4744            "unexpected character: `@'"
4745        );
4746    }
4747
4748    #[test]
4749    fn flow_block_entry_indicator_is_rejected() {
4750        assert_eq!(
4751            first_scanner_error_info("[- ]\n"),
4752            r#""-" is only valid inside a block"#
4753        );
4754    }
4755
4756    #[test]
4757    fn block_entry_after_tabbed_separator_reports_specific_error() {
4758        assert_eq!(
4759            first_scanner_error_info("-\t- value\n"),
4760            "'-' must be followed by a valid YAML whitespace"
4761        );
4762    }
4763
4764    #[test]
4765    fn document_indicator_reports_unclosed_flow_collection() {
4766        assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
4767    }
4768
4769    #[test]
4770    fn block_scalar_header_rejects_trailing_content() {
4771        assert_eq!(
4772            first_scanner_error_info("|+ trailing\n"),
4773            "while scanning a block scalar, did not find expected comment or line break"
4774        );
4775    }
4776
4777    #[test]
4778    fn block_scalar_rejects_zero_indent_indicator() {
4779        let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
4780
4781        assert_eq!(first_scanner_error_info("|0\n"), expected);
4782        assert_eq!(first_scanner_error_info("|+0\n"), expected);
4783    }
4784
4785    #[test]
4786    fn empty_block_scalar_at_eof_honors_chomping() {
4787        assert_eq!(first_scalar_value("|-\n"), "");
4788        assert_eq!(first_scalar_value("|+\n"), "\n");
4789    }
4790
4791    #[test]
4792    fn explicit_indent_block_scalar_can_end_at_document_marker() {
4793        assert_eq!(first_scalar_value("|1\n...\n"), "");
4794    }
4795
4796    #[test]
4797    fn root_explicit_indent_block_scalar_rejects_underindented_content() {
4798        assert_eq!(
4799            first_scanner_error_info("|2\nx\n"),
4800            "wrongly indented line in block scalar"
4801        );
4802    }
4803
4804    #[test]
4805    fn quoted_scalar_rejects_document_indicator_at_line_start() {
4806        assert_eq!(
4807            first_scanner_error_info("\"one\n---\ntwo\"\n"),
4808            "while scanning a quoted scalar, found unexpected document indicator"
4809        );
4810    }
4811
4812    #[test]
4813    fn quoted_scalar_rejects_tab_indentation_after_line_break() {
4814        assert_eq!(
4815            first_scanner_error_info("a: \"one\n\tbad\"\n"),
4816            "tab cannot be used as indentation"
4817        );
4818    }
4819
4820    #[test]
4821    fn quoted_scalar_rejects_underindented_continuation() {
4822        assert_eq!(
4823            first_scanner_error_info("a: \"one\nbad\"\n"),
4824            "invalid indentation in multiline quoted scalar"
4825        );
4826    }
4827
4828    #[test]
4829    fn indented_flow_scalar_reports_invalid_indentation() {
4830        assert_eq!(
4831            first_scanner_error_info("a:\n  [\nfoo]\n"),
4832            "invalid indentation"
4833        );
4834    }
4835
4836    #[test]
4837    fn required_simple_key_requires_value_at_stream_end() {
4838        assert_eq!(
4839            first_scanner_error_info("a:\n&b\n- c\n"),
4840            "simple key expect ':'"
4841        );
4842    }
4843
4844    #[test]
4845    fn plain_scalar_rejects_dash_before_flow_indicator() {
4846        assert_eq!(
4847            first_scanner_error_info("[-]\n"),
4848            "plain scalar cannot start with '-' followed by ,[]{}"
4849        );
4850    }
4851
4852    #[test]
4853    fn explicit_key_rejects_tab_after_indicator() {
4854        assert_eq!(
4855            first_scanner_error_info("? \tfoo\n"),
4856            "tabs disallowed in this context"
4857        );
4858    }
4859
4860    #[test]
4861    fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
4862        assert_eq!(
4863            first_scanner_error_info("[a:[]]\n"),
4864            "':' may not precede any of `[{` in flow mapping"
4865        );
4866    }
4867
4868    #[test]
4869    fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
4870        assert_eq!(
4871            first_scanner_error_info("[foo\n: bar]\n"),
4872            "illegal placement of ':' indicator"
4873        );
4874    }
4875}
granit_parser/scanner.rs

granit_parser/
scanner.rs