Skip to main content

granit_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// Maximum number of characters the scanner may look ahead while disambiguating a simple key.
29const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31/// The encoding of the input. Currently, only UTF-8 is supported.
32#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34    /// UTF-8 encoding.
35    Utf8,
36}
37
38/// The style as which the scalar was written in the YAML document.
39#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41    /// A YAML plain scalar.
42    Plain,
43    /// A YAML single quoted scalar.
44    SingleQuoted,
45    /// A YAML double quoted scalar.
46    DoubleQuoted,
47
48    /// A YAML literal block (`|` block).
49    ///
50    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
51    /// In literal blocks, any indented character is content, including white space characters.
52    /// There is no way to escape characters, nor to break a long line.
53    Literal,
54    /// A YAML folded block (`>` block).
55    ///
56    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
57    /// In folded blocks, any indented character is content, including white space characters.
58    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
59    /// long lines.
60    Folded,
61}
62
63/// Offset information for a [`Marker`].
64///
65/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
66/// character source. For stable inputs, we can track both a character index and a byte offset.
67/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
68/// meaningful underlying file/source), so they are optional.
69#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71    /// The index (in characters) in the source.
72    chars: usize,
73    /// The offset (in bytes) in the source, if available.
74    bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78    fn eq(&self, other: &Self) -> bool {
79        // Byte offsets are an optional diagnostic enhancement and may differ between input
80        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
81        // position only.
82        self.chars == other.chars
83    }
84}
85
86impl Eq for MarkerOffsets {}
87
88/// A location in a yaml document.
89#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91    /// Offsets in the source.
92    offsets: MarkerOffsets,
93    /// The line (1-indexed).
94    line: usize,
95    /// The column (0-indexed).
96    col: usize,
97}
98
99impl Marker {
100    /// Create a new [`Marker`] at the given position.
101    #[must_use]
102    pub fn new(index: usize, line: usize, col: usize) -> Marker {
103        Marker {
104            offsets: MarkerOffsets {
105                chars: index,
106                bytes: None,
107            },
108            line,
109            col,
110        }
111    }
112
113    /// Return a copy of the marker with the given optional byte offset.
114    #[must_use]
115    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116        self.offsets.bytes = byte_offset;
117        self
118    }
119
120    /// Return the index (in characters) of the marker in the source.
121    #[must_use]
122    pub fn index(&self) -> usize {
123        self.offsets.chars
124    }
125
126    /// Return the byte offset of the marker in the source, if available.
127    #[must_use]
128    pub fn byte_offset(&self) -> Option<usize> {
129        self.offsets.bytes
130    }
131
132    /// Return the line of the marker in the source.
133    #[must_use]
134    pub fn line(&self) -> usize {
135        self.line
136    }
137
138    /// Return the column of the marker in the source.
139    #[must_use]
140    pub fn col(&self) -> usize {
141        self.col
142    }
143}
144
145/// A range of locations in a Yaml document.
146#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148    /// The start (inclusive) of the range.
149    pub start: Marker,
150    /// The end (exclusive) of the range.
151    pub end: Marker,
152
153    /// Optional indentation hint associated with this span.
154    ///
155    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
156    /// When indentation is not meaningful or cannot be provided, it must be `None`.
157    pub indent: Option<usize>,
158}
159
160impl Span {
161    /// Create a new [`Span`] for the given range.
162    #[must_use]
163    pub fn new(start: Marker, end: Marker) -> Span {
164        Span {
165            start,
166            end,
167            indent: None,
168        }
169    }
170
171    /// Create a empty [`Span`] at a given location.
172    ///
173    /// An empty span doesn't contain any characters, but its position may still be meaningful.
174    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
175    ///
176    /// [`SequenceEnd`]: crate::Event::SequenceEnd
177    #[must_use]
178    pub fn empty(mark: Marker) -> Span {
179        Span {
180            start: mark,
181            end: mark,
182            indent: None,
183        }
184    }
185
186    /// Return a copy of this [`Span`] with the given indentation hint.
187    #[must_use]
188    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
189        self.indent = indent;
190        self
191    }
192
193    /// Return the length of the span (in characters).
194    #[must_use]
195    pub fn len(&self) -> usize {
196        self.end.index() - self.start.index()
197    }
198
199    /// Return whether the [`Span`] has a length of zero.
200    #[must_use]
201    pub fn is_empty(&self) -> bool {
202        self.len() == 0
203    }
204
205    /// Return the byte range of the span, if available.
206    #[must_use]
207    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
208        let start = self.start.byte_offset()?;
209        let end = self.end.byte_offset()?;
210        Some(start..end)
211    }
212}
213
214/// An error that occurred while scanning.
215#[derive(Clone, PartialEq, Debug, Eq)]
216pub struct ScanError {
217    /// The position at which the error happened in the source.
218    mark: Marker,
219    /// Human-readable details about the error.
220    info: String,
221}
222
223impl ScanError {
224    /// Create a new error from a location and an error string.
225    #[must_use]
226    #[cold]
227    pub fn new(loc: Marker, info: String) -> ScanError {
228        ScanError { mark: loc, info }
229    }
230
231    /// Convenience alias for string slices.
232    #[must_use]
233    #[cold]
234    pub fn new_str(loc: Marker, info: &str) -> ScanError {
235        ScanError {
236            mark: loc,
237            info: info.to_owned(),
238        }
239    }
240
241    /// Return the marker pointing to the error in the source.
242    #[must_use]
243    pub fn marker(&self) -> &Marker {
244        &self.mark
245    }
246
247    /// Return the information string describing the error that happened.
248    #[must_use]
249    pub fn info(&self) -> &str {
250        self.info.as_ref()
251    }
252}
253
254impl fmt::Display for ScanError {
255    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
256        write!(
257            f,
258            "{} at char {} line {} column {}",
259            self.info,
260            self.mark.index(),
261            self.mark.line(),
262            self.mark.col() + 1
263        )
264    }
265}
266
267impl core::error::Error for ScanError {}
268
269/// The contents of a scanner token.
270#[derive(Clone, PartialEq, Debug, Eq)]
271pub enum TokenType<'input> {
272    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
273    StreamStart(TEncoding),
274    /// The end of the stream, EOF.
275    StreamEnd,
276    /// A YAML version directive.
277    VersionDirective(
278        /// Major
279        u32,
280        /// Minor
281        u32,
282    ),
283    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
284    TagDirective(
285        /// Handle
286        Cow<'input, str>,
287        /// Prefix
288        Cow<'input, str>,
289    ),
290    /// The start of a YAML document (`---`).
291    DocumentStart,
292    /// The end of a YAML document (`...`).
293    DocumentEnd,
294    /// The start of a sequence block.
295    ///
296    /// Sequence blocks are arrays starting with a `-`.
297    BlockSequenceStart,
298    /// The start of a sequence mapping.
299    ///
300    /// Sequence mappings are "dictionaries" with "key: value" entries.
301    BlockMappingStart,
302    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
303    BlockEnd,
304    /// Start of an inline sequence (`[ a, b ]`).
305    FlowSequenceStart,
306    /// End of an inline sequence.
307    FlowSequenceEnd,
308    /// Start of an inline mapping (`{ a: b, c: d }`).
309    FlowMappingStart,
310    /// End of an inline mapping.
311    FlowMappingEnd,
312    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
313    BlockEntry,
314    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
315    FlowEntry,
316    /// A key in a mapping.
317    Key,
318    /// A value in a mapping.
319    Value,
320    /// A reference to an anchor.
321    Alias(Cow<'input, str>),
322    /// A YAML anchor (`&`/`*`).
323    Anchor(Cow<'input, str>),
324    /// A YAML tag (starting with bangs `!`).
325    Tag(
326        /// The handle of the tag.
327        Cow<'input, str>,
328        /// The suffix of the tag.
329        Cow<'input, str>,
330    ),
331    /// A regular YAML scalar.
332    Scalar(ScalarStyle, Cow<'input, str>),
333    /// A reserved YAML directive.
334    ReservedDirective(
335        /// Name
336        String,
337        /// Parameters
338        Vec<String>,
339    ),
340}
341
342/// A scanner token.
343#[derive(Clone, PartialEq, Debug, Eq)]
344pub struct Token<'input>(pub Span, pub TokenType<'input>);
345
346/// A scalar that was parsed and may correspond to a simple key.
347///
348/// Upon scanning the following yaml:
349/// ```yaml
350/// a: b
351/// ```
352/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
353/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
354/// kept inside the scanner until more context is fetched and we are able to know whether it is a
355/// plain scalar or a key.
356///
357/// For example, see the following 2 yaml documents:
358/// ```yaml
359/// ---
360/// a: b # Here, `a` is a key.
361/// ...
362/// ---
363/// a # Here, `a` is a plain scalar.
364/// ...
365/// ```
366/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
367///
368/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
369/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
370/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
371///
372/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
373/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
374/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
375/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
376/// [`TokenType::Scalar`] token.
377///
378/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
379/// [`TokenType::Key`] would be emitted by the scanner.
380#[derive(Clone, PartialEq, Debug, Eq)]
381struct SimpleKey {
382    /// Whether the token this [`SimpleKey`] refers to may still be a key.
383    ///
384    /// Sometimes, when we have more context, we notice that what we thought could be a key no
385    /// longer can be. In that case, [`Self::possible`] is set to `false`.
386    ///
387    /// For instance, let us consider the following invalid YAML:
388    /// ```yaml
389    /// key
390    ///   : value
391    /// ```
392    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
393    /// and [`Self::possible`] set to `false`.
394    possible: bool,
395    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
396    ///
397    /// With more context, we may know for sure that the token must be a key. If the YAML is
398    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
399    /// be raised. This boolean helps us know when to raise such error.
400    ///
401    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
402    required: bool,
403    /// The index of the token referred to by the [`SimpleKey`].
404    ///
405    /// This is the index in the scanner, which takes into account both the tokens that have been
406    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
407    /// [`Scanner::tokens`] for more details.
408    token_number: usize,
409    /// The position at which the token the [`SimpleKey`] refers to is.
410    mark: Marker,
411}
412
413impl SimpleKey {
414    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
415    fn new(mark: Marker) -> SimpleKey {
416        SimpleKey {
417            possible: false,
418            required: false,
419            token_number: 0,
420            mark,
421        }
422    }
423}
424
425/// An indentation level on the stack of indentations.
426#[derive(Clone, Debug, Default)]
427struct Indent {
428    /// The former indentation level.
429    indent: isize,
430    /// Whether, upon closing, this indents generates a `BlockEnd` token.
431    ///
432    /// There are levels of indentation which do not start a block. Examples of this would be:
433    /// ```yaml
434    /// -
435    ///   foo # ok
436    /// -
437    /// bar # ko, bar needs to be indented further than the `-`.
438    /// - [
439    ///  baz, # ok
440    /// quux # ko, quux needs to be indented further than the '-'.
441    /// ] # ko, the closing bracket needs to be indented further than the `-`.
442    /// ```
443    ///
444    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
445    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
446    /// sequence, although we must have exactly one to end the sequence.
447    needs_block_end: bool,
448}
449
450/// The knowledge we have about an implicit mapping.
451///
452/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
453/// sequence is omitted:
454/// ```yaml
455/// [ a: b, c: d ]
456/// # Equivalent to
457/// [ { a: b }, { c: d } ]
458/// # Equivalent to
459/// - a: b
460/// - c: d
461/// ```
462///
463/// The state must be carefully tracked for each nested flow sequence since we must emit a
464/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
465/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
466/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
467/// out-of-order.
468///
469/// [`FlowMappingStart`]: TokenType::FlowMappingStart
470/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
471#[derive(Debug, PartialEq)]
472enum ImplicitMappingState {
473    /// It is possible there is an implicit mapping.
474    ///
475    /// This state is the one when we have just encountered the opening `[`. We need more context
476    /// to know whether an implicit mapping follows.
477    Possible,
478    /// We are inside the implcit mapping.
479    ///
480    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
481    Inside(u8),
482}
483
484/// The YAML scanner.
485///
486/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
487/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
488/// some of the constructs. It has understanding of indentation and whitespace and is able to
489/// generate error messages for some invalid YAML constructs.
490///
491/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
492/// YAML documents.
493#[derive(Debug)]
494#[allow(clippy::struct_excessive_bools)]
495pub struct Scanner<'input, T> {
496    /// The input source.
497    ///
498    /// This must implement [`Input`].
499    input: T,
500    /// The position of the cursor within the reader.
501    mark: Marker,
502    /// Buffer for tokens to be returned.
503    ///
504    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
505    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
506    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
507    /// [`Self::next`] until we have more context.
508    tokens: VecDeque<Token<'input>>,
509    /// The last error that happened.
510    error: Option<ScanError>,
511
512    /// Whether we have already emitted the `StreamStart` token.
513    stream_start_produced: bool,
514    /// Whether we have already emitted the `StreamEnd` token.
515    stream_end_produced: bool,
516    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
517    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
518    adjacent_value_allowed_at: usize,
519    /// Whether a simple key could potentially start at the current position.
520    ///
521    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
522    simple_key_allowed: bool,
523    /// A stack of potential simple keys.
524    ///
525    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
526    /// are.
527    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
528    /// The current indentation level.
529    indent: isize,
530    /// List of all block indentation levels we are in (except the current one).
531    indents: smallvec::SmallVec<[Indent; 8]>,
532    /// Level of nesting of flow sequences.
533    flow_level: u8,
534    /// The number of tokens that have been returned from the scanner.
535    ///
536    /// This excludes the tokens from [`Self::tokens`].
537    tokens_parsed: usize,
538    /// Whether a token is ready to be taken from [`Self::tokens`].
539    token_available: bool,
540    /// Whether all characters encountered since the last newline were whitespace.
541    leading_whitespace: bool,
542    /// Whether we started a flow mapping at each flow nesting level.
543    ///
544    /// This is used to detect implicit flow mapping starts such as:
545    /// ```yaml
546    /// [ : foo ] # { null: "foo" }
547    /// ```
548    flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
549    /// An array of states, representing whether flow sequences have implicit mappings.
550    ///
551    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
552    /// the state is set to [`Possible`].
553    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
554    /// [`Inside`].
555    ///
556    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
557    /// The entries are created with the opening `]` and popped with the closing `]`.
558    ///
559    /// [`Possible`]: ImplicitMappingState::Possible
560    /// [`Inside`]: ImplicitMappingState::Inside
561    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
562    /// If a plain scalar was terminated by a `#` comment on its line, we set this
563    /// to detect an illegal multiline continuation on the following line.
564    interrupted_plain_by_comment: Option<Marker>,
565    /// A stack of markers for opening brackets `[` and `{`.
566    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
567    buf_leading_break: String,
568    buf_trailing_breaks: String,
569    buf_whitespaces: String,
570}
571
572impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
573    type Item = Token<'input>;
574
575    fn next(&mut self) -> Option<Self::Item> {
576        if self.error.is_some() {
577            return None;
578        }
579        match self.next_token() {
580            Ok(Some(tok)) => {
581                debug_print!(
582                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
583                    tok.1,
584                    tok.0
585                );
586                Some(tok)
587            }
588            Ok(tok) => tok,
589            Err(e) => {
590                self.error = Some(e);
591                None
592            }
593        }
594    }
595}
596
597/// A convenience alias for scanner functions that may fail without returning a value.
598pub type ScanResult = Result<(), ScanError>;
599
600#[derive(Debug)]
601enum FlowScalarBuf {
602    /// Candidate for `Cow::Borrowed`.
603    ///
604    /// `start..end` is the committed verbatim range.
605    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
606    /// committed (they must be dropped if followed by a line break).
607    Borrowed {
608        start: usize,
609        end: usize,
610        pending_ws_start: Option<usize>,
611        pending_ws_end: usize,
612    },
613    Owned(String),
614}
615
616impl FlowScalarBuf {
617    #[inline]
618    fn new_borrowed(start: usize) -> Self {
619        Self::Borrowed {
620            start,
621            end: start,
622            pending_ws_start: None,
623            pending_ws_end: start,
624        }
625    }
626
627    #[inline]
628    fn new_owned() -> Self {
629        Self::Owned(String::new())
630    }
631
632    #[inline]
633    fn as_owned_mut(&mut self) -> Option<&mut String> {
634        match self {
635            Self::Owned(s) => Some(s),
636            Self::Borrowed { .. } => None,
637        }
638    }
639
640    #[inline]
641    fn commit_pending_ws(&mut self) {
642        if let Self::Borrowed {
643            end,
644            pending_ws_start,
645            pending_ws_end,
646            ..
647        } = self
648        {
649            if pending_ws_start.is_some() {
650                *end = *pending_ws_end;
651                *pending_ws_start = None;
652            }
653        }
654    }
655
656    #[inline]
657    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
658        if let Self::Borrowed {
659            pending_ws_start,
660            pending_ws_end,
661            ..
662        } = self
663        {
664            if pending_ws_start.is_none() {
665                *pending_ws_start = Some(ws_start);
666            }
667            *pending_ws_end = ws_end;
668        }
669    }
670
671    #[inline]
672    fn discard_pending_ws(&mut self) {
673        if let Self::Borrowed {
674            pending_ws_start,
675            pending_ws_end,
676            end,
677            ..
678        } = self
679        {
680            *pending_ws_start = None;
681            *pending_ws_end = *end;
682        }
683    }
684}
685
686impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
687    #[inline]
688    fn promote_flow_scalar_buf_to_owned(
689        &self,
690        start_mark: &Marker,
691        buf: &mut FlowScalarBuf,
692    ) -> Result<(), ScanError> {
693        let FlowScalarBuf::Borrowed {
694            start,
695            end,
696            pending_ws_start: _,
697            pending_ws_end: _,
698        } = *buf
699        else {
700            return Ok(());
701        };
702
703        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
704            ScanError::new_str(
705                *start_mark,
706                "internal error: input advertised offsets but did not provide a slice",
707            )
708        })?;
709        *buf = FlowScalarBuf::Owned(slice.to_owned());
710        Ok(())
711    }
712    /// Try to borrow a slice from the underlying input.
713    ///
714    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
715    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
716    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
717    #[inline]
718    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
719        self.input.slice_borrowed(start, end)
720    }
721
722    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
723    ///
724    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
725    /// borrowing is not possible, it falls back to allocating.
726    fn scan_tag_handle_directive_cow(
727        &mut self,
728        mark: &Marker,
729    ) -> Result<Cow<'input, str>, ScanError> {
730        let Some(start) = self.input.byte_offset() else {
731            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
732        };
733
734        if self.input.look_ch() != '!' {
735            return Err(ScanError::new_str(
736                *mark,
737                "while scanning a tag, did not find expected '!'",
738            ));
739        }
740
741        // Consume the leading '!'.
742        self.skip_non_blank();
743
744        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
745        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
746        self.input.lookahead(1);
747        while self.input.next_is_alpha() {
748            self.skip_non_blank();
749            self.input.lookahead(1);
750        }
751
752        // Optional trailing '!'.
753        if self.input.peek() == '!' {
754            self.skip_non_blank();
755        }
756
757        let Some(end) = self.input.byte_offset() else {
758            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
759            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
760        };
761
762        let Some(slice) = self.try_borrow_slice(start, end) else {
763            // Fall back to allocating if zero-copy borrow is not available.
764            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
765                ScanError::new_str(
766                    *mark,
767                    "internal error: input advertised slicing but did not provide a slice",
768                )
769            })?;
770            if !slice.ends_with('!') && slice != "!" {
771                return Err(ScanError::new_str(
772                    *mark,
773                    "while parsing a tag directive, did not find expected '!'",
774                ));
775            }
776            return Ok(Cow::Owned(slice.to_owned()));
777        };
778
779        if !slice.ends_with('!') && slice != "!" {
780            return Err(ScanError::new_str(
781                *mark,
782                "while parsing a tag directive, did not find expected '!'",
783            ));
784        }
785
786        Ok(Cow::Borrowed(slice))
787    }
788
789    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
790    ///
791    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
792    /// escape is present, the prefix must be decoded and therefore allocated.
793    fn scan_tag_prefix_directive_cow(
794        &mut self,
795        start_mark: &Marker,
796    ) -> Result<Cow<'input, str>, ScanError> {
797        let Some(start) = self.input.byte_offset() else {
798            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
799        };
800
801        // The prefix must start with either '!' (local) or a valid global tag char.
802        if self.input.look_ch() == '!' {
803            self.skip_non_blank();
804        } else if !is_tag_char(self.input.peek()) {
805            return Err(ScanError::new_str(
806                *start_mark,
807                "invalid global tag character",
808            ));
809        } else if self.input.peek() == '%' {
810            // Needs decoding. Fall back to allocating path below.
811        } else {
812            self.skip_non_blank();
813        }
814
815        // Consume URI chars while we can stay in the borrowed path.
816        while is_uri_char(self.input.look_ch()) {
817            if self.input.peek() == '%' {
818                break;
819            }
820            self.skip_non_blank();
821        }
822
823        // If we encountered an escape sequence, we must decode, therefore allocate.
824        if self.input.peek() == '%' {
825            let current = self
826                .input
827                .byte_offset()
828                .expect("byte_offset() must remain available once enabled");
829            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
830                slice.to_owned()
831            } else {
832                String::new()
833            };
834
835            while is_uri_char(self.input.look_ch()) {
836                if self.input.peek() == '%' {
837                    out.push(self.scan_uri_escapes(start_mark)?);
838                } else {
839                    out.push(self.input.peek());
840                    self.skip_non_blank();
841                }
842            }
843            return Ok(Cow::Owned(out));
844        }
845
846        let Some(end) = self.input.byte_offset() else {
847            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
848        };
849
850        let Some(slice) = self.try_borrow_slice(start, end) else {
851            // Fall back to allocating if zero-copy borrow is not available.
852            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
853                ScanError::new_str(
854                    *start_mark,
855                    "internal error: input advertised slicing but did not provide a slice",
856                )
857            })?;
858            return Ok(Cow::Owned(slice.to_owned()));
859        };
860
861        Ok(Cow::Borrowed(slice))
862    }
863    /// Creates the YAML tokenizer.
864    pub fn new(input: T) -> Self {
865        let initial_byte_offset = input.byte_offset();
866        Scanner {
867            input,
868            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
869            tokens: VecDeque::with_capacity(64),
870            error: None,
871
872            stream_start_produced: false,
873            stream_end_produced: false,
874            adjacent_value_allowed_at: 0,
875            simple_key_allowed: true,
876            simple_keys: smallvec::SmallVec::new(),
877            indent: -1,
878            indents: smallvec::SmallVec::new(),
879            flow_level: 0,
880            tokens_parsed: 0,
881            token_available: false,
882            leading_whitespace: true,
883            flow_mapping_started: smallvec::SmallVec::new(),
884            implicit_flow_mapping_states: smallvec::SmallVec::new(),
885            flow_markers: smallvec::SmallVec::new(),
886            interrupted_plain_by_comment: None,
887
888            buf_leading_break: String::with_capacity(128),
889            buf_trailing_breaks: String::with_capacity(128),
890            buf_whitespaces: String::with_capacity(128),
891        }
892    }
893
894    /// Get a copy of the last error that was encountered, if any.
895    ///
896    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
897    /// clone of) the same error.
898    #[inline]
899    pub fn get_error(&self) -> Option<ScanError> {
900        self.error.clone()
901    }
902
903    #[cold]
904    fn simple_key_expected(&self) -> ScanError {
905        ScanError::new_str(self.mark, "simple key expected")
906    }
907
908    #[cold]
909    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
910        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
911    }
912
913    /// Consume the next character. It is assumed the next character is a blank.
914    #[inline]
915    fn skip_blank(&mut self) {
916        self.input.skip();
917
918        self.mark.offsets.chars += 1;
919        self.mark.col += 1;
920        self.mark.offsets.bytes = self.input.byte_offset();
921    }
922
923    /// Consume the next character. It is assumed the next character is not a blank.
924    #[inline]
925    fn skip_non_blank(&mut self) {
926        self.input.skip();
927
928        self.mark.offsets.chars += 1;
929        self.mark.col += 1;
930        self.mark.offsets.bytes = self.input.byte_offset();
931        self.leading_whitespace = false;
932    }
933
934    /// Consume the next characters. It is assumed none of the next characters are blanks.
935    #[inline]
936    fn skip_n_non_blank(&mut self, count: usize) {
937        for _ in 0..count {
938            self.input.skip();
939            self.mark.offsets.chars += 1;
940            self.mark.col += 1;
941        }
942        self.mark.offsets.bytes = self.input.byte_offset();
943        self.leading_whitespace = false;
944    }
945
946    /// Consume the next character. It is assumed the next character is a newline.
947    #[inline]
948    fn skip_nl(&mut self) {
949        self.input.skip();
950
951        self.mark.offsets.chars += 1;
952        self.mark.col = 0;
953        self.mark.line += 1;
954        self.mark.offsets.bytes = self.input.byte_offset();
955        self.leading_whitespace = true;
956    }
957
958    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
959    #[inline]
960    fn skip_linebreak(&mut self) {
961        if self.input.next_2_are('\r', '\n') {
962            // While technically not a blank, this does not matter as `self.leading_whitespace`
963            // will be reset by `skip_nl`.
964            self.skip_blank();
965            self.skip_nl();
966        } else if self.input.next_is_break() {
967            self.skip_nl();
968        }
969    }
970
971    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
972    #[inline]
973    pub fn stream_started(&self) -> bool {
974        self.stream_start_produced
975    }
976
977    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
978    #[inline]
979    pub fn stream_ended(&self) -> bool {
980        self.stream_end_produced
981    }
982
983    /// Get the current position in the input stream.
984    #[inline]
985    pub fn mark(&self) -> Marker {
986        self.mark
987    }
988
989    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
990    //
991    // A `\n` is pushed into `s`.
992    //
993    // # Panics (in debug)
994    // If the next characters do not correspond to a line break.
995    #[inline]
996    fn read_break(&mut self, s: &mut String) {
997        self.skip_break();
998        s.push('\n');
999    }
1000
1001    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
1002    //
1003    // # Panics (in debug)
1004    // If the next characters do not correspond to a line break.
1005    #[inline]
1006    fn skip_break(&mut self) {
1007        let c = self.input.peek();
1008        let nc = self.input.peek_nth(1);
1009        debug_assert!(is_break(c));
1010        if c == '\r' && nc == '\n' {
1011            self.skip_blank();
1012        }
1013        self.skip_nl();
1014    }
1015
1016    /// Insert a token at the given position.
1017    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1018        let old_len = self.tokens.len();
1019        assert!(pos <= old_len);
1020        self.tokens.insert(pos, tok);
1021    }
1022
1023    #[inline]
1024    fn allow_simple_key(&mut self) {
1025        self.simple_key_allowed = true;
1026    }
1027
1028    #[inline]
1029    fn disallow_simple_key(&mut self) {
1030        self.simple_key_allowed = false;
1031    }
1032
1033    /// Fetch the next token in the stream.
1034    ///
1035    /// # Errors
1036    /// Returns `ScanError` when the scanner does not find the next expected token.
1037    pub fn fetch_next_token(&mut self) -> ScanResult {
1038        self.input.lookahead(1);
1039
1040        if !self.stream_start_produced {
1041            self.fetch_stream_start();
1042            return Ok(());
1043        }
1044        self.skip_to_next_token()?;
1045
1046        debug_print!(
1047            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1048            self.mark,
1049            self.input.peek()
1050        );
1051
1052        self.stale_simple_keys()?;
1053
1054        let mark = self.mark;
1055        self.unroll_indent(mark.col as isize);
1056
1057        self.input.lookahead(4);
1058
1059        if self.input.next_is_z() {
1060            self.fetch_stream_end()?;
1061            return Ok(());
1062        }
1063
1064        if self.mark.col == 0 {
1065            if self.input.next_char_is('%') {
1066                return self.fetch_directive();
1067            } else if self.input.next_is_document_start() {
1068                return self.fetch_document_indicator(TokenType::DocumentStart);
1069            } else if self.input.next_is_document_end() {
1070                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1071                self.skip_ws_to_eol(SkipTabs::Yes)?;
1072                if !self.input.next_is_breakz() {
1073                    return Err(ScanError::new_str(
1074                        self.mark,
1075                        "invalid content after document end marker",
1076                    ));
1077                }
1078                return Ok(());
1079            }
1080        }
1081
1082        if (self.mark.col as isize) < self.indent {
1083            self.input.lookahead(1);
1084            let c = self.input.peek();
1085            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1086                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1087            }
1088        }
1089
1090        let c = self.input.peek();
1091        let nc = self.input.peek_nth(1);
1092        match c {
1093            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1094            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1095            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1096            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1097            ',' => self.fetch_flow_entry(),
1098            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1099            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1100            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1101            ':' if self.flow_level > 0
1102                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1103            {
1104                self.fetch_flow_value()
1105            }
1106            // Is it an alias?
1107            '*' => self.fetch_anchor(true),
1108            // Is it an anchor?
1109            '&' => self.fetch_anchor(false),
1110            '!' => self.fetch_tag(),
1111            // Is it a literal scalar?
1112            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1113            // Is it a folded scalar?
1114            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1115            '\'' => self.fetch_flow_scalar(true),
1116            '"' => self.fetch_flow_scalar(false),
1117            // plain scalar
1118            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1119            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1120                self.fetch_plain_scalar()
1121            }
1122            '%' | '@' | '`' => Err(ScanError::new(
1123                self.mark,
1124                format!("unexpected character: `{c}'"),
1125            )),
1126            _ => self.fetch_plain_scalar(),
1127        }
1128    }
1129
1130    /// Return the next token in the stream.
1131    /// # Errors
1132    /// Returns `ScanError` when scanning fails to find an expected next token.
1133    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1134        if self.stream_end_produced {
1135            return Ok(None);
1136        }
1137
1138        if !self.token_available {
1139            self.fetch_more_tokens()?;
1140        }
1141        let Some(t) = self.tokens.pop_front() else {
1142            return Err(ScanError::new_str(
1143                self.mark,
1144                "did not find expected next token",
1145            ));
1146        };
1147        self.token_available = false;
1148        self.tokens_parsed += 1;
1149
1150        if let TokenType::StreamEnd = t.1 {
1151            self.stream_end_produced = true;
1152        }
1153        Ok(Some(t))
1154    }
1155
1156    /// Fetch tokens from the token stream.
1157    /// # Errors
1158    /// Returns `ScanError` when loading fails.
1159    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1160        let mut need_more;
1161        loop {
1162            if self.tokens.is_empty() {
1163                need_more = true;
1164            } else {
1165                need_more = false;
1166                // Stale potential keys that we know won't be keys.
1167                self.stale_simple_keys()?;
1168                // If our next token to be emitted may be a key, fetch more context.
1169                for sk in &self.simple_keys {
1170                    if sk.possible && sk.token_number == self.tokens_parsed {
1171                        need_more = true;
1172                        break;
1173                    }
1174                }
1175            }
1176
1177            // Stop fetching immediately after document end/start markers
1178            // to allow the parser to emit the event before reading more content.
1179            if let Some(token) = self.tokens.back() {
1180                if matches!(token.1, TokenType::DocumentEnd | TokenType::DocumentStart) {
1181                    break;
1182                }
1183            }
1184
1185            if !need_more {
1186                break;
1187            }
1188            self.fetch_next_token()?;
1189        }
1190        self.token_available = true;
1191
1192        Ok(())
1193    }
1194
1195    /// Mark simple keys that can no longer be keys as such.
1196    ///
1197    /// This function sets `possible` to `false` to each key that, now we have more context, we
1198    /// know will not be keys.
1199    ///
1200    /// # Errors
1201    /// This function returns an error if one of the key we would stale was required to be a key.
1202    fn stale_simple_keys(&mut self) -> ScanResult {
1203        for sk in &mut self.simple_keys {
1204            let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1205            // The length cap applies in flow contexts too; otherwise token buffering can grow
1206            // without bound while the scanner waits to see whether a later ':' resolves the key.
1207            let is_length_stale =
1208                self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1209
1210            if sk.possible && (is_line_stale || is_length_stale) {
1211                if sk.required {
1212                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1213                }
1214                sk.possible = false;
1215            }
1216        }
1217        Ok(())
1218    }
1219
1220    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
1221    ///
1222    /// # Errors
1223    /// This function returns an error if a tabulation is encountered where there should not be
1224    /// one.
1225    fn skip_to_next_token(&mut self) -> ScanResult {
1226        // Hot-path helper: consume a single logical linebreak and apply simple-key rules.
1227        // (Kept local to ensure the compiler can inline it easily.)
1228        let consume_linebreak = |this: &mut Self| {
1229            this.input.lookahead(2);
1230            this.skip_linebreak();
1231            if this.flow_level == 0 {
1232                this.allow_simple_key();
1233            }
1234        };
1235
1236        loop {
1237            match self.input.look_ch() {
1238                // Tabs may not be used as indentation (block context only).
1239                '\t' => {
1240                    if self.is_within_block()
1241                        && self.leading_whitespace
1242                        && (self.mark.col as isize) < self.indent
1243                    {
1244                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1245
1246                        // If we have content on that line with a tab, return an error.
1247                        if !self.input.next_is_breakz() {
1248                            return Err(ScanError::new_str(
1249                                self.mark,
1250                                "tabs disallowed within this context (block indentation)",
1251                            ));
1252                        }
1253
1254                        // Micro-opt: if we stopped on a linebreak, consume it now (avoids another loop trip).
1255                        if matches!(self.input.look_ch(), '\n' | '\r') {
1256                            consume_linebreak(self);
1257                        }
1258                    } else {
1259                        // Non-indentation tab behaves like blank.
1260                        self.skip_blank();
1261                    }
1262                }
1263
1264                ' ' => self.skip_blank(),
1265
1266                '\n' | '\r' => consume_linebreak(self),
1267
1268                '#' => {
1269                    // Skip the whole comment payload in one go.
1270                    let n = self.input.skip_while_non_breakz();
1271                    self.mark.offsets.chars += n;
1272                    self.mark.col += n;
1273                    self.mark.offsets.bytes = self.input.byte_offset();
1274
1275                    // Micro-opt: comment-only lines are common; consume the following linebreak here.
1276                    if matches!(self.input.look_ch(), '\n' | '\r') {
1277                        consume_linebreak(self);
1278                    }
1279                }
1280
1281                _ => break,
1282            }
1283        }
1284
1285        // If a plain scalar was interrupted by a comment, and the next line could
1286        // continue the scalar in block context, this is invalid.
1287        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1288            // BS4K should only trigger when the continuation would start on the immediate next
1289            // line (no intervening empty/comment-only lines). A blank line resets the folding
1290            // opportunity and thus should not error.
1291            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1292
1293            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1294            if self.flow_level == 0
1295                && is_immediate_next_line
1296                && (self.mark.col as isize) > self.indent
1297            {
1298                // Ensure enough lookahead for:
1299                // - the checks below (peek/peek_nth)
1300                // - document indicator detection which needs 4 chars.
1301                self.input.lookahead(4);
1302
1303                if !self.input.next_is_z()
1304                    && !self.input.next_is_document_indicator()
1305                    && self.input.next_can_be_plain_scalar(false)
1306                {
1307                    return Err(ScanError::new_str(
1308                        err_mark,
1309                        "comment intercepting the multiline text",
1310                    ));
1311                }
1312            }
1313        }
1314
1315        Ok(())
1316    }
1317
1318    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1319    ///
1320    /// # Errors
1321    /// This function returns an error if no whitespace was found.
1322    fn skip_yaml_whitespace(&mut self) -> ScanResult {
1323        let mut need_whitespace = true;
1324        loop {
1325            match self.input.look_ch() {
1326                ' ' => {
1327                    self.skip_blank();
1328
1329                    need_whitespace = false;
1330                }
1331                '\n' | '\r' => {
1332                    self.input.lookahead(2);
1333                    self.skip_linebreak();
1334                    if self.flow_level == 0 {
1335                        self.allow_simple_key();
1336                    }
1337                    need_whitespace = false;
1338                }
1339                '#' => {
1340                    let comment_length = self.input.skip_while_non_breakz();
1341                    self.mark.offsets.chars += comment_length;
1342                    self.mark.col += comment_length;
1343                    self.mark.offsets.bytes = self.input.byte_offset();
1344                }
1345                _ => break,
1346            }
1347        }
1348
1349        if need_whitespace {
1350            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1351        } else {
1352            Ok(())
1353        }
1354    }
1355
1356    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1357        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
1358        self.mark.col += n_bytes;
1359        self.mark.offsets.chars += n_bytes;
1360        self.mark.offsets.bytes = self.input.byte_offset();
1361        result.map_err(|msg| ScanError::new_str(self.mark, msg))
1362    }
1363
1364    fn fetch_stream_start(&mut self) {
1365        let mark = self.mark;
1366        self.indent = -1;
1367        self.stream_start_produced = true;
1368        self.allow_simple_key();
1369        self.tokens.push_back(Token(
1370            Span::empty(mark),
1371            TokenType::StreamStart(TEncoding::Utf8),
1372        ));
1373        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1374    }
1375
1376    fn fetch_stream_end(&mut self) -> ScanResult {
1377        // force new line
1378        if self.mark.col != 0 {
1379            self.mark.col = 0;
1380            self.mark.line += 1;
1381        }
1382
1383        if let Some((mark, bracket)) = self.flow_markers.pop() {
1384            return Err(Self::unclosed_bracket(mark, bracket));
1385        }
1386
1387        // If the stream ended, we won't have more context. We can stall all the simple keys we
1388        // had. If one was required, however, that was an error and we must propagate it.
1389        for sk in &mut self.simple_keys {
1390            if sk.required && sk.possible {
1391                return Err(self.simple_key_expected());
1392            }
1393            sk.possible = false;
1394        }
1395
1396        self.unroll_indent(-1);
1397        self.remove_simple_key()?;
1398        self.disallow_simple_key();
1399
1400        self.tokens
1401            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
1402        Ok(())
1403    }
1404
1405    fn fetch_directive(&mut self) -> ScanResult {
1406        self.unroll_indent(-1);
1407        self.remove_simple_key()?;
1408
1409        self.disallow_simple_key();
1410
1411        let tok = self.scan_directive()?;
1412        self.tokens.push_back(tok);
1413
1414        Ok(())
1415    }
1416
1417    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1418        let start_mark = self.mark;
1419        self.skip_non_blank();
1420
1421        let name = self.scan_directive_name()?;
1422        let tok = match name.as_ref() {
1423            "YAML" => self.scan_version_directive_value(&start_mark)?,
1424            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1425            _ => {
1426                let mut params = Vec::new();
1427                while self.input.next_is_blank() {
1428                    let n_blanks = self.input.skip_while_blank();
1429                    self.mark.offsets.chars += n_blanks;
1430                    self.mark.col += n_blanks;
1431                    self.mark.offsets.bytes = self.input.byte_offset();
1432
1433                    if !is_blank_or_breakz(self.input.peek()) {
1434                        let mut param = String::new();
1435                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1436                        self.mark.offsets.chars += n_chars;
1437                        self.mark.col += n_chars;
1438                        self.mark.offsets.bytes = self.input.byte_offset();
1439                        params.push(param);
1440                    }
1441                }
1442
1443                Token(
1444                    Span::new(start_mark, self.mark),
1445                    TokenType::ReservedDirective(name, params),
1446                )
1447            }
1448        };
1449
1450        self.skip_ws_to_eol(SkipTabs::Yes)?;
1451
1452        if self.input.next_is_breakz() {
1453            self.input.lookahead(2);
1454            self.skip_linebreak();
1455            Ok(tok)
1456        } else {
1457            Err(ScanError::new_str(
1458                start_mark,
1459                "while scanning a directive, did not find expected comment or line break",
1460            ))
1461        }
1462    }
1463
1464    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1465        let n_blanks = self.input.skip_while_blank();
1466        self.mark.offsets.chars += n_blanks;
1467        self.mark.col += n_blanks;
1468        self.mark.offsets.bytes = self.input.byte_offset();
1469
1470        let major = self.scan_version_directive_number(mark)?;
1471
1472        if self.input.peek() != '.' {
1473            return Err(ScanError::new_str(
1474                *mark,
1475                "while scanning a YAML directive, did not find expected digit or '.' character",
1476            ));
1477        }
1478        self.skip_non_blank();
1479
1480        let minor = self.scan_version_directive_number(mark)?;
1481
1482        Ok(Token(
1483            Span::new(*mark, self.mark),
1484            TokenType::VersionDirective(major, minor),
1485        ))
1486    }
1487
1488    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1489        let start_mark = self.mark;
1490        let mut string = String::new();
1491
1492        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
1493        self.mark.offsets.chars += n_chars;
1494        self.mark.col += n_chars;
1495        self.mark.offsets.bytes = self.input.byte_offset();
1496
1497        if string.is_empty() {
1498            return Err(ScanError::new_str(
1499                start_mark,
1500                "while scanning a directive, could not find expected directive name",
1501            ));
1502        }
1503
1504        if !is_blank_or_breakz(self.input.peek()) {
1505            return Err(ScanError::new_str(
1506                start_mark,
1507                "while scanning a directive, found unexpected non-alphabetical character",
1508            ));
1509        }
1510
1511        Ok(string)
1512    }
1513
1514    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1515        let mut val = 0u32;
1516        let mut length = 0usize;
1517        while let Some(digit) = self.input.look_ch().to_digit(10) {
1518            if length + 1 > 9 {
1519                return Err(ScanError::new_str(
1520                    *mark,
1521                    "while scanning a YAML directive, found extremely long version number",
1522                ));
1523            }
1524            length += 1;
1525            val = val * 10 + digit;
1526            self.skip_non_blank();
1527        }
1528
1529        if length == 0 {
1530            return Err(ScanError::new_str(
1531                *mark,
1532                "while scanning a YAML directive, did not find expected version number",
1533            ));
1534        }
1535
1536        Ok(val)
1537    }
1538
1539    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1540        let n_blanks = self.input.skip_while_blank();
1541        self.mark.offsets.chars += n_blanks;
1542        self.mark.col += n_blanks;
1543        self.mark.offsets.bytes = self.input.byte_offset();
1544
1545        let handle = self.scan_tag_handle_directive_cow(mark)?;
1546
1547        let n_blanks = self.input.skip_while_blank();
1548        self.mark.offsets.chars += n_blanks;
1549        self.mark.col += n_blanks;
1550        self.mark.offsets.bytes = self.input.byte_offset();
1551
1552        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
1553
1554        self.input.lookahead(1);
1555
1556        if self.input.next_is_blank_or_breakz() {
1557            Ok(Token(
1558                Span::new(*mark, self.mark),
1559                TokenType::TagDirective(handle, prefix),
1560            ))
1561        } else {
1562            Err(ScanError::new_str(
1563                *mark,
1564                "while scanning TAG, did not find expected whitespace or line break",
1565            ))
1566        }
1567    }
1568
1569    fn fetch_tag(&mut self) -> ScanResult {
1570        self.save_simple_key();
1571        self.disallow_simple_key();
1572
1573        let tok = self.scan_tag()?;
1574        self.tokens.push_back(tok);
1575        Ok(())
1576    }
1577
1578    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
1579        let start_mark = self.mark;
1580
1581        // Check if the tag is in the canonical form (verbatim).
1582        self.input.lookahead(2);
1583
1584        // If byte_offset is not available, use the original owned-only path.
1585        if self.input.byte_offset().is_none() {
1586            return self.scan_tag_owned(&start_mark);
1587        }
1588
1589        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
1590            if self.input.nth_char_is(1, '<') {
1591                // Verbatim tags always need owned strings (URI escapes).
1592                let suffix = self.scan_verbatim_tag(&start_mark)?;
1593                (Cow::Owned(String::new()), Cow::Owned(suffix))
1594            } else {
1595                // The tag has either the '!suffix' or the '!handle!suffix'
1596                let handle = self.scan_tag_handle_cow(&start_mark)?;
1597                // Check if it is, indeed, handle.
1598                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1599                    // A tag handle starting with "!!" is a secondary tag handle.
1600                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
1601                    (handle, suffix)
1602                } else {
1603                    // Not a real handle, it's part of the suffix.
1604                    // E.g., "!foo" -> handle="!", suffix="foo"
1605                    // The "handle" we scanned is actually "!" + suffix_part1.
1606                    // We need to also scan any remaining suffix characters.
1607                    let remaining_suffix =
1608                        self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
1609
1610                    // Extract suffix from handle (skip leading '!') and combine with remaining.
1611                    let suffix = if handle.len() > 1 {
1612                        if remaining_suffix.is_empty() {
1613                            // The suffix is just what's in handle after '!'
1614                            match handle {
1615                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
1616                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
1617                            }
1618                        } else {
1619                            // Combine handle (minus leading '!') with remaining suffix.
1620                            let mut combined = handle[1..].to_owned();
1621                            combined.push_str(&remaining_suffix);
1622                            Cow::Owned(combined)
1623                        }
1624                    } else {
1625                        // handle is just "!", suffix is whatever we scanned after
1626                        remaining_suffix
1627                    };
1628
1629                    // A special case: the '!' tag.  Set the handle to '' and the
1630                    // suffix to '!'.
1631                    if suffix.is_empty() {
1632                        (Cow::Borrowed(""), Cow::Borrowed("!"))
1633                    } else {
1634                        (Cow::Borrowed("!"), suffix)
1635                    }
1636                }
1637            };
1638
1639        if is_blank_or_breakz(self.input.look_ch())
1640            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
1641        {
1642            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1643            Ok(Token(
1644                Span::new(start_mark, self.mark),
1645                TokenType::Tag(handle, suffix),
1646            ))
1647        } else {
1648            Err(ScanError::new_str(
1649                start_mark,
1650                "while scanning a tag, did not find expected whitespace or line break",
1651            ))
1652        }
1653    }
1654
1655    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
1656    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
1657        let mut handle = String::new();
1658        let mut suffix;
1659
1660        if self.input.nth_char_is(1, '<') {
1661            suffix = self.scan_verbatim_tag(start_mark)?;
1662        } else {
1663            // The tag has either the '!suffix' or the '!handle!suffix'
1664            handle = self.scan_tag_handle(false, start_mark)?;
1665            // Check if it is, indeed, handle.
1666            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1667                // A tag handle starting with "!!" is a secondary tag handle.
1668                let is_secondary_handle = handle == "!!";
1669                suffix =
1670                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
1671            } else {
1672                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
1673                "!".clone_into(&mut handle);
1674                // A special case: the '!' tag.  Set the handle to '' and the
1675                // suffix to '!'.
1676                if suffix.is_empty() {
1677                    handle.clear();
1678                    "!".clone_into(&mut suffix);
1679                }
1680            }
1681        }
1682
1683        if is_blank_or_breakz(self.input.look_ch())
1684            || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
1685        {
1686            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1687            Ok(Token(
1688                Span::new(*start_mark, self.mark),
1689                TokenType::Tag(handle.into(), suffix.into()),
1690            ))
1691        } else {
1692            Err(ScanError::new_str(
1693                *start_mark,
1694                "while scanning a tag, did not find expected whitespace or line break",
1695            ))
1696        }
1697    }
1698
1699    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
1700    ///
1701    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
1702    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
1703    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
1704        let Some(start) = self.input.byte_offset() else {
1705            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1706        };
1707
1708        if self.input.look_ch() != '!' {
1709            return Err(ScanError::new_str(
1710                *mark,
1711                "while scanning a tag, did not find expected '!'",
1712            ));
1713        }
1714
1715        // Consume the leading '!'.
1716        self.skip_non_blank();
1717
1718        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1719        self.input.lookahead(1);
1720        while self.input.next_is_alpha() {
1721            self.skip_non_blank();
1722            self.input.lookahead(1);
1723        }
1724
1725        // Optional trailing '!'.
1726        if self.input.peek() == '!' {
1727            self.skip_non_blank();
1728        }
1729
1730        let Some(end) = self.input.byte_offset() else {
1731            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1732        };
1733
1734        if let Some(slice) = self.try_borrow_slice(start, end) {
1735            Ok(Cow::Borrowed(slice))
1736        } else {
1737            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1738                ScanError::new_str(
1739                    *mark,
1740                    "internal error: input advertised slicing but did not provide a slice",
1741                )
1742            })?;
1743            Ok(Cow::Owned(slice.to_owned()))
1744        }
1745    }
1746
1747    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
1748    ///
1749    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
1750    fn scan_tag_shorthand_suffix_cow(
1751        &mut self,
1752        mark: &Marker,
1753        require_non_empty: bool,
1754    ) -> Result<Cow<'input, str>, ScanError> {
1755        let Some(start) = self.input.byte_offset() else {
1756            return Ok(Cow::Owned(
1757                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
1758            ));
1759        };
1760
1761        // Scan tag characters, checking for URI escapes.
1762        while is_tag_char(self.input.look_ch()) {
1763            if self.input.peek() == '%' {
1764                // URI escape found - must decode, so fall back to owned path.
1765                let current = self
1766                    .input
1767                    .byte_offset()
1768                    .expect("byte_offset() must remain available once enabled");
1769                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1770                    slice.to_owned()
1771                } else {
1772                    String::new()
1773                };
1774
1775                // Continue scanning with owned buffer.
1776                while is_tag_char(self.input.look_ch()) {
1777                    if self.input.peek() == '%' {
1778                        out.push(self.scan_uri_escapes(mark)?);
1779                    } else {
1780                        out.push(self.input.peek());
1781                        self.skip_non_blank();
1782                    }
1783                }
1784                return Ok(Cow::Owned(out));
1785            }
1786            self.skip_non_blank();
1787        }
1788
1789        let Some(end) = self.input.byte_offset() else {
1790            return Ok(Cow::Owned(
1791                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
1792            ));
1793        };
1794
1795        if require_non_empty && start == end {
1796            return Err(ScanError::new_str(
1797                *mark,
1798                "while parsing a tag, did not find expected tag URI",
1799            ));
1800        }
1801
1802        if let Some(slice) = self.try_borrow_slice(start, end) {
1803            Ok(Cow::Borrowed(slice))
1804        } else {
1805            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1806                ScanError::new_str(
1807                    *mark,
1808                    "internal error: input advertised slicing but did not provide a slice",
1809                )
1810            })?;
1811            Ok(Cow::Owned(slice.to_owned()))
1812        }
1813    }
1814
1815    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1816        let mut string = String::new();
1817        if self.input.look_ch() != '!' {
1818            return Err(ScanError::new_str(
1819                *mark,
1820                "while scanning a tag, did not find expected '!'",
1821            ));
1822        }
1823
1824        string.push(self.input.peek());
1825        self.skip_non_blank();
1826
1827        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1828        self.mark.offsets.chars += n_chars;
1829        self.mark.col += n_chars;
1830        self.mark.offsets.bytes = self.input.byte_offset();
1831
1832        // Check if the trailing character is '!' and copy it.
1833        if self.input.peek() == '!' {
1834            string.push(self.input.peek());
1835            self.skip_non_blank();
1836        } else if directive && string != "!" {
1837            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1838            // directive, it's an error.  If it's a tag token, it must be a part of
1839            // URI.
1840            return Err(ScanError::new_str(
1841                *mark,
1842                "while parsing a tag directive, did not find expected '!'",
1843            ));
1844        }
1845        Ok(string)
1846    }
1847
1848    /// Scan for a tag prefix (6.8.2.2).
1849    ///
1850    /// There are 2 kinds of tag prefixes:
1851    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1852    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1853    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1854        let mut string = String::new();
1855
1856        if self.input.look_ch() == '!' {
1857            // If we have a local tag, insert and skip `!`.
1858            string.push(self.input.peek());
1859            self.skip_non_blank();
1860        } else if !is_tag_char(self.input.peek()) {
1861            // Otherwise, check if the first global tag character is valid.
1862            return Err(ScanError::new_str(
1863                *start_mark,
1864                "invalid global tag character",
1865            ));
1866        } else if self.input.peek() == '%' {
1867            // If it is valid and an escape sequence, escape it.
1868            string.push(self.scan_uri_escapes(start_mark)?);
1869        } else {
1870            // Otherwise, push the first character.
1871            string.push(self.input.peek());
1872            self.skip_non_blank();
1873        }
1874
1875        while is_uri_char(self.input.look_ch()) {
1876            if self.input.peek() == '%' {
1877                string.push(self.scan_uri_escapes(start_mark)?);
1878            } else {
1879                string.push(self.input.peek());
1880                self.skip_non_blank();
1881            }
1882        }
1883
1884        Ok(string)
1885    }
1886
1887    /// Scan for a verbatim tag.
1888    ///
1889    /// The prefixing `!<` must _not_ have been skipped.
1890    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1891        // Eat `!<`
1892        self.skip_non_blank();
1893        self.skip_non_blank();
1894
1895        let mut string = String::new();
1896        while is_uri_char(self.input.look_ch()) {
1897            if self.input.peek() == '%' {
1898                string.push(self.scan_uri_escapes(start_mark)?);
1899            } else {
1900                string.push(self.input.peek());
1901                self.skip_non_blank();
1902            }
1903        }
1904
1905        if string.is_empty() {
1906            return Err(ScanError::new_str(
1907                *start_mark,
1908                "while parsing a tag, did not find expected tag URI",
1909            ));
1910        }
1911
1912        if self.input.peek() != '>' {
1913            return Err(ScanError::new_str(
1914                *start_mark,
1915                "while scanning a verbatim tag, did not find the expected '>'",
1916            ));
1917        }
1918        self.skip_non_blank();
1919
1920        Ok(string)
1921    }
1922
1923    fn scan_tag_shorthand_suffix(
1924        &mut self,
1925        _directive: bool,
1926        _is_secondary: bool,
1927        head: &str,
1928        mark: &Marker,
1929    ) -> Result<String, ScanError> {
1930        let mut length = head.len();
1931        let mut string = String::new();
1932
1933        // Copy the head if needed.
1934        // Note that we don't copy the leading '!' character.
1935        if length > 1 {
1936            string.extend(head.chars().skip(1));
1937        }
1938
1939        while is_tag_char(self.input.look_ch()) {
1940            // Check if it is a URI-escape sequence.
1941            if self.input.peek() == '%' {
1942                string.push(self.scan_uri_escapes(mark)?);
1943            } else {
1944                string.push(self.input.peek());
1945                self.skip_non_blank();
1946            }
1947
1948            length += 1;
1949        }
1950
1951        if length == 0 {
1952            return Err(ScanError::new_str(
1953                *mark,
1954                "while parsing a tag, did not find expected tag URI",
1955            ));
1956        }
1957
1958        Ok(string)
1959    }
1960
1961    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1962        let mut width = 0usize;
1963        let mut bytes = [0u8; 4];
1964        let mut bytes_len = 0usize;
1965        loop {
1966            self.input.lookahead(3);
1967
1968            let c = self.input.peek_nth(1);
1969            let nc = self.input.peek_nth(2);
1970
1971            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
1972                return Err(ScanError::new_str(
1973                    *mark,
1974                    "while parsing a tag, found an invalid escape sequence",
1975                ));
1976            }
1977
1978            let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
1979                .expect("two hex nibbles always fit in a byte");
1980            if width == 0 {
1981                width = match byte {
1982                    _ if byte & 0x80 == 0x00 => 1,
1983                    _ if byte & 0xE0 == 0xC0 => 2,
1984                    _ if byte & 0xF0 == 0xE0 => 3,
1985                    _ if byte & 0xF8 == 0xF0 => 4,
1986                    _ => {
1987                        return Err(ScanError::new_str(
1988                            *mark,
1989                            "while parsing a tag, found an incorrect leading UTF-8 byte",
1990                        ));
1991                    }
1992                };
1993            } else if byte & 0xc0 != 0x80 {
1994                return Err(ScanError::new_str(
1995                    *mark,
1996                    "while parsing a tag, found an incorrect trailing UTF-8 byte",
1997                ));
1998            }
1999
2000            bytes[bytes_len] = byte;
2001            bytes_len += 1;
2002
2003            self.skip_n_non_blank(3);
2004
2005            width -= 1;
2006            if width == 0 {
2007                break;
2008            }
2009        }
2010
2011        let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2012            ScanError::new_str(
2013                *mark,
2014                "while parsing a tag, found an invalid UTF-8 codepoint",
2015            )
2016        })?;
2017
2018        let mut chars = s.chars();
2019        match (chars.next(), chars.next()) {
2020            (Some(ch), None) => Ok(ch),
2021            _ => Err(ScanError::new_str(
2022                *mark,
2023                "while parsing a tag, found an invalid UTF-8 codepoint",
2024            )),
2025        }
2026    }
2027
2028    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2029        self.save_simple_key();
2030        self.disallow_simple_key();
2031
2032        let tok = self.scan_anchor(alias)?;
2033
2034        self.tokens.push_back(tok);
2035
2036        Ok(())
2037    }
2038
2039    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2040        let start_mark = self.mark;
2041
2042        // Skip `&` / `*`.
2043        self.skip_non_blank();
2044
2045        // Borrow from input when possible.
2046        if let Some(start) = self.input.byte_offset() {
2047            while is_anchor_char(self.input.look_ch()) {
2048                self.skip_non_blank();
2049            }
2050
2051            let end = self
2052                .input
2053                .byte_offset()
2054                .expect("byte_offset() must remain available once enabled");
2055
2056            if start == end {
2057                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2058            }
2059
2060            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2061                Cow::Borrowed(slice)
2062            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2063                Cow::Owned(slice.to_owned())
2064            } else {
2065                return Err(ScanError::new_str(
2066                    start_mark,
2067                    "internal error: input advertised slicing but did not provide a slice",
2068                ));
2069            };
2070
2071            let tok = if alias {
2072                TokenType::Alias(cow)
2073            } else {
2074                TokenType::Anchor(cow)
2075            };
2076            return Ok(Token(Span::new(start_mark, self.mark), tok));
2077        }
2078
2079        let mut string = String::new();
2080        while is_anchor_char(self.input.look_ch()) {
2081            string.push(self.input.peek());
2082            self.skip_non_blank();
2083        }
2084
2085        if string.is_empty() {
2086            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2087        }
2088
2089        let tok = if alias {
2090            TokenType::Alias(string.into())
2091        } else {
2092            TokenType::Anchor(string.into())
2093        };
2094        Ok(Token(Span::new(start_mark, self.mark), tok))
2095    }
2096
2097    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2098        // The indicators '[' and '{' may start a simple key.
2099        self.save_simple_key();
2100
2101        let start_mark = self.mark;
2102        let indicator = self.input.peek();
2103        self.flow_markers.push((start_mark, indicator));
2104
2105        self.roll_one_col_indent();
2106        self.increase_flow_level()?;
2107
2108        self.allow_simple_key();
2109
2110        self.skip_non_blank();
2111
2112        if tok == TokenType::FlowMappingStart {
2113            self.flow_mapping_started.push(true);
2114        } else {
2115            self.flow_mapping_started.push(false);
2116            self.implicit_flow_mapping_states
2117                .push(ImplicitMappingState::Possible);
2118        }
2119
2120        self.skip_ws_to_eol(SkipTabs::Yes)?;
2121
2122        self.tokens
2123            .push_back(Token(Span::new(start_mark, self.mark), tok));
2124        Ok(())
2125    }
2126
2127    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2128        // A closing bracket without a corresponding opening is invalid YAML.
2129        if self.flow_level == 0 {
2130            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2131        }
2132
2133        let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2134            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2135        };
2136
2137        let (expected_open, actual_close) = match tok {
2138            TokenType::FlowSequenceEnd => ('[', ']'),
2139            TokenType::FlowMappingEnd => ('{', '}'),
2140            _ => unreachable!("flow collection end called with non-closing token"),
2141        };
2142
2143        if open_ch != expected_open {
2144            return Err(ScanError::new(
2145                open_mark,
2146                format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2147            ));
2148        }
2149
2150        let flow_level = self.flow_level;
2151
2152        self.remove_simple_key()?;
2153
2154        if matches!(tok, TokenType::FlowSequenceEnd) {
2155            self.end_implicit_mapping(self.mark, flow_level);
2156            // We are out exiting the flow sequence, nesting goes down 1 level.
2157            self.implicit_flow_mapping_states.pop();
2158        }
2159        self.flow_mapping_started.pop();
2160
2161        self.decrease_flow_level();
2162
2163        self.disallow_simple_key();
2164
2165        let start_mark = self.mark;
2166        self.skip_non_blank();
2167        self.skip_ws_to_eol(SkipTabs::Yes)?;
2168
2169        // A flow collection within a flow mapping can be a key. In that case, the value may be
2170        // adjacent to the `:`.
2171        // ```yaml
2172        // - [ {a: b}:value ]
2173        // ```
2174        if self.flow_level > 0 {
2175            self.adjacent_value_allowed_at = self.mark.index();
2176        }
2177
2178        self.tokens
2179            .push_back(Token(Span::new(start_mark, self.mark), tok));
2180        Ok(())
2181    }
2182
2183    /// Push the `FlowEntry` token and skip over the `,`.
2184    fn fetch_flow_entry(&mut self) -> ScanResult {
2185        self.remove_simple_key()?;
2186        self.allow_simple_key();
2187
2188        self.end_implicit_mapping(self.mark, self.flow_level);
2189        if self.current_flow_collection_is_sequence() {
2190            self.set_current_flow_mapping_started(false);
2191        }
2192
2193        let start_mark = self.mark;
2194        self.skip_non_blank();
2195        self.skip_ws_to_eol(SkipTabs::Yes)?;
2196
2197        self.tokens.push_back(Token(
2198            Span::new(start_mark, self.mark),
2199            TokenType::FlowEntry,
2200        ));
2201        Ok(())
2202    }
2203
2204    fn increase_flow_level(&mut self) -> ScanResult {
2205        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2206        self.flow_level = self
2207            .flow_level
2208            .checked_add(1)
2209            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2210        Ok(())
2211    }
2212
2213    fn decrease_flow_level(&mut self) {
2214        if self.flow_level > 0 {
2215            self.flow_level -= 1;
2216            self.simple_keys.pop().unwrap();
2217        }
2218    }
2219
2220    /// Push the `Block*` token(s) and skip over the `-`.
2221    ///
2222    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2223    /// `BlockEntry` token.
2224    /// This function only skips over the `-` and does not fetch the entry value.
2225    fn fetch_block_entry(&mut self) -> ScanResult {
2226        if self.flow_level > 0 {
2227            // - * only allowed in block
2228            return Err(ScanError::new_str(
2229                self.mark,
2230                r#""-" is only valid inside a block"#,
2231            ));
2232        }
2233        // Check if we are allowed to start a new entry.
2234        if !self.simple_key_allowed {
2235            return Err(ScanError::new_str(
2236                self.mark,
2237                "block sequence entries are not allowed in this context",
2238            ));
2239        }
2240
2241        // ???, fixes test G9HC.
2242        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
2243            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2244                return Err(ScanError::new_str(
2245                    span.start,
2246                    "invalid indentation for anchor",
2247                ));
2248            }
2249        }
2250
2251        // Skip over the `-`.
2252        let mark = self.mark;
2253        self.skip_non_blank();
2254
2255        // generate BLOCK-SEQUENCE-START if indented
2256        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2257        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2258        self.input.lookahead(2);
2259        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2260        {
2261            return Err(ScanError::new_str(
2262                self.mark,
2263                "'-' must be followed by a valid YAML whitespace",
2264            ));
2265        }
2266
2267        self.skip_ws_to_eol(SkipTabs::No)?;
2268        self.input.lookahead(1);
2269        if self.input.next_is_break() || self.input.next_is_flow() {
2270            self.roll_one_col_indent();
2271        }
2272
2273        self.remove_simple_key()?;
2274        self.allow_simple_key();
2275
2276        self.tokens
2277            .push_back(Token(Span::empty(self.mark), TokenType::BlockEntry));
2278
2279        Ok(())
2280    }
2281
2282    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2283        if let Some((mark, bracket)) = self.flow_markers.pop() {
2284            return Err(ScanError::new(
2285                mark,
2286                format!("unclosed bracket '{bracket}'"),
2287            ));
2288        }
2289
2290        self.unroll_indent(-1);
2291        self.remove_simple_key()?;
2292        self.disallow_simple_key();
2293
2294        let mark = self.mark;
2295
2296        self.skip_n_non_blank(3);
2297
2298        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
2299        Ok(())
2300    }
2301
2302    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2303        self.save_simple_key();
2304        self.allow_simple_key();
2305        let tok = self.scan_block_scalar(literal)?;
2306
2307        self.tokens.push_back(tok);
2308        Ok(())
2309    }
2310
2311    #[allow(clippy::too_many_lines)]
2312    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2313        let start_mark = self.mark;
2314        let mut chomping = Chomping::Clip;
2315        let mut increment: usize = 0;
2316        let mut indent: usize = 0;
2317        let mut trailing_blank: bool;
2318        let mut leading_blank: bool = false;
2319        let style = if literal {
2320            ScalarStyle::Literal
2321        } else {
2322            ScalarStyle::Folded
2323        };
2324
2325        let mut string = String::new();
2326        let mut leading_break = String::new();
2327        let mut trailing_breaks = String::new();
2328        let mut chomping_break = String::new();
2329
2330        // skip '|' or '>'
2331        self.skip_non_blank();
2332        self.unroll_non_block_indents();
2333
2334        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2335            if self.input.peek() == '+' {
2336                chomping = Chomping::Keep;
2337            } else {
2338                chomping = Chomping::Strip;
2339            }
2340            self.skip_non_blank();
2341            self.input.lookahead(1);
2342            if self.input.next_is_digit() {
2343                if self.input.peek() == '0' {
2344                    return Err(ScanError::new_str(
2345                        start_mark,
2346                        "while scanning a block scalar, found an indentation indicator equal to 0",
2347                    ));
2348                }
2349                increment = (self.input.peek() as usize) - ('0' as usize);
2350                self.skip_non_blank();
2351            }
2352        } else if self.input.next_is_digit() {
2353            if self.input.peek() == '0' {
2354                return Err(ScanError::new_str(
2355                    start_mark,
2356                    "while scanning a block scalar, found an indentation indicator equal to 0",
2357                ));
2358            }
2359
2360            increment = (self.input.peek() as usize) - ('0' as usize);
2361            self.skip_non_blank();
2362            self.input.lookahead(1);
2363            if self.input.peek() == '+' || self.input.peek() == '-' {
2364                if self.input.peek() == '+' {
2365                    chomping = Chomping::Keep;
2366                } else {
2367                    chomping = Chomping::Strip;
2368                }
2369                self.skip_non_blank();
2370            }
2371        }
2372
2373        self.skip_ws_to_eol(SkipTabs::Yes)?;
2374
2375        // Check if we are at the end of the line.
2376        self.input.lookahead(1);
2377        if !self.input.next_is_breakz() {
2378            return Err(ScanError::new_str(
2379                start_mark,
2380                "while scanning a block scalar, did not find expected comment or line break",
2381            ));
2382        }
2383
2384        if self.input.next_is_break() {
2385            self.input.lookahead(2);
2386            self.read_break(&mut chomping_break);
2387        }
2388
2389        if self.input.look_ch() == '\t' {
2390            return Err(ScanError::new_str(
2391                start_mark,
2392                "a block scalar content cannot start with a tab",
2393            ));
2394        }
2395
2396        if increment > 0 {
2397            indent = if self.indent >= 0 {
2398                (self.indent + increment as isize) as usize
2399            } else {
2400                increment
2401            }
2402        }
2403
2404        // Scan the leading line breaks and determine the indentation level if needed.
2405        if indent == 0 {
2406            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2407        } else {
2408            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2409        }
2410
2411        // We have an end-of-stream with no content, e.g.:
2412        // ```yaml
2413        // - |+
2414        // ```
2415        if self.input.next_is_z() {
2416            let contents = match chomping {
2417                // We strip trailing linebreaks. Nothing remain.
2418                Chomping::Strip => String::new(),
2419                // There was no newline after the chomping indicator.
2420                _ if self.mark.line == start_mark.line() => String::new(),
2421                // We clip lines, and there was a newline after the chomping indicator.
2422                // All other breaks are ignored.
2423                Chomping::Clip => chomping_break,
2424                // We keep lines. There was a newline after the chomping indicator but nothing
2425                // else.
2426                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2427                // Otherwise, the newline after chomping is ignored.
2428                Chomping::Keep => trailing_breaks,
2429            };
2430            return Ok(Token(
2431                Span::new(start_mark, self.mark),
2432                TokenType::Scalar(style, contents.into()),
2433            ));
2434        }
2435
2436        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2437            if self.indent < 0 && self.mark.col == 0 {
2438                self.input.lookahead(4);
2439                if self.input.next_is_document_start()
2440                    || self.input.next_is_document_end()
2441                    || self.input.peek() == '#'
2442                {
2443                    // At the root level, an explicit indentation indicator can still yield an
2444                    // empty scalar when the next line is a document marker or comment.
2445                    // In this case, the scalar is terminated rather than under-indented.
2446                } else {
2447                    return Err(ScanError::new_str(
2448                        self.mark,
2449                        "wrongly indented line in block scalar",
2450                    ));
2451                }
2452            } else {
2453                return Err(ScanError::new_str(
2454                    self.mark,
2455                    "wrongly indented line in block scalar",
2456                ));
2457            }
2458        }
2459
2460        let mut line_buffer = String::with_capacity(100);
2461        let start_mark = self.mark;
2462        while self.mark.col == indent && !self.input.next_is_z() {
2463            if indent == 0 {
2464                self.input.lookahead(4);
2465                if self.input.next_is_document_end() {
2466                    break;
2467                }
2468            }
2469
2470            // We are at the first content character of a content line.
2471            trailing_blank = self.input.next_is_blank();
2472            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
2473                string.push_str(&trailing_breaks);
2474                if trailing_breaks.is_empty() {
2475                    string.push(' ');
2476                }
2477            } else {
2478                string.push_str(&leading_break);
2479                string.push_str(&trailing_breaks);
2480            }
2481
2482            leading_break.clear();
2483            trailing_breaks.clear();
2484
2485            leading_blank = self.input.next_is_blank();
2486
2487            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
2488
2489            // break on EOF
2490            self.input.lookahead(2);
2491            if self.input.next_is_z() {
2492                break;
2493            }
2494
2495            self.read_break(&mut leading_break);
2496
2497            // Eat the following indentation spaces and line breaks.
2498            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2499        }
2500
2501        // Chomp the tail.
2502        if chomping != Chomping::Strip {
2503            string.push_str(&leading_break);
2504            // If we had reached an eof but the last character wasn't an end-of-line, check if the
2505            // last line was indented at least as the rest of the scalar, then we need to consider
2506            // there is a newline.
2507            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
2508                string.push('\n');
2509            }
2510        }
2511
2512        if chomping == Chomping::Keep {
2513            string.push_str(&trailing_breaks);
2514        }
2515
2516        Ok(Token(
2517            Span::new(start_mark, self.mark),
2518            TokenType::Scalar(style, string.into()),
2519        ))
2520    }
2521
2522    /// Retrieve the contents of the line, parsing it as a block scalar.
2523    ///
2524    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
2525    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
2526    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
2527    /// `clear`ed before the end of the function.
2528    ///
2529    /// This function assumed the first character to read is the first content character in the
2530    /// line. This function does not consume the line break character(s) after the line.
2531    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
2532        // Start by evaluating characters in the buffer.
2533        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
2534            string.push(self.input.peek());
2535            // We may technically skip non-blank characters. However, the only distinction is
2536            // to determine what is leading whitespace and what is not. Here, we read the
2537            // contents of the line until either eof or a linebreak. We know we will not read
2538            // `self.leading_whitespace` until the end of the line, where it will be reset.
2539            // This allows us to call a slightly less expensive function.
2540            self.skip_blank();
2541        }
2542
2543        // All characters that were in the buffer were consumed. We need to check if more
2544        // follow.
2545        if self.input.buf_is_empty() {
2546            // We will read all consecutive non-breakz characters. We push them into a
2547            // temporary buffer. The main difference with going through `self.buffer` is that
2548            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
2549            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
2550            // (using `String::push_str`).
2551
2552            // line_buffer is empty at this point so we can compute n_chars here as well
2553            let mut n_chars = 0;
2554            debug_assert!(line_buffer.is_empty());
2555            while let Some(c) = self.input.raw_read_non_breakz_ch() {
2556                line_buffer.push(c);
2557                n_chars += 1;
2558            }
2559
2560            // We need to manually update our position; we haven't called a `skip` function.
2561            self.mark.col += n_chars;
2562            self.mark.offsets.chars += n_chars;
2563            self.mark.offsets.bytes = self.input.byte_offset();
2564
2565            // We can now append our bytes to our `string`.
2566            string.reserve(line_buffer.len());
2567            string.push_str(line_buffer);
2568            // This clears the _contents_ without touching the _capacity_.
2569            line_buffer.clear();
2570        }
2571    }
2572
2573    /// Skip the block scalar indentation and empty lines.
2574    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
2575        loop {
2576            // Consume all spaces. Tabs cannot be used as indentation.
2577            if indent < self.input.bufmaxlen() - 2 {
2578                self.input.lookahead(self.input.bufmaxlen());
2579                while self.mark.col < indent && self.input.peek() == ' ' {
2580                    self.skip_blank();
2581                }
2582            } else {
2583                loop {
2584                    self.input.lookahead(self.input.bufmaxlen());
2585                    while !self.input.buf_is_empty()
2586                        && self.mark.col < indent
2587                        && self.input.peek() == ' '
2588                    {
2589                        self.skip_blank();
2590                    }
2591                    // If we reached our indent, we can break. We must also break if we have
2592                    // reached content or EOF; that is, the buffer is not empty and the next
2593                    // character is not a space.
2594                    if self.mark.col == indent
2595                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
2596                    {
2597                        break;
2598                    }
2599                }
2600                self.input.lookahead(2);
2601            }
2602
2603            // If our current line is empty, skip over the break and continue looping.
2604            if self.input.next_is_break() {
2605                self.read_break(breaks);
2606            } else {
2607                // Otherwise, we have a content line. Return control.
2608                break;
2609            }
2610        }
2611    }
2612
2613    /// Determine the indentation level for a block scalar from the first line of its contents.
2614    ///
2615    /// The function skips over whitespace-only lines and sets `indent` to the the longest
2616    /// whitespace line that was encountered.
2617    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
2618        let mut max_indent = 0;
2619        loop {
2620            // Consume all spaces. Tabs cannot be used as indentation.
2621            while self.input.look_ch() == ' ' {
2622                self.skip_blank();
2623            }
2624
2625            if self.mark.col > max_indent {
2626                max_indent = self.mark.col;
2627            }
2628
2629            if self.input.next_is_break() {
2630                // If our current line is empty, skip over the break and continue looping.
2631                self.input.lookahead(2);
2632                self.read_break(breaks);
2633            } else {
2634                // Otherwise, we have a content line. Return control.
2635                break;
2636            }
2637        }
2638
2639        // In case a yaml looks like:
2640        // ```yaml
2641        // |
2642        // foo
2643        // bar
2644        // ```
2645        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
2646        // least 1. When in the above example, `self.indent` will be set to -1.
2647        *indent = max_indent.max((self.indent + 1) as usize);
2648        if self.indent > 0 {
2649            *indent = (*indent).max(1);
2650        }
2651    }
2652
2653    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
2654        self.save_simple_key();
2655        self.disallow_simple_key();
2656
2657        let tok = self.scan_flow_scalar(single)?;
2658
2659        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
2660        // YAML allows the following value to be specified adjacent to the “:”.
2661        self.skip_to_next_token()?;
2662        self.adjacent_value_allowed_at = self.mark.index();
2663
2664        self.tokens.push_back(tok);
2665        Ok(())
2666    }
2667
2668    #[allow(clippy::too_many_lines)]
2669    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
2670        let start_mark = self.mark;
2671
2672        // Output scalar contents.
2673        let mut buf = match self.input.byte_offset() {
2674            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
2675            None => FlowScalarBuf::new_owned(),
2676        };
2677
2678        // Scratch used to consume the *first* line break in a break run without emitting it.
2679        // (The first break folds to ' ' or to nothing depending on escaping rules.)
2680        let mut break_scratch = String::new();
2681
2682        /* Eat the left quote. */
2683        self.skip_non_blank();
2684
2685        loop {
2686            /* Check for a document indicator. */
2687            self.input.lookahead(4);
2688
2689            if self.mark.col == 0 && self.input.next_is_document_indicator() {
2690                return Err(ScanError::new_str(
2691                    start_mark,
2692                    "while scanning a quoted scalar, found unexpected document indicator",
2693                ));
2694            }
2695
2696            if self.input.next_is_z() {
2697                return Err(ScanError::new_str(start_mark, "unclosed quote"));
2698            }
2699
2700            // Do not enforce block indentation inside quoted (flow) scalars.
2701            // YAML allows line breaks within quoted scalars.
2702            let mut leading_blanks = false;
2703            self.consume_flow_scalar_non_whitespace_chars(
2704                single,
2705                &mut buf,
2706                &mut leading_blanks,
2707                &start_mark,
2708            )?;
2709
2710            match self.input.look_ch() {
2711                '\'' if single => break,
2712                '"' if !single => break,
2713                _ => {}
2714            }
2715
2716            // --- Faster whitespace / line break handling (no temporary Strings) ---
2717            //
2718            // Instead of:
2719            //   - collecting blanks into `whitespaces` and then copying
2720            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
2721            //
2722            // We do:
2723            //   - append trailing blanks directly to `string`, remember where they started,
2724            //     and truncate them if a line break follows.
2725            //   - for line breaks: consume the first break into a scratch (discarded),
2726            //     append subsequent breaks directly to `string`.
2727            //
2728            // These flags mirror the old "is_empty()" checks:
2729            //   has_leading_break  <=> !leading_break.is_empty()
2730            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
2731            let mut trailing_ws_start: Option<usize> = None;
2732            let mut has_leading_break = false;
2733            let mut has_trailing_breaks = false;
2734
2735            // For the borrowed path: track the (byte) start of a pending whitespace run.
2736            let mut pending_ws_start: Option<usize> = None;
2737
2738            // Consume blank characters.
2739            while self.input.next_is_blank() || self.input.next_is_break() {
2740                if self.input.next_is_blank() {
2741                    // Consume a space or a tab character.
2742                    if leading_blanks {
2743                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
2744                            return Err(ScanError::new_str(
2745                                self.mark,
2746                                "tab cannot be used as indentation",
2747                            ));
2748                        }
2749                        self.skip_blank();
2750                    } else {
2751                        // Append to output immediately; if a break appears next, we'll truncate.
2752                        match buf {
2753                            FlowScalarBuf::Owned(ref mut string) => {
2754                                if trailing_ws_start.is_none() {
2755                                    trailing_ws_start = Some(string.len());
2756                                }
2757                                string.push(self.input.peek());
2758                            }
2759                            FlowScalarBuf::Borrowed { .. } => {
2760                                if pending_ws_start.is_none() {
2761                                    pending_ws_start = self.input.byte_offset();
2762                                }
2763                            }
2764                        }
2765                        self.skip_blank();
2766
2767                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
2768                            (&mut buf, pending_ws_start, self.input.byte_offset())
2769                        {
2770                            buf.note_pending_ws(ws_start, ws_end);
2771                        }
2772                    }
2773                } else {
2774                    self.input.lookahead(2);
2775
2776                    // Check if it is a first line break.
2777                    if leading_blanks {
2778                        // Second+ line break in a run: preserve it.
2779                        match buf {
2780                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
2781                            FlowScalarBuf::Borrowed { .. } => {
2782                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2783                                let Some(string) = buf.as_owned_mut() else {
2784                                    unreachable!()
2785                                };
2786                                self.read_break(string);
2787                            }
2788                        }
2789                        has_trailing_breaks = true;
2790                    } else {
2791                        // First break: drop any trailing blanks we appended, then consume the break.
2792                        if let Some(pos) = trailing_ws_start.take() {
2793                            if let FlowScalarBuf::Owned(ref mut string) = buf {
2794                                string.truncate(pos);
2795                            }
2796                        }
2797
2798                        if pending_ws_start.take().is_some() {
2799                            // Trailing blanks before a break are discarded => transformation.
2800                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2801                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2802                            }
2803                            buf.discard_pending_ws();
2804                        } else {
2805                            buf.commit_pending_ws();
2806                        }
2807
2808                        break_scratch.clear();
2809                        self.read_break(&mut break_scratch);
2810                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
2811
2812                        has_leading_break = true;
2813                        leading_blanks = true;
2814                    }
2815                }
2816
2817                self.input.lookahead(1);
2818            }
2819
2820            // If we had a line break inside a quoted (flow) scalar, validate indentation
2821            // of the continuation line in block context.
2822            if leading_blanks && has_leading_break && self.flow_level == 0 {
2823                let next_ch = self.input.peek();
2824                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
2825                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
2826                    return Err(ScanError::new_str(
2827                        self.mark,
2828                        "invalid indentation in multiline quoted scalar",
2829                    ));
2830                }
2831            }
2832
2833            // Join the whitespaces or fold line breaks.
2834            if leading_blanks {
2835                // Old logic:
2836                //   if leading_break empty => emit trailing_breaks (already emitted now)
2837                //   else if trailing_breaks empty => emit ' '
2838                //   else emit trailing_breaks (already emitted now)
2839                if has_leading_break && !has_trailing_breaks {
2840                    match buf {
2841                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
2842                        FlowScalarBuf::Borrowed { .. } => {
2843                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2844                            let Some(string) = buf.as_owned_mut() else {
2845                                unreachable!()
2846                            };
2847                            string.push(' ');
2848                        }
2849                    }
2850                }
2851            }
2852            // else: trailing blanks are already appended to `string`
2853        } // loop
2854
2855        // Eat the right quote.
2856        self.skip_non_blank();
2857
2858        // Ensure there is no invalid trailing content.
2859        self.skip_ws_to_eol(SkipTabs::Yes)?;
2860        match self.input.peek() {
2861            // These can be encountered in flow sequences or mappings.
2862            ',' | '}' | ']' if self.flow_level > 0 => {}
2863            // An end-of-line / end-of-stream is fine. No trailing content.
2864            c if is_breakz(c) => {}
2865            // ':' can be encountered if our scalar is a key.
2866            // Outside of flow contexts, keys cannot span multiple lines
2867            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2868            // Inside a flow context, this is allowed.
2869            ':' if self.flow_level > 0 => {}
2870            _ => {
2871                return Err(ScanError::new_str(
2872                    self.mark,
2873                    "invalid trailing content after double-quoted scalar",
2874                ));
2875            }
2876        }
2877
2878        let style = if single {
2879            ScalarStyle::SingleQuoted
2880        } else {
2881            ScalarStyle::DoubleQuoted
2882        };
2883
2884        let contents = match buf {
2885            FlowScalarBuf::Owned(string) => Cow::Owned(string),
2886            FlowScalarBuf::Borrowed {
2887                start,
2888                mut end,
2889                pending_ws_start,
2890                pending_ws_end,
2891            } => {
2892                // If we ended after a whitespace run, it is part of the output (no break followed).
2893                if pending_ws_start.is_some() {
2894                    end = pending_ws_end;
2895                }
2896                if let Some(slice) = self.try_borrow_slice(start, end) {
2897                    Cow::Borrowed(slice)
2898                } else {
2899                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2900                        ScanError::new_str(
2901                            start_mark,
2902                            "internal error: input advertised offsets but did not provide a slice",
2903                        )
2904                    })?;
2905                    Cow::Owned(slice.to_owned())
2906                }
2907            }
2908        };
2909
2910        Ok(Token(
2911            Span::new(start_mark, self.mark),
2912            TokenType::Scalar(style, contents),
2913        ))
2914    }
2915
2916    /// Consume successive non-whitespace characters from a flow scalar.
2917    ///
2918    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2919    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2920    /// for double quoted scalars).
2921    ///
2922    /// # Errors
2923    /// Return an error if an invalid escape sequence is found.
2924    fn consume_flow_scalar_non_whitespace_chars(
2925        &mut self,
2926        single: bool,
2927        buf: &mut FlowScalarBuf,
2928        leading_blanks: &mut bool,
2929        start_mark: &Marker,
2930    ) -> Result<(), ScanError> {
2931        self.input.lookahead(2);
2932        while !is_blank_or_breakz(self.input.peek()) {
2933            match self.input.peek() {
2934                // Check for an escaped single quote.
2935                '\'' if self.input.peek_nth(1) == '\'' && single => {
2936                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2937                        buf.commit_pending_ws();
2938                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2939                    }
2940                    let Some(string) = buf.as_owned_mut() else {
2941                        unreachable!()
2942                    };
2943                    string.push('\'');
2944                    self.skip_n_non_blank(2);
2945                }
2946                // Check for the right quote.
2947                '\'' if single => break,
2948                '"' if !single => break,
2949                // Check for an escaped line break.
2950                '\\' if !single && is_break(self.input.peek_nth(1)) => {
2951                    self.input.lookahead(3);
2952                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2953                        buf.commit_pending_ws();
2954                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2955                    }
2956                    self.skip_non_blank();
2957                    self.skip_linebreak();
2958                    *leading_blanks = true;
2959                    break;
2960                }
2961                // Check for an escape sequence.
2962                '\\' if !single => {
2963                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2964                        buf.commit_pending_ws();
2965                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2966                    }
2967                    let Some(string) = buf.as_owned_mut() else {
2968                        unreachable!()
2969                    };
2970                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2971                }
2972                c => {
2973                    match buf {
2974                        FlowScalarBuf::Owned(ref mut string) => {
2975                            string.push(c);
2976                        }
2977                        FlowScalarBuf::Borrowed { .. } => {
2978                            buf.commit_pending_ws();
2979                        }
2980                    }
2981                    self.skip_non_blank();
2982
2983                    if let Some(new_end) = self.input.byte_offset() {
2984                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
2985                            *end = new_end;
2986                        }
2987                    }
2988                }
2989            }
2990            self.input.lookahead(2);
2991        }
2992        Ok(())
2993    }
2994
2995    /// Escape the sequence we encounter in a flow scalar.
2996    ///
2997    /// `self.input.peek()` must point to the `\` starting the escape sequence.
2998    ///
2999    /// # Errors
3000    /// Return an error if an invalid escape sequence is found.
3001    fn resolve_flow_scalar_escape_sequence(
3002        &mut self,
3003        start_mark: &Marker,
3004    ) -> Result<char, ScanError> {
3005        let mut code_length = 0usize;
3006        let mut ret = '\0';
3007
3008        match self.input.peek_nth(1) {
3009            '0' => ret = '\0',
3010            'a' => ret = '\x07',
3011            'b' => ret = '\x08',
3012            't' | '\t' => ret = '\t',
3013            'n' => ret = '\n',
3014            'v' => ret = '\x0b',
3015            'f' => ret = '\x0c',
3016            'r' => ret = '\x0d',
3017            'e' => ret = '\x1b',
3018            ' ' => ret = '\x20',
3019            '"' => ret = '"',
3020            '/' => ret = '/',
3021            '\\' => ret = '\\',
3022            // Unicode next line (#x85)
3023            'N' => ret = char::from_u32(0x85).unwrap(),
3024            // Unicode non-breaking space (#xA0)
3025            '_' => ret = char::from_u32(0xA0).unwrap(),
3026            // Unicode line separator (#x2028)
3027            'L' => ret = char::from_u32(0x2028).unwrap(),
3028            // Unicode paragraph separator (#x2029)
3029            'P' => ret = char::from_u32(0x2029).unwrap(),
3030            'x' => code_length = 2,
3031            'u' => code_length = 4,
3032            'U' => code_length = 8,
3033            _ => {
3034                return Err(ScanError::new_str(
3035                    *start_mark,
3036                    "while parsing a quoted scalar, found unknown escape character",
3037                ))
3038            }
3039        }
3040        self.skip_n_non_blank(2);
3041
3042        // Consume an arbitrary escape code.
3043        if code_length > 0 {
3044            self.input.lookahead(code_length);
3045            let mut value = 0u32;
3046            for i in 0..code_length {
3047                let c = self.input.peek_nth(i);
3048                if !is_hex(c) {
3049                    return Err(ScanError::new_str(
3050                        *start_mark,
3051                        "while parsing a quoted scalar, did not find expected hexadecimal number",
3052                    ));
3053                }
3054                value = (value << 4) + as_hex(c);
3055            }
3056
3057            self.skip_n_non_blank(code_length);
3058
3059            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
3060            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3061                self.input.lookahead(2);
3062                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3063                    self.skip_n_non_blank(2);
3064                    self.input.lookahead(4);
3065                    let mut low_value = 0u32;
3066                    for i in 0..4 {
3067                        let c = self.input.peek_nth(i);
3068                        if !is_hex(c) {
3069                            return Err(ScanError::new_str(
3070                                *start_mark,
3071                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3072                            ));
3073                        }
3074                        low_value = (low_value << 4) + as_hex(c);
3075                    }
3076                    if (0xDC00..=0xDFFF).contains(&low_value) {
3077                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3078                        self.skip_n_non_blank(4);
3079                    } else {
3080                        return Err(ScanError::new_str(
3081                            *start_mark,
3082                            "while parsing a quoted scalar, found invalid low surrogate",
3083                        ));
3084                    }
3085                } else {
3086                    return Err(ScanError::new_str(
3087                        *start_mark,
3088                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3089                    ));
3090                }
3091            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3092                return Err(ScanError::new_str(
3093                    *start_mark,
3094                    "while parsing a quoted scalar, found unpaired low surrogate",
3095                ));
3096            }
3097
3098            let Some(ch) = char::from_u32(value) else {
3099                return Err(ScanError::new_str(
3100                    *start_mark,
3101                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3102                ));
3103            };
3104            ret = ch;
3105        }
3106        Ok(ret)
3107    }
3108
3109    fn fetch_plain_scalar(&mut self) -> ScanResult {
3110        self.save_simple_key();
3111        self.disallow_simple_key();
3112
3113        let tok = self.scan_plain_scalar()?;
3114
3115        self.tokens.push_back(tok);
3116        Ok(())
3117    }
3118
3119    /// Scan for a plain scalar.
3120    ///
3121    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3122    /// some contexts.
3123    #[allow(clippy::too_many_lines)]
3124    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3125        self.unroll_non_block_indents();
3126        let indent = self.indent + 1;
3127        let start_mark = self.mark;
3128
3129        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3130            return Err(ScanError::new_str(
3131                start_mark,
3132                "invalid indentation in flow construct",
3133            ));
3134        }
3135
3136        let mut string = String::with_capacity(32);
3137        self.buf_whitespaces.clear();
3138        self.buf_leading_break.clear();
3139        self.buf_trailing_breaks.clear();
3140        let mut end_mark = self.mark;
3141
3142        loop {
3143            self.input.lookahead(4);
3144            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3145                || self.input.peek() == '#'
3146            {
3147                // BS4K: If a `#` starts a comment after some separation spaces following content
3148                // of a plain scalar in block context, and there is potential continuation on the
3149                // next line, this is invalid. We cannot decide yet if there will be continuation,
3150                // so record that a comment interrupted a plain scalar.
3151                if self.input.peek() == '#'
3152                    && !string.is_empty()
3153                    && !self.buf_whitespaces.is_empty()
3154                    && self.flow_level == 0
3155                {
3156                    self.interrupted_plain_by_comment = Some(self.mark);
3157                }
3158                break;
3159            }
3160
3161            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3162                return Err(ScanError::new_str(
3163                    self.mark,
3164                    "plain scalar cannot start with '-' followed by ,[]{}",
3165                ));
3166            }
3167
3168            if !self.input.next_is_blank_or_breakz()
3169                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3170            {
3171                if self.leading_whitespace {
3172                    if self.buf_leading_break.is_empty() {
3173                        string.push_str(&self.buf_leading_break);
3174                        string.push_str(&self.buf_trailing_breaks);
3175                        self.buf_trailing_breaks.clear();
3176                        self.buf_leading_break.clear();
3177                    } else {
3178                        if self.buf_trailing_breaks.is_empty() {
3179                            string.push(' ');
3180                        } else {
3181                            string.push_str(&self.buf_trailing_breaks);
3182                            self.buf_trailing_breaks.clear();
3183                        }
3184                        self.buf_leading_break.clear();
3185                    }
3186                    self.leading_whitespace = false;
3187                } else if !self.buf_whitespaces.is_empty() {
3188                    string.push_str(&self.buf_whitespaces);
3189                    self.buf_whitespaces.clear();
3190                }
3191
3192                // We can unroll the first iteration of the loop.
3193                string.push(self.input.peek());
3194                self.skip_non_blank();
3195                string.reserve(self.input.bufmaxlen());
3196
3197                // Add content non-blank characters to the scalar.
3198                let mut end = false;
3199                while !end {
3200                    // Fill the buffer once and process all characters in the buffer until the next
3201                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3202                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3203                    self.input.lookahead(self.input.bufmaxlen());
3204                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3205                        &mut string,
3206                        self.input.bufmaxlen() - 1,
3207                        self.flow_level > 0,
3208                    );
3209                    end = stop;
3210                    self.mark.offsets.chars += chars_consumed;
3211                    self.mark.col += chars_consumed;
3212                    self.mark.offsets.bytes = self.input.byte_offset();
3213                }
3214                end_mark = self.mark;
3215            }
3216
3217            // We may reach the end of a plain scalar if:
3218            //  - We reach eof
3219            //  - We reach ": "
3220            //  - We find a flow character in a flow context
3221            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3222                break;
3223            }
3224
3225            // Process blank characters.
3226            self.input.lookahead(2);
3227            while self.input.next_is_blank_or_break() {
3228                if self.input.next_is_blank() {
3229                    if !self.leading_whitespace {
3230                        self.buf_whitespaces.push(self.input.peek());
3231                        self.skip_blank();
3232                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3233                        // Tabs in an indentation columns are allowed if and only if the line is
3234                        // empty. Skip to the end of the line.
3235                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3236                        if !self.input.next_is_breakz() {
3237                            return Err(ScanError::new_str(
3238                                start_mark,
3239                                "while scanning a plain scalar, found a tab",
3240                            ));
3241                        }
3242                    } else {
3243                        self.skip_blank();
3244                    }
3245                } else {
3246                    // Check if it is a first line break
3247                    if self.leading_whitespace {
3248                        self.skip_break();
3249                        self.buf_trailing_breaks.push('\n');
3250                    } else {
3251                        self.buf_whitespaces.clear();
3252                        self.skip_break();
3253                        self.buf_leading_break.push('\n');
3254                        self.leading_whitespace = true;
3255                    }
3256                }
3257                self.input.lookahead(2);
3258            }
3259
3260            // check indentation level
3261            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3262                break;
3263            }
3264        }
3265
3266        if self.leading_whitespace {
3267            self.allow_simple_key();
3268        }
3269
3270        if string.is_empty() {
3271            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3272            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3273            // erroneous inputs such as "{...".
3274            Err(ScanError::new_str(
3275                start_mark,
3276                "unexpected end of plain scalar",
3277            ))
3278        } else {
3279            let contents = if let (Some(start), Some(end)) =
3280                (start_mark.byte_offset(), end_mark.byte_offset())
3281            {
3282                match self.try_borrow_slice(start, end) {
3283                    Some(slice) if slice == string => Cow::Borrowed(slice),
3284                    _ => Cow::Owned(string),
3285                }
3286            } else {
3287                Cow::Owned(string)
3288            };
3289
3290            Ok(Token(
3291                Span::new(start_mark, end_mark),
3292                TokenType::Scalar(ScalarStyle::Plain, contents),
3293            ))
3294        }
3295    }
3296
3297    fn fetch_key(&mut self) -> ScanResult {
3298        let start_mark = self.mark;
3299        if self.flow_level == 0 {
3300            // Check if we are allowed to start a new key (not necessarily simple).
3301            if !self.simple_key_allowed {
3302                return Err(ScanError::new_str(
3303                    self.mark,
3304                    "mapping keys are not allowed in this context",
3305                ));
3306            }
3307            self.roll_indent(
3308                start_mark.col,
3309                None,
3310                TokenType::BlockMappingStart,
3311                start_mark,
3312            );
3313        } else {
3314            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3315            self.set_current_flow_mapping_started(true);
3316        }
3317
3318        self.remove_simple_key()?;
3319
3320        if self.flow_level == 0 {
3321            self.allow_simple_key();
3322        } else {
3323            self.disallow_simple_key();
3324        }
3325
3326        self.skip_non_blank();
3327        self.skip_yaml_whitespace()?;
3328        if self.input.peek() == '\t' {
3329            return Err(ScanError::new_str(
3330                self.mark(),
3331                "tabs disallowed in this context",
3332            ));
3333        }
3334        self.tokens
3335            .push_back(Token(Span::new(start_mark, self.mark), TokenType::Key));
3336        Ok(())
3337    }
3338
3339    /// Fetch a value in a mapping inside of a flow collection.
3340    ///
3341    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3342    /// values in flow collections are respected prior to calling [`fetch_value`].
3343    ///
3344    /// [`self.flow_level`]: Self::flow_level
3345    /// [`fetch_value`]: Self::fetch_value
3346    fn fetch_flow_value(&mut self) -> ScanResult {
3347        let nc = self.input.peek_nth(1);
3348
3349        // If we encounter a ':' inside a flow collection and it is not immediately
3350        // followed by a blank or breakz:
3351        //   - We must check whether an adjacent value is allowed
3352        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3353        //     is needed for JSON compatibility.
3354        //   - If not, we must ensure there is a space after the ':' and before its value.
3355        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3356        //   - But if the value is empty (null), then it's okay.
3357        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3358        // flow character), but the ']' is not the value. The value is an invisible empty
3359        // space which is represented as null ('~').
3360        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3361            return Err(ScanError::new_str(
3362                self.mark,
3363                "':' may not precede any of `[{` in flow mapping",
3364            ));
3365        }
3366
3367        self.fetch_value()
3368    }
3369
3370    /// Fetch a value from a mapping (after a `:`).
3371    fn fetch_value(&mut self) -> ScanResult {
3372        let sk = self.simple_keys.last().unwrap().clone();
3373        let start_mark = self.mark;
3374        let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3375            && !self.current_flow_mapping_started()
3376            && !self.implicit_flow_mapping_states.is_empty();
3377        if is_implicit_flow_mapping {
3378            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3379                ImplicitMappingState::Inside(self.flow_level);
3380        }
3381
3382        // Skip over ':'.
3383        self.skip_non_blank();
3384        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3385        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3386        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3387        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3388        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3389        // cases will still fail to parse correctly (just with a less specific error message).
3390        if self.input.look_ch() == '\t'
3391            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
3392            && (self.input.peek() == '-' || self.input.next_is_alpha())
3393        {
3394            return Err(ScanError::new_str(
3395                self.mark,
3396                "':' must be followed by a valid YAML whitespace",
3397            ));
3398        }
3399
3400        if sk.possible {
3401            // insert simple key
3402            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3403            self.insert_token(sk.token_number - self.tokens_parsed, tok);
3404            if is_implicit_flow_mapping {
3405                if sk.mark.line < start_mark.line {
3406                    return Err(ScanError::new_str(
3407                        start_mark,
3408                        "illegal placement of ':' indicator",
3409                    ));
3410                }
3411                self.insert_token(
3412                    sk.token_number - self.tokens_parsed,
3413                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3414                );
3415            }
3416
3417            // Add the BLOCK-MAPPING-START token if needed.
3418            self.roll_indent(
3419                sk.mark.col,
3420                Some(sk.token_number),
3421                TokenType::BlockMappingStart,
3422                sk.mark,
3423            );
3424            self.roll_one_col_indent();
3425
3426            self.simple_keys.last_mut().unwrap().possible = false;
3427            self.disallow_simple_key();
3428        } else {
3429            if is_implicit_flow_mapping {
3430                self.tokens
3431                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
3432            }
3433            // The ':' indicator follows a complex key.
3434            if self.flow_level == 0 {
3435                if !self.simple_key_allowed {
3436                    return Err(ScanError::new_str(
3437                        start_mark,
3438                        "mapping values are not allowed in this context",
3439                    ));
3440                }
3441
3442                self.roll_indent(
3443                    start_mark.col,
3444                    None,
3445                    TokenType::BlockMappingStart,
3446                    start_mark,
3447                );
3448            }
3449            self.roll_one_col_indent();
3450
3451            if self.flow_level == 0 {
3452                self.allow_simple_key();
3453            } else {
3454                self.disallow_simple_key();
3455            }
3456        }
3457        self.tokens
3458            .push_back(Token(Span::empty(start_mark), TokenType::Value));
3459
3460        Ok(())
3461    }
3462
3463    /// Add an indentation level to the stack with the given block token, if needed.
3464    ///
3465    /// An indentation level is added only if:
3466    ///   - We are not in a flow-style construct (which don't have indentation per-se).
3467    ///   - The current column is further indented than the last indent we have registered.
3468    fn roll_indent(
3469        &mut self,
3470        col: usize,
3471        number: Option<usize>,
3472        tok: TokenType<'input>,
3473        mark: Marker,
3474    ) {
3475        if self.flow_level > 0 {
3476            return;
3477        }
3478
3479        // If the last indent was a non-block indent, remove it.
3480        // This means that we prepared an indent that we thought we wouldn't use, but realized just
3481        // now that it is a block indent.
3482        if self.indent <= col as isize {
3483            if let Some(indent) = self.indents.last() {
3484                if !indent.needs_block_end {
3485                    self.indent = indent.indent;
3486                    self.indents.pop();
3487                }
3488            }
3489        }
3490
3491        if self.indent < col as isize {
3492            self.indents.push(Indent {
3493                indent: self.indent,
3494                needs_block_end: true,
3495            });
3496            self.indent = col as isize;
3497            let tokens_parsed = self.tokens_parsed;
3498            match number {
3499                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
3500                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
3501            }
3502        }
3503    }
3504
3505    /// Pop indentation levels from the stack as much as needed.
3506    ///
3507    /// Indentation levels are popped from the stack while they are further indented than `col`.
3508    /// If we are in a flow-style construct (which don't have indentation per-se), this function
3509    /// does nothing.
3510    fn unroll_indent(&mut self, col: isize) {
3511        if self.flow_level > 0 {
3512            return;
3513        }
3514        while self.indent > col {
3515            let indent = self.indents.pop().unwrap();
3516            self.indent = indent.indent;
3517            if indent.needs_block_end {
3518                self.tokens
3519                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
3520            }
3521        }
3522    }
3523
3524    /// Add an indentation level of 1 column that does not start a block.
3525    ///
3526    /// See the documentation of [`Indent::needs_block_end`] for more details.
3527    /// An indentation is not added if we are inside a flow level or if the last indent is already
3528    /// a non-block indent.
3529    fn roll_one_col_indent(&mut self) {
3530        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
3531            self.indents.push(Indent {
3532                indent: self.indent,
3533                needs_block_end: false,
3534            });
3535            self.indent += 1;
3536        }
3537    }
3538
3539    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
3540    fn unroll_non_block_indents(&mut self) {
3541        while let Some(indent) = self.indents.last() {
3542            if indent.needs_block_end {
3543                break;
3544            }
3545            self.indent = indent.indent;
3546            self.indents.pop();
3547        }
3548    }
3549
3550    /// Mark the next token to be inserted as a potential simple key.
3551    fn save_simple_key(&mut self) {
3552        if self.simple_key_allowed {
3553            let required = self.flow_level == 0
3554                && self.indent == (self.mark.col as isize)
3555                && self.indents.last().unwrap().needs_block_end;
3556
3557            if let Some(last) = self.simple_keys.last_mut() {
3558                *last = SimpleKey {
3559                    mark: self.mark,
3560                    possible: true,
3561                    required,
3562                    token_number: self.tokens_parsed + self.tokens.len(),
3563                };
3564            }
3565        }
3566    }
3567
3568    fn remove_simple_key(&mut self) -> ScanResult {
3569        let last = self.simple_keys.last_mut().unwrap();
3570        if last.possible && last.required {
3571            return Err(self.simple_key_expected());
3572        }
3573
3574        last.possible = false;
3575        Ok(())
3576    }
3577
3578    /// Return whether the scanner is inside a block but outside of a flow sequence.
3579    fn is_within_block(&self) -> bool {
3580        !self.indents.is_empty()
3581    }
3582
3583    /// If an implicit mapping had started, end it.
3584    ///
3585    /// This function does not pop the state in [`implicit_flow_mapping_states`].
3586    ///
3587    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
3588    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
3589        if self
3590            .implicit_flow_mapping_states
3591            .last()
3592            .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
3593        {
3594            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
3595            self.set_current_flow_mapping_started(false);
3596            self.tokens
3597                .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
3598        }
3599    }
3600
3601    fn current_flow_collection_is_sequence(&self) -> bool {
3602        self.flow_markers
3603            .last()
3604            .is_some_and(|(_, bracket)| *bracket == '[')
3605    }
3606
3607    fn current_flow_mapping_started(&self) -> bool {
3608        self.flow_mapping_started.last().copied().unwrap_or(false)
3609    }
3610
3611    fn set_current_flow_mapping_started(&mut self, started: bool) {
3612        if let Some(current) = self.flow_mapping_started.last_mut() {
3613            *current = started;
3614        }
3615    }
3616}
3617
3618/// Chomping, how final line breaks and trailing empty lines are interpreted.
3619///
3620/// See YAML spec 8.1.1.2.
3621#[derive(PartialEq, Eq)]
3622pub enum Chomping {
3623    /// The final line break and any trailing empty lines are excluded.
3624    Strip,
3625    /// The final line break is preserved, but trailing empty lines are excluded.
3626    Clip,
3627    /// The final line break and trailing empty lines are included.
3628    Keep,
3629}
3630
3631#[cfg(test)]
3632mod test {
3633    use alloc::{borrow::Cow, rc::Rc, string::String, vec::Vec};
3634    use core::cell::Cell;
3635
3636    use crate::{
3637        input::{str::StrInput, BufferedInput},
3638        scanner::{Scanner, Token, TokenType},
3639    };
3640
3641    struct CountingChars {
3642        chars: alloc::vec::IntoIter<char>,
3643        read: Rc<Cell<usize>>,
3644    }
3645
3646    impl Iterator for CountingChars {
3647        type Item = char;
3648
3649        fn next(&mut self) -> Option<Self::Item> {
3650            let next = self.chars.next();
3651            if next.is_some() {
3652                self.read.set(self.read.get() + 1);
3653            }
3654            next
3655        }
3656    }
3657
3658    #[test]
3659    fn test_is_anchor_char() {
3660        use super::is_anchor_char;
3661        assert!(is_anchor_char('x'));
3662    }
3663
3664    #[test]
3665    fn flow_simple_key_length_limit_bounds_buffering() {
3666        let mut yaml = String::from("[\n\"start\"\n");
3667        for _ in 0..600 {
3668            yaml.push_str("\"x\"\n");
3669        }
3670        let total_chars = yaml.chars().count();
3671        let read = Rc::new(Cell::new(0));
3672        let chars = yaml.chars().collect::<Vec<_>>().into_iter();
3673        let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
3674            chars,
3675            read: Rc::clone(&read),
3676        }));
3677
3678        assert!(matches!(
3679            scanner.next_token().unwrap().unwrap().1,
3680            TokenType::StreamStart(_)
3681        ));
3682
3683        let token = scanner.next_token().unwrap().unwrap();
3684        assert!(matches!(token.1, TokenType::FlowSequenceStart));
3685
3686        let token = scanner.next_token().unwrap().unwrap();
3687        assert!(matches!(
3688            token.1,
3689            TokenType::Scalar(_, ref value) if value == "start"
3690        ));
3691        assert!(
3692            read.get() < total_chars,
3693            "scanner consumed all {total_chars} chars before yielding the first flow scalar"
3694        );
3695        assert!(
3696            read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
3697            "scanner read {} chars before yielding the first flow scalar",
3698            read.get()
3699        );
3700    }
3701
3702    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
3703    #[test]
3704    fn anchor_name_is_borrowed_for_str_input() {
3705        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
3706
3707        loop {
3708            let tok = scanner
3709                .next_token()
3710                .expect("valid YAML must scan without errors")
3711                .expect("scanner must eventually produce a token");
3712            if let TokenType::Anchor(name) = tok.1 {
3713                assert!(matches!(name, Cow::Borrowed("anch")));
3714                break;
3715            }
3716        }
3717    }
3718
3719    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
3720    #[test]
3721    fn anchor_name_rejects_non_printable_control_chars() {
3722        let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
3723
3724        loop {
3725            let tok = scanner
3726                .next_token()
3727                .expect("scanning should not fail")
3728                .expect("scanner must eventually produce a token");
3729            if let TokenType::Anchor(name) = tok.1 {
3730                assert!(matches!(name, Cow::Borrowed("foo")));
3731                let next = scanner.next_token().expect("scanning should not fail");
3732                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
3733                    assert!(rest.starts_with('\u{0001}'));
3734                }
3735                break;
3736            }
3737        }
3738    }
3739
3740    #[test]
3741    fn alias_name_rejects_non_printable_control_chars() {
3742        let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
3743
3744        loop {
3745            let tok = scanner
3746                .next_token()
3747                .expect("scanning should not fail")
3748                .expect("scanner must eventually produce a token");
3749            if let TokenType::Alias(name) = tok.1 {
3750                assert!(matches!(name, Cow::Borrowed("foo")));
3751                let next = scanner.next_token().expect("scanning should not fail");
3752                if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
3753                    assert!(rest.starts_with('\u{0001}'));
3754                }
3755                break;
3756            }
3757        }
3758    }
3759
3760    #[test]
3761    fn alias_name_is_borrowed_for_str_input() {
3762        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
3763
3764        loop {
3765            let tok = scanner
3766                .next_token()
3767                .expect("valid YAML must scan without errors")
3768                .expect("scanner must eventually produce a token");
3769            if let TokenType::Alias(name) = tok.1 {
3770                assert!(matches!(name, Cow::Borrowed("anch")));
3771                break;
3772            }
3773        }
3774    }
3775
3776    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
3777    #[test]
3778    fn tag_directive_parts_are_borrowed_for_str_input() {
3779        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
3780
3781        loop {
3782            let tok = scanner
3783                .next_token()
3784                .expect("valid YAML must scan without errors")
3785                .expect("scanner must eventually produce a token");
3786            if let TokenType::TagDirective(handle, prefix) = tok.1 {
3787                assert!(matches!(handle, Cow::Borrowed("!e!")));
3788                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
3789                break;
3790            }
3791        }
3792    }
3793
3794    #[test]
3795    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
3796        let mut scanner = Scanner::new(StrInput::new("foo\n"));
3797
3798        loop {
3799            let tok = scanner
3800                .next_token()
3801                .expect("valid YAML must scan without errors")
3802                .expect("scanner must eventually produce a token");
3803            if let TokenType::Scalar(_, value) = tok.1 {
3804                assert!(matches!(value, Cow::Borrowed("foo")));
3805                break;
3806            }
3807        }
3808    }
3809
3810    #[test]
3811    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
3812        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
3813
3814        loop {
3815            let tok = scanner
3816                .next_token()
3817                .expect("valid YAML must scan without errors")
3818                .expect("scanner must eventually produce a token");
3819            if let TokenType::Scalar(_, value) = tok.1 {
3820                assert!(matches!(value, Cow::Borrowed("foo bar")));
3821                break;
3822            }
3823        }
3824    }
3825
3826    #[test]
3827    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
3828        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
3829
3830        loop {
3831            let tok = scanner
3832                .next_token()
3833                .expect("valid YAML must scan without errors")
3834                .expect("scanner must eventually produce a token");
3835            if let TokenType::Scalar(_, value) = tok.1 {
3836                assert!(matches!(value, Cow::Borrowed("foo bar")));
3837                break;
3838            }
3839        }
3840    }
3841
3842    #[test]
3843    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
3844        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
3845
3846        loop {
3847            let tok = scanner
3848                .next_token()
3849                .expect("valid YAML must scan without errors")
3850                .expect("scanner must eventually produce a token");
3851            if let TokenType::Scalar(_, value) = tok.1 {
3852                assert!(matches!(value, Cow::Owned(_)));
3853                assert_eq!(&*value, "foo'bar");
3854                break;
3855            }
3856        }
3857    }
3858
3859    #[test]
3860    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
3861        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
3862
3863        loop {
3864            let tok = scanner
3865                .next_token()
3866                .expect("valid YAML must scan without errors")
3867                .expect("scanner must eventually produce a token");
3868            if let TokenType::Scalar(_, value) = tok.1 {
3869                assert!(matches!(value, Cow::Borrowed("foo bar")));
3870                break;
3871            }
3872        }
3873    }
3874
3875    #[test]
3876    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
3877        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
3878
3879        loop {
3880            let tok = scanner
3881                .next_token()
3882                .expect("valid YAML must scan without errors")
3883                .expect("scanner must eventually produce a token");
3884            if let TokenType::Scalar(_, value) = tok.1 {
3885                assert!(matches!(value, Cow::Owned(_)));
3886                assert_eq!(&*value, "foo\nbar");
3887                break;
3888            }
3889        }
3890    }
3891
3892    #[test]
3893    fn plain_key_is_borrowed_for_str_input() {
3894        // Keys are just scalars in a key position; they should also be borrowed.
3895        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
3896
3897        let mut found_key = false;
3898        let mut key_value: Option<Cow<'_, str>> = None;
3899
3900        loop {
3901            let tok = scanner
3902                .next_token()
3903                .expect("valid YAML must scan without errors");
3904            let Some(tok) = tok else { break };
3905
3906            if matches!(tok.1, TokenType::Key) {
3907                found_key = true;
3908            } else if found_key {
3909                if let TokenType::Scalar(_, value) = tok.1 {
3910                    key_value = Some(value);
3911                    break;
3912                }
3913            }
3914        }
3915
3916        assert!(found_key, "expected to find a Key token");
3917        let key_value = key_value.expect("expected to find a scalar after Key token");
3918        assert!(
3919            matches!(key_value, Cow::Borrowed("mykey")),
3920            "key should be borrowed, got: {key_value:?}"
3921        );
3922    }
3923
3924    #[test]
3925    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
3926        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
3927
3928        let mut found_key = false;
3929        let mut key_value: Option<Cow<'_, str>> = None;
3930
3931        loop {
3932            let tok = scanner
3933                .next_token()
3934                .expect("valid YAML must scan without errors");
3935            let Some(tok) = tok else { break };
3936
3937            if matches!(tok.1, TokenType::Key) {
3938                found_key = true;
3939            } else if found_key {
3940                if let TokenType::Scalar(_, value) = tok.1 {
3941                    key_value = Some(value);
3942                    break;
3943                }
3944            }
3945        }
3946
3947        assert!(found_key, "expected to find a Key token");
3948        let key_value = key_value.expect("expected to find a scalar after Key token");
3949        assert!(
3950            matches!(key_value, Cow::Borrowed("mykey")),
3951            "quoted key should be borrowed when verbatim, got: {key_value:?}"
3952        );
3953    }
3954
3955    #[test]
3956    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
3957        // Test a tag like !!str which should have handle="!!" and suffix="str"
3958        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
3959
3960        loop {
3961            let tok = scanner
3962                .next_token()
3963                .expect("valid YAML must scan without errors")
3964                .expect("scanner must eventually produce a token");
3965            if let TokenType::Tag(handle, suffix) = tok.1 {
3966                assert!(
3967                    matches!(handle, Cow::Borrowed("!!")),
3968                    "tag handle should be borrowed, got: {handle:?}"
3969                );
3970                assert!(
3971                    matches!(suffix, Cow::Borrowed("str")),
3972                    "tag suffix should be borrowed, got: {suffix:?}"
3973                );
3974                break;
3975            }
3976        }
3977    }
3978
3979    #[test]
3980    fn local_tag_suffix_is_borrowed_for_str_input() {
3981        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
3982        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
3983
3984        loop {
3985            let tok = scanner
3986                .next_token()
3987                .expect("valid YAML must scan without errors")
3988                .expect("scanner must eventually produce a token");
3989            if let TokenType::Tag(handle, suffix) = tok.1 {
3990                assert!(
3991                    matches!(handle, Cow::Borrowed("!")),
3992                    "local tag handle should be '!', got: {handle:?}"
3993                );
3994                assert!(
3995                    matches!(suffix, Cow::Borrowed("mytag")),
3996                    "local tag suffix should be borrowed, got: {suffix:?}"
3997                );
3998                break;
3999            }
4000        }
4001    }
4002
4003    #[test]
4004    fn tag_with_uri_escape_is_owned_for_str_input() {
4005        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
4006        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
4007
4008        loop {
4009            let tok = scanner
4010                .next_token()
4011                .expect("valid YAML must scan without errors")
4012                .expect("scanner must eventually produce a token");
4013            if let TokenType::Tag(handle, suffix) = tok.1 {
4014                assert!(
4015                    matches!(handle, Cow::Borrowed("!!")),
4016                    "tag handle should still be borrowed, got: {handle:?}"
4017                );
4018                assert!(
4019                    matches!(suffix, Cow::Owned(_)),
4020                    "tag suffix with URI escape should be owned, got: {suffix:?}"
4021                );
4022                assert_eq!(&*suffix, "my tag");
4023                break;
4024            }
4025        }
4026    }
4027}