saphyr_parser_bw/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21    char_traits::{
22        as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex,
23        is_tag_char, is_uri_char,
24    },
25    input::{BorrowedInput, SkipTabs},
26};
27
28/// The encoding of the input. Currently, only UTF-8 is supported.
29#[derive(Clone, Copy, PartialEq, Debug, Eq)]
30pub enum TEncoding {
31    /// UTF-8 encoding.
32    Utf8,
33}
34
35/// The style as which the scalar was written in the YAML document.
36#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
37pub enum ScalarStyle {
38    /// A YAML plain scalar.
39    Plain,
40    /// A YAML single quoted scalar.
41    SingleQuoted,
42    /// A YAML double quoted scalar.
43    DoubleQuoted,
44
45    /// A YAML literal block (`|` block).
46    ///
47    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
48    /// In literal blocks, any indented character is content, including white space characters.
49    /// There is no way to escape characters, nor to break a long line.
50    Literal,
51    /// A YAML folded block (`>` block).
52    ///
53    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
54    /// In folded blocks, any indented character is content, including white space characters.
55    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
56    /// long lines.
57    Folded,
58}
59
60/// Offset information for a [`Marker`].
61///
62/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
63/// character source. For stable inputs, we can track both a character index and a byte offset.
64/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
65/// meaningful underlying file/source), so they are optional.
66#[derive(Clone, Copy, Debug, Default)]
67pub struct MarkerOffsets {
68    /// The index (in characters) in the source.
69    chars: usize,
70    /// The offset (in bytes) in the source, if available.
71    bytes: Option<usize>,
72}
73
74impl PartialEq for MarkerOffsets {
75    fn eq(&self, other: &Self) -> bool {
76        // Byte offsets are an optional diagnostic enhancement and may differ between input
77        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
78        // position only.
79        self.chars == other.chars
80    }
81}
82
83impl Eq for MarkerOffsets {}
84
85/// A location in a yaml document.
86#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
87pub struct Marker {
88    /// Offsets in the source.
89    offsets: MarkerOffsets,
90    /// The line (1-indexed).
91    line: usize,
92    /// The column (0-indexed).
93    col: usize,
94}
95
96impl Marker {
97    /// Create a new [`Marker`] at the given position.
98    #[must_use]
99    pub fn new(index: usize, line: usize, col: usize) -> Marker {
100        Marker {
101            offsets: MarkerOffsets {
102                chars: index,
103                bytes: None,
104            },
105            line,
106            col,
107        }
108    }
109
110    /// Return a copy of the marker with the given optional byte offset.
111    #[must_use]
112    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
113        self.offsets.bytes = byte_offset;
114        self
115    }
116
117    /// Return the index (in characters) of the marker in the source.
118    #[must_use]
119    pub fn index(&self) -> usize {
120        self.offsets.chars
121    }
122
123    /// Return the byte offset of the marker in the source, if available.
124    #[must_use]
125    pub fn byte_offset(&self) -> Option<usize> {
126        self.offsets.bytes
127    }
128
129    /// Return the line of the marker in the source.
130    #[must_use]
131    pub fn line(&self) -> usize {
132        self.line
133    }
134
135    /// Return the column of the marker in the source.
136    #[must_use]
137    pub fn col(&self) -> usize {
138        self.col
139    }
140}
141
142/// A range of locations in a Yaml document.
143#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
144pub struct Span {
145    /// The start (inclusive) of the range.
146    pub start: Marker,
147    /// The end (exclusive) of the range.
148    pub end: Marker,
149
150    /// Optional indentation hint associated with this span.
151    ///
152    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
153    /// When indentation is not meaningful or cannot be provided, it must be `None`.
154    pub indent: Option<usize>,
155}
156
157impl Span {
158    /// Create a new [`Span`] for the given range.
159    #[must_use]
160    pub fn new(start: Marker, end: Marker) -> Span {
161        Span {
162            start,
163            end,
164            indent: None,
165        }
166    }
167
168    /// Create a empty [`Span`] at a given location.
169    ///
170    /// An empty span doesn't contain any characters, but its position may still be meaningful.
171    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
172    ///
173    /// [`SequenceEnd`]: crate::Event::SequenceEnd
174    #[must_use]
175    pub fn empty(mark: Marker) -> Span {
176        Span {
177            start: mark,
178            end: mark,
179            indent: None,
180        }
181    }
182
183    /// Return a copy of this [`Span`] with the given indentation hint.
184    #[must_use]
185    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
186        self.indent = indent;
187        self
188    }
189
190    /// Return the length of the span (in characters).
191    #[must_use]
192    pub fn len(&self) -> usize {
193        self.end.index() - self.start.index()
194    }
195
196    /// Return whether the [`Span`] has a length of zero.
197    #[must_use]
198    pub fn is_empty(&self) -> bool {
199        self.len() == 0
200    }
201
202    /// Return the byte range of the span, if available.
203    #[must_use]
204    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
205        let start = self.start.byte_offset()?;
206        let end = self.end.byte_offset()?;
207        Some(start..end)
208    }
209}
210
211/// An error that occurred while scanning.
212#[derive(Clone, PartialEq, Debug, Eq)]
213pub struct ScanError {
214    /// The position at which the error happened in the source.
215    mark: Marker,
216    /// Human-readable details about the error.
217    info: String,
218}
219
220impl ScanError {
221    /// Create a new error from a location and an error string.
222    #[must_use]
223    #[cold]
224    pub fn new(loc: Marker, info: String) -> ScanError {
225        ScanError { mark: loc, info }
226    }
227
228    /// Convenience alias for string slices.
229    #[must_use]
230    #[cold]
231    pub fn new_str(loc: Marker, info: &str) -> ScanError {
232        ScanError {
233            mark: loc,
234            info: info.to_owned(),
235        }
236    }
237
238    /// Return the marker pointing to the error in the source.
239    #[must_use]
240    pub fn marker(&self) -> &Marker {
241        &self.mark
242    }
243
244    /// Return the information string describing the error that happened.
245    #[must_use]
246    pub fn info(&self) -> &str {
247        self.info.as_ref()
248    }
249}
250
251impl fmt::Display for ScanError {
252    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
253        write!(
254            f,
255            "{} at char {} line {} column {}",
256            self.info,
257            self.mark.index(),
258            self.mark.line(),
259            self.mark.col() + 1
260        )
261    }
262}
263
264impl core::error::Error for ScanError {}
265
266/// The contents of a scanner token.
267#[derive(Clone, PartialEq, Debug, Eq)]
268pub enum TokenType<'input> {
269    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
270    StreamStart(TEncoding),
271    /// The end of the stream, EOF.
272    StreamEnd,
273    /// A YAML version directive.
274    VersionDirective(
275        /// Major
276        u32,
277        /// Minor
278        u32,
279    ),
280    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
281    TagDirective(
282        /// Handle
283        Cow<'input, str>,
284        /// Prefix
285        Cow<'input, str>,
286    ),
287    /// The start of a YAML document (`---`).
288    DocumentStart,
289    /// The end of a YAML document (`...`).
290    DocumentEnd,
291    /// The start of a sequence block.
292    ///
293    /// Sequence blocks are arrays starting with a `-`.
294    BlockSequenceStart,
295    /// The start of a sequence mapping.
296    ///
297    /// Sequence mappings are "dictionaries" with "key: value" entries.
298    BlockMappingStart,
299    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
300    BlockEnd,
301    /// Start of an inline sequence (`[ a, b ]`).
302    FlowSequenceStart,
303    /// End of an inline sequence.
304    FlowSequenceEnd,
305    /// Start of an inline mapping (`{ a: b, c: d }`).
306    FlowMappingStart,
307    /// End of an inline mapping.
308    FlowMappingEnd,
309    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
310    BlockEntry,
311    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
312    FlowEntry,
313    /// A key in a mapping.
314    Key,
315    /// A value in a mapping.
316    Value,
317    /// A reference to an anchor.
318    Alias(Cow<'input, str>),
319    /// A YAML anchor (`&`/`*`).
320    Anchor(Cow<'input, str>),
321    /// A YAML tag (starting with bangs `!`).
322    Tag(
323        /// The handle of the tag.
324        Cow<'input, str>,
325        /// The suffix of the tag.
326        Cow<'input, str>,
327    ),
328    /// A regular YAML scalar.
329    Scalar(ScalarStyle, Cow<'input, str>),
330    /// A reserved YAML directive.
331    ReservedDirective(
332        /// Name
333        String,
334        /// Parameters
335        Vec<String>,
336    ),
337}
338
339/// A scanner token.
340#[derive(Clone, PartialEq, Debug, Eq)]
341pub struct Token<'input>(pub Span, pub TokenType<'input>);
342
343/// A scalar that was parsed and may correspond to a simple key.
344///
345/// Upon scanning the following yaml:
346/// ```yaml
347/// a: b
348/// ```
349/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
350/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
351/// kept inside the scanner until more context is fetched and we are able to know whether it is a
352/// plain scalar or a key.
353///
354/// For example, see the following 2 yaml documents:
355/// ```yaml
356/// ---
357/// a: b # Here, `a` is a key.
358/// ...
359/// ---
360/// a # Here, `a` is a plain scalar.
361/// ...
362/// ```
363/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
364///
365/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
366/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
367/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
368///
369/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
370/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
371/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
372/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
373/// [`TokenType::Scalar`] token.
374///
375/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
376/// [`TokenType::Key`] would be emitted by the scanner.
377#[derive(Clone, PartialEq, Debug, Eq)]
378struct SimpleKey {
379    /// Whether the token this [`SimpleKey`] refers to may still be a key.
380    ///
381    /// Sometimes, when we have more context, we notice that what we thought could be a key no
382    /// longer can be. In that case, [`Self::possible`] is set to `false`.
383    ///
384    /// For instance, let us consider the following invalid YAML:
385    /// ```yaml
386    /// key
387    ///   : value
388    /// ```
389    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
390    /// and [`Self::possible`] set to `false`.
391    possible: bool,
392    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
393    ///
394    /// With more context, we may know for sure that the token must be a key. If the YAML is
395    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
396    /// be raised. This boolean helps us know when to raise such error.
397    ///
398    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
399    required: bool,
400    /// The index of the token referred to by the [`SimpleKey`].
401    ///
402    /// This is the index in the scanner, which takes into account both the tokens that have been
403    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
404    /// [`Scanner::tokens`] for more details.
405    token_number: usize,
406    /// The position at which the token the [`SimpleKey`] refers to is.
407    mark: Marker,
408}
409
410impl SimpleKey {
411    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
412    fn new(mark: Marker) -> SimpleKey {
413        SimpleKey {
414            possible: false,
415            required: false,
416            token_number: 0,
417            mark,
418        }
419    }
420}
421
422/// An indentation level on the stack of indentations.
423#[derive(Clone, Debug, Default)]
424struct Indent {
425    /// The former indentation level.
426    indent: isize,
427    /// Whether, upon closing, this indents generates a `BlockEnd` token.
428    ///
429    /// There are levels of indentation which do not start a block. Examples of this would be:
430    /// ```yaml
431    /// -
432    ///   foo # ok
433    /// -
434    /// bar # ko, bar needs to be indented further than the `-`.
435    /// - [
436    ///  baz, # ok
437    /// quux # ko, quux needs to be indented further than the '-'.
438    /// ] # ko, the closing bracket needs to be indented further than the `-`.
439    /// ```
440    ///
441    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
442    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
443    /// sequence, although we must have exactly one to end the sequence.
444    needs_block_end: bool,
445}
446
447/// The knowledge we have about an implicit mapping.
448///
449/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
450/// sequence is omitted:
451/// ```yaml
452/// [ a: b, c: d ]
453/// # Equivalent to
454/// [ { a: b }, { c: d } ]
455/// # Equivalent to
456/// - a: b
457/// - c: d
458/// ```
459///
460/// The state must be carefully tracked for each nested flow sequence since we must emit a
461/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
462/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
463/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
464/// out-of-order.
465///
466/// [`FlowMappingStart`]: TokenType::FlowMappingStart
467/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
468#[derive(Debug, PartialEq)]
469enum ImplicitMappingState {
470    /// It is possible there is an implicit mapping.
471    ///
472    /// This state is the one when we have just encountered the opening `[`. We need more context
473    /// to know whether an implicit mapping follows.
474    Possible,
475    /// We are inside the implcit mapping.
476    ///
477    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
478    Inside(u8),
479}
480
481/// The YAML scanner.
482///
483/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
484/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
485/// some of the constructs. It has understanding of indentation and whitespace and is able to
486/// generate error messages for some invalid YAML constructs.
487///
488/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
489/// YAML documents.
490#[derive(Debug)]
491#[allow(clippy::struct_excessive_bools)]
492pub struct Scanner<'input, T> {
493    /// The input source.
494    ///
495    /// This must implement [`Input`].
496    input: T,
497    /// The position of the cursor within the reader.
498    mark: Marker,
499    /// Buffer for tokens to be returned.
500    ///
501    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
502    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
503    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
504    /// [`Self::next`] until we have more context.
505    tokens: VecDeque<Token<'input>>,
506    /// The last error that happened.
507    error: Option<ScanError>,
508
509    /// Whether we have already emitted the `StreamStart` token.
510    stream_start_produced: bool,
511    /// Whether we have already emitted the `StreamEnd` token.
512    stream_end_produced: bool,
513    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
514    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
515    adjacent_value_allowed_at: usize,
516    /// Whether a simple key could potentially start at the current position.
517    ///
518    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
519    simple_key_allowed: bool,
520    /// A stack of potential simple keys.
521    ///
522    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
523    /// are.
524    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
525    /// The current indentation level.
526    indent: isize,
527    /// List of all block indentation levels we are in (except the current one).
528    indents: smallvec::SmallVec<[Indent; 8]>,
529    /// Level of nesting of flow sequences.
530    flow_level: u8,
531    /// The number of tokens that have been returned from the scanner.
532    ///
533    /// This excludes the tokens from [`Self::tokens`].
534    tokens_parsed: usize,
535    /// Whether a token is ready to be taken from [`Self::tokens`].
536    token_available: bool,
537    /// Whether all characters encountered since the last newline were whitespace.
538    leading_whitespace: bool,
539    /// Whether we started a flow mapping.
540    ///
541    /// This is used to detect implicit flow mapping starts such as:
542    /// ```yaml
543    /// [ : foo ] # { null: "foo" }
544    /// ```
545    flow_mapping_started: bool,
546    /// An array of states, representing whether flow sequences have implicit mappings.
547    ///
548    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
549    /// the state is set to [`Possible`].
550    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
551    /// [`Inside`].
552    ///
553    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
554    /// The entries are created with the opening `]` and popped with the closing `]`.
555    ///
556    /// [`Possible`]: ImplicitMappingState::Possible
557    /// [`Inside`]: ImplicitMappingState::Inside
558    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
559    /// If a plain scalar was terminated by a `#` comment on its line, we set this
560    /// to detect an illegal multiline continuation on the following line.
561    interrupted_plain_by_comment: Option<Marker>,
562    /// A stack of markers for opening brackets `[` and `{`.
563    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
564    buf_leading_break: String,
565    buf_trailing_breaks: String,
566    buf_whitespaces: String,
567}
568
569impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
570    type Item = Token<'input>;
571
572    fn next(&mut self) -> Option<Self::Item> {
573        if self.error.is_some() {
574            return None;
575        }
576        match self.next_token() {
577            Ok(Some(tok)) => {
578                debug_print!(
579                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
580                    tok.1,
581                    tok.0
582                );
583                Some(tok)
584            }
585            Ok(tok) => tok,
586            Err(e) => {
587                self.error = Some(e);
588                None
589            }
590        }
591    }
592}
593
594/// A convenience alias for scanner functions that may fail without returning a value.
595pub type ScanResult = Result<(), ScanError>;
596
597#[derive(Debug)]
598enum FlowScalarBuf {
599    /// Candidate for `Cow::Borrowed`.
600    ///
601    /// `start..end` is the committed verbatim range.
602    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
603    /// committed (they must be dropped if followed by a line break).
604    Borrowed {
605        start: usize,
606        end: usize,
607        pending_ws_start: Option<usize>,
608        pending_ws_end: usize,
609    },
610    Owned(String),
611}
612
613impl FlowScalarBuf {
614    #[inline]
615    fn new_borrowed(start: usize) -> Self {
616        Self::Borrowed {
617            start,
618            end: start,
619            pending_ws_start: None,
620            pending_ws_end: start,
621        }
622    }
623
624    #[inline]
625    fn new_owned() -> Self {
626        Self::Owned(String::new())
627    }
628
629    #[inline]
630    fn as_owned_mut(&mut self) -> Option<&mut String> {
631        match self {
632            Self::Owned(s) => Some(s),
633            Self::Borrowed { .. } => None,
634        }
635    }
636
637    #[inline]
638    fn commit_pending_ws(&mut self) {
639        if let Self::Borrowed {
640            end,
641            pending_ws_start,
642            pending_ws_end,
643            ..
644        } = self
645        {
646            if pending_ws_start.is_some() {
647                *end = *pending_ws_end;
648                *pending_ws_start = None;
649            }
650        }
651    }
652
653    #[inline]
654    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
655        if let Self::Borrowed {
656            pending_ws_start,
657            pending_ws_end,
658            ..
659        } = self
660        {
661            if pending_ws_start.is_none() {
662                *pending_ws_start = Some(ws_start);
663            }
664            *pending_ws_end = ws_end;
665        }
666    }
667
668    #[inline]
669    fn discard_pending_ws(&mut self) {
670        if let Self::Borrowed {
671            pending_ws_start,
672            pending_ws_end,
673            end,
674            ..
675        } = self
676        {
677            *pending_ws_start = None;
678            *pending_ws_end = *end;
679        }
680    }
681}
682
683impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
684    #[inline]
685    fn promote_flow_scalar_buf_to_owned(
686        &self,
687        start_mark: &Marker,
688        buf: &mut FlowScalarBuf,
689    ) -> Result<(), ScanError> {
690        let FlowScalarBuf::Borrowed {
691            start,
692            end,
693            pending_ws_start: _,
694            pending_ws_end: _,
695        } = *buf
696        else {
697            return Ok(());
698        };
699
700        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
701            ScanError::new_str(
702                *start_mark,
703                "internal error: input advertised offsets but did not provide a slice",
704            )
705        })?;
706        *buf = FlowScalarBuf::Owned(slice.to_owned());
707        Ok(())
708    }
709    /// Try to borrow a slice from the underlying input.
710    ///
711    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
712    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
713    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
714    #[inline]
715    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
716        self.input.slice_borrowed(start, end)
717    }
718
719    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
720    ///
721    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
722    /// borrowing is not possible, it falls back to allocating.
723    fn scan_tag_handle_directive_cow(
724        &mut self,
725        mark: &Marker,
726    ) -> Result<Cow<'input, str>, ScanError> {
727        let Some(start) = self.input.byte_offset() else {
728            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
729        };
730
731        if self.input.look_ch() != '!' {
732            return Err(ScanError::new_str(
733                *mark,
734                "while scanning a tag, did not find expected '!'",
735            ));
736        }
737
738        // Consume the leading '!'.
739        self.skip_non_blank();
740
741        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
742        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
743        self.input.lookahead(1);
744        while self.input.next_is_alpha() {
745            self.skip_non_blank();
746            self.input.lookahead(1);
747        }
748
749        // Optional trailing '!'.
750        if self.input.peek() == '!' {
751            self.skip_non_blank();
752        }
753
754        let Some(end) = self.input.byte_offset() else {
755            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
756            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
757        };
758
759        let Some(slice) = self.try_borrow_slice(start, end) else {
760            // Fall back to allocating if zero-copy borrow is not available.
761            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
762                ScanError::new_str(
763                    *mark,
764                    "internal error: input advertised slicing but did not provide a slice",
765                )
766            })?;
767            if !slice.ends_with('!') && slice != "!" {
768                return Err(ScanError::new_str(
769                    *mark,
770                    "while parsing a tag directive, did not find expected '!'",
771                ));
772            }
773            return Ok(Cow::Owned(slice.to_owned()));
774        };
775
776        if !slice.ends_with('!') && slice != "!" {
777            return Err(ScanError::new_str(
778                *mark,
779                "while parsing a tag directive, did not find expected '!'",
780            ));
781        }
782
783        Ok(Cow::Borrowed(slice))
784    }
785
786    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
787    ///
788    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
789    /// escape is present, the prefix must be decoded and therefore allocated.
790    fn scan_tag_prefix_directive_cow(
791        &mut self,
792        start_mark: &Marker,
793    ) -> Result<Cow<'input, str>, ScanError> {
794        let Some(start) = self.input.byte_offset() else {
795            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
796        };
797
798        // The prefix must start with either '!' (local) or a valid global tag char.
799        if self.input.look_ch() == '!' {
800            self.skip_non_blank();
801        } else if !is_tag_char(self.input.peek()) {
802            return Err(ScanError::new_str(
803                *start_mark,
804                "invalid global tag character",
805            ));
806        } else if self.input.peek() == '%' {
807            // Needs decoding. Fall back to allocating path below.
808        } else {
809            self.skip_non_blank();
810        }
811
812        // Consume URI chars while we can stay in the borrowed path.
813        while is_uri_char(self.input.look_ch()) {
814            if self.input.peek() == '%' {
815                break;
816            }
817            self.skip_non_blank();
818        }
819
820        // If we encountered an escape sequence, we must decode, therefore allocate.
821        if self.input.peek() == '%' {
822            let current = self
823                .input
824                .byte_offset()
825                .expect("byte_offset() must remain available once enabled");
826            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
827                slice.to_owned()
828            } else {
829                String::new()
830            };
831
832            while is_uri_char(self.input.look_ch()) {
833                if self.input.peek() == '%' {
834                    out.push(self.scan_uri_escapes(start_mark)?);
835                } else {
836                    out.push(self.input.peek());
837                    self.skip_non_blank();
838                }
839            }
840            return Ok(Cow::Owned(out));
841        }
842
843        let Some(end) = self.input.byte_offset() else {
844            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
845        };
846
847        let Some(slice) = self.try_borrow_slice(start, end) else {
848            // Fall back to allocating if zero-copy borrow is not available.
849            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
850                ScanError::new_str(
851                    *start_mark,
852                    "internal error: input advertised slicing but did not provide a slice",
853                )
854            })?;
855            return Ok(Cow::Owned(slice.to_owned()));
856        };
857
858        Ok(Cow::Borrowed(slice))
859    }
860    /// Creates the YAML tokenizer.
861    pub fn new(input: T) -> Self {
862        let initial_byte_offset = input.byte_offset();
863        Scanner {
864            input,
865            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
866            tokens: VecDeque::with_capacity(64),
867            error: None,
868
869            stream_start_produced: false,
870            stream_end_produced: false,
871            adjacent_value_allowed_at: 0,
872            simple_key_allowed: true,
873            simple_keys: smallvec::SmallVec::new(),
874            indent: -1,
875            indents: smallvec::SmallVec::new(),
876            flow_level: 0,
877            tokens_parsed: 0,
878            token_available: false,
879            leading_whitespace: true,
880            flow_mapping_started: false,
881            implicit_flow_mapping_states: smallvec::SmallVec::new(),
882            flow_markers: smallvec::SmallVec::new(),
883            interrupted_plain_by_comment: None,
884
885            buf_leading_break: String::with_capacity(128),
886            buf_trailing_breaks: String::with_capacity(128),
887            buf_whitespaces: String::with_capacity(128),
888        }
889    }
890
891    /// Get a copy of the last error that was encountered, if any.
892    ///
893    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
894    /// clone of) the same error.
895    #[inline]
896    pub fn get_error(&self) -> Option<ScanError> {
897        self.error.clone()
898    }
899
900    #[cold]
901    fn simple_key_expected(&self) -> ScanError {
902        ScanError::new_str(self.mark, "simple key expected")
903    }
904
905    #[cold]
906    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
907        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
908    }
909
910    /// Consume the next character. It is assumed the next character is a blank.
911    #[inline]
912    fn skip_blank(&mut self) {
913        self.input.skip();
914
915        self.mark.offsets.chars += 1;
916        self.mark.col += 1;
917        self.mark.offsets.bytes = self.input.byte_offset();
918    }
919
920    /// Consume the next character. It is assumed the next character is not a blank.
921    #[inline]
922    fn skip_non_blank(&mut self) {
923        self.input.skip();
924
925        self.mark.offsets.chars += 1;
926        self.mark.col += 1;
927        self.mark.offsets.bytes = self.input.byte_offset();
928        self.leading_whitespace = false;
929    }
930
931    /// Consume the next characters. It is assumed none of the next characters are blanks.
932    #[inline]
933    fn skip_n_non_blank(&mut self, count: usize) {
934        for _ in 0..count {
935            self.input.skip();
936            self.mark.offsets.chars += 1;
937            self.mark.col += 1;
938        }
939        self.mark.offsets.bytes = self.input.byte_offset();
940        self.leading_whitespace = false;
941    }
942
943    /// Consume the next character. It is assumed the next character is a newline.
944    #[inline]
945    fn skip_nl(&mut self) {
946        self.input.skip();
947
948        self.mark.offsets.chars += 1;
949        self.mark.col = 0;
950        self.mark.line += 1;
951        self.mark.offsets.bytes = self.input.byte_offset();
952        self.leading_whitespace = true;
953    }
954
955    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
956    #[inline]
957    fn skip_linebreak(&mut self) {
958        if self.input.next_2_are('\r', '\n') {
959            // While technically not a blank, this does not matter as `self.leading_whitespace`
960            // will be reset by `skip_nl`.
961            self.skip_blank();
962            self.skip_nl();
963        } else if self.input.next_is_break() {
964            self.skip_nl();
965        }
966    }
967
968    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
969    #[inline]
970    pub fn stream_started(&self) -> bool {
971        self.stream_start_produced
972    }
973
974    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
975    #[inline]
976    pub fn stream_ended(&self) -> bool {
977        self.stream_end_produced
978    }
979
980    /// Get the current position in the input stream.
981    #[inline]
982    pub fn mark(&self) -> Marker {
983        self.mark
984    }
985
986    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
987    //
988    // A `\n` is pushed into `s`.
989    //
990    // # Panics (in debug)
991    // If the next characters do not correspond to a line break.
992    #[inline]
993    fn read_break(&mut self, s: &mut String) {
994        self.skip_break();
995        s.push('\n');
996    }
997
998    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
999    //
1000    // # Panics (in debug)
1001    // If the next characters do not correspond to a line break.
1002    #[inline]
1003    fn skip_break(&mut self) {
1004        let c = self.input.peek();
1005        let nc = self.input.peek_nth(1);
1006        debug_assert!(is_break(c));
1007        if c == '\r' && nc == '\n' {
1008            self.skip_blank();
1009        }
1010        self.skip_nl();
1011    }
1012
1013    /// Insert a token at the given position.
1014    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1015        let old_len = self.tokens.len();
1016        assert!(pos <= old_len);
1017        self.tokens.insert(pos, tok);
1018    }
1019
1020    #[inline]
1021    fn allow_simple_key(&mut self) {
1022        self.simple_key_allowed = true;
1023    }
1024
1025    #[inline]
1026    fn disallow_simple_key(&mut self) {
1027        self.simple_key_allowed = false;
1028    }
1029
1030    /// Fetch the next token in the stream.
1031    ///
1032    /// # Errors
1033    /// Returns `ScanError` when the scanner does not find the next expected token.
1034    pub fn fetch_next_token(&mut self) -> ScanResult {
1035        self.input.lookahead(1);
1036
1037        if !self.stream_start_produced {
1038            self.fetch_stream_start();
1039            return Ok(());
1040        }
1041        self.skip_to_next_token()?;
1042
1043        debug_print!(
1044            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1045            self.mark,
1046            self.input.peek()
1047        );
1048
1049        self.stale_simple_keys()?;
1050
1051        let mark = self.mark;
1052        self.unroll_indent(mark.col as isize);
1053
1054        self.input.lookahead(4);
1055
1056        if self.input.next_is_z() {
1057            self.fetch_stream_end()?;
1058            return Ok(());
1059        }
1060
1061        if self.mark.col == 0 {
1062            if self.input.next_char_is('%') {
1063                return self.fetch_directive();
1064            } else if self.input.next_is_document_start() {
1065                return self.fetch_document_indicator(TokenType::DocumentStart);
1066            } else if self.input.next_is_document_end() {
1067                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1068                self.skip_ws_to_eol(SkipTabs::Yes)?;
1069                if !self.input.next_is_breakz() {
1070                    return Err(ScanError::new_str(
1071                        self.mark,
1072                        "invalid content after document end marker",
1073                    ));
1074                }
1075                return Ok(());
1076            }
1077        }
1078
1079        if (self.mark.col as isize) < self.indent {
1080            self.input.lookahead(1);
1081            let c = self.input.peek();
1082            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1083                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1084            }
1085        }
1086
1087        let c = self.input.peek();
1088        let nc = self.input.peek_nth(1);
1089        match c {
1090            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1091            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1092            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1093            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1094            ',' => self.fetch_flow_entry(),
1095            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1096            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1097            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1098            ':' if self.flow_level > 0
1099                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1100            {
1101                self.fetch_flow_value()
1102            }
1103            // Is it an alias?
1104            '*' => self.fetch_anchor(true),
1105            // Is it an anchor?
1106            '&' => self.fetch_anchor(false),
1107            '!' => self.fetch_tag(),
1108            // Is it a literal scalar?
1109            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1110            // Is it a folded scalar?
1111            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1112            '\'' => self.fetch_flow_scalar(true),
1113            '"' => self.fetch_flow_scalar(false),
1114            // plain scalar
1115            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1116            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1117                self.fetch_plain_scalar()
1118            }
1119            '%' | '@' | '`' => Err(ScanError::new(
1120                self.mark,
1121                format!("unexpected character: `{c}'"),
1122            )),
1123            _ => self.fetch_plain_scalar(),
1124        }
1125    }
1126
1127    /// Return the next token in the stream.
1128    /// # Errors
1129    /// Returns `ScanError` when scanning fails to find an expected next token.
1130    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1131        if self.stream_end_produced {
1132            return Ok(None);
1133        }
1134
1135        if !self.token_available {
1136            self.fetch_more_tokens()?;
1137        }
1138        let Some(t) = self.tokens.pop_front() else {
1139            return Err(ScanError::new_str(
1140                self.mark,
1141                "did not find expected next token",
1142            ));
1143        };
1144        self.token_available = false;
1145        self.tokens_parsed += 1;
1146
1147        if let TokenType::StreamEnd = t.1 {
1148            self.stream_end_produced = true;
1149        }
1150        Ok(Some(t))
1151    }
1152
1153    /// Fetch tokens from the token stream.
1154    /// # Errors
1155    /// Returns `ScanError` when loading fails.
1156    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1157        let mut need_more;
1158        loop {
1159            if self.tokens.is_empty() {
1160                need_more = true;
1161            } else {
1162                need_more = false;
1163                // Stale potential keys that we know won't be keys.
1164                self.stale_simple_keys()?;
1165                // If our next token to be emitted may be a key, fetch more context.
1166                for sk in &self.simple_keys {
1167                    if sk.possible && sk.token_number == self.tokens_parsed {
1168                        need_more = true;
1169                        break;
1170                    }
1171                }
1172            }
1173
1174            // Stop fetching immediately after document end/start markers
1175            // to allow the parser to emit the event before reading more content.
1176            if let Some(token) = self.tokens.back() {
1177                if matches!(token.1, TokenType::DocumentEnd | TokenType::DocumentStart) {
1178                    break;
1179                }
1180            }
1181
1182            if !need_more {
1183                break;
1184            }
1185            self.fetch_next_token()?;
1186        }
1187        self.token_available = true;
1188
1189        Ok(())
1190    }
1191
1192    /// Mark simple keys that can no longer be keys as such.
1193    ///
1194    /// This function sets `possible` to `false` to each key that, now we have more context, we
1195    /// know will not be keys.
1196    ///
1197    /// # Errors
1198    /// This function returns an error if one of the key we would stale was required to be a key.
1199    fn stale_simple_keys(&mut self) -> ScanResult {
1200        for sk in &mut self.simple_keys {
1201            if sk.possible
1202                // If not in a flow construct, simple keys cannot span multiple lines.
1203                && self.flow_level == 0
1204                    && (sk.mark.line < self.mark.line
1205                        || sk.mark.index() + 1024 < self.mark.index())
1206            {
1207                if sk.required {
1208                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1209                }
1210                sk.possible = false;
1211            }
1212        }
1213        Ok(())
1214    }
1215
1216    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
1217    ///
1218    /// # Errors
1219    /// This function returns an error if a tabulation is encountered where there should not be
1220    /// one.
1221    fn skip_to_next_token(&mut self) -> ScanResult {
1222        // Hot-path helper: consume a single logical linebreak and apply simple-key rules.
1223        // (Kept local to ensure the compiler can inline it easily.)
1224        let consume_linebreak = |this: &mut Self| {
1225            this.input.lookahead(2);
1226            this.skip_linebreak();
1227            if this.flow_level == 0 {
1228                this.allow_simple_key();
1229            }
1230        };
1231
1232        loop {
1233            match self.input.look_ch() {
1234                // Tabs may not be used as indentation (block context only).
1235                '\t' => {
1236                    if self.is_within_block()
1237                        && self.leading_whitespace
1238                        && (self.mark.col as isize) < self.indent
1239                    {
1240                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1241
1242                        // If we have content on that line with a tab, return an error.
1243                        if !self.input.next_is_breakz() {
1244                            return Err(ScanError::new_str(
1245                                self.mark,
1246                                "tabs disallowed within this context (block indentation)",
1247                            ));
1248                        }
1249
1250                        // Micro-opt: if we stopped on a linebreak, consume it now (avoids another loop trip).
1251                        if matches!(self.input.look_ch(), '\n' | '\r') {
1252                            consume_linebreak(self);
1253                        }
1254                    } else {
1255                        // Non-indentation tab behaves like blank.
1256                        self.skip_blank();
1257                    }
1258                }
1259
1260                ' ' => self.skip_blank(),
1261
1262                '\n' | '\r' => consume_linebreak(self),
1263
1264                '#' => {
1265                    // Skip the whole comment payload in one go.
1266                    let n = self.input.skip_while_non_breakz();
1267                    self.mark.offsets.chars += n;
1268                    self.mark.col += n;
1269                    self.mark.offsets.bytes = self.input.byte_offset();
1270
1271                    // Micro-opt: comment-only lines are common; consume the following linebreak here.
1272                    if matches!(self.input.look_ch(), '\n' | '\r') {
1273                        consume_linebreak(self);
1274                    }
1275                }
1276
1277                _ => break,
1278            }
1279        }
1280
1281        // If a plain scalar was interrupted by a comment, and the next line could
1282        // continue the scalar in block context, this is invalid.
1283        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1284            // BS4K should only trigger when the continuation would start on the immediate next
1285            // line (no intervening empty/comment-only lines). A blank line resets the folding
1286            // opportunity and thus should not error.
1287            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1288
1289            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1290            if self.flow_level == 0
1291                && is_immediate_next_line
1292                && (self.mark.col as isize) > self.indent
1293            {
1294                // Ensure enough lookahead for:
1295                // - the checks below (peek/peek_nth)
1296                // - document indicator detection which needs 4 chars.
1297                self.input.lookahead(4);
1298
1299                if !self.input.next_is_z()
1300                    && !self.input.next_is_document_indicator()
1301                    && self.input.next_can_be_plain_scalar(false)
1302                {
1303                    return Err(ScanError::new_str(
1304                        err_mark,
1305                        "comment intercepting the multiline text",
1306                    ));
1307                }
1308            }
1309        }
1310
1311        Ok(())
1312    }
1313
1314    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1315    ///
1316    /// # Errors
1317    /// This function returns an error if no whitespace was found.
1318    fn skip_yaml_whitespace(&mut self) -> ScanResult {
1319        let mut need_whitespace = true;
1320        loop {
1321            match self.input.look_ch() {
1322                ' ' => {
1323                    self.skip_blank();
1324
1325                    need_whitespace = false;
1326                }
1327                '\n' | '\r' => {
1328                    self.input.lookahead(2);
1329                    self.skip_linebreak();
1330                    if self.flow_level == 0 {
1331                        self.allow_simple_key();
1332                    }
1333                    need_whitespace = false;
1334                }
1335                '#' => {
1336                    let comment_length = self.input.skip_while_non_breakz();
1337                    self.mark.offsets.chars += comment_length;
1338                    self.mark.col += comment_length;
1339                    self.mark.offsets.bytes = self.input.byte_offset();
1340                }
1341                _ => break,
1342            }
1343        }
1344
1345        if need_whitespace {
1346            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1347        } else {
1348            Ok(())
1349        }
1350    }
1351
1352    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1353        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
1354        self.mark.col += n_bytes;
1355        self.mark.offsets.chars += n_bytes;
1356        self.mark.offsets.bytes = self.input.byte_offset();
1357        result.map_err(|msg| ScanError::new_str(self.mark, msg))
1358    }
1359
1360    fn fetch_stream_start(&mut self) {
1361        let mark = self.mark;
1362        self.indent = -1;
1363        self.stream_start_produced = true;
1364        self.allow_simple_key();
1365        self.tokens.push_back(Token(
1366            Span::empty(mark),
1367            TokenType::StreamStart(TEncoding::Utf8),
1368        ));
1369        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1370    }
1371
1372    fn fetch_stream_end(&mut self) -> ScanResult {
1373        // force new line
1374        if self.mark.col != 0 {
1375            self.mark.col = 0;
1376            self.mark.line += 1;
1377        }
1378
1379        if let Some((mark, bracket)) = self.flow_markers.pop() {
1380            return Err(Self::unclosed_bracket(mark, bracket));
1381        }
1382
1383        // If the stream ended, we won't have more context. We can stall all the simple keys we
1384        // had. If one was required, however, that was an error and we must propagate it.
1385        for sk in &mut self.simple_keys {
1386            if sk.required && sk.possible {
1387                return Err(self.simple_key_expected());
1388            }
1389            sk.possible = false;
1390        }
1391
1392        self.unroll_indent(-1);
1393        self.remove_simple_key()?;
1394        self.disallow_simple_key();
1395
1396        self.tokens
1397            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
1398        Ok(())
1399    }
1400
1401    fn fetch_directive(&mut self) -> ScanResult {
1402        self.unroll_indent(-1);
1403        self.remove_simple_key()?;
1404
1405        self.disallow_simple_key();
1406
1407        let tok = self.scan_directive()?;
1408        self.tokens.push_back(tok);
1409
1410        Ok(())
1411    }
1412
1413    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1414        let start_mark = self.mark;
1415        self.skip_non_blank();
1416
1417        let name = self.scan_directive_name()?;
1418        let tok = match name.as_ref() {
1419            "YAML" => self.scan_version_directive_value(&start_mark)?,
1420            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1421            _ => {
1422                let mut params = Vec::new();
1423                while self.input.next_is_blank() {
1424                    let n_blanks = self.input.skip_while_blank();
1425                    self.mark.offsets.chars += n_blanks;
1426                    self.mark.col += n_blanks;
1427                    self.mark.offsets.bytes = self.input.byte_offset();
1428
1429                    if !is_blank_or_breakz(self.input.peek()) {
1430                        let mut param = String::new();
1431                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1432                        self.mark.offsets.chars += n_chars;
1433                        self.mark.col += n_chars;
1434                        self.mark.offsets.bytes = self.input.byte_offset();
1435                        params.push(param);
1436                    }
1437                }
1438
1439                Token(
1440                    Span::new(start_mark, self.mark),
1441                    TokenType::ReservedDirective(name, params),
1442                )
1443            }
1444        };
1445
1446        self.skip_ws_to_eol(SkipTabs::Yes)?;
1447
1448        if self.input.next_is_breakz() {
1449            self.input.lookahead(2);
1450            self.skip_linebreak();
1451            Ok(tok)
1452        } else {
1453            Err(ScanError::new_str(
1454                start_mark,
1455                "while scanning a directive, did not find expected comment or line break",
1456            ))
1457        }
1458    }
1459
1460    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1461        let n_blanks = self.input.skip_while_blank();
1462        self.mark.offsets.chars += n_blanks;
1463        self.mark.col += n_blanks;
1464        self.mark.offsets.bytes = self.input.byte_offset();
1465
1466        let major = self.scan_version_directive_number(mark)?;
1467
1468        if self.input.peek() != '.' {
1469            return Err(ScanError::new_str(
1470                *mark,
1471                "while scanning a YAML directive, did not find expected digit or '.' character",
1472            ));
1473        }
1474        self.skip_non_blank();
1475
1476        let minor = self.scan_version_directive_number(mark)?;
1477
1478        Ok(Token(
1479            Span::new(*mark, self.mark),
1480            TokenType::VersionDirective(major, minor),
1481        ))
1482    }
1483
1484    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1485        let start_mark = self.mark;
1486        let mut string = String::new();
1487
1488        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
1489        self.mark.offsets.chars += n_chars;
1490        self.mark.col += n_chars;
1491        self.mark.offsets.bytes = self.input.byte_offset();
1492
1493        if string.is_empty() {
1494            return Err(ScanError::new_str(
1495                start_mark,
1496                "while scanning a directive, could not find expected directive name",
1497            ));
1498        }
1499
1500        if !is_blank_or_breakz(self.input.peek()) {
1501            return Err(ScanError::new_str(
1502                start_mark,
1503                "while scanning a directive, found unexpected non-alphabetical character",
1504            ));
1505        }
1506
1507        Ok(string)
1508    }
1509
1510    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1511        let mut val = 0u32;
1512        let mut length = 0usize;
1513        while let Some(digit) = self.input.look_ch().to_digit(10) {
1514            if length + 1 > 9 {
1515                return Err(ScanError::new_str(
1516                    *mark,
1517                    "while scanning a YAML directive, found extremely long version number",
1518                ));
1519            }
1520            length += 1;
1521            val = val * 10 + digit;
1522            self.skip_non_blank();
1523        }
1524
1525        if length == 0 {
1526            return Err(ScanError::new_str(
1527                *mark,
1528                "while scanning a YAML directive, did not find expected version number",
1529            ));
1530        }
1531
1532        Ok(val)
1533    }
1534
1535    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1536        let n_blanks = self.input.skip_while_blank();
1537        self.mark.offsets.chars += n_blanks;
1538        self.mark.col += n_blanks;
1539        self.mark.offsets.bytes = self.input.byte_offset();
1540
1541        let handle = self.scan_tag_handle_directive_cow(mark)?;
1542
1543        let n_blanks = self.input.skip_while_blank();
1544        self.mark.offsets.chars += n_blanks;
1545        self.mark.col += n_blanks;
1546        self.mark.offsets.bytes = self.input.byte_offset();
1547
1548        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
1549
1550        self.input.lookahead(1);
1551
1552        if self.input.next_is_blank_or_breakz() {
1553            Ok(Token(
1554                Span::new(*mark, self.mark),
1555                TokenType::TagDirective(handle, prefix),
1556            ))
1557        } else {
1558            Err(ScanError::new_str(
1559                *mark,
1560                "while scanning TAG, did not find expected whitespace or line break",
1561            ))
1562        }
1563    }
1564
1565    fn fetch_tag(&mut self) -> ScanResult {
1566        self.save_simple_key();
1567        self.disallow_simple_key();
1568
1569        let tok = self.scan_tag()?;
1570        self.tokens.push_back(tok);
1571        Ok(())
1572    }
1573
1574    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
1575        let start_mark = self.mark;
1576
1577        // Check if the tag is in the canonical form (verbatim).
1578        self.input.lookahead(2);
1579
1580        // If byte_offset is not available, use the original owned-only path.
1581        if self.input.byte_offset().is_none() {
1582            return self.scan_tag_owned(&start_mark);
1583        }
1584
1585        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
1586            if self.input.nth_char_is(1, '<') {
1587                // Verbatim tags always need owned strings (URI escapes).
1588                let suffix = self.scan_verbatim_tag(&start_mark)?;
1589                (Cow::Owned(String::new()), Cow::Owned(suffix))
1590            } else {
1591                // The tag has either the '!suffix' or the '!handle!suffix'
1592                let handle = self.scan_tag_handle_cow(&start_mark)?;
1593                // Check if it is, indeed, handle.
1594                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1595                    // A tag handle starting with "!!" is a secondary tag handle.
1596                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark)?;
1597                    (handle, suffix)
1598                } else {
1599                    // Not a real handle, it's part of the suffix.
1600                    // E.g., "!foo" -> handle="!", suffix="foo"
1601                    // The "handle" we scanned is actually "!" + suffix_part1.
1602                    // We need to also scan any remaining suffix characters.
1603                    let remaining_suffix = self.scan_tag_shorthand_suffix_cow(&start_mark)?;
1604
1605                    // Extract suffix from handle (skip leading '!') and combine with remaining.
1606                    let suffix = if handle.len() > 1 {
1607                        if remaining_suffix.is_empty() {
1608                            // The suffix is just what's in handle after '!'
1609                            match handle {
1610                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
1611                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
1612                            }
1613                        } else {
1614                            // Combine handle (minus leading '!') with remaining suffix.
1615                            let mut combined = handle[1..].to_owned();
1616                            combined.push_str(&remaining_suffix);
1617                            Cow::Owned(combined)
1618                        }
1619                    } else {
1620                        // handle is just "!", suffix is whatever we scanned after
1621                        remaining_suffix
1622                    };
1623
1624                    // A special case: the '!' tag.  Set the handle to '' and the
1625                    // suffix to '!'.
1626                    if suffix.is_empty() {
1627                        (Cow::Borrowed(""), Cow::Borrowed("!"))
1628                    } else {
1629                        (Cow::Borrowed("!"), suffix)
1630                    }
1631                }
1632            };
1633
1634        if is_blank_or_breakz(self.input.look_ch())
1635            || (self.flow_level > 0 && self.input.next_is_flow())
1636        {
1637            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1638            Ok(Token(
1639                Span::new(start_mark, self.mark),
1640                TokenType::Tag(handle, suffix),
1641            ))
1642        } else {
1643            Err(ScanError::new_str(
1644                start_mark,
1645                "while scanning a tag, did not find expected whitespace or line break",
1646            ))
1647        }
1648    }
1649
1650    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
1651    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
1652        let mut handle = String::new();
1653        let mut suffix;
1654
1655        if self.input.nth_char_is(1, '<') {
1656            suffix = self.scan_verbatim_tag(start_mark)?;
1657        } else {
1658            // The tag has either the '!suffix' or the '!handle!suffix'
1659            handle = self.scan_tag_handle(false, start_mark)?;
1660            // Check if it is, indeed, handle.
1661            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1662                // A tag handle starting with "!!" is a secondary tag handle.
1663                let is_secondary_handle = handle == "!!";
1664                suffix =
1665                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
1666            } else {
1667                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
1668                "!".clone_into(&mut handle);
1669                // A special case: the '!' tag.  Set the handle to '' and the
1670                // suffix to '!'.
1671                if suffix.is_empty() {
1672                    handle.clear();
1673                    "!".clone_into(&mut suffix);
1674                }
1675            }
1676        }
1677
1678        if is_blank_or_breakz(self.input.look_ch())
1679            || (self.flow_level > 0 && self.input.next_is_flow())
1680        {
1681            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1682            Ok(Token(
1683                Span::new(*start_mark, self.mark),
1684                TokenType::Tag(handle.into(), suffix.into()),
1685            ))
1686        } else {
1687            Err(ScanError::new_str(
1688                *start_mark,
1689                "while scanning a tag, did not find expected whitespace or line break",
1690            ))
1691        }
1692    }
1693
1694    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
1695    ///
1696    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
1697    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
1698    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
1699        let Some(start) = self.input.byte_offset() else {
1700            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1701        };
1702
1703        if self.input.look_ch() != '!' {
1704            return Err(ScanError::new_str(
1705                *mark,
1706                "while scanning a tag, did not find expected '!'",
1707            ));
1708        }
1709
1710        // Consume the leading '!'.
1711        self.skip_non_blank();
1712
1713        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1714        self.input.lookahead(1);
1715        while self.input.next_is_alpha() {
1716            self.skip_non_blank();
1717            self.input.lookahead(1);
1718        }
1719
1720        // Optional trailing '!'.
1721        if self.input.peek() == '!' {
1722            self.skip_non_blank();
1723        }
1724
1725        let Some(end) = self.input.byte_offset() else {
1726            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1727        };
1728
1729        if let Some(slice) = self.try_borrow_slice(start, end) {
1730            Ok(Cow::Borrowed(slice))
1731        } else {
1732            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1733                ScanError::new_str(
1734                    *mark,
1735                    "internal error: input advertised slicing but did not provide a slice",
1736                )
1737            })?;
1738            Ok(Cow::Owned(slice.to_owned()))
1739        }
1740    }
1741
1742    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
1743    ///
1744    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
1745    fn scan_tag_shorthand_suffix_cow(
1746        &mut self,
1747        mark: &Marker,
1748    ) -> Result<Cow<'input, str>, ScanError> {
1749        let Some(start) = self.input.byte_offset() else {
1750            return Ok(Cow::Owned(
1751                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
1752            ));
1753        };
1754
1755        // Scan tag characters, checking for URI escapes.
1756        while is_tag_char(self.input.look_ch()) {
1757            if self.input.peek() == '%' {
1758                // URI escape found - must decode, so fall back to owned path.
1759                let current = self
1760                    .input
1761                    .byte_offset()
1762                    .expect("byte_offset() must remain available once enabled");
1763                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1764                    slice.to_owned()
1765                } else {
1766                    String::new()
1767                };
1768
1769                // Continue scanning with owned buffer.
1770                while is_tag_char(self.input.look_ch()) {
1771                    if self.input.peek() == '%' {
1772                        out.push(self.scan_uri_escapes(mark)?);
1773                    } else {
1774                        out.push(self.input.peek());
1775                        self.skip_non_blank();
1776                    }
1777                }
1778                return Ok(Cow::Owned(out));
1779            }
1780            self.skip_non_blank();
1781        }
1782
1783        let Some(end) = self.input.byte_offset() else {
1784            return Ok(Cow::Owned(
1785                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
1786            ));
1787        };
1788
1789        if let Some(slice) = self.try_borrow_slice(start, end) {
1790            Ok(Cow::Borrowed(slice))
1791        } else {
1792            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1793                ScanError::new_str(
1794                    *mark,
1795                    "internal error: input advertised slicing but did not provide a slice",
1796                )
1797            })?;
1798            Ok(Cow::Owned(slice.to_owned()))
1799        }
1800    }
1801
1802    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1803        let mut string = String::new();
1804        if self.input.look_ch() != '!' {
1805            return Err(ScanError::new_str(
1806                *mark,
1807                "while scanning a tag, did not find expected '!'",
1808            ));
1809        }
1810
1811        string.push(self.input.peek());
1812        self.skip_non_blank();
1813
1814        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1815        self.mark.offsets.chars += n_chars;
1816        self.mark.col += n_chars;
1817        self.mark.offsets.bytes = self.input.byte_offset();
1818
1819        // Check if the trailing character is '!' and copy it.
1820        if self.input.peek() == '!' {
1821            string.push(self.input.peek());
1822            self.skip_non_blank();
1823        } else if directive && string != "!" {
1824            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1825            // directive, it's an error.  If it's a tag token, it must be a part of
1826            // URI.
1827            return Err(ScanError::new_str(
1828                *mark,
1829                "while parsing a tag directive, did not find expected '!'",
1830            ));
1831        }
1832        Ok(string)
1833    }
1834
1835    /// Scan for a tag prefix (6.8.2.2).
1836    ///
1837    /// There are 2 kinds of tag prefixes:
1838    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1839    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1840    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1841        let mut string = String::new();
1842
1843        if self.input.look_ch() == '!' {
1844            // If we have a local tag, insert and skip `!`.
1845            string.push(self.input.peek());
1846            self.skip_non_blank();
1847        } else if !is_tag_char(self.input.peek()) {
1848            // Otherwise, check if the first global tag character is valid.
1849            return Err(ScanError::new_str(
1850                *start_mark,
1851                "invalid global tag character",
1852            ));
1853        } else if self.input.peek() == '%' {
1854            // If it is valid and an escape sequence, escape it.
1855            string.push(self.scan_uri_escapes(start_mark)?);
1856        } else {
1857            // Otherwise, push the first character.
1858            string.push(self.input.peek());
1859            self.skip_non_blank();
1860        }
1861
1862        while is_uri_char(self.input.look_ch()) {
1863            if self.input.peek() == '%' {
1864                string.push(self.scan_uri_escapes(start_mark)?);
1865            } else {
1866                string.push(self.input.peek());
1867                self.skip_non_blank();
1868            }
1869        }
1870
1871        Ok(string)
1872    }
1873
1874    /// Scan for a verbatim tag.
1875    ///
1876    /// The prefixing `!<` must _not_ have been skipped.
1877    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1878        // Eat `!<`
1879        self.skip_non_blank();
1880        self.skip_non_blank();
1881
1882        let mut string = String::new();
1883        while is_uri_char(self.input.look_ch()) {
1884            if self.input.peek() == '%' {
1885                string.push(self.scan_uri_escapes(start_mark)?);
1886            } else {
1887                string.push(self.input.peek());
1888                self.skip_non_blank();
1889            }
1890        }
1891
1892        if self.input.peek() != '>' {
1893            return Err(ScanError::new_str(
1894                *start_mark,
1895                "while scanning a verbatim tag, did not find the expected '>'",
1896            ));
1897        }
1898        self.skip_non_blank();
1899
1900        Ok(string)
1901    }
1902
1903    fn scan_tag_shorthand_suffix(
1904        &mut self,
1905        _directive: bool,
1906        _is_secondary: bool,
1907        head: &str,
1908        mark: &Marker,
1909    ) -> Result<String, ScanError> {
1910        let mut length = head.len();
1911        let mut string = String::new();
1912
1913        // Copy the head if needed.
1914        // Note that we don't copy the leading '!' character.
1915        if length > 1 {
1916            string.extend(head.chars().skip(1));
1917        }
1918
1919        while is_tag_char(self.input.look_ch()) {
1920            // Check if it is a URI-escape sequence.
1921            if self.input.peek() == '%' {
1922                string.push(self.scan_uri_escapes(mark)?);
1923            } else {
1924                string.push(self.input.peek());
1925                self.skip_non_blank();
1926            }
1927
1928            length += 1;
1929        }
1930
1931        if length == 0 {
1932            return Err(ScanError::new_str(
1933                *mark,
1934                "while parsing a tag, did not find expected tag URI",
1935            ));
1936        }
1937
1938        Ok(string)
1939    }
1940
1941    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1942        let mut width = 0usize;
1943        let mut code = 0u32;
1944        loop {
1945            self.input.lookahead(3);
1946
1947            let c = self.input.peek_nth(1);
1948            let nc = self.input.peek_nth(2);
1949
1950            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
1951                return Err(ScanError::new_str(
1952                    *mark,
1953                    "while parsing a tag, found an invalid escape sequence",
1954                ));
1955            }
1956
1957            let byte = (as_hex(c) << 4) + as_hex(nc);
1958            if width == 0 {
1959                width = match byte {
1960                    _ if byte & 0x80 == 0x00 => 1,
1961                    _ if byte & 0xE0 == 0xC0 => 2,
1962                    _ if byte & 0xF0 == 0xE0 => 3,
1963                    _ if byte & 0xF8 == 0xF0 => 4,
1964                    _ => {
1965                        return Err(ScanError::new_str(
1966                            *mark,
1967                            "while parsing a tag, found an incorrect leading UTF-8 byte",
1968                        ));
1969                    }
1970                };
1971                code = byte;
1972            } else {
1973                if byte & 0xc0 != 0x80 {
1974                    return Err(ScanError::new_str(
1975                        *mark,
1976                        "while parsing a tag, found an incorrect trailing UTF-8 byte",
1977                    ));
1978                }
1979                code = (code << 8) + byte;
1980            }
1981
1982            self.skip_n_non_blank(3);
1983
1984            width -= 1;
1985            if width == 0 {
1986                break;
1987            }
1988        }
1989
1990        match char::from_u32(code) {
1991            Some(ch) => Ok(ch),
1992            None => Err(ScanError::new_str(
1993                *mark,
1994                "while parsing a tag, found an invalid UTF-8 codepoint",
1995            )),
1996        }
1997    }
1998
1999    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2000        self.save_simple_key();
2001        self.disallow_simple_key();
2002
2003        let tok = self.scan_anchor(alias)?;
2004
2005        self.tokens.push_back(tok);
2006
2007        Ok(())
2008    }
2009
2010    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2011        let start_mark = self.mark;
2012
2013        // Skip `&` / `*`.
2014        self.skip_non_blank();
2015
2016        // Borrow from input when possible.
2017        if let Some(start) = self.input.byte_offset() {
2018            while is_anchor_char(self.input.look_ch()) {
2019                self.skip_non_blank();
2020            }
2021
2022            let end = self
2023                .input
2024                .byte_offset()
2025                .expect("byte_offset() must remain available once enabled");
2026
2027            if start == end {
2028                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2029            }
2030
2031            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2032                Cow::Borrowed(slice)
2033            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2034                Cow::Owned(slice.to_owned())
2035            } else {
2036                return Err(ScanError::new_str(
2037                    start_mark,
2038                    "internal error: input advertised slicing but did not provide a slice",
2039                ));
2040            };
2041
2042            let tok = if alias {
2043                TokenType::Alias(cow)
2044            } else {
2045                TokenType::Anchor(cow)
2046            };
2047            return Ok(Token(Span::new(start_mark, self.mark), tok));
2048        }
2049
2050        let mut string = String::new();
2051        while is_anchor_char(self.input.look_ch()) {
2052            string.push(self.input.peek());
2053            self.skip_non_blank();
2054        }
2055
2056        if string.is_empty() {
2057            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2058        }
2059
2060        let tok = if alias {
2061            TokenType::Alias(string.into())
2062        } else {
2063            TokenType::Anchor(string.into())
2064        };
2065        Ok(Token(Span::new(start_mark, self.mark), tok))
2066    }
2067
2068    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2069        // The indicators '[' and '{' may start a simple key.
2070        self.save_simple_key();
2071
2072        let start_mark = self.mark;
2073        let indicator = self.input.peek();
2074        self.flow_markers.push((start_mark, indicator));
2075
2076        self.roll_one_col_indent();
2077        self.increase_flow_level()?;
2078
2079        self.allow_simple_key();
2080
2081        self.skip_non_blank();
2082
2083        if tok == TokenType::FlowMappingStart {
2084            self.flow_mapping_started = true;
2085        } else {
2086            self.implicit_flow_mapping_states
2087                .push(ImplicitMappingState::Possible);
2088        }
2089
2090        self.skip_ws_to_eol(SkipTabs::Yes)?;
2091
2092        self.tokens
2093            .push_back(Token(Span::new(start_mark, self.mark), tok));
2094        Ok(())
2095    }
2096
2097    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2098        // A closing bracket without a corresponding opening is invalid YAML.
2099        if self.flow_level == 0 {
2100            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2101        }
2102
2103        let flow_level = self.flow_level;
2104
2105        self.flow_markers.pop();
2106        self.remove_simple_key()?;
2107
2108        if matches!(tok, TokenType::FlowSequenceEnd) {
2109            self.end_implicit_mapping(self.mark, flow_level);
2110            // We are out exiting the flow sequence, nesting goes down 1 level.
2111            self.implicit_flow_mapping_states.pop();
2112        }
2113
2114        self.decrease_flow_level();
2115
2116        self.disallow_simple_key();
2117
2118        let start_mark = self.mark;
2119        self.skip_non_blank();
2120        self.skip_ws_to_eol(SkipTabs::Yes)?;
2121
2122        // A flow collection within a flow mapping can be a key. In that case, the value may be
2123        // adjacent to the `:`.
2124        // ```yaml
2125        // - [ {a: b}:value ]
2126        // ```
2127        if self.flow_level > 0 {
2128            self.adjacent_value_allowed_at = self.mark.index();
2129        }
2130
2131        self.tokens
2132            .push_back(Token(Span::new(start_mark, self.mark), tok));
2133        Ok(())
2134    }
2135
2136    /// Push the `FlowEntry` token and skip over the `,`.
2137    fn fetch_flow_entry(&mut self) -> ScanResult {
2138        self.remove_simple_key()?;
2139        self.allow_simple_key();
2140
2141        self.end_implicit_mapping(self.mark, self.flow_level);
2142
2143        let start_mark = self.mark;
2144        self.skip_non_blank();
2145        self.skip_ws_to_eol(SkipTabs::Yes)?;
2146
2147        self.tokens.push_back(Token(
2148            Span::new(start_mark, self.mark),
2149            TokenType::FlowEntry,
2150        ));
2151        Ok(())
2152    }
2153
2154    fn increase_flow_level(&mut self) -> ScanResult {
2155        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2156        self.flow_level = self
2157            .flow_level
2158            .checked_add(1)
2159            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2160        Ok(())
2161    }
2162
2163    fn decrease_flow_level(&mut self) {
2164        if self.flow_level > 0 {
2165            self.flow_level -= 1;
2166            self.simple_keys.pop().unwrap();
2167        }
2168    }
2169
2170    /// Push the `Block*` token(s) and skip over the `-`.
2171    ///
2172    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2173    /// `BlockEntry` token.
2174    /// This function only skips over the `-` and does not fetch the entry value.
2175    fn fetch_block_entry(&mut self) -> ScanResult {
2176        if self.flow_level > 0 {
2177            // - * only allowed in block
2178            return Err(ScanError::new_str(
2179                self.mark,
2180                r#""-" is only valid inside a block"#,
2181            ));
2182        }
2183        // Check if we are allowed to start a new entry.
2184        if !self.simple_key_allowed {
2185            return Err(ScanError::new_str(
2186                self.mark,
2187                "block sequence entries are not allowed in this context",
2188            ));
2189        }
2190
2191        // ???, fixes test G9HC.
2192        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
2193            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2194                return Err(ScanError::new_str(
2195                    span.start,
2196                    "invalid indentation for anchor",
2197                ));
2198            }
2199        }
2200
2201        // Skip over the `-`.
2202        let mark = self.mark;
2203        self.skip_non_blank();
2204
2205        // generate BLOCK-SEQUENCE-START if indented
2206        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2207        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2208        self.input.lookahead(2);
2209        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2210        {
2211            return Err(ScanError::new_str(
2212                self.mark,
2213                "'-' must be followed by a valid YAML whitespace",
2214            ));
2215        }
2216
2217        self.skip_ws_to_eol(SkipTabs::No)?;
2218        self.input.lookahead(1);
2219        if self.input.next_is_break() || self.input.next_is_flow() {
2220            self.roll_one_col_indent();
2221        }
2222
2223        self.remove_simple_key()?;
2224        self.allow_simple_key();
2225
2226        self.tokens
2227            .push_back(Token(Span::empty(self.mark), TokenType::BlockEntry));
2228
2229        Ok(())
2230    }
2231
2232    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2233        if let Some((mark, bracket)) = self.flow_markers.pop() {
2234            return Err(ScanError::new(
2235                mark,
2236                format!("unclosed bracket '{bracket}'"),
2237            ));
2238        }
2239
2240        self.unroll_indent(-1);
2241        self.remove_simple_key()?;
2242        self.disallow_simple_key();
2243
2244        let mark = self.mark;
2245
2246        self.skip_n_non_blank(3);
2247
2248        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
2249        Ok(())
2250    }
2251
2252    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2253        self.save_simple_key();
2254        self.allow_simple_key();
2255        let tok = self.scan_block_scalar(literal)?;
2256
2257        self.tokens.push_back(tok);
2258        Ok(())
2259    }
2260
2261    #[allow(clippy::too_many_lines)]
2262    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2263        let start_mark = self.mark;
2264        let mut chomping = Chomping::Clip;
2265        let mut increment: usize = 0;
2266        let mut indent: usize = 0;
2267        let mut trailing_blank: bool;
2268        let mut leading_blank: bool = false;
2269        let style = if literal {
2270            ScalarStyle::Literal
2271        } else {
2272            ScalarStyle::Folded
2273        };
2274
2275        let mut string = String::new();
2276        let mut leading_break = String::new();
2277        let mut trailing_breaks = String::new();
2278        let mut chomping_break = String::new();
2279
2280        // skip '|' or '>'
2281        self.skip_non_blank();
2282        self.unroll_non_block_indents();
2283
2284        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2285            if self.input.peek() == '+' {
2286                chomping = Chomping::Keep;
2287            } else {
2288                chomping = Chomping::Strip;
2289            }
2290            self.skip_non_blank();
2291            self.input.lookahead(1);
2292            if self.input.next_is_digit() {
2293                if self.input.peek() == '0' {
2294                    return Err(ScanError::new_str(
2295                        start_mark,
2296                        "while scanning a block scalar, found an indentation indicator equal to 0",
2297                    ));
2298                }
2299                increment = (self.input.peek() as usize) - ('0' as usize);
2300                self.skip_non_blank();
2301            }
2302        } else if self.input.next_is_digit() {
2303            if self.input.peek() == '0' {
2304                return Err(ScanError::new_str(
2305                    start_mark,
2306                    "while scanning a block scalar, found an indentation indicator equal to 0",
2307                ));
2308            }
2309
2310            increment = (self.input.peek() as usize) - ('0' as usize);
2311            self.skip_non_blank();
2312            self.input.lookahead(1);
2313            if self.input.peek() == '+' || self.input.peek() == '-' {
2314                if self.input.peek() == '+' {
2315                    chomping = Chomping::Keep;
2316                } else {
2317                    chomping = Chomping::Strip;
2318                }
2319                self.skip_non_blank();
2320            }
2321        }
2322
2323        self.skip_ws_to_eol(SkipTabs::Yes)?;
2324
2325        // Check if we are at the end of the line.
2326        self.input.lookahead(1);
2327        if !self.input.next_is_breakz() {
2328            return Err(ScanError::new_str(
2329                start_mark,
2330                "while scanning a block scalar, did not find expected comment or line break",
2331            ));
2332        }
2333
2334        if self.input.next_is_break() {
2335            self.input.lookahead(2);
2336            self.read_break(&mut chomping_break);
2337        }
2338
2339        if self.input.look_ch() == '\t' {
2340            return Err(ScanError::new_str(
2341                start_mark,
2342                "a block scalar content cannot start with a tab",
2343            ));
2344        }
2345
2346        if increment > 0 {
2347            indent = if self.indent >= 0 {
2348                (self.indent + increment as isize) as usize
2349            } else {
2350                increment
2351            }
2352        }
2353
2354        // Scan the leading line breaks and determine the indentation level if needed.
2355        if indent == 0 {
2356            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2357        } else {
2358            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2359        }
2360
2361        // We have an end-of-stream with no content, e.g.:
2362        // ```yaml
2363        // - |+
2364        // ```
2365        if self.input.next_is_z() {
2366            let contents = match chomping {
2367                // We strip trailing linebreaks. Nothing remain.
2368                Chomping::Strip => String::new(),
2369                // There was no newline after the chomping indicator.
2370                _ if self.mark.line == start_mark.line() => String::new(),
2371                // We clip lines, and there was a newline after the chomping indicator.
2372                // All other breaks are ignored.
2373                Chomping::Clip => chomping_break,
2374                // We keep lines. There was a newline after the chomping indicator but nothing
2375                // else.
2376                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2377                // Otherwise, the newline after chomping is ignored.
2378                Chomping::Keep => trailing_breaks,
2379            };
2380            return Ok(Token(
2381                Span::new(start_mark, self.mark),
2382                TokenType::Scalar(style, contents.into()),
2383            ));
2384        }
2385
2386        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2387            return Err(ScanError::new_str(
2388                self.mark,
2389                "wrongly indented line in block scalar",
2390            ));
2391        }
2392
2393        let mut line_buffer = String::with_capacity(100);
2394        let start_mark = self.mark;
2395        while self.mark.col == indent && !self.input.next_is_z() {
2396            if indent == 0 {
2397                self.input.lookahead(4);
2398                if self.input.next_is_document_end() {
2399                    break;
2400                }
2401            }
2402
2403            // We are at the first content character of a content line.
2404            trailing_blank = self.input.next_is_blank();
2405            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
2406                string.push_str(&trailing_breaks);
2407                if trailing_breaks.is_empty() {
2408                    string.push(' ');
2409                }
2410            } else {
2411                string.push_str(&leading_break);
2412                string.push_str(&trailing_breaks);
2413            }
2414
2415            leading_break.clear();
2416            trailing_breaks.clear();
2417
2418            leading_blank = self.input.next_is_blank();
2419
2420            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
2421
2422            // break on EOF
2423            self.input.lookahead(2);
2424            if self.input.next_is_z() {
2425                break;
2426            }
2427
2428            self.read_break(&mut leading_break);
2429
2430            // Eat the following indentation spaces and line breaks.
2431            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2432        }
2433
2434        // Chomp the tail.
2435        if chomping != Chomping::Strip {
2436            string.push_str(&leading_break);
2437            // If we had reached an eof but the last character wasn't an end-of-line, check if the
2438            // last line was indented at least as the rest of the scalar, then we need to consider
2439            // there is a newline.
2440            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
2441                string.push('\n');
2442            }
2443        }
2444
2445        if chomping == Chomping::Keep {
2446            string.push_str(&trailing_breaks);
2447        }
2448
2449        Ok(Token(
2450            Span::new(start_mark, self.mark),
2451            TokenType::Scalar(style, string.into()),
2452        ))
2453    }
2454
2455    /// Retrieve the contents of the line, parsing it as a block scalar.
2456    ///
2457    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
2458    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
2459    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
2460    /// `clear`ed before the end of the function.
2461    ///
2462    /// This function assumed the first character to read is the first content character in the
2463    /// line. This function does not consume the line break character(s) after the line.
2464    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
2465        // Start by evaluating characters in the buffer.
2466        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
2467            string.push(self.input.peek());
2468            // We may technically skip non-blank characters. However, the only distinction is
2469            // to determine what is leading whitespace and what is not. Here, we read the
2470            // contents of the line until either eof or a linebreak. We know we will not read
2471            // `self.leading_whitespace` until the end of the line, where it will be reset.
2472            // This allows us to call a slightly less expensive function.
2473            self.skip_blank();
2474        }
2475
2476        // All characters that were in the buffer were consumed. We need to check if more
2477        // follow.
2478        if self.input.buf_is_empty() {
2479            // We will read all consecutive non-breakz characters. We push them into a
2480            // temporary buffer. The main difference with going through `self.buffer` is that
2481            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
2482            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
2483            // (using `String::push_str`).
2484
2485            // line_buffer is empty at this point so we can compute n_chars here as well
2486            let mut n_chars = 0;
2487            debug_assert!(line_buffer.is_empty());
2488            while let Some(c) = self.input.raw_read_non_breakz_ch() {
2489                line_buffer.push(c);
2490                n_chars += 1;
2491            }
2492
2493            // We need to manually update our position; we haven't called a `skip` function.
2494            self.mark.col += n_chars;
2495            self.mark.offsets.chars += n_chars;
2496            self.mark.offsets.bytes = self.input.byte_offset();
2497
2498            // We can now append our bytes to our `string`.
2499            string.reserve(line_buffer.len());
2500            string.push_str(line_buffer);
2501            // This clears the _contents_ without touching the _capacity_.
2502            line_buffer.clear();
2503        }
2504    }
2505
2506    /// Skip the block scalar indentation and empty lines.
2507    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
2508        loop {
2509            // Consume all spaces. Tabs cannot be used as indentation.
2510            if indent < self.input.bufmaxlen() - 2 {
2511                self.input.lookahead(self.input.bufmaxlen());
2512                while self.mark.col < indent && self.input.peek() == ' ' {
2513                    self.skip_blank();
2514                }
2515            } else {
2516                loop {
2517                    self.input.lookahead(self.input.bufmaxlen());
2518                    while !self.input.buf_is_empty()
2519                        && self.mark.col < indent
2520                        && self.input.peek() == ' '
2521                    {
2522                        self.skip_blank();
2523                    }
2524                    // If we reached our indent, we can break. We must also break if we have
2525                    // reached content or EOF; that is, the buffer is not empty and the next
2526                    // character is not a space.
2527                    if self.mark.col == indent
2528                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
2529                    {
2530                        break;
2531                    }
2532                }
2533                self.input.lookahead(2);
2534            }
2535
2536            // If our current line is empty, skip over the break and continue looping.
2537            if self.input.next_is_break() {
2538                self.read_break(breaks);
2539            } else {
2540                // Otherwise, we have a content line. Return control.
2541                break;
2542            }
2543        }
2544    }
2545
2546    /// Determine the indentation level for a block scalar from the first line of its contents.
2547    ///
2548    /// The function skips over whitespace-only lines and sets `indent` to the the longest
2549    /// whitespace line that was encountered.
2550    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
2551        let mut max_indent = 0;
2552        loop {
2553            // Consume all spaces. Tabs cannot be used as indentation.
2554            while self.input.look_ch() == ' ' {
2555                self.skip_blank();
2556            }
2557
2558            if self.mark.col > max_indent {
2559                max_indent = self.mark.col;
2560            }
2561
2562            if self.input.next_is_break() {
2563                // If our current line is empty, skip over the break and continue looping.
2564                self.input.lookahead(2);
2565                self.read_break(breaks);
2566            } else {
2567                // Otherwise, we have a content line. Return control.
2568                break;
2569            }
2570        }
2571
2572        // In case a yaml looks like:
2573        // ```yaml
2574        // |
2575        // foo
2576        // bar
2577        // ```
2578        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
2579        // least 1. When in the above example, `self.indent` will be set to -1.
2580        *indent = max_indent.max((self.indent + 1) as usize);
2581        if self.indent > 0 {
2582            *indent = (*indent).max(1);
2583        }
2584    }
2585
2586    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
2587        self.save_simple_key();
2588        self.disallow_simple_key();
2589
2590        let tok = self.scan_flow_scalar(single)?;
2591
2592        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
2593        // YAML allows the following value to be specified adjacent to the “:”.
2594        self.skip_to_next_token()?;
2595        self.adjacent_value_allowed_at = self.mark.index();
2596
2597        self.tokens.push_back(tok);
2598        Ok(())
2599    }
2600
2601    #[allow(clippy::too_many_lines)]
2602    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
2603        let start_mark = self.mark;
2604
2605        // Output scalar contents.
2606        let mut buf = match self.input.byte_offset() {
2607            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
2608            None => FlowScalarBuf::new_owned(),
2609        };
2610
2611        // Scratch used to consume the *first* line break in a break run without emitting it.
2612        // (The first break folds to ' ' or to nothing depending on escaping rules.)
2613        let mut break_scratch = String::new();
2614
2615        /* Eat the left quote. */
2616        self.skip_non_blank();
2617
2618        loop {
2619            /* Check for a document indicator. */
2620            self.input.lookahead(4);
2621
2622            if self.mark.col == 0 && self.input.next_is_document_indicator() {
2623                return Err(ScanError::new_str(
2624                    start_mark,
2625                    "while scanning a quoted scalar, found unexpected document indicator",
2626                ));
2627            }
2628
2629            if self.input.next_is_z() {
2630                return Err(ScanError::new_str(start_mark, "unclosed quote"));
2631            }
2632
2633            // Do not enforce block indentation inside quoted (flow) scalars.
2634            // YAML allows line breaks within quoted scalars.
2635            let mut leading_blanks = false;
2636            self.consume_flow_scalar_non_whitespace_chars(
2637                single,
2638                &mut buf,
2639                &mut leading_blanks,
2640                &start_mark,
2641            )?;
2642
2643            match self.input.look_ch() {
2644                '\'' if single => break,
2645                '"' if !single => break,
2646                _ => {}
2647            }
2648
2649            // --- Faster whitespace / line break handling (no temporary Strings) ---
2650            //
2651            // Instead of:
2652            //   - collecting blanks into `whitespaces` and then copying
2653            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
2654            //
2655            // We do:
2656            //   - append trailing blanks directly to `string`, remember where they started,
2657            //     and truncate them if a line break follows.
2658            //   - for line breaks: consume the first break into a scratch (discarded),
2659            //     append subsequent breaks directly to `string`.
2660            //
2661            // These flags mirror the old "is_empty()" checks:
2662            //   has_leading_break  <=> !leading_break.is_empty()
2663            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
2664            let mut trailing_ws_start: Option<usize> = None;
2665            let mut has_leading_break = false;
2666            let mut has_trailing_breaks = false;
2667
2668            // For the borrowed path: track the (byte) start of a pending whitespace run.
2669            let mut pending_ws_start: Option<usize> = None;
2670
2671            // Consume blank characters.
2672            while self.input.next_is_blank() || self.input.next_is_break() {
2673                if self.input.next_is_blank() {
2674                    // Consume a space or a tab character.
2675                    if leading_blanks {
2676                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
2677                            return Err(ScanError::new_str(
2678                                self.mark,
2679                                "tab cannot be used as indentation",
2680                            ));
2681                        }
2682                        self.skip_blank();
2683                    } else {
2684                        // Append to output immediately; if a break appears next, we'll truncate.
2685                        match buf {
2686                            FlowScalarBuf::Owned(ref mut string) => {
2687                                if trailing_ws_start.is_none() {
2688                                    trailing_ws_start = Some(string.len());
2689                                }
2690                                string.push(self.input.peek());
2691                            }
2692                            FlowScalarBuf::Borrowed { .. } => {
2693                                if pending_ws_start.is_none() {
2694                                    pending_ws_start = self.input.byte_offset();
2695                                }
2696                            }
2697                        }
2698                        self.skip_blank();
2699
2700                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
2701                            (&mut buf, pending_ws_start, self.input.byte_offset())
2702                        {
2703                            buf.note_pending_ws(ws_start, ws_end);
2704                        }
2705                    }
2706                } else {
2707                    self.input.lookahead(2);
2708
2709                    // Check if it is a first line break.
2710                    if leading_blanks {
2711                        // Second+ line break in a run: preserve it.
2712                        match buf {
2713                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
2714                            FlowScalarBuf::Borrowed { .. } => {
2715                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2716                                let Some(string) = buf.as_owned_mut() else {
2717                                    unreachable!()
2718                                };
2719                                self.read_break(string);
2720                            }
2721                        }
2722                        has_trailing_breaks = true;
2723                    } else {
2724                        // First break: drop any trailing blanks we appended, then consume the break.
2725                        if let Some(pos) = trailing_ws_start.take() {
2726                            if let FlowScalarBuf::Owned(ref mut string) = buf {
2727                                string.truncate(pos);
2728                            }
2729                        }
2730
2731                        if pending_ws_start.take().is_some() {
2732                            // Trailing blanks before a break are discarded => transformation.
2733                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2734                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2735                            }
2736                            buf.discard_pending_ws();
2737                        } else {
2738                            buf.commit_pending_ws();
2739                        }
2740
2741                        break_scratch.clear();
2742                        self.read_break(&mut break_scratch);
2743                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
2744
2745                        has_leading_break = true;
2746                        leading_blanks = true;
2747                    }
2748                }
2749
2750                self.input.lookahead(1);
2751            }
2752
2753            // If we had a line break inside a quoted (flow) scalar, validate indentation
2754            // of the continuation line in block context.
2755            if leading_blanks && has_leading_break && self.flow_level == 0 {
2756                let next_ch = self.input.peek();
2757                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
2758                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
2759                    return Err(ScanError::new_str(
2760                        self.mark,
2761                        "invalid indentation in multiline quoted scalar",
2762                    ));
2763                }
2764            }
2765
2766            // Join the whitespaces or fold line breaks.
2767            if leading_blanks {
2768                // Old logic:
2769                //   if leading_break empty => emit trailing_breaks (already emitted now)
2770                //   else if trailing_breaks empty => emit ' '
2771                //   else emit trailing_breaks (already emitted now)
2772                if has_leading_break && !has_trailing_breaks {
2773                    match buf {
2774                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
2775                        FlowScalarBuf::Borrowed { .. } => {
2776                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2777                            let Some(string) = buf.as_owned_mut() else {
2778                                unreachable!()
2779                            };
2780                            string.push(' ');
2781                        }
2782                    }
2783                }
2784            }
2785            // else: trailing blanks are already appended to `string`
2786        } // loop
2787
2788        // Eat the right quote.
2789        self.skip_non_blank();
2790
2791        // Ensure there is no invalid trailing content.
2792        self.skip_ws_to_eol(SkipTabs::Yes)?;
2793        match self.input.peek() {
2794            // These can be encountered in flow sequences or mappings.
2795            ',' | '}' | ']' if self.flow_level > 0 => {}
2796            // An end-of-line / end-of-stream is fine. No trailing content.
2797            c if is_breakz(c) => {}
2798            // ':' can be encountered if our scalar is a key.
2799            // Outside of flow contexts, keys cannot span multiple lines
2800            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2801            // Inside a flow context, this is allowed.
2802            ':' if self.flow_level > 0 => {}
2803            _ => {
2804                return Err(ScanError::new_str(
2805                    self.mark,
2806                    "invalid trailing content after double-quoted scalar",
2807                ));
2808            }
2809        }
2810
2811        let style = if single {
2812            ScalarStyle::SingleQuoted
2813        } else {
2814            ScalarStyle::DoubleQuoted
2815        };
2816
2817        let contents = match buf {
2818            FlowScalarBuf::Owned(string) => Cow::Owned(string),
2819            FlowScalarBuf::Borrowed {
2820                start,
2821                mut end,
2822                pending_ws_start,
2823                pending_ws_end,
2824            } => {
2825                // If we ended after a whitespace run, it is part of the output (no break followed).
2826                if pending_ws_start.is_some() {
2827                    end = pending_ws_end;
2828                }
2829                if let Some(slice) = self.try_borrow_slice(start, end) {
2830                    Cow::Borrowed(slice)
2831                } else {
2832                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2833                        ScanError::new_str(
2834                            start_mark,
2835                            "internal error: input advertised offsets but did not provide a slice",
2836                        )
2837                    })?;
2838                    Cow::Owned(slice.to_owned())
2839                }
2840            }
2841        };
2842
2843        Ok(Token(
2844            Span::new(start_mark, self.mark),
2845            TokenType::Scalar(style, contents),
2846        ))
2847    }
2848
2849    /// Consume successive non-whitespace characters from a flow scalar.
2850    ///
2851    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2852    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2853    /// for double quoted scalars).
2854    ///
2855    /// # Errors
2856    /// Return an error if an invalid escape sequence is found.
2857    fn consume_flow_scalar_non_whitespace_chars(
2858        &mut self,
2859        single: bool,
2860        buf: &mut FlowScalarBuf,
2861        leading_blanks: &mut bool,
2862        start_mark: &Marker,
2863    ) -> Result<(), ScanError> {
2864        self.input.lookahead(2);
2865        while !is_blank_or_breakz(self.input.peek()) {
2866            match self.input.peek() {
2867                // Check for an escaped single quote.
2868                '\'' if self.input.peek_nth(1) == '\'' && single => {
2869                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2870                        buf.commit_pending_ws();
2871                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2872                    }
2873                    let Some(string) = buf.as_owned_mut() else {
2874                        unreachable!()
2875                    };
2876                    string.push('\'');
2877                    self.skip_n_non_blank(2);
2878                }
2879                // Check for the right quote.
2880                '\'' if single => break,
2881                '"' if !single => break,
2882                // Check for an escaped line break.
2883                '\\' if !single && is_break(self.input.peek_nth(1)) => {
2884                    self.input.lookahead(3);
2885                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2886                        buf.commit_pending_ws();
2887                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2888                    }
2889                    self.skip_non_blank();
2890                    self.skip_linebreak();
2891                    *leading_blanks = true;
2892                    break;
2893                }
2894                // Check for an escape sequence.
2895                '\\' if !single => {
2896                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2897                        buf.commit_pending_ws();
2898                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2899                    }
2900                    let Some(string) = buf.as_owned_mut() else {
2901                        unreachable!()
2902                    };
2903                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2904                }
2905                c => {
2906                    match buf {
2907                        FlowScalarBuf::Owned(ref mut string) => {
2908                            string.push(c);
2909                        }
2910                        FlowScalarBuf::Borrowed { .. } => {
2911                            buf.commit_pending_ws();
2912                        }
2913                    }
2914                    self.skip_non_blank();
2915
2916                    if let Some(new_end) = self.input.byte_offset() {
2917                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
2918                            *end = new_end;
2919                        }
2920                    }
2921                }
2922            }
2923            self.input.lookahead(2);
2924        }
2925        Ok(())
2926    }
2927
2928    /// Escape the sequence we encounter in a flow scalar.
2929    ///
2930    /// `self.input.peek()` must point to the `\` starting the escape sequence.
2931    ///
2932    /// # Errors
2933    /// Return an error if an invalid escape sequence is found.
2934    fn resolve_flow_scalar_escape_sequence(
2935        &mut self,
2936        start_mark: &Marker,
2937    ) -> Result<char, ScanError> {
2938        let mut code_length = 0usize;
2939        let mut ret = '\0';
2940
2941        match self.input.peek_nth(1) {
2942            '0' => ret = '\0',
2943            'a' => ret = '\x07',
2944            'b' => ret = '\x08',
2945            't' | '\t' => ret = '\t',
2946            'n' => ret = '\n',
2947            'v' => ret = '\x0b',
2948            'f' => ret = '\x0c',
2949            'r' => ret = '\x0d',
2950            'e' => ret = '\x1b',
2951            ' ' => ret = '\x20',
2952            '"' => ret = '"',
2953            '/' => ret = '/',
2954            '\\' => ret = '\\',
2955            // Unicode next line (#x85)
2956            'N' => ret = char::from_u32(0x85).unwrap(),
2957            // Unicode non-breaking space (#xA0)
2958            '_' => ret = char::from_u32(0xA0).unwrap(),
2959            // Unicode line separator (#x2028)
2960            'L' => ret = char::from_u32(0x2028).unwrap(),
2961            // Unicode paragraph separator (#x2029)
2962            'P' => ret = char::from_u32(0x2029).unwrap(),
2963            'x' => code_length = 2,
2964            'u' => code_length = 4,
2965            'U' => code_length = 8,
2966            _ => {
2967                return Err(ScanError::new_str(
2968                    *start_mark,
2969                    "while parsing a quoted scalar, found unknown escape character",
2970                ))
2971            }
2972        }
2973        self.skip_n_non_blank(2);
2974
2975        // Consume an arbitrary escape code.
2976        if code_length > 0 {
2977            self.input.lookahead(code_length);
2978            let mut value = 0u32;
2979            for i in 0..code_length {
2980                let c = self.input.peek_nth(i);
2981                if !is_hex(c) {
2982                    return Err(ScanError::new_str(
2983                        *start_mark,
2984                        "while parsing a quoted scalar, did not find expected hexadecimal number",
2985                    ));
2986                }
2987                value = (value << 4) + as_hex(c);
2988            }
2989
2990            self.skip_n_non_blank(code_length);
2991
2992            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
2993            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
2994                self.input.lookahead(2);
2995                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
2996                    self.skip_n_non_blank(2);
2997                    self.input.lookahead(4);
2998                    let mut low_value = 0u32;
2999                    for i in 0..4 {
3000                        let c = self.input.peek_nth(i);
3001                        if !is_hex(c) {
3002                            return Err(ScanError::new_str(
3003                                *start_mark,
3004                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3005                            ));
3006                        }
3007                        low_value = (low_value << 4) + as_hex(c);
3008                    }
3009                    if (0xDC00..=0xDFFF).contains(&low_value) {
3010                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3011                        self.skip_n_non_blank(4);
3012                    } else {
3013                        return Err(ScanError::new_str(
3014                            *start_mark,
3015                            "while parsing a quoted scalar, found invalid low surrogate",
3016                        ));
3017                    }
3018                } else {
3019                    return Err(ScanError::new_str(
3020                        *start_mark,
3021                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3022                    ));
3023                }
3024            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3025                return Err(ScanError::new_str(
3026                    *start_mark,
3027                    "while parsing a quoted scalar, found unpaired low surrogate",
3028                ));
3029            }
3030
3031            let Some(ch) = char::from_u32(value) else {
3032                return Err(ScanError::new_str(
3033                    *start_mark,
3034                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3035                ));
3036            };
3037            ret = ch;
3038        }
3039        Ok(ret)
3040    }
3041
3042    fn fetch_plain_scalar(&mut self) -> ScanResult {
3043        self.save_simple_key();
3044        self.disallow_simple_key();
3045
3046        let tok = self.scan_plain_scalar()?;
3047
3048        self.tokens.push_back(tok);
3049        Ok(())
3050    }
3051
3052    /// Scan for a plain scalar.
3053    ///
3054    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3055    /// some contexts.
3056    #[allow(clippy::too_many_lines)]
3057    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3058        self.unroll_non_block_indents();
3059        let indent = self.indent + 1;
3060        let start_mark = self.mark;
3061
3062        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3063            return Err(ScanError::new_str(
3064                start_mark,
3065                "invalid indentation in flow construct",
3066            ));
3067        }
3068
3069        let mut string = String::with_capacity(32);
3070        self.buf_whitespaces.clear();
3071        self.buf_leading_break.clear();
3072        self.buf_trailing_breaks.clear();
3073        let mut end_mark = self.mark;
3074
3075        loop {
3076            self.input.lookahead(4);
3077            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3078                || self.input.peek() == '#'
3079            {
3080                // BS4K: If a `#` starts a comment after some separation spaces following content
3081                // of a plain scalar in block context, and there is potential continuation on the
3082                // next line, this is invalid. We cannot decide yet if there will be continuation,
3083                // so record that a comment interrupted a plain scalar.
3084                if self.input.peek() == '#'
3085                    && !string.is_empty()
3086                    && !self.buf_whitespaces.is_empty()
3087                    && self.flow_level == 0
3088                {
3089                    self.interrupted_plain_by_comment = Some(self.mark);
3090                }
3091                break;
3092            }
3093
3094            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3095                return Err(ScanError::new_str(
3096                    self.mark,
3097                    "plain scalar cannot start with '-' followed by ,[]{}",
3098                ));
3099            }
3100
3101            if !self.input.next_is_blank_or_breakz()
3102                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3103            {
3104                if self.leading_whitespace {
3105                    if self.buf_leading_break.is_empty() {
3106                        string.push_str(&self.buf_leading_break);
3107                        string.push_str(&self.buf_trailing_breaks);
3108                        self.buf_trailing_breaks.clear();
3109                        self.buf_leading_break.clear();
3110                    } else {
3111                        if self.buf_trailing_breaks.is_empty() {
3112                            string.push(' ');
3113                        } else {
3114                            string.push_str(&self.buf_trailing_breaks);
3115                            self.buf_trailing_breaks.clear();
3116                        }
3117                        self.buf_leading_break.clear();
3118                    }
3119                    self.leading_whitespace = false;
3120                } else if !self.buf_whitespaces.is_empty() {
3121                    string.push_str(&self.buf_whitespaces);
3122                    self.buf_whitespaces.clear();
3123                }
3124
3125                // We can unroll the first iteration of the loop.
3126                string.push(self.input.peek());
3127                self.skip_non_blank();
3128                string.reserve(self.input.bufmaxlen());
3129
3130                // Add content non-blank characters to the scalar.
3131                let mut end = false;
3132                while !end {
3133                    // Fill the buffer once and process all characters in the buffer until the next
3134                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3135                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3136                    self.input.lookahead(self.input.bufmaxlen());
3137                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3138                        &mut string,
3139                        self.input.bufmaxlen() - 1,
3140                        self.flow_level > 0,
3141                    );
3142                    end = stop;
3143                    self.mark.offsets.chars += chars_consumed;
3144                    self.mark.col += chars_consumed;
3145                    self.mark.offsets.bytes = self.input.byte_offset();
3146                }
3147                end_mark = self.mark;
3148            }
3149
3150            // We may reach the end of a plain scalar if:
3151            //  - We reach eof
3152            //  - We reach ": "
3153            //  - We find a flow character in a flow context
3154            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3155                break;
3156            }
3157
3158            // Process blank characters.
3159            self.input.lookahead(2);
3160            while self.input.next_is_blank_or_break() {
3161                if self.input.next_is_blank() {
3162                    if !self.leading_whitespace {
3163                        self.buf_whitespaces.push(self.input.peek());
3164                        self.skip_blank();
3165                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3166                        // Tabs in an indentation columns are allowed if and only if the line is
3167                        // empty. Skip to the end of the line.
3168                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3169                        if !self.input.next_is_breakz() {
3170                            return Err(ScanError::new_str(
3171                                start_mark,
3172                                "while scanning a plain scalar, found a tab",
3173                            ));
3174                        }
3175                    } else {
3176                        self.skip_blank();
3177                    }
3178                } else {
3179                    // Check if it is a first line break
3180                    if self.leading_whitespace {
3181                        self.skip_break();
3182                        self.buf_trailing_breaks.push('\n');
3183                    } else {
3184                        self.buf_whitespaces.clear();
3185                        self.skip_break();
3186                        self.buf_leading_break.push('\n');
3187                        self.leading_whitespace = true;
3188                    }
3189                }
3190                self.input.lookahead(2);
3191            }
3192
3193            // check indentation level
3194            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3195                break;
3196            }
3197        }
3198
3199        if self.leading_whitespace {
3200            self.allow_simple_key();
3201        }
3202
3203        if string.is_empty() {
3204            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3205            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3206            // erroneous inputs such as "{...".
3207            Err(ScanError::new_str(
3208                start_mark,
3209                "unexpected end of plain scalar",
3210            ))
3211        } else {
3212            let contents = if let (Some(start), Some(end)) =
3213                (start_mark.byte_offset(), end_mark.byte_offset())
3214            {
3215                match self.try_borrow_slice(start, end) {
3216                    Some(slice) if slice == string => Cow::Borrowed(slice),
3217                    _ => Cow::Owned(string),
3218                }
3219            } else {
3220                Cow::Owned(string)
3221            };
3222
3223            Ok(Token(
3224                Span::new(start_mark, end_mark),
3225                TokenType::Scalar(ScalarStyle::Plain, contents),
3226            ))
3227        }
3228    }
3229
3230    fn fetch_key(&mut self) -> ScanResult {
3231        let start_mark = self.mark;
3232        if self.flow_level == 0 {
3233            // Check if we are allowed to start a new key (not necessarily simple).
3234            if !self.simple_key_allowed {
3235                return Err(ScanError::new_str(
3236                    self.mark,
3237                    "mapping keys are not allowed in this context",
3238                ));
3239            }
3240            self.roll_indent(
3241                start_mark.col,
3242                None,
3243                TokenType::BlockMappingStart,
3244                start_mark,
3245            );
3246        } else {
3247            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3248            self.flow_mapping_started = true;
3249        }
3250
3251        self.remove_simple_key()?;
3252
3253        if self.flow_level == 0 {
3254            self.allow_simple_key();
3255        } else {
3256            self.disallow_simple_key();
3257        }
3258
3259        self.skip_non_blank();
3260        self.skip_yaml_whitespace()?;
3261        if self.input.peek() == '\t' {
3262            return Err(ScanError::new_str(
3263                self.mark(),
3264                "tabs disallowed in this context",
3265            ));
3266        }
3267        self.tokens
3268            .push_back(Token(Span::new(start_mark, self.mark), TokenType::Key));
3269        Ok(())
3270    }
3271
3272    /// Fetch a value in a mapping inside of a flow collection.
3273    ///
3274    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3275    /// values in flow collections are respected prior to calling [`fetch_value`].
3276    ///
3277    /// [`self.flow_level`]: Self::flow_level
3278    /// [`fetch_value`]: Self::fetch_value
3279    fn fetch_flow_value(&mut self) -> ScanResult {
3280        let nc = self.input.peek_nth(1);
3281
3282        // If we encounter a ':' inside a flow collection and it is not immediately
3283        // followed by a blank or breakz:
3284        //   - We must check whether an adjacent value is allowed
3285        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3286        //     is needed for JSON compatibility.
3287        //   - If not, we must ensure there is a space after the ':' and before its value.
3288        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3289        //   - But if the value is empty (null), then it's okay.
3290        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3291        // flow character), but the ']' is not the value. The value is an invisible empty
3292        // space which is represented as null ('~').
3293        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3294            return Err(ScanError::new_str(
3295                self.mark,
3296                "':' may not precede any of `[{` in flow mapping",
3297            ));
3298        }
3299
3300        self.fetch_value()
3301    }
3302
3303    /// Fetch a value from a mapping (after a `:`).
3304    fn fetch_value(&mut self) -> ScanResult {
3305        let sk = self.simple_keys.last().unwrap().clone();
3306        let start_mark = self.mark;
3307        let is_implicit_flow_mapping =
3308            !self.implicit_flow_mapping_states.is_empty() && !self.flow_mapping_started;
3309        if is_implicit_flow_mapping {
3310            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3311                ImplicitMappingState::Inside(self.flow_level);
3312        }
3313
3314        // Skip over ':'.
3315        self.skip_non_blank();
3316        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3317        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3318        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3319        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3320        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3321        // cases will still fail to parse correctly (just with a less specific error message).
3322        if self.input.look_ch() == '\t'
3323            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
3324            && (self.input.peek() == '-' || self.input.next_is_alpha())
3325        {
3326            return Err(ScanError::new_str(
3327                self.mark,
3328                "':' must be followed by a valid YAML whitespace",
3329            ));
3330        }
3331
3332        if sk.possible {
3333            // insert simple key
3334            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3335            self.insert_token(sk.token_number - self.tokens_parsed, tok);
3336            if is_implicit_flow_mapping {
3337                if sk.mark.line < start_mark.line {
3338                    return Err(ScanError::new_str(
3339                        start_mark,
3340                        "illegal placement of ':' indicator",
3341                    ));
3342                }
3343                self.insert_token(
3344                    sk.token_number - self.tokens_parsed,
3345                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3346                );
3347            }
3348
3349            // Add the BLOCK-MAPPING-START token if needed.
3350            self.roll_indent(
3351                sk.mark.col,
3352                Some(sk.token_number),
3353                TokenType::BlockMappingStart,
3354                sk.mark,
3355            );
3356            self.roll_one_col_indent();
3357
3358            self.simple_keys.last_mut().unwrap().possible = false;
3359            self.disallow_simple_key();
3360        } else {
3361            if is_implicit_flow_mapping {
3362                self.tokens
3363                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
3364            }
3365            // The ':' indicator follows a complex key.
3366            if self.flow_level == 0 {
3367                if !self.simple_key_allowed {
3368                    return Err(ScanError::new_str(
3369                        start_mark,
3370                        "mapping values are not allowed in this context",
3371                    ));
3372                }
3373
3374                self.roll_indent(
3375                    start_mark.col,
3376                    None,
3377                    TokenType::BlockMappingStart,
3378                    start_mark,
3379                );
3380            }
3381            self.roll_one_col_indent();
3382
3383            if self.flow_level == 0 {
3384                self.allow_simple_key();
3385            } else {
3386                self.disallow_simple_key();
3387            }
3388        }
3389        self.tokens
3390            .push_back(Token(Span::empty(start_mark), TokenType::Value));
3391
3392        Ok(())
3393    }
3394
3395    /// Add an indentation level to the stack with the given block token, if needed.
3396    ///
3397    /// An indentation level is added only if:
3398    ///   - We are not in a flow-style construct (which don't have indentation per-se).
3399    ///   - The current column is further indented than the last indent we have registered.
3400    fn roll_indent(
3401        &mut self,
3402        col: usize,
3403        number: Option<usize>,
3404        tok: TokenType<'input>,
3405        mark: Marker,
3406    ) {
3407        if self.flow_level > 0 {
3408            return;
3409        }
3410
3411        // If the last indent was a non-block indent, remove it.
3412        // This means that we prepared an indent that we thought we wouldn't use, but realized just
3413        // now that it is a block indent.
3414        if self.indent <= col as isize {
3415            if let Some(indent) = self.indents.last() {
3416                if !indent.needs_block_end {
3417                    self.indent = indent.indent;
3418                    self.indents.pop();
3419                }
3420            }
3421        }
3422
3423        if self.indent < col as isize {
3424            self.indents.push(Indent {
3425                indent: self.indent,
3426                needs_block_end: true,
3427            });
3428            self.indent = col as isize;
3429            let tokens_parsed = self.tokens_parsed;
3430            match number {
3431                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
3432                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
3433            }
3434        }
3435    }
3436
3437    /// Pop indentation levels from the stack as much as needed.
3438    ///
3439    /// Indentation levels are popped from the stack while they are further indented than `col`.
3440    /// If we are in a flow-style construct (which don't have indentation per-se), this function
3441    /// does nothing.
3442    fn unroll_indent(&mut self, col: isize) {
3443        if self.flow_level > 0 {
3444            return;
3445        }
3446        while self.indent > col {
3447            let indent = self.indents.pop().unwrap();
3448            self.indent = indent.indent;
3449            if indent.needs_block_end {
3450                self.tokens
3451                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
3452            }
3453        }
3454    }
3455
3456    /// Add an indentation level of 1 column that does not start a block.
3457    ///
3458    /// See the documentation of [`Indent::needs_block_end`] for more details.
3459    /// An indentation is not added if we are inside a flow level or if the last indent is already
3460    /// a non-block indent.
3461    fn roll_one_col_indent(&mut self) {
3462        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
3463            self.indents.push(Indent {
3464                indent: self.indent,
3465                needs_block_end: false,
3466            });
3467            self.indent += 1;
3468        }
3469    }
3470
3471    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
3472    fn unroll_non_block_indents(&mut self) {
3473        while let Some(indent) = self.indents.last() {
3474            if indent.needs_block_end {
3475                break;
3476            }
3477            self.indent = indent.indent;
3478            self.indents.pop();
3479        }
3480    }
3481
3482    /// Mark the next token to be inserted as a potential simple key.
3483    fn save_simple_key(&mut self) {
3484        if self.simple_key_allowed {
3485            let required = self.flow_level == 0
3486                && self.indent == (self.mark.col as isize)
3487                && self.indents.last().unwrap().needs_block_end;
3488
3489            if let Some(last) = self.simple_keys.last_mut() {
3490                *last = SimpleKey {
3491                    mark: self.mark,
3492                    possible: true,
3493                    required,
3494                    token_number: self.tokens_parsed + self.tokens.len(),
3495                };
3496            }
3497        }
3498    }
3499
3500    fn remove_simple_key(&mut self) -> ScanResult {
3501        let last = self.simple_keys.last_mut().unwrap();
3502        if last.possible && last.required {
3503            return Err(self.simple_key_expected());
3504        }
3505
3506        last.possible = false;
3507        Ok(())
3508    }
3509
3510    /// Return whether the scanner is inside a block but outside of a flow sequence.
3511    fn is_within_block(&self) -> bool {
3512        !self.indents.is_empty()
3513    }
3514
3515    /// If an implicit mapping had started, end it.
3516    ///
3517    /// This function does not pop the state in [`implicit_flow_mapping_states`].
3518    ///
3519    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
3520    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
3521        if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() {
3522            if *implicit_mapping == ImplicitMappingState::Inside(flow_level) {
3523                self.flow_mapping_started = false;
3524                *implicit_mapping = ImplicitMappingState::Possible;
3525                self.tokens
3526                    .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
3527            }
3528        }
3529    }
3530}
3531
3532/// Chomping, how final line breaks and trailing empty lines are interpreted.
3533///
3534/// See YAML spec 8.1.1.2.
3535#[derive(PartialEq, Eq)]
3536pub enum Chomping {
3537    /// The final line break and any trailing empty lines are excluded.
3538    Strip,
3539    /// The final line break is preserved, but trailing empty lines are excluded.
3540    Clip,
3541    /// The final line break and trailing empty lines are included.
3542    Keep,
3543}
3544
3545#[cfg(test)]
3546mod test {
3547    use alloc::borrow::Cow;
3548
3549    use crate::{
3550        input::str::StrInput,
3551        scanner::{Scanner, TokenType},
3552    };
3553
3554    #[test]
3555    fn test_is_anchor_char() {
3556        use super::is_anchor_char;
3557        assert!(is_anchor_char('x'));
3558    }
3559
3560    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
3561    #[test]
3562    fn anchor_name_is_borrowed_for_str_input() {
3563        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
3564
3565        loop {
3566            let tok = scanner
3567                .next_token()
3568                .expect("valid YAML must scan without errors")
3569                .expect("scanner must eventually produce a token");
3570            if let TokenType::Anchor(name) = tok.1 {
3571                assert!(matches!(name, Cow::Borrowed("anch")));
3572                break;
3573            }
3574        }
3575    }
3576
3577    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
3578    #[test]
3579    fn alias_name_is_borrowed_for_str_input() {
3580        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
3581
3582        loop {
3583            let tok = scanner
3584                .next_token()
3585                .expect("valid YAML must scan without errors")
3586                .expect("scanner must eventually produce a token");
3587            if let TokenType::Alias(name) = tok.1 {
3588                assert!(matches!(name, Cow::Borrowed("anch")));
3589                break;
3590            }
3591        }
3592    }
3593
3594    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
3595    #[test]
3596    fn tag_directive_parts_are_borrowed_for_str_input() {
3597        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
3598
3599        loop {
3600            let tok = scanner
3601                .next_token()
3602                .expect("valid YAML must scan without errors")
3603                .expect("scanner must eventually produce a token");
3604            if let TokenType::TagDirective(handle, prefix) = tok.1 {
3605                assert!(matches!(handle, Cow::Borrowed("!e!")));
3606                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
3607                break;
3608            }
3609        }
3610    }
3611
3612    #[test]
3613    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
3614        let mut scanner = Scanner::new(StrInput::new("foo\n"));
3615
3616        loop {
3617            let tok = scanner
3618                .next_token()
3619                .expect("valid YAML must scan without errors")
3620                .expect("scanner must eventually produce a token");
3621            if let TokenType::Scalar(_, value) = tok.1 {
3622                assert!(matches!(value, Cow::Borrowed("foo")));
3623                break;
3624            }
3625        }
3626    }
3627
3628    #[test]
3629    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
3630        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
3631
3632        loop {
3633            let tok = scanner
3634                .next_token()
3635                .expect("valid YAML must scan without errors")
3636                .expect("scanner must eventually produce a token");
3637            if let TokenType::Scalar(_, value) = tok.1 {
3638                assert!(matches!(value, Cow::Borrowed("foo bar")));
3639                break;
3640            }
3641        }
3642    }
3643
3644    #[test]
3645    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
3646        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
3647
3648        loop {
3649            let tok = scanner
3650                .next_token()
3651                .expect("valid YAML must scan without errors")
3652                .expect("scanner must eventually produce a token");
3653            if let TokenType::Scalar(_, value) = tok.1 {
3654                assert!(matches!(value, Cow::Borrowed("foo bar")));
3655                break;
3656            }
3657        }
3658    }
3659
3660    #[test]
3661    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
3662        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
3663
3664        loop {
3665            let tok = scanner
3666                .next_token()
3667                .expect("valid YAML must scan without errors")
3668                .expect("scanner must eventually produce a token");
3669            if let TokenType::Scalar(_, value) = tok.1 {
3670                assert!(matches!(value, Cow::Owned(_)));
3671                assert_eq!(&*value, "foo'bar");
3672                break;
3673            }
3674        }
3675    }
3676
3677    #[test]
3678    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
3679        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
3680
3681        loop {
3682            let tok = scanner
3683                .next_token()
3684                .expect("valid YAML must scan without errors")
3685                .expect("scanner must eventually produce a token");
3686            if let TokenType::Scalar(_, value) = tok.1 {
3687                assert!(matches!(value, Cow::Borrowed("foo bar")));
3688                break;
3689            }
3690        }
3691    }
3692
3693    #[test]
3694    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
3695        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
3696
3697        loop {
3698            let tok = scanner
3699                .next_token()
3700                .expect("valid YAML must scan without errors")
3701                .expect("scanner must eventually produce a token");
3702            if let TokenType::Scalar(_, value) = tok.1 {
3703                assert!(matches!(value, Cow::Owned(_)));
3704                assert_eq!(&*value, "foo\nbar");
3705                break;
3706            }
3707        }
3708    }
3709
3710    #[test]
3711    fn plain_key_is_borrowed_for_str_input() {
3712        // Keys are just scalars in a key position; they should also be borrowed.
3713        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
3714
3715        let mut found_key = false;
3716        let mut key_value: Option<Cow<'_, str>> = None;
3717
3718        loop {
3719            let tok = scanner
3720                .next_token()
3721                .expect("valid YAML must scan without errors");
3722            let Some(tok) = tok else { break };
3723
3724            if matches!(tok.1, TokenType::Key) {
3725                found_key = true;
3726            } else if found_key {
3727                if let TokenType::Scalar(_, value) = tok.1 {
3728                    key_value = Some(value);
3729                    break;
3730                }
3731            }
3732        }
3733
3734        assert!(found_key, "expected to find a Key token");
3735        let key_value = key_value.expect("expected to find a scalar after Key token");
3736        assert!(
3737            matches!(key_value, Cow::Borrowed("mykey")),
3738            "key should be borrowed, got: {key_value:?}"
3739        );
3740    }
3741
3742    #[test]
3743    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
3744        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
3745
3746        let mut found_key = false;
3747        let mut key_value: Option<Cow<'_, str>> = None;
3748
3749        loop {
3750            let tok = scanner
3751                .next_token()
3752                .expect("valid YAML must scan without errors");
3753            let Some(tok) = tok else { break };
3754
3755            if matches!(tok.1, TokenType::Key) {
3756                found_key = true;
3757            } else if found_key {
3758                if let TokenType::Scalar(_, value) = tok.1 {
3759                    key_value = Some(value);
3760                    break;
3761                }
3762            }
3763        }
3764
3765        assert!(found_key, "expected to find a Key token");
3766        let key_value = key_value.expect("expected to find a scalar after Key token");
3767        assert!(
3768            matches!(key_value, Cow::Borrowed("mykey")),
3769            "quoted key should be borrowed when verbatim, got: {key_value:?}"
3770        );
3771    }
3772
3773    #[test]
3774    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
3775        // Test a tag like !!str which should have handle="!!" and suffix="str"
3776        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
3777
3778        loop {
3779            let tok = scanner
3780                .next_token()
3781                .expect("valid YAML must scan without errors")
3782                .expect("scanner must eventually produce a token");
3783            if let TokenType::Tag(handle, suffix) = tok.1 {
3784                assert!(
3785                    matches!(handle, Cow::Borrowed("!!")),
3786                    "tag handle should be borrowed, got: {handle:?}"
3787                );
3788                assert!(
3789                    matches!(suffix, Cow::Borrowed("str")),
3790                    "tag suffix should be borrowed, got: {suffix:?}"
3791                );
3792                break;
3793            }
3794        }
3795    }
3796
3797    #[test]
3798    fn local_tag_suffix_is_borrowed_for_str_input() {
3799        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
3800        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
3801
3802        loop {
3803            let tok = scanner
3804                .next_token()
3805                .expect("valid YAML must scan without errors")
3806                .expect("scanner must eventually produce a token");
3807            if let TokenType::Tag(handle, suffix) = tok.1 {
3808                assert!(
3809                    matches!(handle, Cow::Borrowed("!")),
3810                    "local tag handle should be '!', got: {handle:?}"
3811                );
3812                assert!(
3813                    matches!(suffix, Cow::Borrowed("mytag")),
3814                    "local tag suffix should be borrowed, got: {suffix:?}"
3815                );
3816                break;
3817            }
3818        }
3819    }
3820
3821    #[test]
3822    fn tag_with_uri_escape_is_owned_for_str_input() {
3823        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
3824        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
3825
3826        loop {
3827            let tok = scanner
3828                .next_token()
3829                .expect("valid YAML must scan without errors")
3830                .expect("scanner must eventually produce a token");
3831            if let TokenType::Tag(handle, suffix) = tok.1 {
3832                assert!(
3833                    matches!(handle, Cow::Borrowed("!!")),
3834                    "tag handle should still be borrowed, got: {handle:?}"
3835                );
3836                assert!(
3837                    matches!(suffix, Cow::Owned(_)),
3838                    "tag suffix with URI escape should be owned, got: {suffix:?}"
3839                );
3840                assert_eq!(&*suffix, "my tag");
3841                break;
3842            }
3843        }
3844    }
3845}
saphyr_parser_bw/scanner.rs

saphyr_parser_bw/
scanner.rs