saphyr_parser_bw/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13    borrow::{Cow, ToOwned},
14    collections::VecDeque,
15    string::String,
16    vec::Vec,
17};
18use core::char;
19
20use thiserror::Error;
21
22use crate::{
23    char_traits::{
24        as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex,
25        is_tag_char, is_uri_char,
26    },
27    input::{BorrowedInput, SkipTabs},
28};
29
30/// The encoding of the input. Currently, only UTF-8 is supported.
31#[derive(Clone, Copy, PartialEq, Debug, Eq)]
32pub enum TEncoding {
33    /// UTF-8 encoding.
34    Utf8,
35}
36
37/// The style as which the scalar was written in the YAML document.
38#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
39pub enum ScalarStyle {
40    /// A YAML plain scalar.
41    Plain,
42    /// A YAML single quoted scalar.
43    SingleQuoted,
44    /// A YAML double quoted scalar.
45    DoubleQuoted,
46
47    /// A YAML literal block (`|` block).
48    ///
49    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
50    /// In literal blocks, any indented character is content, including white space characters.
51    /// There is no way to escape characters, nor to break a long line.
52    Literal,
53    /// A YAML folded block (`>` block).
54    ///
55    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
56    /// In folded blocks, any indented character is content, including white space characters.
57    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
58    /// long lines.
59    Folded,
60}
61
62/// Offset information for a [`Marker`].
63///
64/// YAML inputs can come from either a full `&str` (stable backing storage) or a streaming
65/// character source. For stable inputs, we can track both a character index and a byte offset.
66/// For streaming inputs, byte offsets are not generally useful (and may not correspond to any
67/// meaningful underlying file/source), so they are optional.
68#[derive(Clone, Copy, Debug, Default)]
69pub struct MarkerOffsets {
70    /// The index (in characters) in the source.
71    chars: usize,
72    /// The offset (in bytes) in the source, if available.
73    bytes: Option<usize>,
74}
75
76impl PartialEq for MarkerOffsets {
77    fn eq(&self, other: &Self) -> bool {
78        // Byte offsets are an optional diagnostic enhancement and may differ between input
79        // backends (e.g., `&str` vs streaming). Equality is therefore based on the character
80        // position only.
81        self.chars == other.chars
82    }
83}
84
85impl Eq for MarkerOffsets {}
86
87/// A location in a yaml document.
88#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
89pub struct Marker {
90    /// Offsets in the source.
91    offsets: MarkerOffsets,
92    /// The line (1-indexed).
93    line: usize,
94    /// The column (0-indexed).
95    col: usize,
96}
97
98impl Marker {
99    /// Create a new [`Marker`] at the given position.
100    #[must_use]
101    pub fn new(index: usize, line: usize, col: usize) -> Marker {
102        Marker {
103            offsets: MarkerOffsets {
104                chars: index,
105                bytes: None,
106            },
107            line,
108            col,
109        }
110    }
111
112    /// Return a copy of the marker with the given optional byte offset.
113    #[must_use]
114    pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
115        self.offsets.bytes = byte_offset;
116        self
117    }
118
119    /// Return the index (in characters) of the marker in the source.
120    #[must_use]
121    pub fn index(&self) -> usize {
122        self.offsets.chars
123    }
124
125    /// Return the byte offset of the marker in the source, if available.
126    #[must_use]
127    pub fn byte_offset(&self) -> Option<usize> {
128        self.offsets.bytes
129    }
130
131    /// Return the line of the marker in the source.
132    #[must_use]
133    pub fn line(&self) -> usize {
134        self.line
135    }
136
137    /// Return the column of the marker in the source.
138    #[must_use]
139    pub fn col(&self) -> usize {
140        self.col
141    }
142}
143
144/// A range of locations in a Yaml document.
145#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
146pub struct Span {
147    /// The start (inclusive) of the range.
148    pub start: Marker,
149    /// The end (exclusive) of the range.
150    pub end: Marker,
151
152    /// Optional indentation hint associated with this span.
153    ///
154    /// This is only meaningful for certain parser-emitted events (notably: block mapping keys).
155    /// When indentation is not meaningful or cannot be provided, it must be `None`.
156    pub indent: Option<usize>,
157}
158
159impl Span {
160    /// Create a new [`Span`] for the given range.
161    #[must_use]
162    pub fn new(start: Marker, end: Marker) -> Span {
163        Span {
164            start,
165            end,
166            indent: None,
167        }
168    }
169
170    /// Create a empty [`Span`] at a given location.
171    ///
172    /// An empty span doesn't contain any characters, but its position may still be meaningful.
173    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
174    ///
175    /// [`SequenceEnd`]: crate::Event::SequenceEnd
176    #[must_use]
177    pub fn empty(mark: Marker) -> Span {
178        Span {
179            start: mark,
180            end: mark,
181            indent: None,
182        }
183    }
184
185    /// Return a copy of this [`Span`] with the given indentation hint.
186    #[must_use]
187    pub fn with_indent(mut self, indent: Option<usize>) -> Span {
188        self.indent = indent;
189        self
190    }
191
192    /// Return the length of the span (in characters).
193    #[must_use]
194    pub fn len(&self) -> usize {
195        self.end.index() - self.start.index()
196    }
197
198    /// Return whether the [`Span`] has a length of zero.
199    #[must_use]
200    pub fn is_empty(&self) -> bool {
201        self.len() == 0
202    }
203
204    /// Return the byte range of the span, if available.
205    #[must_use]
206    pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
207        let start = self.start.byte_offset()?;
208        let end = self.end.byte_offset()?;
209        Some(start..end)
210    }
211}
212
213/// An error that occurred while scanning.
214#[derive(Clone, PartialEq, Debug, Eq, Error)]
215#[error(
216    "{} at char {} line {} column {}",
217    .info,
218    .mark.index(),
219    .mark.line(),
220    .mark.col() + 1,
221)]
222pub struct ScanError {
223    /// The position at which the error happened in the source.
224    mark: Marker,
225    /// Human-readable details about the error.
226    info: String,
227}
228
229impl ScanError {
230    /// Create a new error from a location and an error string.
231    #[must_use]
232    #[cold]
233    pub fn new(loc: Marker, info: String) -> ScanError {
234        ScanError { mark: loc, info }
235    }
236
237    /// Convenience alias for string slices.
238    #[must_use]
239    #[cold]
240    pub fn new_str(loc: Marker, info: &str) -> ScanError {
241        ScanError {
242            mark: loc,
243            info: info.to_owned(),
244        }
245    }
246
247    /// Return the marker pointing to the error in the source.
248    #[must_use]
249    pub fn marker(&self) -> &Marker {
250        &self.mark
251    }
252
253    /// Return the information string describing the error that happened.
254    #[must_use]
255    pub fn info(&self) -> &str {
256        self.info.as_ref()
257    }
258}
259
260/// The contents of a scanner token.
261#[derive(Clone, PartialEq, Debug, Eq)]
262pub enum TokenType<'input> {
263    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
264    StreamStart(TEncoding),
265    /// The end of the stream, EOF.
266    StreamEnd,
267    /// A YAML version directive.
268    VersionDirective(
269        /// Major
270        u32,
271        /// Minor
272        u32,
273    ),
274    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
275    TagDirective(
276        /// Handle
277        Cow<'input, str>,
278        /// Prefix
279        Cow<'input, str>,
280    ),
281    /// The start of a YAML document (`---`).
282    DocumentStart,
283    /// The end of a YAML document (`...`).
284    DocumentEnd,
285    /// The start of a sequence block.
286    ///
287    /// Sequence blocks are arrays starting with a `-`.
288    BlockSequenceStart,
289    /// The start of a sequence mapping.
290    ///
291    /// Sequence mappings are "dictionaries" with "key: value" entries.
292    BlockMappingStart,
293    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
294    BlockEnd,
295    /// Start of an inline sequence (`[ a, b ]`).
296    FlowSequenceStart,
297    /// End of an inline sequence.
298    FlowSequenceEnd,
299    /// Start of an inline mapping (`{ a: b, c: d }`).
300    FlowMappingStart,
301    /// End of an inline mapping.
302    FlowMappingEnd,
303    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
304    BlockEntry,
305    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
306    FlowEntry,
307    /// A key in a mapping.
308    Key,
309    /// A value in a mapping.
310    Value,
311    /// A reference to an anchor.
312    Alias(Cow<'input, str>),
313    /// A YAML anchor (`&`/`*`).
314    Anchor(Cow<'input, str>),
315    /// A YAML tag (starting with bangs `!`).
316    Tag(
317        /// The handle of the tag.
318        Cow<'input, str>,
319        /// The suffix of the tag.
320        Cow<'input, str>,
321    ),
322    /// A regular YAML scalar.
323    Scalar(ScalarStyle, Cow<'input, str>),
324    /// A reserved YAML directive.
325    ReservedDirective(
326        /// Name
327        String,
328        /// Parameters
329        Vec<String>,
330    ),
331}
332
333/// A scanner token.
334#[derive(Clone, PartialEq, Debug, Eq)]
335pub struct Token<'input>(pub Span, pub TokenType<'input>);
336
337/// A scalar that was parsed and may correspond to a simple key.
338///
339/// Upon scanning the following yaml:
340/// ```yaml
341/// a: b
342/// ```
343/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
344/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
345/// kept inside the scanner until more context is fetched and we are able to know whether it is a
346/// plain scalar or a key.
347///
348/// For example, see the following 2 yaml documents:
349/// ```yaml
350/// ---
351/// a: b # Here, `a` is a key.
352/// ...
353/// ---
354/// a # Here, `a` is a plain scalar.
355/// ...
356/// ```
357/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
358///
359/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
360/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
361/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
362///
363/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
364/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
365/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
366/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
367/// [`TokenType::Scalar`] token.
368///
369/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
370/// [`TokenType::Key`] would be emitted by the scanner.
371#[derive(Clone, PartialEq, Debug, Eq)]
372struct SimpleKey {
373    /// Whether the token this [`SimpleKey`] refers to may still be a key.
374    ///
375    /// Sometimes, when we have more context, we notice that what we thought could be a key no
376    /// longer can be. In that case, [`Self::possible`] is set to `false`.
377    ///
378    /// For instance, let us consider the following invalid YAML:
379    /// ```yaml
380    /// key
381    ///   : value
382    /// ```
383    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
384    /// and [`Self::possible`] set to `false`.
385    possible: bool,
386    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
387    ///
388    /// With more context, we may know for sure that the token must be a key. If the YAML is
389    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
390    /// be raised. This boolean helps us know when to raise such error.
391    ///
392    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
393    required: bool,
394    /// The index of the token referred to by the [`SimpleKey`].
395    ///
396    /// This is the index in the scanner, which takes into account both the tokens that have been
397    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
398    /// [`Scanner::tokens`] for more details.
399    token_number: usize,
400    /// The position at which the token the [`SimpleKey`] refers to is.
401    mark: Marker,
402}
403
404impl SimpleKey {
405    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
406    fn new(mark: Marker) -> SimpleKey {
407        SimpleKey {
408            possible: false,
409            required: false,
410            token_number: 0,
411            mark,
412        }
413    }
414}
415
416/// An indentation level on the stack of indentations.
417#[derive(Clone, Debug, Default)]
418struct Indent {
419    /// The former indentation level.
420    indent: isize,
421    /// Whether, upon closing, this indents generates a `BlockEnd` token.
422    ///
423    /// There are levels of indentation which do not start a block. Examples of this would be:
424    /// ```yaml
425    /// -
426    ///   foo # ok
427    /// -
428    /// bar # ko, bar needs to be indented further than the `-`.
429    /// - [
430    ///  baz, # ok
431    /// quux # ko, quux needs to be indented further than the '-'.
432    /// ] # ko, the closing bracket needs to be indented further than the `-`.
433    /// ```
434    ///
435    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
436    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
437    /// sequence, although we must have exactly one to end the sequence.
438    needs_block_end: bool,
439}
440
441/// The knowledge we have about an implicit mapping.
442///
443/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
444/// sequence is omitted:
445/// ```yaml
446/// [ a: b, c: d ]
447/// # Equivalent to
448/// [ { a: b }, { c: d } ]
449/// # Equivalent to
450/// - a: b
451/// - c: d
452/// ```
453///
454/// The state must be carefully tracked for each nested flow sequence since we must emit a
455/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
456/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
457/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
458/// out-of-order.
459///
460/// [`FlowMappingStart`]: TokenType::FlowMappingStart
461/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
462#[derive(Debug, PartialEq)]
463enum ImplicitMappingState {
464    /// It is possible there is an implicit mapping.
465    ///
466    /// This state is the one when we have just encountered the opening `[`. We need more context
467    /// to know whether an implicit mapping follows.
468    Possible,
469    /// We are inside the implcit mapping.
470    ///
471    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
472    Inside(u8),
473}
474
475/// The YAML scanner.
476///
477/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
478/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
479/// some of the constructs. It has understanding of indentation and whitespace and is able to
480/// generate error messages for some invalid YAML constructs.
481///
482/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
483/// YAML documents.
484#[derive(Debug)]
485#[allow(clippy::struct_excessive_bools)]
486pub struct Scanner<'input, T> {
487    /// The input source.
488    ///
489    /// This must implement [`Input`].
490    input: T,
491    /// The position of the cursor within the reader.
492    mark: Marker,
493    /// Buffer for tokens to be returned.
494    ///
495    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
496    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
497    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
498    /// [`Self::next`] until we have more context.
499    tokens: VecDeque<Token<'input>>,
500    /// The last error that happened.
501    error: Option<ScanError>,
502
503    /// Whether we have already emitted the `StreamStart` token.
504    stream_start_produced: bool,
505    /// Whether we have already emitted the `StreamEnd` token.
506    stream_end_produced: bool,
507    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
508    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
509    adjacent_value_allowed_at: usize,
510    /// Whether a simple key could potentially start at the current position.
511    ///
512    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
513    simple_key_allowed: bool,
514    /// A stack of potential simple keys.
515    ///
516    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
517    /// are.
518    simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
519    /// The current indentation level.
520    indent: isize,
521    /// List of all block indentation levels we are in (except the current one).
522    indents: smallvec::SmallVec<[Indent; 8]>,
523    /// Level of nesting of flow sequences.
524    flow_level: u8,
525    /// The number of tokens that have been returned from the scanner.
526    ///
527    /// This excludes the tokens from [`Self::tokens`].
528    tokens_parsed: usize,
529    /// Whether a token is ready to be taken from [`Self::tokens`].
530    token_available: bool,
531    /// Whether all characters encountered since the last newline were whitespace.
532    leading_whitespace: bool,
533    /// Whether we started a flow mapping.
534    ///
535    /// This is used to detect implicit flow mapping starts such as:
536    /// ```yaml
537    /// [ : foo ] # { null: "foo" }
538    /// ```
539    flow_mapping_started: bool,
540    /// An array of states, representing whether flow sequences have implicit mappings.
541    ///
542    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
543    /// the state is set to [`Possible`].
544    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
545    /// [`Inside`].
546    ///
547    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
548    /// The entries are created with the opening `]` and popped with the closing `]`.
549    ///
550    /// [`Possible`]: ImplicitMappingState::Possible
551    /// [`Inside`]: ImplicitMappingState::Inside
552    implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
553    /// If a plain scalar was terminated by a `#` comment on its line, we set this
554    /// to detect an illegal multiline continuation on the following line.
555    interrupted_plain_by_comment: Option<Marker>,
556    /// A stack of markers for opening brackets `[` and `{`.
557    flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
558    buf_leading_break: String,
559    buf_trailing_breaks: String,
560    buf_whitespaces: String,
561}
562
563impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
564    type Item = Token<'input>;
565
566    fn next(&mut self) -> Option<Self::Item> {
567        if self.error.is_some() {
568            return None;
569        }
570        match self.next_token() {
571            Ok(Some(tok)) => {
572                debug_print!(
573                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
574                    tok.1,
575                    tok.0
576                );
577                Some(tok)
578            }
579            Ok(tok) => tok,
580            Err(e) => {
581                self.error = Some(e);
582                None
583            }
584        }
585    }
586}
587
588/// A convenience alias for scanner functions that may fail without returning a value.
589pub type ScanResult = Result<(), ScanError>;
590
591#[derive(Debug)]
592enum FlowScalarBuf {
593    /// Candidate for `Cow::Borrowed`.
594    ///
595    /// `start..end` is the committed verbatim range.
596    /// `pending_ws_start..pending_ws_end` is a run of blanks that were seen but not yet
597    /// committed (they must be dropped if followed by a line break).
598    Borrowed {
599        start: usize,
600        end: usize,
601        pending_ws_start: Option<usize>,
602        pending_ws_end: usize,
603    },
604    Owned(String),
605}
606
607impl FlowScalarBuf {
608    #[inline]
609    fn new_borrowed(start: usize) -> Self {
610        Self::Borrowed {
611            start,
612            end: start,
613            pending_ws_start: None,
614            pending_ws_end: start,
615        }
616    }
617
618    #[inline]
619    fn new_owned() -> Self {
620        Self::Owned(String::new())
621    }
622
623    #[inline]
624    fn as_owned_mut(&mut self) -> Option<&mut String> {
625        match self {
626            Self::Owned(s) => Some(s),
627            Self::Borrowed { .. } => None,
628        }
629    }
630
631    #[inline]
632    fn commit_pending_ws(&mut self) {
633        if let Self::Borrowed {
634            end,
635            pending_ws_start,
636            pending_ws_end,
637            ..
638        } = self
639        {
640            if pending_ws_start.is_some() {
641                *end = *pending_ws_end;
642                *pending_ws_start = None;
643            }
644        }
645    }
646
647    #[inline]
648    fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
649        if let Self::Borrowed {
650            pending_ws_start,
651            pending_ws_end,
652            ..
653        } = self
654        {
655            if pending_ws_start.is_none() {
656                *pending_ws_start = Some(ws_start);
657            }
658            *pending_ws_end = ws_end;
659        }
660    }
661
662    #[inline]
663    fn discard_pending_ws(&mut self) {
664        if let Self::Borrowed {
665            pending_ws_start,
666            pending_ws_end,
667            end,
668            ..
669        } = self
670        {
671            *pending_ws_start = None;
672            *pending_ws_end = *end;
673        }
674    }
675}
676
677impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
678    #[inline]
679    fn promote_flow_scalar_buf_to_owned(
680        &self,
681        start_mark: &Marker,
682        buf: &mut FlowScalarBuf,
683    ) -> Result<(), ScanError> {
684        let FlowScalarBuf::Borrowed {
685            start,
686            end,
687            pending_ws_start: _,
688            pending_ws_end: _,
689        } = *buf
690        else {
691            return Ok(());
692        };
693
694        let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
695            ScanError::new_str(
696                *start_mark,
697                "internal error: input advertised offsets but did not provide a slice",
698            )
699        })?;
700        *buf = FlowScalarBuf::Owned(slice.to_owned());
701        Ok(())
702    }
703    /// Try to borrow a slice from the underlying input.
704    ///
705    /// This method uses the [`BorrowedInput`] trait to safely obtain a slice with the `'input`
706    /// lifetime. For inputs that support zero-copy slicing (like `StrInput`), this returns
707    /// `Some(&'input str)`. For streaming inputs, this returns `None`.
708    #[inline]
709    fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
710        self.input.slice_borrowed(start, end)
711    }
712
713    /// Scan a tag handle for a `%TAG` directive as a `Cow<str>`.
714    ///
715    /// For `StrInput`, this will borrow from the input when possible. For other inputs, or if
716    /// borrowing is not possible, it falls back to allocating.
717    fn scan_tag_handle_directive_cow(
718        &mut self,
719        mark: &Marker,
720    ) -> Result<Cow<'input, str>, ScanError> {
721        let Some(start) = self.input.byte_offset() else {
722            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
723        };
724
725        if self.input.look_ch() != '!' {
726            return Err(ScanError::new_str(
727                *mark,
728                "while scanning a tag, did not find expected '!'",
729            ));
730        }
731
732        // Consume the leading '!'.
733        self.skip_non_blank();
734
735        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
736        // This mirrors `StrInput::fetch_while_is_alpha` but avoids allocation.
737        self.input.lookahead(1);
738        while self.input.next_is_alpha() {
739            self.skip_non_blank();
740            self.input.lookahead(1);
741        }
742
743        // Optional trailing '!'.
744        if self.input.peek() == '!' {
745            self.skip_non_blank();
746        }
747
748        let Some(end) = self.input.byte_offset() else {
749            // Should be impossible if `byte_offset()` was `Some` above, but keep safe fallback.
750            return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
751        };
752
753        let Some(slice) = self.try_borrow_slice(start, end) else {
754            // Fall back to allocating if zero-copy borrow is not available.
755            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
756                ScanError::new_str(
757                    *mark,
758                    "internal error: input advertised slicing but did not provide a slice",
759                )
760            })?;
761            if !slice.ends_with('!') && slice != "!" {
762                return Err(ScanError::new_str(
763                    *mark,
764                    "while parsing a tag directive, did not find expected '!'",
765                ));
766            }
767            return Ok(Cow::Owned(slice.to_owned()));
768        };
769
770        if !slice.ends_with('!') && slice != "!" {
771            return Err(ScanError::new_str(
772                *mark,
773                "while parsing a tag directive, did not find expected '!'",
774            ));
775        }
776
777        Ok(Cow::Borrowed(slice))
778    }
779
780    /// Scan a tag prefix for a `%TAG` directive as a `Cow<str>`.
781    ///
782    /// This borrows from `StrInput` only when no URI escape sequences are encountered. If a `%`
783    /// escape is present, the prefix must be decoded and therefore allocated.
784    fn scan_tag_prefix_directive_cow(
785        &mut self,
786        start_mark: &Marker,
787    ) -> Result<Cow<'input, str>, ScanError> {
788        let Some(start) = self.input.byte_offset() else {
789            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
790        };
791
792        // The prefix must start with either '!' (local) or a valid global tag char.
793        if self.input.look_ch() == '!' {
794            self.skip_non_blank();
795        } else if !is_tag_char(self.input.peek()) {
796            return Err(ScanError::new_str(
797                *start_mark,
798                "invalid global tag character",
799            ));
800        } else if self.input.peek() == '%' {
801            // Needs decoding. Fall back to allocating path below.
802        } else {
803            self.skip_non_blank();
804        }
805
806        // Consume URI chars while we can stay in the borrowed path.
807        while is_uri_char(self.input.look_ch()) {
808            if self.input.peek() == '%' {
809                break;
810            }
811            self.skip_non_blank();
812        }
813
814        // If we encountered an escape sequence, we must decode, therefore allocate.
815        if self.input.peek() == '%' {
816            let current = self
817                .input
818                .byte_offset()
819                .expect("byte_offset() must remain available once enabled");
820            let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
821                slice.to_owned()
822            } else {
823                String::new()
824            };
825
826            while is_uri_char(self.input.look_ch()) {
827                if self.input.peek() == '%' {
828                    out.push(self.scan_uri_escapes(start_mark)?);
829                } else {
830                    out.push(self.input.peek());
831                    self.skip_non_blank();
832                }
833            }
834            return Ok(Cow::Owned(out));
835        }
836
837        let Some(end) = self.input.byte_offset() else {
838            return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
839        };
840
841        let Some(slice) = self.try_borrow_slice(start, end) else {
842            // Fall back to allocating if zero-copy borrow is not available.
843            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
844                ScanError::new_str(
845                    *start_mark,
846                    "internal error: input advertised slicing but did not provide a slice",
847                )
848            })?;
849            return Ok(Cow::Owned(slice.to_owned()));
850        };
851
852        Ok(Cow::Borrowed(slice))
853    }
854    /// Creates the YAML tokenizer.
855    pub fn new(input: T) -> Self {
856        let initial_byte_offset = input.byte_offset();
857        Scanner {
858            input,
859            mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
860            tokens: VecDeque::with_capacity(64),
861            error: None,
862
863            stream_start_produced: false,
864            stream_end_produced: false,
865            adjacent_value_allowed_at: 0,
866            simple_key_allowed: true,
867            simple_keys: smallvec::SmallVec::new(),
868            indent: -1,
869            indents: smallvec::SmallVec::new(),
870            flow_level: 0,
871            tokens_parsed: 0,
872            token_available: false,
873            leading_whitespace: true,
874            flow_mapping_started: false,
875            implicit_flow_mapping_states: smallvec::SmallVec::new(),
876            flow_markers: smallvec::SmallVec::new(),
877            interrupted_plain_by_comment: None,
878
879            buf_leading_break: String::with_capacity(128),
880            buf_trailing_breaks: String::with_capacity(128),
881            buf_whitespaces: String::with_capacity(128),
882        }
883    }
884
885    /// Get a copy of the last error that was encountered, if any.
886    ///
887    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
888    /// clone of) the same error.
889    #[inline]
890    pub fn get_error(&self) -> Option<ScanError> {
891        self.error.clone()
892    }
893
894    #[cold]
895    fn simple_key_expected(&self) -> ScanError {
896        ScanError::new_str(self.mark, "simple key expected")
897    }
898
899    #[cold]
900    fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
901        ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
902    }
903
904    /// Consume the next character. It is assumed the next character is a blank.
905    #[inline]
906    fn skip_blank(&mut self) {
907        self.input.skip();
908
909        self.mark.offsets.chars += 1;
910        self.mark.col += 1;
911        self.mark.offsets.bytes = self.input.byte_offset();
912    }
913
914    /// Consume the next character. It is assumed the next character is not a blank.
915    #[inline]
916    fn skip_non_blank(&mut self) {
917        self.input.skip();
918
919        self.mark.offsets.chars += 1;
920        self.mark.col += 1;
921        self.mark.offsets.bytes = self.input.byte_offset();
922        self.leading_whitespace = false;
923    }
924
925    /// Consume the next characters. It is assumed none of the next characters are blanks.
926    #[inline]
927    fn skip_n_non_blank(&mut self, count: usize) {
928        for _ in 0..count {
929            self.input.skip();
930            self.mark.offsets.chars += 1;
931            self.mark.col += 1;
932        }
933        self.mark.offsets.bytes = self.input.byte_offset();
934        self.leading_whitespace = false;
935    }
936
937    /// Consume the next character. It is assumed the next character is a newline.
938    #[inline]
939    fn skip_nl(&mut self) {
940        self.input.skip();
941
942        self.mark.offsets.chars += 1;
943        self.mark.col = 0;
944        self.mark.line += 1;
945        self.mark.offsets.bytes = self.input.byte_offset();
946        self.leading_whitespace = true;
947    }
948
949    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
950    #[inline]
951    fn skip_linebreak(&mut self) {
952        if self.input.next_2_are('\r', '\n') {
953            // While technically not a blank, this does not matter as `self.leading_whitespace`
954            // will be reset by `skip_nl`.
955            self.skip_blank();
956            self.skip_nl();
957        } else if self.input.next_is_break() {
958            self.skip_nl();
959        }
960    }
961
962    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
963    #[inline]
964    pub fn stream_started(&self) -> bool {
965        self.stream_start_produced
966    }
967
968    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
969    #[inline]
970    pub fn stream_ended(&self) -> bool {
971        self.stream_end_produced
972    }
973
974    /// Get the current position in the input stream.
975    #[inline]
976    pub fn mark(&self) -> Marker {
977        self.mark
978    }
979
980    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
981    //
982    // A `\n` is pushed into `s`.
983    //
984    // # Panics (in debug)
985    // If the next characters do not correspond to a line break.
986    #[inline]
987    fn read_break(&mut self, s: &mut String) {
988        self.skip_break();
989        s.push('\n');
990    }
991
992    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
993    //
994    // # Panics (in debug)
995    // If the next characters do not correspond to a line break.
996    #[inline]
997    fn skip_break(&mut self) {
998        let c = self.input.peek();
999        let nc = self.input.peek_nth(1);
1000        debug_assert!(is_break(c));
1001        if c == '\r' && nc == '\n' {
1002            self.skip_blank();
1003        }
1004        self.skip_nl();
1005    }
1006
1007    /// Insert a token at the given position.
1008    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1009        let old_len = self.tokens.len();
1010        assert!(pos <= old_len);
1011        self.tokens.insert(pos, tok);
1012    }
1013
1014    #[inline]
1015    fn allow_simple_key(&mut self) {
1016        self.simple_key_allowed = true;
1017    }
1018
1019    #[inline]
1020    fn disallow_simple_key(&mut self) {
1021        self.simple_key_allowed = false;
1022    }
1023
1024    /// Fetch the next token in the stream.
1025    ///
1026    /// # Errors
1027    /// Returns `ScanError` when the scanner does not find the next expected token.
1028    pub fn fetch_next_token(&mut self) -> ScanResult {
1029        self.input.lookahead(1);
1030
1031        if !self.stream_start_produced {
1032            self.fetch_stream_start();
1033            return Ok(());
1034        }
1035        self.skip_to_next_token()?;
1036
1037        debug_print!(
1038            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1039            self.mark,
1040            self.input.peek()
1041        );
1042
1043        self.stale_simple_keys()?;
1044
1045        let mark = self.mark;
1046        self.unroll_indent(mark.col as isize);
1047
1048        self.input.lookahead(4);
1049
1050        if self.input.next_is_z() {
1051            self.fetch_stream_end()?;
1052            return Ok(());
1053        }
1054
1055        if self.mark.col == 0 {
1056            if self.input.next_char_is('%') {
1057                return self.fetch_directive();
1058            } else if self.input.next_is_document_start() {
1059                return self.fetch_document_indicator(TokenType::DocumentStart);
1060            } else if self.input.next_is_document_end() {
1061                self.fetch_document_indicator(TokenType::DocumentEnd)?;
1062                self.skip_ws_to_eol(SkipTabs::Yes)?;
1063                if !self.input.next_is_breakz() {
1064                    return Err(ScanError::new_str(
1065                        self.mark,
1066                        "invalid content after document end marker",
1067                    ));
1068                }
1069                return Ok(());
1070            }
1071        }
1072
1073        if (self.mark.col as isize) < self.indent {
1074            self.input.lookahead(1);
1075            let c = self.input.peek();
1076            if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1077                return Err(ScanError::new_str(self.mark, "invalid indentation"));
1078            }
1079        }
1080
1081        let c = self.input.peek();
1082        let nc = self.input.peek_nth(1);
1083        match c {
1084            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1085            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1086            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1087            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1088            ',' => self.fetch_flow_entry(),
1089            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1090            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1091            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1092            ':' if self.flow_level > 0
1093                && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1094            {
1095                self.fetch_flow_value()
1096            }
1097            // Is it an alias?
1098            '*' => self.fetch_anchor(true),
1099            // Is it an anchor?
1100            '&' => self.fetch_anchor(false),
1101            '!' => self.fetch_tag(),
1102            // Is it a literal scalar?
1103            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1104            // Is it a folded scalar?
1105            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1106            '\'' => self.fetch_flow_scalar(true),
1107            '"' => self.fetch_flow_scalar(false),
1108            // plain scalar
1109            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1110            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1111                self.fetch_plain_scalar()
1112            }
1113            '%' | '@' | '`' => Err(ScanError::new(
1114                self.mark,
1115                format!("unexpected character: `{c}'"),
1116            )),
1117            _ => self.fetch_plain_scalar(),
1118        }
1119    }
1120
1121    /// Return the next token in the stream.
1122    /// # Errors
1123    /// Returns `ScanError` when scanning fails to find an expected next token.
1124    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1125        if self.stream_end_produced {
1126            return Ok(None);
1127        }
1128
1129        if !self.token_available {
1130            self.fetch_more_tokens()?;
1131        }
1132        let Some(t) = self.tokens.pop_front() else {
1133            return Err(ScanError::new_str(
1134                self.mark,
1135                "did not find expected next token",
1136            ));
1137        };
1138        self.token_available = false;
1139        self.tokens_parsed += 1;
1140
1141        if let TokenType::StreamEnd = t.1 {
1142            self.stream_end_produced = true;
1143        }
1144        Ok(Some(t))
1145    }
1146
1147    /// Fetch tokens from the token stream.
1148    /// # Errors
1149    /// Returns `ScanError` when loading fails.
1150    pub fn fetch_more_tokens(&mut self) -> ScanResult {
1151        let mut need_more;
1152        loop {
1153            if self.tokens.is_empty() {
1154                need_more = true;
1155            } else {
1156                need_more = false;
1157                // Stale potential keys that we know won't be keys.
1158                self.stale_simple_keys()?;
1159                // If our next token to be emitted may be a key, fetch more context.
1160                for sk in &self.simple_keys {
1161                    if sk.possible && sk.token_number == self.tokens_parsed {
1162                        need_more = true;
1163                        break;
1164                    }
1165                }
1166            }
1167
1168            // Stop fetching immediately after document end/start markers
1169            // to allow the parser to emit the event before reading more content.
1170            if let Some(token) = self.tokens.back() {
1171                if matches!(token.1, TokenType::DocumentEnd | TokenType::DocumentStart) {
1172                    break;
1173                }
1174            }
1175
1176            if !need_more {
1177                break;
1178            }
1179            self.fetch_next_token()?;
1180        }
1181        self.token_available = true;
1182
1183        Ok(())
1184    }
1185
1186    /// Mark simple keys that can no longer be keys as such.
1187    ///
1188    /// This function sets `possible` to `false` to each key that, now we have more context, we
1189    /// know will not be keys.
1190    ///
1191    /// # Errors
1192    /// This function returns an error if one of the key we would stale was required to be a key.
1193    fn stale_simple_keys(&mut self) -> ScanResult {
1194        for sk in &mut self.simple_keys {
1195            if sk.possible
1196                // If not in a flow construct, simple keys cannot span multiple lines.
1197                && self.flow_level == 0
1198                    && (sk.mark.line < self.mark.line
1199                        || sk.mark.index() + 1024 < self.mark.index())
1200            {
1201                if sk.required {
1202                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1203                }
1204                sk.possible = false;
1205            }
1206        }
1207        Ok(())
1208    }
1209
1210    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
1211    ///
1212    /// # Errors
1213    /// This function returns an error if a tabulation is encountered where there should not be
1214    /// one.
1215    fn skip_to_next_token(&mut self) -> ScanResult {
1216        // Hot-path helper: consume a single logical linebreak and apply simple-key rules.
1217        // (Kept local to ensure the compiler can inline it easily.)
1218        let consume_linebreak = |this: &mut Self| {
1219            this.input.lookahead(2);
1220            this.skip_linebreak();
1221            if this.flow_level == 0 {
1222                this.allow_simple_key();
1223            }
1224        };
1225
1226        loop {
1227            match self.input.look_ch() {
1228                // Tabs may not be used as indentation (block context only).
1229                '\t' => {
1230                    if self.is_within_block()
1231                        && self.leading_whitespace
1232                        && (self.mark.col as isize) < self.indent
1233                    {
1234                        self.skip_ws_to_eol(SkipTabs::Yes)?;
1235
1236                        // If we have content on that line with a tab, return an error.
1237                        if !self.input.next_is_breakz() {
1238                            return Err(ScanError::new_str(
1239                                self.mark,
1240                                "tabs disallowed within this context (block indentation)",
1241                            ));
1242                        }
1243
1244                        // Micro-opt: if we stopped on a linebreak, consume it now (avoids another loop trip).
1245                        if matches!(self.input.look_ch(), '\n' | '\r') {
1246                            consume_linebreak(self);
1247                        }
1248                    } else {
1249                        // Non-indentation tab behaves like blank.
1250                        self.skip_blank();
1251                    }
1252                }
1253
1254                ' ' => self.skip_blank(),
1255
1256                '\n' | '\r' => consume_linebreak(self),
1257
1258                '#' => {
1259                    // Skip the whole comment payload in one go.
1260                    let n = self.input.skip_while_non_breakz();
1261                    self.mark.offsets.chars += n;
1262                    self.mark.col += n;
1263                    self.mark.offsets.bytes = self.input.byte_offset();
1264
1265                    // Micro-opt: comment-only lines are common; consume the following linebreak here.
1266                    if matches!(self.input.look_ch(), '\n' | '\r') {
1267                        consume_linebreak(self);
1268                    }
1269                }
1270
1271                _ => break,
1272            }
1273        }
1274
1275        // If a plain scalar was interrupted by a comment, and the next line could
1276        // continue the scalar in block context, this is invalid.
1277        if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1278            // BS4K should only trigger when the continuation would start on the immediate next
1279            // line (no intervening empty/comment-only lines). A blank line resets the folding
1280            // opportunity and thus should not error.
1281            let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1282
1283            // Optimization: do the cheap checks first; only then request extra lookahead / do deeper checks.
1284            if self.flow_level == 0
1285                && is_immediate_next_line
1286                && (self.mark.col as isize) > self.indent
1287            {
1288                // Ensure enough lookahead for:
1289                // - the checks below (peek/peek_nth)
1290                // - document indicator detection which needs 4 chars.
1291                self.input.lookahead(4);
1292
1293                if !self.input.next_is_z()
1294                    && !self.input.next_is_document_indicator()
1295                    && self.input.next_can_be_plain_scalar(false)
1296                {
1297                    return Err(ScanError::new_str(
1298                        err_mark,
1299                        "comment intercepting the multiline text",
1300                    ));
1301                }
1302            }
1303        }
1304
1305        Ok(())
1306    }
1307
1308    /// Skip over YAML whitespace (` `, `\n`, `\r`).
1309    ///
1310    /// # Errors
1311    /// This function returns an error if no whitespace was found.
1312    fn skip_yaml_whitespace(&mut self) -> ScanResult {
1313        let mut need_whitespace = true;
1314        loop {
1315            match self.input.look_ch() {
1316                ' ' => {
1317                    self.skip_blank();
1318
1319                    need_whitespace = false;
1320                }
1321                '\n' | '\r' => {
1322                    self.input.lookahead(2);
1323                    self.skip_linebreak();
1324                    if self.flow_level == 0 {
1325                        self.allow_simple_key();
1326                    }
1327                    need_whitespace = false;
1328                }
1329                '#' => {
1330                    let comment_length = self.input.skip_while_non_breakz();
1331                    self.mark.offsets.chars += comment_length;
1332                    self.mark.col += comment_length;
1333                    self.mark.offsets.bytes = self.input.byte_offset();
1334                }
1335                _ => break,
1336            }
1337        }
1338
1339        if need_whitespace {
1340            Err(ScanError::new_str(self.mark(), "expected whitespace"))
1341        } else {
1342            Ok(())
1343        }
1344    }
1345
1346    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1347        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
1348        self.mark.col += n_bytes;
1349        self.mark.offsets.chars += n_bytes;
1350        self.mark.offsets.bytes = self.input.byte_offset();
1351        result.map_err(|msg| ScanError::new_str(self.mark, msg))
1352    }
1353
1354    fn fetch_stream_start(&mut self) {
1355        let mark = self.mark;
1356        self.indent = -1;
1357        self.stream_start_produced = true;
1358        self.allow_simple_key();
1359        self.tokens.push_back(Token(
1360            Span::empty(mark),
1361            TokenType::StreamStart(TEncoding::Utf8),
1362        ));
1363        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1364    }
1365
1366    fn fetch_stream_end(&mut self) -> ScanResult {
1367        // force new line
1368        if self.mark.col != 0 {
1369            self.mark.col = 0;
1370            self.mark.line += 1;
1371        }
1372
1373        if let Some((mark, bracket)) = self.flow_markers.pop() {
1374            return Err(Self::unclosed_bracket(mark, bracket));
1375        }
1376
1377        // If the stream ended, we won't have more context. We can stall all the simple keys we
1378        // had. If one was required, however, that was an error and we must propagate it.
1379        for sk in &mut self.simple_keys {
1380            if sk.required && sk.possible {
1381                return Err(self.simple_key_expected());
1382            }
1383            sk.possible = false;
1384        }
1385
1386        self.unroll_indent(-1);
1387        self.remove_simple_key()?;
1388        self.disallow_simple_key();
1389
1390        self.tokens
1391            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
1392        Ok(())
1393    }
1394
1395    fn fetch_directive(&mut self) -> ScanResult {
1396        self.unroll_indent(-1);
1397        self.remove_simple_key()?;
1398
1399        self.disallow_simple_key();
1400
1401        let tok = self.scan_directive()?;
1402        self.tokens.push_back(tok);
1403
1404        Ok(())
1405    }
1406
1407    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1408        let start_mark = self.mark;
1409        self.skip_non_blank();
1410
1411        let name = self.scan_directive_name()?;
1412        let tok = match name.as_ref() {
1413            "YAML" => self.scan_version_directive_value(&start_mark)?,
1414            "TAG" => self.scan_tag_directive_value(&start_mark)?,
1415            _ => {
1416                let mut params = Vec::new();
1417                while self.input.next_is_blank() {
1418                    let n_blanks = self.input.skip_while_blank();
1419                    self.mark.offsets.chars += n_blanks;
1420                    self.mark.col += n_blanks;
1421                    self.mark.offsets.bytes = self.input.byte_offset();
1422
1423                    if !is_blank_or_breakz(self.input.peek()) {
1424                        let mut param = String::new();
1425                        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1426                        self.mark.offsets.chars += n_chars;
1427                        self.mark.col += n_chars;
1428                        self.mark.offsets.bytes = self.input.byte_offset();
1429                        params.push(param);
1430                    }
1431                }
1432
1433                Token(
1434                    Span::new(start_mark, self.mark),
1435                    TokenType::ReservedDirective(name, params),
1436                )
1437            }
1438        };
1439
1440        self.skip_ws_to_eol(SkipTabs::Yes)?;
1441
1442        if self.input.next_is_breakz() {
1443            self.input.lookahead(2);
1444            self.skip_linebreak();
1445            Ok(tok)
1446        } else {
1447            Err(ScanError::new_str(
1448                start_mark,
1449                "while scanning a directive, did not find expected comment or line break",
1450            ))
1451        }
1452    }
1453
1454    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1455        let n_blanks = self.input.skip_while_blank();
1456        self.mark.offsets.chars += n_blanks;
1457        self.mark.col += n_blanks;
1458        self.mark.offsets.bytes = self.input.byte_offset();
1459
1460        let major = self.scan_version_directive_number(mark)?;
1461
1462        if self.input.peek() != '.' {
1463            return Err(ScanError::new_str(
1464                *mark,
1465                "while scanning a YAML directive, did not find expected digit or '.' character",
1466            ));
1467        }
1468        self.skip_non_blank();
1469
1470        let minor = self.scan_version_directive_number(mark)?;
1471
1472        Ok(Token(
1473            Span::new(*mark, self.mark),
1474            TokenType::VersionDirective(major, minor),
1475        ))
1476    }
1477
1478    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1479        let start_mark = self.mark;
1480        let mut string = String::new();
1481
1482        let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
1483        self.mark.offsets.chars += n_chars;
1484        self.mark.col += n_chars;
1485        self.mark.offsets.bytes = self.input.byte_offset();
1486
1487        if string.is_empty() {
1488            return Err(ScanError::new_str(
1489                start_mark,
1490                "while scanning a directive, could not find expected directive name",
1491            ));
1492        }
1493
1494        if !is_blank_or_breakz(self.input.peek()) {
1495            return Err(ScanError::new_str(
1496                start_mark,
1497                "while scanning a directive, found unexpected non-alphabetical character",
1498            ));
1499        }
1500
1501        Ok(string)
1502    }
1503
1504    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1505        let mut val = 0u32;
1506        let mut length = 0usize;
1507        while let Some(digit) = self.input.look_ch().to_digit(10) {
1508            if length + 1 > 9 {
1509                return Err(ScanError::new_str(
1510                    *mark,
1511                    "while scanning a YAML directive, found extremely long version number",
1512                ));
1513            }
1514            length += 1;
1515            val = val * 10 + digit;
1516            self.skip_non_blank();
1517        }
1518
1519        if length == 0 {
1520            return Err(ScanError::new_str(
1521                *mark,
1522                "while scanning a YAML directive, did not find expected version number",
1523            ));
1524        }
1525
1526        Ok(val)
1527    }
1528
1529    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1530        let n_blanks = self.input.skip_while_blank();
1531        self.mark.offsets.chars += n_blanks;
1532        self.mark.col += n_blanks;
1533        self.mark.offsets.bytes = self.input.byte_offset();
1534
1535        let handle = self.scan_tag_handle_directive_cow(mark)?;
1536
1537        let n_blanks = self.input.skip_while_blank();
1538        self.mark.offsets.chars += n_blanks;
1539        self.mark.col += n_blanks;
1540        self.mark.offsets.bytes = self.input.byte_offset();
1541
1542        let prefix = self.scan_tag_prefix_directive_cow(mark)?;
1543
1544        self.input.lookahead(1);
1545
1546        if self.input.next_is_blank_or_breakz() {
1547            Ok(Token(
1548                Span::new(*mark, self.mark),
1549                TokenType::TagDirective(handle, prefix),
1550            ))
1551        } else {
1552            Err(ScanError::new_str(
1553                *mark,
1554                "while scanning TAG, did not find expected whitespace or line break",
1555            ))
1556        }
1557    }
1558
1559    fn fetch_tag(&mut self) -> ScanResult {
1560        self.save_simple_key();
1561        self.disallow_simple_key();
1562
1563        let tok = self.scan_tag()?;
1564        self.tokens.push_back(tok);
1565        Ok(())
1566    }
1567
1568    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
1569        let start_mark = self.mark;
1570
1571        // Check if the tag is in the canonical form (verbatim).
1572        self.input.lookahead(2);
1573
1574        // If byte_offset is not available, use the original owned-only path.
1575        if self.input.byte_offset().is_none() {
1576            return self.scan_tag_owned(&start_mark);
1577        }
1578
1579        let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
1580            if self.input.nth_char_is(1, '<') {
1581                // Verbatim tags always need owned strings (URI escapes).
1582                let suffix = self.scan_verbatim_tag(&start_mark)?;
1583                (Cow::Owned(String::new()), Cow::Owned(suffix))
1584            } else {
1585                // The tag has either the '!suffix' or the '!handle!suffix'
1586                let handle = self.scan_tag_handle_cow(&start_mark)?;
1587                // Check if it is, indeed, handle.
1588                if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1589                    // A tag handle starting with "!!" is a secondary tag handle.
1590                    let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark)?;
1591                    (handle, suffix)
1592                } else {
1593                    // Not a real handle, it's part of the suffix.
1594                    // E.g., "!foo" -> handle="!", suffix="foo"
1595                    // The "handle" we scanned is actually "!" + suffix_part1.
1596                    // We need to also scan any remaining suffix characters.
1597                    let remaining_suffix = self.scan_tag_shorthand_suffix_cow(&start_mark)?;
1598
1599                    // Extract suffix from handle (skip leading '!') and combine with remaining.
1600                    let suffix = if handle.len() > 1 {
1601                        if remaining_suffix.is_empty() {
1602                            // The suffix is just what's in handle after '!'
1603                            match handle {
1604                                Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
1605                                Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
1606                            }
1607                        } else {
1608                            // Combine handle (minus leading '!') with remaining suffix.
1609                            let mut combined = handle[1..].to_owned();
1610                            combined.push_str(&remaining_suffix);
1611                            Cow::Owned(combined)
1612                        }
1613                    } else {
1614                        // handle is just "!", suffix is whatever we scanned after
1615                        remaining_suffix
1616                    };
1617
1618                    // A special case: the '!' tag.  Set the handle to '' and the
1619                    // suffix to '!'.
1620                    if suffix.is_empty() {
1621                        (Cow::Borrowed(""), Cow::Borrowed("!"))
1622                    } else {
1623                        (Cow::Borrowed("!"), suffix)
1624                    }
1625                }
1626            };
1627
1628        if is_blank_or_breakz(self.input.look_ch())
1629            || (self.flow_level > 0 && self.input.next_is_flow())
1630        {
1631            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1632            Ok(Token(
1633                Span::new(start_mark, self.mark),
1634                TokenType::Tag(handle, suffix),
1635            ))
1636        } else {
1637            Err(ScanError::new_str(
1638                start_mark,
1639                "while scanning a tag, did not find expected whitespace or line break",
1640            ))
1641        }
1642    }
1643
1644    /// Original owned-only tag scanning path for inputs without `byte_offset` support.
1645    fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
1646        let mut handle = String::new();
1647        let mut suffix;
1648
1649        if self.input.nth_char_is(1, '<') {
1650            suffix = self.scan_verbatim_tag(start_mark)?;
1651        } else {
1652            // The tag has either the '!suffix' or the '!handle!suffix'
1653            handle = self.scan_tag_handle(false, start_mark)?;
1654            // Check if it is, indeed, handle.
1655            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1656                // A tag handle starting with "!!" is a secondary tag handle.
1657                let is_secondary_handle = handle == "!!";
1658                suffix =
1659                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
1660            } else {
1661                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
1662                "!".clone_into(&mut handle);
1663                // A special case: the '!' tag.  Set the handle to '' and the
1664                // suffix to '!'.
1665                if suffix.is_empty() {
1666                    handle.clear();
1667                    "!".clone_into(&mut suffix);
1668                }
1669            }
1670        }
1671
1672        if is_blank_or_breakz(self.input.look_ch())
1673            || (self.flow_level > 0 && self.input.next_is_flow())
1674        {
1675            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1676            Ok(Token(
1677                Span::new(*start_mark, self.mark),
1678                TokenType::Tag(handle.into(), suffix.into()),
1679            ))
1680        } else {
1681            Err(ScanError::new_str(
1682                *start_mark,
1683                "while scanning a tag, did not find expected whitespace or line break",
1684            ))
1685        }
1686    }
1687
1688    /// Scan a tag handle as a `Cow<str>`, borrowing when possible.
1689    ///
1690    /// Tag handles are of the form `!`, `!!`, or `!name!` where name is ASCII alphanumeric.
1691    /// Since they contain no escape sequences, they can always be borrowed from `StrInput`.
1692    fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
1693        let Some(start) = self.input.byte_offset() else {
1694            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1695        };
1696
1697        if self.input.look_ch() != '!' {
1698            return Err(ScanError::new_str(
1699                *mark,
1700                "while scanning a tag, did not find expected '!'",
1701            ));
1702        }
1703
1704        // Consume the leading '!'.
1705        self.skip_non_blank();
1706
1707        // Consume ns-word-char (ASCII alphanumeric, '_' or '-') characters.
1708        self.input.lookahead(1);
1709        while self.input.next_is_alpha() {
1710            self.skip_non_blank();
1711            self.input.lookahead(1);
1712        }
1713
1714        // Optional trailing '!'.
1715        if self.input.peek() == '!' {
1716            self.skip_non_blank();
1717        }
1718
1719        let Some(end) = self.input.byte_offset() else {
1720            return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
1721        };
1722
1723        if let Some(slice) = self.try_borrow_slice(start, end) {
1724            Ok(Cow::Borrowed(slice))
1725        } else {
1726            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1727                ScanError::new_str(
1728                    *mark,
1729                    "internal error: input advertised slicing but did not provide a slice",
1730                )
1731            })?;
1732            Ok(Cow::Owned(slice.to_owned()))
1733        }
1734    }
1735
1736    /// Scan a tag shorthand suffix as a `Cow<str>`, borrowing when possible.
1737    ///
1738    /// The suffix can be borrowed only if no `%` URI escape sequences are present.
1739    fn scan_tag_shorthand_suffix_cow(
1740        &mut self,
1741        mark: &Marker,
1742    ) -> Result<Cow<'input, str>, ScanError> {
1743        let Some(start) = self.input.byte_offset() else {
1744            return Ok(Cow::Owned(
1745                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
1746            ));
1747        };
1748
1749        // Scan tag characters, checking for URI escapes.
1750        while is_tag_char(self.input.look_ch()) {
1751            if self.input.peek() == '%' {
1752                // URI escape found - must decode, so fall back to owned path.
1753                let current = self
1754                    .input
1755                    .byte_offset()
1756                    .expect("byte_offset() must remain available once enabled");
1757                let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1758                    slice.to_owned()
1759                } else {
1760                    String::new()
1761                };
1762
1763                // Continue scanning with owned buffer.
1764                while is_tag_char(self.input.look_ch()) {
1765                    if self.input.peek() == '%' {
1766                        out.push(self.scan_uri_escapes(mark)?);
1767                    } else {
1768                        out.push(self.input.peek());
1769                        self.skip_non_blank();
1770                    }
1771                }
1772                return Ok(Cow::Owned(out));
1773            }
1774            self.skip_non_blank();
1775        }
1776
1777        let Some(end) = self.input.byte_offset() else {
1778            return Ok(Cow::Owned(
1779                self.scan_tag_shorthand_suffix(false, false, "", mark)?,
1780            ));
1781        };
1782
1783        if let Some(slice) = self.try_borrow_slice(start, end) {
1784            Ok(Cow::Borrowed(slice))
1785        } else {
1786            let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1787                ScanError::new_str(
1788                    *mark,
1789                    "internal error: input advertised slicing but did not provide a slice",
1790                )
1791            })?;
1792            Ok(Cow::Owned(slice.to_owned()))
1793        }
1794    }
1795
1796    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1797        let mut string = String::new();
1798        if self.input.look_ch() != '!' {
1799            return Err(ScanError::new_str(
1800                *mark,
1801                "while scanning a tag, did not find expected '!'",
1802            ));
1803        }
1804
1805        string.push(self.input.peek());
1806        self.skip_non_blank();
1807
1808        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1809        self.mark.offsets.chars += n_chars;
1810        self.mark.col += n_chars;
1811        self.mark.offsets.bytes = self.input.byte_offset();
1812
1813        // Check if the trailing character is '!' and copy it.
1814        if self.input.peek() == '!' {
1815            string.push(self.input.peek());
1816            self.skip_non_blank();
1817        } else if directive && string != "!" {
1818            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1819            // directive, it's an error.  If it's a tag token, it must be a part of
1820            // URI.
1821            return Err(ScanError::new_str(
1822                *mark,
1823                "while parsing a tag directive, did not find expected '!'",
1824            ));
1825        }
1826        Ok(string)
1827    }
1828
1829    /// Scan for a tag prefix (6.8.2.2).
1830    ///
1831    /// There are 2 kinds of tag prefixes:
1832    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1833    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1834    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1835        let mut string = String::new();
1836
1837        if self.input.look_ch() == '!' {
1838            // If we have a local tag, insert and skip `!`.
1839            string.push(self.input.peek());
1840            self.skip_non_blank();
1841        } else if !is_tag_char(self.input.peek()) {
1842            // Otherwise, check if the first global tag character is valid.
1843            return Err(ScanError::new_str(
1844                *start_mark,
1845                "invalid global tag character",
1846            ));
1847        } else if self.input.peek() == '%' {
1848            // If it is valid and an escape sequence, escape it.
1849            string.push(self.scan_uri_escapes(start_mark)?);
1850        } else {
1851            // Otherwise, push the first character.
1852            string.push(self.input.peek());
1853            self.skip_non_blank();
1854        }
1855
1856        while is_uri_char(self.input.look_ch()) {
1857            if self.input.peek() == '%' {
1858                string.push(self.scan_uri_escapes(start_mark)?);
1859            } else {
1860                string.push(self.input.peek());
1861                self.skip_non_blank();
1862            }
1863        }
1864
1865        Ok(string)
1866    }
1867
1868    /// Scan for a verbatim tag.
1869    ///
1870    /// The prefixing `!<` must _not_ have been skipped.
1871    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1872        // Eat `!<`
1873        self.skip_non_blank();
1874        self.skip_non_blank();
1875
1876        let mut string = String::new();
1877        while is_uri_char(self.input.look_ch()) {
1878            if self.input.peek() == '%' {
1879                string.push(self.scan_uri_escapes(start_mark)?);
1880            } else {
1881                string.push(self.input.peek());
1882                self.skip_non_blank();
1883            }
1884        }
1885
1886        if self.input.peek() != '>' {
1887            return Err(ScanError::new_str(
1888                *start_mark,
1889                "while scanning a verbatim tag, did not find the expected '>'",
1890            ));
1891        }
1892        self.skip_non_blank();
1893
1894        Ok(string)
1895    }
1896
1897    fn scan_tag_shorthand_suffix(
1898        &mut self,
1899        _directive: bool,
1900        _is_secondary: bool,
1901        head: &str,
1902        mark: &Marker,
1903    ) -> Result<String, ScanError> {
1904        let mut length = head.len();
1905        let mut string = String::new();
1906
1907        // Copy the head if needed.
1908        // Note that we don't copy the leading '!' character.
1909        if length > 1 {
1910            string.extend(head.chars().skip(1));
1911        }
1912
1913        while is_tag_char(self.input.look_ch()) {
1914            // Check if it is a URI-escape sequence.
1915            if self.input.peek() == '%' {
1916                string.push(self.scan_uri_escapes(mark)?);
1917            } else {
1918                string.push(self.input.peek());
1919                self.skip_non_blank();
1920            }
1921
1922            length += 1;
1923        }
1924
1925        if length == 0 {
1926            return Err(ScanError::new_str(
1927                *mark,
1928                "while parsing a tag, did not find expected tag URI",
1929            ));
1930        }
1931
1932        Ok(string)
1933    }
1934
1935    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1936        let mut width = 0usize;
1937        let mut code = 0u32;
1938        loop {
1939            self.input.lookahead(3);
1940
1941            let c = self.input.peek_nth(1);
1942            let nc = self.input.peek_nth(2);
1943
1944            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
1945                return Err(ScanError::new_str(
1946                    *mark,
1947                    "while parsing a tag, found an invalid escape sequence",
1948                ));
1949            }
1950
1951            let byte = (as_hex(c) << 4) + as_hex(nc);
1952            if width == 0 {
1953                width = match byte {
1954                    _ if byte & 0x80 == 0x00 => 1,
1955                    _ if byte & 0xE0 == 0xC0 => 2,
1956                    _ if byte & 0xF0 == 0xE0 => 3,
1957                    _ if byte & 0xF8 == 0xF0 => 4,
1958                    _ => {
1959                        return Err(ScanError::new_str(
1960                            *mark,
1961                            "while parsing a tag, found an incorrect leading UTF-8 byte",
1962                        ));
1963                    }
1964                };
1965                code = byte;
1966            } else {
1967                if byte & 0xc0 != 0x80 {
1968                    return Err(ScanError::new_str(
1969                        *mark,
1970                        "while parsing a tag, found an incorrect trailing UTF-8 byte",
1971                    ));
1972                }
1973                code = (code << 8) + byte;
1974            }
1975
1976            self.skip_n_non_blank(3);
1977
1978            width -= 1;
1979            if width == 0 {
1980                break;
1981            }
1982        }
1983
1984        match char::from_u32(code) {
1985            Some(ch) => Ok(ch),
1986            None => Err(ScanError::new_str(
1987                *mark,
1988                "while parsing a tag, found an invalid UTF-8 codepoint",
1989            )),
1990        }
1991    }
1992
1993    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
1994        self.save_simple_key();
1995        self.disallow_simple_key();
1996
1997        let tok = self.scan_anchor(alias)?;
1998
1999        self.tokens.push_back(tok);
2000
2001        Ok(())
2002    }
2003
2004    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2005        let start_mark = self.mark;
2006
2007        // Skip `&` / `*`.
2008        self.skip_non_blank();
2009
2010        // Borrow from input when possible.
2011        if let Some(start) = self.input.byte_offset() {
2012            while is_anchor_char(self.input.look_ch()) {
2013                self.skip_non_blank();
2014            }
2015
2016            let end = self
2017                .input
2018                .byte_offset()
2019                .expect("byte_offset() must remain available once enabled");
2020
2021            if start == end {
2022                return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2023            }
2024
2025            let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2026                Cow::Borrowed(slice)
2027            } else if let Some(slice) = self.input.slice_bytes(start, end) {
2028                Cow::Owned(slice.to_owned())
2029            } else {
2030                return Err(ScanError::new_str(
2031                    start_mark,
2032                    "internal error: input advertised slicing but did not provide a slice",
2033                ));
2034            };
2035
2036            let tok = if alias {
2037                TokenType::Alias(cow)
2038            } else {
2039                TokenType::Anchor(cow)
2040            };
2041            return Ok(Token(Span::new(start_mark, self.mark), tok));
2042        }
2043
2044        let mut string = String::new();
2045        while is_anchor_char(self.input.look_ch()) {
2046            string.push(self.input.peek());
2047            self.skip_non_blank();
2048        }
2049
2050        if string.is_empty() {
2051            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2052        }
2053
2054        let tok = if alias {
2055            TokenType::Alias(string.into())
2056        } else {
2057            TokenType::Anchor(string.into())
2058        };
2059        Ok(Token(Span::new(start_mark, self.mark), tok))
2060    }
2061
2062    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2063        // The indicators '[' and '{' may start a simple key.
2064        self.save_simple_key();
2065
2066        let start_mark = self.mark;
2067        let indicator = self.input.peek();
2068        self.flow_markers.push((start_mark, indicator));
2069
2070        self.roll_one_col_indent();
2071        self.increase_flow_level()?;
2072
2073        self.allow_simple_key();
2074
2075        self.skip_non_blank();
2076
2077        if tok == TokenType::FlowMappingStart {
2078            self.flow_mapping_started = true;
2079        } else {
2080            self.implicit_flow_mapping_states
2081                .push(ImplicitMappingState::Possible);
2082        }
2083
2084        self.skip_ws_to_eol(SkipTabs::Yes)?;
2085
2086        self.tokens
2087            .push_back(Token(Span::new(start_mark, self.mark), tok));
2088        Ok(())
2089    }
2090
2091    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2092        // A closing bracket without a corresponding opening is invalid YAML.
2093        if self.flow_level == 0 {
2094            return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2095        }
2096
2097        let flow_level = self.flow_level;
2098
2099        self.flow_markers.pop();
2100        self.remove_simple_key()?;
2101
2102        if matches!(tok, TokenType::FlowSequenceEnd) {
2103            self.end_implicit_mapping(self.mark, flow_level);
2104            // We are out exiting the flow sequence, nesting goes down 1 level.
2105            self.implicit_flow_mapping_states.pop();
2106        }
2107
2108        self.decrease_flow_level();
2109
2110        self.disallow_simple_key();
2111
2112        let start_mark = self.mark;
2113        self.skip_non_blank();
2114        self.skip_ws_to_eol(SkipTabs::Yes)?;
2115
2116        // A flow collection within a flow mapping can be a key. In that case, the value may be
2117        // adjacent to the `:`.
2118        // ```yaml
2119        // - [ {a: b}:value ]
2120        // ```
2121        if self.flow_level > 0 {
2122            self.adjacent_value_allowed_at = self.mark.index();
2123        }
2124
2125        self.tokens
2126            .push_back(Token(Span::new(start_mark, self.mark), tok));
2127        Ok(())
2128    }
2129
2130    /// Push the `FlowEntry` token and skip over the `,`.
2131    fn fetch_flow_entry(&mut self) -> ScanResult {
2132        self.remove_simple_key()?;
2133        self.allow_simple_key();
2134
2135        self.end_implicit_mapping(self.mark, self.flow_level);
2136
2137        let start_mark = self.mark;
2138        self.skip_non_blank();
2139        self.skip_ws_to_eol(SkipTabs::Yes)?;
2140
2141        self.tokens.push_back(Token(
2142            Span::new(start_mark, self.mark),
2143            TokenType::FlowEntry,
2144        ));
2145        Ok(())
2146    }
2147
2148    fn increase_flow_level(&mut self) -> ScanResult {
2149        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2150        self.flow_level = self
2151            .flow_level
2152            .checked_add(1)
2153            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2154        Ok(())
2155    }
2156
2157    fn decrease_flow_level(&mut self) {
2158        if self.flow_level > 0 {
2159            self.flow_level -= 1;
2160            self.simple_keys.pop().unwrap();
2161        }
2162    }
2163
2164    /// Push the `Block*` token(s) and skip over the `-`.
2165    ///
2166    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
2167    /// `BlockEntry` token.
2168    /// This function only skips over the `-` and does not fetch the entry value.
2169    fn fetch_block_entry(&mut self) -> ScanResult {
2170        if self.flow_level > 0 {
2171            // - * only allowed in block
2172            return Err(ScanError::new_str(
2173                self.mark,
2174                r#""-" is only valid inside a block"#,
2175            ));
2176        }
2177        // Check if we are allowed to start a new entry.
2178        if !self.simple_key_allowed {
2179            return Err(ScanError::new_str(
2180                self.mark,
2181                "block sequence entries are not allowed in this context",
2182            ));
2183        }
2184
2185        // ???, fixes test G9HC.
2186        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
2187            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2188                return Err(ScanError::new_str(
2189                    span.start,
2190                    "invalid indentation for anchor",
2191                ));
2192            }
2193        }
2194
2195        // Skip over the `-`.
2196        let mark = self.mark;
2197        self.skip_non_blank();
2198
2199        // generate BLOCK-SEQUENCE-START if indented
2200        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2201        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2202        self.input.lookahead(2);
2203        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2204        {
2205            return Err(ScanError::new_str(
2206                self.mark,
2207                "'-' must be followed by a valid YAML whitespace",
2208            ));
2209        }
2210
2211        self.skip_ws_to_eol(SkipTabs::No)?;
2212        self.input.lookahead(1);
2213        if self.input.next_is_break() || self.input.next_is_flow() {
2214            self.roll_one_col_indent();
2215        }
2216
2217        self.remove_simple_key()?;
2218        self.allow_simple_key();
2219
2220        self.tokens
2221            .push_back(Token(Span::empty(self.mark), TokenType::BlockEntry));
2222
2223        Ok(())
2224    }
2225
2226    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2227        if let Some((mark, bracket)) = self.flow_markers.pop() {
2228            return Err(ScanError::new(
2229                mark,
2230                format!("unclosed bracket '{bracket}'"),
2231            ));
2232        }
2233
2234        self.unroll_indent(-1);
2235        self.remove_simple_key()?;
2236        self.disallow_simple_key();
2237
2238        let mark = self.mark;
2239
2240        self.skip_n_non_blank(3);
2241
2242        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
2243        Ok(())
2244    }
2245
2246    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2247        self.save_simple_key();
2248        self.allow_simple_key();
2249        let tok = self.scan_block_scalar(literal)?;
2250
2251        self.tokens.push_back(tok);
2252        Ok(())
2253    }
2254
2255    #[allow(clippy::too_many_lines)]
2256    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2257        let start_mark = self.mark;
2258        let mut chomping = Chomping::Clip;
2259        let mut increment: usize = 0;
2260        let mut indent: usize = 0;
2261        let mut trailing_blank: bool;
2262        let mut leading_blank: bool = false;
2263        let style = if literal {
2264            ScalarStyle::Literal
2265        } else {
2266            ScalarStyle::Folded
2267        };
2268
2269        let mut string = String::new();
2270        let mut leading_break = String::new();
2271        let mut trailing_breaks = String::new();
2272        let mut chomping_break = String::new();
2273
2274        // skip '|' or '>'
2275        self.skip_non_blank();
2276        self.unroll_non_block_indents();
2277
2278        if self.input.look_ch() == '+' || self.input.peek() == '-' {
2279            if self.input.peek() == '+' {
2280                chomping = Chomping::Keep;
2281            } else {
2282                chomping = Chomping::Strip;
2283            }
2284            self.skip_non_blank();
2285            self.input.lookahead(1);
2286            if self.input.next_is_digit() {
2287                if self.input.peek() == '0' {
2288                    return Err(ScanError::new_str(
2289                        start_mark,
2290                        "while scanning a block scalar, found an indentation indicator equal to 0",
2291                    ));
2292                }
2293                increment = (self.input.peek() as usize) - ('0' as usize);
2294                self.skip_non_blank();
2295            }
2296        } else if self.input.next_is_digit() {
2297            if self.input.peek() == '0' {
2298                return Err(ScanError::new_str(
2299                    start_mark,
2300                    "while scanning a block scalar, found an indentation indicator equal to 0",
2301                ));
2302            }
2303
2304            increment = (self.input.peek() as usize) - ('0' as usize);
2305            self.skip_non_blank();
2306            self.input.lookahead(1);
2307            if self.input.peek() == '+' || self.input.peek() == '-' {
2308                if self.input.peek() == '+' {
2309                    chomping = Chomping::Keep;
2310                } else {
2311                    chomping = Chomping::Strip;
2312                }
2313                self.skip_non_blank();
2314            }
2315        }
2316
2317        self.skip_ws_to_eol(SkipTabs::Yes)?;
2318
2319        // Check if we are at the end of the line.
2320        self.input.lookahead(1);
2321        if !self.input.next_is_breakz() {
2322            return Err(ScanError::new_str(
2323                start_mark,
2324                "while scanning a block scalar, did not find expected comment or line break",
2325            ));
2326        }
2327
2328        if self.input.next_is_break() {
2329            self.input.lookahead(2);
2330            self.read_break(&mut chomping_break);
2331        }
2332
2333        if self.input.look_ch() == '\t' {
2334            return Err(ScanError::new_str(
2335                start_mark,
2336                "a block scalar content cannot start with a tab",
2337            ));
2338        }
2339
2340        if increment > 0 {
2341            indent = if self.indent >= 0 {
2342                (self.indent + increment as isize) as usize
2343            } else {
2344                increment
2345            }
2346        }
2347
2348        // Scan the leading line breaks and determine the indentation level if needed.
2349        if indent == 0 {
2350            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2351        } else {
2352            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2353        }
2354
2355        // We have an end-of-stream with no content, e.g.:
2356        // ```yaml
2357        // - |+
2358        // ```
2359        if self.input.next_is_z() {
2360            let contents = match chomping {
2361                // We strip trailing linebreaks. Nothing remain.
2362                Chomping::Strip => String::new(),
2363                // There was no newline after the chomping indicator.
2364                _ if self.mark.line == start_mark.line() => String::new(),
2365                // We clip lines, and there was a newline after the chomping indicator.
2366                // All other breaks are ignored.
2367                Chomping::Clip => chomping_break,
2368                // We keep lines. There was a newline after the chomping indicator but nothing
2369                // else.
2370                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2371                // Otherwise, the newline after chomping is ignored.
2372                Chomping::Keep => trailing_breaks,
2373            };
2374            return Ok(Token(
2375                Span::new(start_mark, self.mark),
2376                TokenType::Scalar(style, contents.into()),
2377            ));
2378        }
2379
2380        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2381            return Err(ScanError::new_str(
2382                self.mark,
2383                "wrongly indented line in block scalar",
2384            ));
2385        }
2386
2387        let mut line_buffer = String::with_capacity(100);
2388        let start_mark = self.mark;
2389        while self.mark.col == indent && !self.input.next_is_z() {
2390            if indent == 0 {
2391                self.input.lookahead(4);
2392                if self.input.next_is_document_end() {
2393                    break;
2394                }
2395            }
2396
2397            // We are at the first content character of a content line.
2398            trailing_blank = self.input.next_is_blank();
2399            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
2400                string.push_str(&trailing_breaks);
2401                if trailing_breaks.is_empty() {
2402                    string.push(' ');
2403                }
2404            } else {
2405                string.push_str(&leading_break);
2406                string.push_str(&trailing_breaks);
2407            }
2408
2409            leading_break.clear();
2410            trailing_breaks.clear();
2411
2412            leading_blank = self.input.next_is_blank();
2413
2414            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
2415
2416            // break on EOF
2417            self.input.lookahead(2);
2418            if self.input.next_is_z() {
2419                break;
2420            }
2421
2422            self.read_break(&mut leading_break);
2423
2424            // Eat the following indentation spaces and line breaks.
2425            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2426        }
2427
2428        // Chomp the tail.
2429        if chomping != Chomping::Strip {
2430            string.push_str(&leading_break);
2431            // If we had reached an eof but the last character wasn't an end-of-line, check if the
2432            // last line was indented at least as the rest of the scalar, then we need to consider
2433            // there is a newline.
2434            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
2435                string.push('\n');
2436            }
2437        }
2438
2439        if chomping == Chomping::Keep {
2440            string.push_str(&trailing_breaks);
2441        }
2442
2443        Ok(Token(
2444            Span::new(start_mark, self.mark),
2445            TokenType::Scalar(style, string.into()),
2446        ))
2447    }
2448
2449    /// Retrieve the contents of the line, parsing it as a block scalar.
2450    ///
2451    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
2452    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
2453    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
2454    /// `clear`ed before the end of the function.
2455    ///
2456    /// This function assumed the first character to read is the first content character in the
2457    /// line. This function does not consume the line break character(s) after the line.
2458    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
2459        // Start by evaluating characters in the buffer.
2460        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
2461            string.push(self.input.peek());
2462            // We may technically skip non-blank characters. However, the only distinction is
2463            // to determine what is leading whitespace and what is not. Here, we read the
2464            // contents of the line until either eof or a linebreak. We know we will not read
2465            // `self.leading_whitespace` until the end of the line, where it will be reset.
2466            // This allows us to call a slightly less expensive function.
2467            self.skip_blank();
2468        }
2469
2470        // All characters that were in the buffer were consumed. We need to check if more
2471        // follow.
2472        if self.input.buf_is_empty() {
2473            // We will read all consecutive non-breakz characters. We push them into a
2474            // temporary buffer. The main difference with going through `self.buffer` is that
2475            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
2476            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
2477            // (using `String::push_str`).
2478
2479            // line_buffer is empty at this point so we can compute n_chars here as well
2480            let mut n_chars = 0;
2481            debug_assert!(line_buffer.is_empty());
2482            while let Some(c) = self.input.raw_read_non_breakz_ch() {
2483                line_buffer.push(c);
2484                n_chars += 1;
2485            }
2486
2487            // We need to manually update our position; we haven't called a `skip` function.
2488            self.mark.col += n_chars;
2489            self.mark.offsets.chars += n_chars;
2490            self.mark.offsets.bytes = self.input.byte_offset();
2491
2492            // We can now append our bytes to our `string`.
2493            string.reserve(line_buffer.len());
2494            string.push_str(line_buffer);
2495            // This clears the _contents_ without touching the _capacity_.
2496            line_buffer.clear();
2497        }
2498    }
2499
2500    /// Skip the block scalar indentation and empty lines.
2501    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
2502        loop {
2503            // Consume all spaces. Tabs cannot be used as indentation.
2504            if indent < self.input.bufmaxlen() - 2 {
2505                self.input.lookahead(self.input.bufmaxlen());
2506                while self.mark.col < indent && self.input.peek() == ' ' {
2507                    self.skip_blank();
2508                }
2509            } else {
2510                loop {
2511                    self.input.lookahead(self.input.bufmaxlen());
2512                    while !self.input.buf_is_empty()
2513                        && self.mark.col < indent
2514                        && self.input.peek() == ' '
2515                    {
2516                        self.skip_blank();
2517                    }
2518                    // If we reached our indent, we can break. We must also break if we have
2519                    // reached content or EOF; that is, the buffer is not empty and the next
2520                    // character is not a space.
2521                    if self.mark.col == indent
2522                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
2523                    {
2524                        break;
2525                    }
2526                }
2527                self.input.lookahead(2);
2528            }
2529
2530            // If our current line is empty, skip over the break and continue looping.
2531            if self.input.next_is_break() {
2532                self.read_break(breaks);
2533            } else {
2534                // Otherwise, we have a content line. Return control.
2535                break;
2536            }
2537        }
2538    }
2539
2540    /// Determine the indentation level for a block scalar from the first line of its contents.
2541    ///
2542    /// The function skips over whitespace-only lines and sets `indent` to the the longest
2543    /// whitespace line that was encountered.
2544    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
2545        let mut max_indent = 0;
2546        loop {
2547            // Consume all spaces. Tabs cannot be used as indentation.
2548            while self.input.look_ch() == ' ' {
2549                self.skip_blank();
2550            }
2551
2552            if self.mark.col > max_indent {
2553                max_indent = self.mark.col;
2554            }
2555
2556            if self.input.next_is_break() {
2557                // If our current line is empty, skip over the break and continue looping.
2558                self.input.lookahead(2);
2559                self.read_break(breaks);
2560            } else {
2561                // Otherwise, we have a content line. Return control.
2562                break;
2563            }
2564        }
2565
2566        // In case a yaml looks like:
2567        // ```yaml
2568        // |
2569        // foo
2570        // bar
2571        // ```
2572        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
2573        // least 1. When in the above example, `self.indent` will be set to -1.
2574        *indent = max_indent.max((self.indent + 1) as usize);
2575        if self.indent > 0 {
2576            *indent = (*indent).max(1);
2577        }
2578    }
2579
2580    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
2581        self.save_simple_key();
2582        self.disallow_simple_key();
2583
2584        let tok = self.scan_flow_scalar(single)?;
2585
2586        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
2587        // YAML allows the following value to be specified adjacent to the “:”.
2588        self.skip_to_next_token()?;
2589        self.adjacent_value_allowed_at = self.mark.index();
2590
2591        self.tokens.push_back(tok);
2592        Ok(())
2593    }
2594
2595    #[allow(clippy::too_many_lines)]
2596    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
2597        let start_mark = self.mark;
2598
2599        // Output scalar contents.
2600        let mut buf = match self.input.byte_offset() {
2601            Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
2602            None => FlowScalarBuf::new_owned(),
2603        };
2604
2605        // Scratch used to consume the *first* line break in a break run without emitting it.
2606        // (The first break folds to ' ' or to nothing depending on escaping rules.)
2607        let mut break_scratch = String::new();
2608
2609        /* Eat the left quote. */
2610        self.skip_non_blank();
2611
2612        loop {
2613            /* Check for a document indicator. */
2614            self.input.lookahead(4);
2615
2616            if self.mark.col == 0 && self.input.next_is_document_indicator() {
2617                return Err(ScanError::new_str(
2618                    start_mark,
2619                    "while scanning a quoted scalar, found unexpected document indicator",
2620                ));
2621            }
2622
2623            if self.input.next_is_z() {
2624                return Err(ScanError::new_str(start_mark, "unclosed quote"));
2625            }
2626
2627            // Do not enforce block indentation inside quoted (flow) scalars.
2628            // YAML allows line breaks within quoted scalars.
2629            let mut leading_blanks = false;
2630            self.consume_flow_scalar_non_whitespace_chars(
2631                single,
2632                &mut buf,
2633                &mut leading_blanks,
2634                &start_mark,
2635            )?;
2636
2637            match self.input.look_ch() {
2638                '\'' if single => break,
2639                '"' if !single => break,
2640                _ => {}
2641            }
2642
2643            // --- Faster whitespace / line break handling (no temporary Strings) ---
2644            //
2645            // Instead of:
2646            //   - collecting blanks into `whitespaces` and then copying
2647            //   - collecting breaks into `leading_break` / `trailing_breaks` and then copying
2648            //
2649            // We do:
2650            //   - append trailing blanks directly to `string`, remember where they started,
2651            //     and truncate them if a line break follows.
2652            //   - for line breaks: consume the first break into a scratch (discarded),
2653            //     append subsequent breaks directly to `string`.
2654            //
2655            // These flags mirror the old "is_empty()" checks:
2656            //   has_leading_break  <=> !leading_break.is_empty()
2657            //   has_trailing_breaks <=> !trailing_breaks.is_empty()
2658            let mut trailing_ws_start: Option<usize> = None;
2659            let mut has_leading_break = false;
2660            let mut has_trailing_breaks = false;
2661
2662            // For the borrowed path: track the (byte) start of a pending whitespace run.
2663            let mut pending_ws_start: Option<usize> = None;
2664
2665            // Consume blank characters.
2666            while self.input.next_is_blank() || self.input.next_is_break() {
2667                if self.input.next_is_blank() {
2668                    // Consume a space or a tab character.
2669                    if leading_blanks {
2670                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
2671                            return Err(ScanError::new_str(
2672                                self.mark,
2673                                "tab cannot be used as indentation",
2674                            ));
2675                        }
2676                        self.skip_blank();
2677                    } else {
2678                        // Append to output immediately; if a break appears next, we'll truncate.
2679                        match buf {
2680                            FlowScalarBuf::Owned(ref mut string) => {
2681                                if trailing_ws_start.is_none() {
2682                                    trailing_ws_start = Some(string.len());
2683                                }
2684                                string.push(self.input.peek());
2685                            }
2686                            FlowScalarBuf::Borrowed { .. } => {
2687                                if pending_ws_start.is_none() {
2688                                    pending_ws_start = self.input.byte_offset();
2689                                }
2690                            }
2691                        }
2692                        self.skip_blank();
2693
2694                        if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
2695                            (&mut buf, pending_ws_start, self.input.byte_offset())
2696                        {
2697                            buf.note_pending_ws(ws_start, ws_end);
2698                        }
2699                    }
2700                } else {
2701                    self.input.lookahead(2);
2702
2703                    // Check if it is a first line break.
2704                    if leading_blanks {
2705                        // Second+ line break in a run: preserve it.
2706                        match buf {
2707                            FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
2708                            FlowScalarBuf::Borrowed { .. } => {
2709                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2710                                let Some(string) = buf.as_owned_mut() else {
2711                                    unreachable!()
2712                                };
2713                                self.read_break(string);
2714                            }
2715                        }
2716                        has_trailing_breaks = true;
2717                    } else {
2718                        // First break: drop any trailing blanks we appended, then consume the break.
2719                        if let Some(pos) = trailing_ws_start.take() {
2720                            if let FlowScalarBuf::Owned(ref mut string) = buf {
2721                                string.truncate(pos);
2722                            }
2723                        }
2724
2725                        if pending_ws_start.take().is_some() {
2726                            // Trailing blanks before a break are discarded => transformation.
2727                            if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2728                                self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2729                            }
2730                            buf.discard_pending_ws();
2731                        } else {
2732                            buf.commit_pending_ws();
2733                        }
2734
2735                        break_scratch.clear();
2736                        self.read_break(&mut break_scratch);
2737                        // Keep `break_scratch` content (ignored) until next clear; no need to clear twice.
2738
2739                        has_leading_break = true;
2740                        leading_blanks = true;
2741                    }
2742                }
2743
2744                self.input.lookahead(1);
2745            }
2746
2747            // If we had a line break inside a quoted (flow) scalar, validate indentation
2748            // of the continuation line in block context.
2749            if leading_blanks && has_leading_break && self.flow_level == 0 {
2750                let next_ch = self.input.peek();
2751                let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
2752                if !is_closing_quote && (self.mark.col as isize) <= self.indent {
2753                    return Err(ScanError::new_str(
2754                        self.mark,
2755                        "invalid indentation in multiline quoted scalar",
2756                    ));
2757                }
2758            }
2759
2760            // Join the whitespaces or fold line breaks.
2761            if leading_blanks {
2762                // Old logic:
2763                //   if leading_break empty => emit trailing_breaks (already emitted now)
2764                //   else if trailing_breaks empty => emit ' '
2765                //   else emit trailing_breaks (already emitted now)
2766                if has_leading_break && !has_trailing_breaks {
2767                    match buf {
2768                        FlowScalarBuf::Owned(ref mut string) => string.push(' '),
2769                        FlowScalarBuf::Borrowed { .. } => {
2770                            self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
2771                            let Some(string) = buf.as_owned_mut() else {
2772                                unreachable!()
2773                            };
2774                            string.push(' ');
2775                        }
2776                    }
2777                }
2778            }
2779            // else: trailing blanks are already appended to `string`
2780        } // loop
2781
2782        // Eat the right quote.
2783        self.skip_non_blank();
2784
2785        // Ensure there is no invalid trailing content.
2786        self.skip_ws_to_eol(SkipTabs::Yes)?;
2787        match self.input.peek() {
2788            // These can be encountered in flow sequences or mappings.
2789            ',' | '}' | ']' if self.flow_level > 0 => {}
2790            // An end-of-line / end-of-stream is fine. No trailing content.
2791            c if is_breakz(c) => {}
2792            // ':' can be encountered if our scalar is a key.
2793            // Outside of flow contexts, keys cannot span multiple lines
2794            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2795            // Inside a flow context, this is allowed.
2796            ':' if self.flow_level > 0 => {}
2797            _ => {
2798                return Err(ScanError::new_str(
2799                    self.mark,
2800                    "invalid trailing content after double-quoted scalar",
2801                ));
2802            }
2803        }
2804
2805        let style = if single {
2806            ScalarStyle::SingleQuoted
2807        } else {
2808            ScalarStyle::DoubleQuoted
2809        };
2810
2811        let contents = match buf {
2812            FlowScalarBuf::Owned(string) => Cow::Owned(string),
2813            FlowScalarBuf::Borrowed {
2814                start,
2815                mut end,
2816                pending_ws_start,
2817                pending_ws_end,
2818            } => {
2819                // If we ended after a whitespace run, it is part of the output (no break followed).
2820                if pending_ws_start.is_some() {
2821                    end = pending_ws_end;
2822                }
2823                if let Some(slice) = self.try_borrow_slice(start, end) {
2824                    Cow::Borrowed(slice)
2825                } else {
2826                    let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2827                        ScanError::new_str(
2828                            start_mark,
2829                            "internal error: input advertised offsets but did not provide a slice",
2830                        )
2831                    })?;
2832                    Cow::Owned(slice.to_owned())
2833                }
2834            }
2835        };
2836
2837        Ok(Token(
2838            Span::new(start_mark, self.mark),
2839            TokenType::Scalar(style, contents),
2840        ))
2841    }
2842
2843    /// Consume successive non-whitespace characters from a flow scalar.
2844    ///
2845    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2846    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2847    /// for double quoted scalars).
2848    ///
2849    /// # Errors
2850    /// Return an error if an invalid escape sequence is found.
2851    fn consume_flow_scalar_non_whitespace_chars(
2852        &mut self,
2853        single: bool,
2854        buf: &mut FlowScalarBuf,
2855        leading_blanks: &mut bool,
2856        start_mark: &Marker,
2857    ) -> Result<(), ScanError> {
2858        self.input.lookahead(2);
2859        while !is_blank_or_breakz(self.input.peek()) {
2860            match self.input.peek() {
2861                // Check for an escaped single quote.
2862                '\'' if self.input.peek_nth(1) == '\'' && single => {
2863                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2864                        buf.commit_pending_ws();
2865                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2866                    }
2867                    let Some(string) = buf.as_owned_mut() else {
2868                        unreachable!()
2869                    };
2870                    string.push('\'');
2871                    self.skip_n_non_blank(2);
2872                }
2873                // Check for the right quote.
2874                '\'' if single => break,
2875                '"' if !single => break,
2876                // Check for an escaped line break.
2877                '\\' if !single && is_break(self.input.peek_nth(1)) => {
2878                    self.input.lookahead(3);
2879                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2880                        buf.commit_pending_ws();
2881                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2882                    }
2883                    self.skip_non_blank();
2884                    self.skip_linebreak();
2885                    *leading_blanks = true;
2886                    break;
2887                }
2888                // Check for an escape sequence.
2889                '\\' if !single => {
2890                    if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
2891                        buf.commit_pending_ws();
2892                        self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
2893                    }
2894                    let Some(string) = buf.as_owned_mut() else {
2895                        unreachable!()
2896                    };
2897                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2898                }
2899                c => {
2900                    match buf {
2901                        FlowScalarBuf::Owned(ref mut string) => {
2902                            string.push(c);
2903                        }
2904                        FlowScalarBuf::Borrowed { .. } => {
2905                            buf.commit_pending_ws();
2906                        }
2907                    }
2908                    self.skip_non_blank();
2909
2910                    if let Some(new_end) = self.input.byte_offset() {
2911                        if let FlowScalarBuf::Borrowed { end, .. } = buf {
2912                            *end = new_end;
2913                        }
2914                    }
2915                }
2916            }
2917            self.input.lookahead(2);
2918        }
2919        Ok(())
2920    }
2921
2922    /// Escape the sequence we encounter in a flow scalar.
2923    ///
2924    /// `self.input.peek()` must point to the `\` starting the escape sequence.
2925    ///
2926    /// # Errors
2927    /// Return an error if an invalid escape sequence is found.
2928    fn resolve_flow_scalar_escape_sequence(
2929        &mut self,
2930        start_mark: &Marker,
2931    ) -> Result<char, ScanError> {
2932        let mut code_length = 0usize;
2933        let mut ret = '\0';
2934
2935        match self.input.peek_nth(1) {
2936            '0' => ret = '\0',
2937            'a' => ret = '\x07',
2938            'b' => ret = '\x08',
2939            't' | '\t' => ret = '\t',
2940            'n' => ret = '\n',
2941            'v' => ret = '\x0b',
2942            'f' => ret = '\x0c',
2943            'r' => ret = '\x0d',
2944            'e' => ret = '\x1b',
2945            ' ' => ret = '\x20',
2946            '"' => ret = '"',
2947            '/' => ret = '/',
2948            '\\' => ret = '\\',
2949            // Unicode next line (#x85)
2950            'N' => ret = char::from_u32(0x85).unwrap(),
2951            // Unicode non-breaking space (#xA0)
2952            '_' => ret = char::from_u32(0xA0).unwrap(),
2953            // Unicode line separator (#x2028)
2954            'L' => ret = char::from_u32(0x2028).unwrap(),
2955            // Unicode paragraph separator (#x2029)
2956            'P' => ret = char::from_u32(0x2029).unwrap(),
2957            'x' => code_length = 2,
2958            'u' => code_length = 4,
2959            'U' => code_length = 8,
2960            _ => {
2961                return Err(ScanError::new_str(
2962                    *start_mark,
2963                    "while parsing a quoted scalar, found unknown escape character",
2964                ))
2965            }
2966        }
2967        self.skip_n_non_blank(2);
2968
2969        // Consume an arbitrary escape code.
2970        if code_length > 0 {
2971            self.input.lookahead(code_length);
2972            let mut value = 0u32;
2973            for i in 0..code_length {
2974                let c = self.input.peek_nth(i);
2975                if !is_hex(c) {
2976                    return Err(ScanError::new_str(
2977                        *start_mark,
2978                        "while parsing a quoted scalar, did not find expected hexadecimal number",
2979                    ));
2980                }
2981                value = (value << 4) + as_hex(c);
2982            }
2983
2984            self.skip_n_non_blank(code_length);
2985
2986            // Handle JSON surrogate pairs: high surrogate followed by low surrogate
2987            if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
2988                self.input.lookahead(2);
2989                if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
2990                    self.skip_n_non_blank(2);
2991                    self.input.lookahead(4);
2992                    let mut low_value = 0u32;
2993                    for i in 0..4 {
2994                        let c = self.input.peek_nth(i);
2995                        if !is_hex(c) {
2996                            return Err(ScanError::new_str(
2997                                *start_mark,
2998                                "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
2999                            ));
3000                        }
3001                        low_value = (low_value << 4) + as_hex(c);
3002                    }
3003                    if (0xDC00..=0xDFFF).contains(&low_value) {
3004                        value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3005                        self.skip_n_non_blank(4);
3006                    } else {
3007                        return Err(ScanError::new_str(
3008                            *start_mark,
3009                            "while parsing a quoted scalar, found invalid low surrogate",
3010                        ));
3011                    }
3012                } else {
3013                    return Err(ScanError::new_str(
3014                        *start_mark,
3015                        "while parsing a quoted scalar, found high surrogate without following low surrogate",
3016                    ));
3017                }
3018            } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3019                return Err(ScanError::new_str(
3020                    *start_mark,
3021                    "while parsing a quoted scalar, found unpaired low surrogate",
3022                ));
3023            }
3024
3025            let Some(ch) = char::from_u32(value) else {
3026                return Err(ScanError::new_str(
3027                    *start_mark,
3028                    "while parsing a quoted scalar, found invalid Unicode character escape code",
3029                ));
3030            };
3031            ret = ch;
3032        }
3033        Ok(ret)
3034    }
3035
3036    fn fetch_plain_scalar(&mut self) -> ScanResult {
3037        self.save_simple_key();
3038        self.disallow_simple_key();
3039
3040        let tok = self.scan_plain_scalar()?;
3041
3042        self.tokens.push_back(tok);
3043        Ok(())
3044    }
3045
3046    /// Scan for a plain scalar.
3047    ///
3048    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
3049    /// some contexts.
3050    #[allow(clippy::too_many_lines)]
3051    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3052        self.unroll_non_block_indents();
3053        let indent = self.indent + 1;
3054        let start_mark = self.mark;
3055
3056        if self.flow_level > 0 && (start_mark.col as isize) < indent {
3057            return Err(ScanError::new_str(
3058                start_mark,
3059                "invalid indentation in flow construct",
3060            ));
3061        }
3062
3063        let mut string = String::with_capacity(32);
3064        self.buf_whitespaces.clear();
3065        self.buf_leading_break.clear();
3066        self.buf_trailing_breaks.clear();
3067        let mut end_mark = self.mark;
3068
3069        loop {
3070            self.input.lookahead(4);
3071            if (self.mark.col == 0 && self.input.next_is_document_indicator())
3072                || self.input.peek() == '#'
3073            {
3074                // BS4K: If a `#` starts a comment after some separation spaces following content
3075                // of a plain scalar in block context, and there is potential continuation on the
3076                // next line, this is invalid. We cannot decide yet if there will be continuation,
3077                // so record that a comment interrupted a plain scalar.
3078                if self.input.peek() == '#'
3079                    && !string.is_empty()
3080                    && !self.buf_whitespaces.is_empty()
3081                    && self.flow_level == 0
3082                {
3083                    self.interrupted_plain_by_comment = Some(self.mark);
3084                }
3085                break;
3086            }
3087
3088            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3089                return Err(ScanError::new_str(
3090                    self.mark,
3091                    "plain scalar cannot start with '-' followed by ,[]{}",
3092                ));
3093            }
3094
3095            if !self.input.next_is_blank_or_breakz()
3096                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3097            {
3098                if self.leading_whitespace {
3099                    if self.buf_leading_break.is_empty() {
3100                        string.push_str(&self.buf_leading_break);
3101                        string.push_str(&self.buf_trailing_breaks);
3102                        self.buf_trailing_breaks.clear();
3103                        self.buf_leading_break.clear();
3104                    } else {
3105                        if self.buf_trailing_breaks.is_empty() {
3106                            string.push(' ');
3107                        } else {
3108                            string.push_str(&self.buf_trailing_breaks);
3109                            self.buf_trailing_breaks.clear();
3110                        }
3111                        self.buf_leading_break.clear();
3112                    }
3113                    self.leading_whitespace = false;
3114                } else if !self.buf_whitespaces.is_empty() {
3115                    string.push_str(&self.buf_whitespaces);
3116                    self.buf_whitespaces.clear();
3117                }
3118
3119                // We can unroll the first iteration of the loop.
3120                string.push(self.input.peek());
3121                self.skip_non_blank();
3122                string.reserve(self.input.bufmaxlen());
3123
3124                // Add content non-blank characters to the scalar.
3125                let mut end = false;
3126                while !end {
3127                    // Fill the buffer once and process all characters in the buffer until the next
3128                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
3129                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
3130                    self.input.lookahead(self.input.bufmaxlen());
3131                    let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3132                        &mut string,
3133                        self.input.bufmaxlen() - 1,
3134                        self.flow_level > 0,
3135                    );
3136                    end = stop;
3137                    self.mark.offsets.chars += chars_consumed;
3138                    self.mark.col += chars_consumed;
3139                    self.mark.offsets.bytes = self.input.byte_offset();
3140                }
3141                end_mark = self.mark;
3142            }
3143
3144            // We may reach the end of a plain scalar if:
3145            //  - We reach eof
3146            //  - We reach ": "
3147            //  - We find a flow character in a flow context
3148            if !(self.input.next_is_blank() || self.input.next_is_break()) {
3149                break;
3150            }
3151
3152            // Process blank characters.
3153            self.input.lookahead(2);
3154            while self.input.next_is_blank_or_break() {
3155                if self.input.next_is_blank() {
3156                    if !self.leading_whitespace {
3157                        self.buf_whitespaces.push(self.input.peek());
3158                        self.skip_blank();
3159                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3160                        // Tabs in an indentation columns are allowed if and only if the line is
3161                        // empty. Skip to the end of the line.
3162                        self.skip_ws_to_eol(SkipTabs::Yes)?;
3163                        if !self.input.next_is_breakz() {
3164                            return Err(ScanError::new_str(
3165                                start_mark,
3166                                "while scanning a plain scalar, found a tab",
3167                            ));
3168                        }
3169                    } else {
3170                        self.skip_blank();
3171                    }
3172                } else {
3173                    // Check if it is a first line break
3174                    if self.leading_whitespace {
3175                        self.skip_break();
3176                        self.buf_trailing_breaks.push('\n');
3177                    } else {
3178                        self.buf_whitespaces.clear();
3179                        self.skip_break();
3180                        self.buf_leading_break.push('\n');
3181                        self.leading_whitespace = true;
3182                    }
3183                }
3184                self.input.lookahead(2);
3185            }
3186
3187            // check indentation level
3188            if self.flow_level == 0 && (self.mark.col as isize) < indent {
3189                break;
3190            }
3191        }
3192
3193        if self.leading_whitespace {
3194            self.allow_simple_key();
3195        }
3196
3197        if string.is_empty() {
3198            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
3199            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
3200            // erroneous inputs such as "{...".
3201            Err(ScanError::new_str(
3202                start_mark,
3203                "unexpected end of plain scalar",
3204            ))
3205        } else {
3206            let contents = if let (Some(start), Some(end)) =
3207                (start_mark.byte_offset(), end_mark.byte_offset())
3208            {
3209                match self.try_borrow_slice(start, end) {
3210                    Some(slice) if slice == string => Cow::Borrowed(slice),
3211                    _ => Cow::Owned(string),
3212                }
3213            } else {
3214                Cow::Owned(string)
3215            };
3216
3217            Ok(Token(
3218                Span::new(start_mark, end_mark),
3219                TokenType::Scalar(ScalarStyle::Plain, contents),
3220            ))
3221        }
3222    }
3223
3224    fn fetch_key(&mut self) -> ScanResult {
3225        let start_mark = self.mark;
3226        if self.flow_level == 0 {
3227            // Check if we are allowed to start a new key (not necessarily simple).
3228            if !self.simple_key_allowed {
3229                return Err(ScanError::new_str(
3230                    self.mark,
3231                    "mapping keys are not allowed in this context",
3232                ));
3233            }
3234            self.roll_indent(
3235                start_mark.col,
3236                None,
3237                TokenType::BlockMappingStart,
3238                start_mark,
3239            );
3240        } else {
3241            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
3242            self.flow_mapping_started = true;
3243        }
3244
3245        self.remove_simple_key()?;
3246
3247        if self.flow_level == 0 {
3248            self.allow_simple_key();
3249        } else {
3250            self.disallow_simple_key();
3251        }
3252
3253        self.skip_non_blank();
3254        self.skip_yaml_whitespace()?;
3255        if self.input.peek() == '\t' {
3256            return Err(ScanError::new_str(
3257                self.mark(),
3258                "tabs disallowed in this context",
3259            ));
3260        }
3261        self.tokens
3262            .push_back(Token(Span::new(start_mark, self.mark), TokenType::Key));
3263        Ok(())
3264    }
3265
3266    /// Fetch a value in a mapping inside of a flow collection.
3267    ///
3268    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
3269    /// values in flow collections are respected prior to calling [`fetch_value`].
3270    ///
3271    /// [`self.flow_level`]: Self::flow_level
3272    /// [`fetch_value`]: Self::fetch_value
3273    fn fetch_flow_value(&mut self) -> ScanResult {
3274        let nc = self.input.peek_nth(1);
3275
3276        // If we encounter a ':' inside a flow collection and it is not immediately
3277        // followed by a blank or breakz:
3278        //   - We must check whether an adjacent value is allowed
3279        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
3280        //     is needed for JSON compatibility.
3281        //   - If not, we must ensure there is a space after the ':' and before its value.
3282        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
3283        //   - But if the value is empty (null), then it's okay.
3284        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
3285        // flow character), but the ']' is not the value. The value is an invisible empty
3286        // space which is represented as null ('~').
3287        if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3288            return Err(ScanError::new_str(
3289                self.mark,
3290                "':' may not precede any of `[{` in flow mapping",
3291            ));
3292        }
3293
3294        self.fetch_value()
3295    }
3296
3297    /// Fetch a value from a mapping (after a `:`).
3298    fn fetch_value(&mut self) -> ScanResult {
3299        let sk = self.simple_keys.last().unwrap().clone();
3300        let start_mark = self.mark;
3301        let is_implicit_flow_mapping =
3302            !self.implicit_flow_mapping_states.is_empty() && !self.flow_mapping_started;
3303        if is_implicit_flow_mapping {
3304            *self.implicit_flow_mapping_states.last_mut().unwrap() =
3305                ImplicitMappingState::Inside(self.flow_level);
3306        }
3307
3308        // Skip over ':'.
3309        self.skip_non_blank();
3310        // Error detection: if ':' is followed by tab(s) without any space, and then what looks
3311        // like a value, emit a helpful error. The check for '-' or alphanumeric is an intentional
3312        // heuristic that catches common cases (e.g., `key:\tvalue`, `key:\t-item`) without
3313        // rejecting valid YAML like `key:\t|` (block scalar) or `key:\t"quoted"`.
3314        // Note: This heuristic won't catch Unicode value starters like `key:\täöü`, but such
3315        // cases will still fail to parse correctly (just with a less specific error message).
3316        if self.input.look_ch() == '\t'
3317            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
3318            && (self.input.peek() == '-' || self.input.next_is_alpha())
3319        {
3320            return Err(ScanError::new_str(
3321                self.mark,
3322                "':' must be followed by a valid YAML whitespace",
3323            ));
3324        }
3325
3326        if sk.possible {
3327            // insert simple key
3328            let tok = Token(Span::empty(sk.mark), TokenType::Key);
3329            self.insert_token(sk.token_number - self.tokens_parsed, tok);
3330            if is_implicit_flow_mapping {
3331                if sk.mark.line < start_mark.line {
3332                    return Err(ScanError::new_str(
3333                        start_mark,
3334                        "illegal placement of ':' indicator",
3335                    ));
3336                }
3337                self.insert_token(
3338                    sk.token_number - self.tokens_parsed,
3339                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3340                );
3341            }
3342
3343            // Add the BLOCK-MAPPING-START token if needed.
3344            self.roll_indent(
3345                sk.mark.col,
3346                Some(sk.token_number),
3347                TokenType::BlockMappingStart,
3348                sk.mark,
3349            );
3350            self.roll_one_col_indent();
3351
3352            self.simple_keys.last_mut().unwrap().possible = false;
3353            self.disallow_simple_key();
3354        } else {
3355            if is_implicit_flow_mapping {
3356                self.tokens
3357                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
3358            }
3359            // The ':' indicator follows a complex key.
3360            if self.flow_level == 0 {
3361                if !self.simple_key_allowed {
3362                    return Err(ScanError::new_str(
3363                        start_mark,
3364                        "mapping values are not allowed in this context",
3365                    ));
3366                }
3367
3368                self.roll_indent(
3369                    start_mark.col,
3370                    None,
3371                    TokenType::BlockMappingStart,
3372                    start_mark,
3373                );
3374            }
3375            self.roll_one_col_indent();
3376
3377            if self.flow_level == 0 {
3378                self.allow_simple_key();
3379            } else {
3380                self.disallow_simple_key();
3381            }
3382        }
3383        self.tokens
3384            .push_back(Token(Span::empty(start_mark), TokenType::Value));
3385
3386        Ok(())
3387    }
3388
3389    /// Add an indentation level to the stack with the given block token, if needed.
3390    ///
3391    /// An indentation level is added only if:
3392    ///   - We are not in a flow-style construct (which don't have indentation per-se).
3393    ///   - The current column is further indented than the last indent we have registered.
3394    fn roll_indent(
3395        &mut self,
3396        col: usize,
3397        number: Option<usize>,
3398        tok: TokenType<'input>,
3399        mark: Marker,
3400    ) {
3401        if self.flow_level > 0 {
3402            return;
3403        }
3404
3405        // If the last indent was a non-block indent, remove it.
3406        // This means that we prepared an indent that we thought we wouldn't use, but realized just
3407        // now that it is a block indent.
3408        if self.indent <= col as isize {
3409            if let Some(indent) = self.indents.last() {
3410                if !indent.needs_block_end {
3411                    self.indent = indent.indent;
3412                    self.indents.pop();
3413                }
3414            }
3415        }
3416
3417        if self.indent < col as isize {
3418            self.indents.push(Indent {
3419                indent: self.indent,
3420                needs_block_end: true,
3421            });
3422            self.indent = col as isize;
3423            let tokens_parsed = self.tokens_parsed;
3424            match number {
3425                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
3426                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
3427            }
3428        }
3429    }
3430
3431    /// Pop indentation levels from the stack as much as needed.
3432    ///
3433    /// Indentation levels are popped from the stack while they are further indented than `col`.
3434    /// If we are in a flow-style construct (which don't have indentation per-se), this function
3435    /// does nothing.
3436    fn unroll_indent(&mut self, col: isize) {
3437        if self.flow_level > 0 {
3438            return;
3439        }
3440        while self.indent > col {
3441            let indent = self.indents.pop().unwrap();
3442            self.indent = indent.indent;
3443            if indent.needs_block_end {
3444                self.tokens
3445                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
3446            }
3447        }
3448    }
3449
3450    /// Add an indentation level of 1 column that does not start a block.
3451    ///
3452    /// See the documentation of [`Indent::needs_block_end`] for more details.
3453    /// An indentation is not added if we are inside a flow level or if the last indent is already
3454    /// a non-block indent.
3455    fn roll_one_col_indent(&mut self) {
3456        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
3457            self.indents.push(Indent {
3458                indent: self.indent,
3459                needs_block_end: false,
3460            });
3461            self.indent += 1;
3462        }
3463    }
3464
3465    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
3466    fn unroll_non_block_indents(&mut self) {
3467        while let Some(indent) = self.indents.last() {
3468            if indent.needs_block_end {
3469                break;
3470            }
3471            self.indent = indent.indent;
3472            self.indents.pop();
3473        }
3474    }
3475
3476    /// Mark the next token to be inserted as a potential simple key.
3477    fn save_simple_key(&mut self) {
3478        if self.simple_key_allowed {
3479            let required = self.flow_level == 0
3480                && self.indent == (self.mark.col as isize)
3481                && self.indents.last().unwrap().needs_block_end;
3482
3483            if let Some(last) = self.simple_keys.last_mut() {
3484                *last = SimpleKey {
3485                    mark: self.mark,
3486                    possible: true,
3487                    required,
3488                    token_number: self.tokens_parsed + self.tokens.len(),
3489                };
3490            }
3491        }
3492    }
3493
3494    fn remove_simple_key(&mut self) -> ScanResult {
3495        let last = self.simple_keys.last_mut().unwrap();
3496        if last.possible && last.required {
3497            return Err(self.simple_key_expected());
3498        }
3499
3500        last.possible = false;
3501        Ok(())
3502    }
3503
3504    /// Return whether the scanner is inside a block but outside of a flow sequence.
3505    fn is_within_block(&self) -> bool {
3506        !self.indents.is_empty()
3507    }
3508
3509    /// If an implicit mapping had started, end it.
3510    ///
3511    /// This function does not pop the state in [`implicit_flow_mapping_states`].
3512    ///
3513    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
3514    fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
3515        if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() {
3516            if *implicit_mapping == ImplicitMappingState::Inside(flow_level) {
3517                self.flow_mapping_started = false;
3518                *implicit_mapping = ImplicitMappingState::Possible;
3519                self.tokens
3520                    .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
3521            }
3522        }
3523    }
3524}
3525
3526/// Chomping, how final line breaks and trailing empty lines are interpreted.
3527///
3528/// See YAML spec 8.1.1.2.
3529#[derive(PartialEq, Eq)]
3530pub enum Chomping {
3531    /// The final line break and any trailing empty lines are excluded.
3532    Strip,
3533    /// The final line break is preserved, but trailing empty lines are excluded.
3534    Clip,
3535    /// The final line break and trailing empty lines are included.
3536    Keep,
3537}
3538
3539#[cfg(test)]
3540mod test {
3541    use alloc::borrow::Cow;
3542
3543    use crate::{
3544        input::str::StrInput,
3545        scanner::{Scanner, TokenType},
3546    };
3547
3548    #[test]
3549    fn test_is_anchor_char() {
3550        use super::is_anchor_char;
3551        assert!(is_anchor_char('x'));
3552    }
3553
3554    /// Ensure anchors scanned from `StrInput` are returned as `Cow::Borrowed`.
3555    #[test]
3556    fn anchor_name_is_borrowed_for_str_input() {
3557        let mut scanner = Scanner::new(StrInput::new("&anch\n"));
3558
3559        loop {
3560            let tok = scanner
3561                .next_token()
3562                .expect("valid YAML must scan without errors")
3563                .expect("scanner must eventually produce a token");
3564            if let TokenType::Anchor(name) = tok.1 {
3565                assert!(matches!(name, Cow::Borrowed("anch")));
3566                break;
3567            }
3568        }
3569    }
3570
3571    /// Ensure aliases scanned from `StrInput` are returned as `Cow::Borrowed`.
3572    #[test]
3573    fn alias_name_is_borrowed_for_str_input() {
3574        let mut scanner = Scanner::new(StrInput::new("*anch\n"));
3575
3576        loop {
3577            let tok = scanner
3578                .next_token()
3579                .expect("valid YAML must scan without errors")
3580                .expect("scanner must eventually produce a token");
3581            if let TokenType::Alias(name) = tok.1 {
3582                assert!(matches!(name, Cow::Borrowed("anch")));
3583                break;
3584            }
3585        }
3586    }
3587
3588    /// Ensure `%TAG` directive handle and prefix are borrowed when they are verbatim (no escapes).
3589    #[test]
3590    fn tag_directive_parts_are_borrowed_for_str_input() {
3591        let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
3592
3593        loop {
3594            let tok = scanner
3595                .next_token()
3596                .expect("valid YAML must scan without errors")
3597                .expect("scanner must eventually produce a token");
3598            if let TokenType::TagDirective(handle, prefix) = tok.1 {
3599                assert!(matches!(handle, Cow::Borrowed("!e!")));
3600                assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
3601                break;
3602            }
3603        }
3604    }
3605
3606    #[test]
3607    fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
3608        let mut scanner = Scanner::new(StrInput::new("foo\n"));
3609
3610        loop {
3611            let tok = scanner
3612                .next_token()
3613                .expect("valid YAML must scan without errors")
3614                .expect("scanner must eventually produce a token");
3615            if let TokenType::Scalar(_, value) = tok.1 {
3616                assert!(matches!(value, Cow::Borrowed("foo")));
3617                break;
3618            }
3619        }
3620    }
3621
3622    #[test]
3623    fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
3624        let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
3625
3626        loop {
3627            let tok = scanner
3628                .next_token()
3629                .expect("valid YAML must scan without errors")
3630                .expect("scanner must eventually produce a token");
3631            if let TokenType::Scalar(_, value) = tok.1 {
3632                assert!(matches!(value, Cow::Borrowed("foo bar")));
3633                break;
3634            }
3635        }
3636    }
3637
3638    #[test]
3639    fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
3640        let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
3641
3642        loop {
3643            let tok = scanner
3644                .next_token()
3645                .expect("valid YAML must scan without errors")
3646                .expect("scanner must eventually produce a token");
3647            if let TokenType::Scalar(_, value) = tok.1 {
3648                assert!(matches!(value, Cow::Borrowed("foo bar")));
3649                break;
3650            }
3651        }
3652    }
3653
3654    #[test]
3655    fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
3656        let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
3657
3658        loop {
3659            let tok = scanner
3660                .next_token()
3661                .expect("valid YAML must scan without errors")
3662                .expect("scanner must eventually produce a token");
3663            if let TokenType::Scalar(_, value) = tok.1 {
3664                assert!(matches!(value, Cow::Owned(_)));
3665                assert_eq!(&*value, "foo'bar");
3666                break;
3667            }
3668        }
3669    }
3670
3671    #[test]
3672    fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
3673        let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
3674
3675        loop {
3676            let tok = scanner
3677                .next_token()
3678                .expect("valid YAML must scan without errors")
3679                .expect("scanner must eventually produce a token");
3680            if let TokenType::Scalar(_, value) = tok.1 {
3681                assert!(matches!(value, Cow::Borrowed("foo bar")));
3682                break;
3683            }
3684        }
3685    }
3686
3687    #[test]
3688    fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
3689        let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
3690
3691        loop {
3692            let tok = scanner
3693                .next_token()
3694                .expect("valid YAML must scan without errors")
3695                .expect("scanner must eventually produce a token");
3696            if let TokenType::Scalar(_, value) = tok.1 {
3697                assert!(matches!(value, Cow::Owned(_)));
3698                assert_eq!(&*value, "foo\nbar");
3699                break;
3700            }
3701        }
3702    }
3703
3704    #[test]
3705    fn plain_key_is_borrowed_for_str_input() {
3706        // Keys are just scalars in a key position; they should also be borrowed.
3707        let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
3708
3709        let mut found_key = false;
3710        let mut key_value: Option<Cow<'_, str>> = None;
3711
3712        loop {
3713            let tok = scanner
3714                .next_token()
3715                .expect("valid YAML must scan without errors");
3716            let Some(tok) = tok else { break };
3717
3718            if matches!(tok.1, TokenType::Key) {
3719                found_key = true;
3720            } else if found_key {
3721                if let TokenType::Scalar(_, value) = tok.1 {
3722                    key_value = Some(value);
3723                    break;
3724                }
3725            }
3726        }
3727
3728        assert!(found_key, "expected to find a Key token");
3729        let key_value = key_value.expect("expected to find a scalar after Key token");
3730        assert!(
3731            matches!(key_value, Cow::Borrowed("mykey")),
3732            "key should be borrowed, got: {key_value:?}"
3733        );
3734    }
3735
3736    #[test]
3737    fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
3738        let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
3739
3740        let mut found_key = false;
3741        let mut key_value: Option<Cow<'_, str>> = None;
3742
3743        loop {
3744            let tok = scanner
3745                .next_token()
3746                .expect("valid YAML must scan without errors");
3747            let Some(tok) = tok else { break };
3748
3749            if matches!(tok.1, TokenType::Key) {
3750                found_key = true;
3751            } else if found_key {
3752                if let TokenType::Scalar(_, value) = tok.1 {
3753                    key_value = Some(value);
3754                    break;
3755                }
3756            }
3757        }
3758
3759        assert!(found_key, "expected to find a Key token");
3760        let key_value = key_value.expect("expected to find a scalar after Key token");
3761        assert!(
3762            matches!(key_value, Cow::Borrowed("mykey")),
3763            "quoted key should be borrowed when verbatim, got: {key_value:?}"
3764        );
3765    }
3766
3767    #[test]
3768    fn tag_handle_and_suffix_are_borrowed_for_str_input() {
3769        // Test a tag like !!str which should have handle="!!" and suffix="str"
3770        let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
3771
3772        loop {
3773            let tok = scanner
3774                .next_token()
3775                .expect("valid YAML must scan without errors")
3776                .expect("scanner must eventually produce a token");
3777            if let TokenType::Tag(handle, suffix) = tok.1 {
3778                assert!(
3779                    matches!(handle, Cow::Borrowed("!!")),
3780                    "tag handle should be borrowed, got: {handle:?}"
3781                );
3782                assert!(
3783                    matches!(suffix, Cow::Borrowed("str")),
3784                    "tag suffix should be borrowed, got: {suffix:?}"
3785                );
3786                break;
3787            }
3788        }
3789    }
3790
3791    #[test]
3792    fn local_tag_suffix_is_borrowed_for_str_input() {
3793        // Test a local tag like !mytag which should have handle="!" and suffix="mytag"
3794        let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
3795
3796        loop {
3797            let tok = scanner
3798                .next_token()
3799                .expect("valid YAML must scan without errors")
3800                .expect("scanner must eventually produce a token");
3801            if let TokenType::Tag(handle, suffix) = tok.1 {
3802                assert!(
3803                    matches!(handle, Cow::Borrowed("!")),
3804                    "local tag handle should be '!', got: {handle:?}"
3805                );
3806                assert!(
3807                    matches!(suffix, Cow::Borrowed("mytag")),
3808                    "local tag suffix should be borrowed, got: {suffix:?}"
3809                );
3810                break;
3811            }
3812        }
3813    }
3814
3815    #[test]
3816    fn tag_with_uri_escape_is_owned_for_str_input() {
3817        // Test a tag with URI escape like !my%20tag - suffix must be owned due to decoding
3818        let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
3819
3820        loop {
3821            let tok = scanner
3822                .next_token()
3823                .expect("valid YAML must scan without errors")
3824                .expect("scanner must eventually produce a token");
3825            if let TokenType::Tag(handle, suffix) = tok.1 {
3826                assert!(
3827                    matches!(handle, Cow::Borrowed("!!")),
3828                    "tag handle should still be borrowed, got: {handle:?}"
3829                );
3830                assert!(
3831                    matches!(suffix, Cow::Owned(_)),
3832                    "tag suffix with URI escape should be owned, got: {suffix:?}"
3833                );
3834                assert_eq!(&*suffix, "my tag");
3835                break;
3836            }
3837        }
3838    }
3839}
saphyr_parser_bw/scanner.rs

saphyr_parser_bw/
scanner.rs