saphyr_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use std::{borrow::Cow, char, collections::VecDeque, error::Error, fmt};
13
14use crate::{
15    char_traits::{
16        as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex,
17        is_tag_char, is_uri_char,
18    },
19    input::{Input, SkipTabs},
20};
21
22/// The encoding of the input. Currently, only UTF-8 is supported.
23#[derive(Clone, Copy, PartialEq, Debug, Eq)]
24pub enum TEncoding {
25    /// UTF-8 encoding.
26    Utf8,
27}
28
29/// The style as which the scalar was written in the YAML document.
30#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
31pub enum ScalarStyle {
32    /// A YAML plain scalar.
33    Plain,
34    /// A YAML single quoted scalar.
35    SingleQuoted,
36    /// A YAML double quoted scalar.
37    DoubleQuoted,
38
39    /// A YAML literal block (`|` block).
40    ///
41    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
42    /// In literal blocks, any indented character is content, including white space characters.
43    /// There is no way to escape characters, nor to break a long line.
44    Literal,
45    /// A YAML folded block (`>` block).
46    ///
47    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
48    /// In folded blocks, any indented character is content, including white space characters.
49    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
50    /// long lines.
51    Folded,
52}
53
54/// A location in a yaml document.
55#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
56pub struct Marker {
57    /// The index (in chars) in the input string.
58    index: usize,
59    /// The line (1-indexed).
60    line: usize,
61    /// The column (1-indexed).
62    col: usize,
63}
64
65impl Marker {
66    /// Create a new [`Marker`] at the given position.
67    #[must_use]
68    pub fn new(index: usize, line: usize, col: usize) -> Marker {
69        Marker { index, line, col }
70    }
71
72    /// Return the index (in bytes) of the marker in the source.
73    #[must_use]
74    pub fn index(&self) -> usize {
75        self.index
76    }
77
78    /// Return the line of the marker in the source.
79    #[must_use]
80    pub fn line(&self) -> usize {
81        self.line
82    }
83
84    /// Return the column of the marker in the source.
85    #[must_use]
86    pub fn col(&self) -> usize {
87        self.col
88    }
89}
90
91/// A range of locations in a Yaml document.
92#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
93pub struct Span {
94    /// The start (inclusive) of the range.
95    pub start: Marker,
96    /// The end (exclusive) of the range.
97    pub end: Marker,
98}
99
100impl Span {
101    /// Create a new [`Span`] for the given range.
102    #[must_use]
103    pub fn new(start: Marker, end: Marker) -> Span {
104        Span { start, end }
105    }
106
107    /// Create a empty [`Span`] at a given location.
108    ///
109    /// An empty span doesn't contain any characters, but its position may still be meaningful.
110    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
111    ///
112    /// [`SequenceEnd`]: crate::Event::SequenceEnd
113    #[must_use]
114    pub fn empty(mark: Marker) -> Span {
115        Span {
116            start: mark,
117            end: mark,
118        }
119    }
120}
121
122/// An error that occurred while scanning.
123#[derive(Clone, PartialEq, Debug, Eq)]
124pub struct ScanError {
125    /// The position at which the error happened in the source.
126    mark: Marker,
127    /// Human-readable details about the error.
128    info: String,
129}
130
131impl ScanError {
132    /// Create a new error from a location and an error string.
133    #[must_use]
134    pub fn new(loc: Marker, info: String) -> ScanError {
135        ScanError { mark: loc, info }
136    }
137
138    /// Convenience alias for string slices.
139    #[must_use]
140    pub fn new_str(loc: Marker, info: &str) -> ScanError {
141        ScanError {
142            mark: loc,
143            info: info.to_owned(),
144        }
145    }
146
147    /// Return the marker pointing to the error in the source.
148    #[must_use]
149    pub fn marker(&self) -> &Marker {
150        &self.mark
151    }
152
153    /// Return the information string describing the error that happened.
154    #[must_use]
155    pub fn info(&self) -> &str {
156        self.info.as_ref()
157    }
158}
159
160impl Error for ScanError {
161    fn source(&self) -> Option<&(dyn Error + 'static)> {
162        None
163    }
164}
165
166impl fmt::Display for ScanError {
167    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
168        write!(
169            formatter,
170            "{} at byte {} line {} column {}",
171            self.info,
172            self.mark.index,
173            self.mark.line,
174            self.mark.col + 1,
175        )
176    }
177}
178
179/// The contents of a scanner token.
180#[derive(Clone, PartialEq, Debug, Eq)]
181pub enum TokenType<'input> {
182    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
183    StreamStart(TEncoding),
184    /// The end of the stream, EOF.
185    StreamEnd,
186    /// A YAML version directive.
187    VersionDirective(
188        /// Major
189        u32,
190        /// Minor
191        u32,
192    ),
193    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
194    TagDirective(
195        /// Handle
196        Cow<'input, str>,
197        /// Prefix
198        Cow<'input, str>,
199    ),
200    /// The start of a YAML document (`---`).
201    DocumentStart,
202    /// The end of a YAML document (`...`).
203    DocumentEnd,
204    /// The start of a sequence block.
205    ///
206    /// Sequence blocks are arrays starting with a `-`.
207    BlockSequenceStart,
208    /// The start of a sequence mapping.
209    ///
210    /// Sequence mappings are "dictionaries" with "key: value" entries.
211    BlockMappingStart,
212    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
213    BlockEnd,
214    /// Start of an inline sequence (`[ a, b ]`).
215    FlowSequenceStart,
216    /// End of an inline sequence.
217    FlowSequenceEnd,
218    /// Start of an inline mapping (`{ a: b, c: d }`).
219    FlowMappingStart,
220    /// End of an inline mapping.
221    FlowMappingEnd,
222    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
223    BlockEntry,
224    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
225    FlowEntry,
226    /// A key in a mapping.
227    Key,
228    /// A value in a mapping.
229    Value,
230    /// A reference to an anchor.
231    Alias(Cow<'input, str>),
232    /// A YAML anchor (`&`/`*`).
233    Anchor(Cow<'input, str>),
234    /// A YAML tag (starting with bangs `!`).
235    Tag(
236        /// The handle of the tag.
237        String,
238        /// The suffix of the tag.
239        String,
240    ),
241    /// A regular YAML scalar.
242    Scalar(ScalarStyle, Cow<'input, str>),
243}
244
245/// A scanner token.
246#[derive(Clone, PartialEq, Debug, Eq)]
247pub struct Token<'input>(pub Span, pub TokenType<'input>);
248
249/// A scalar that was parsed and may correspond to a simple key.
250///
251/// Upon scanning the following yaml:
252/// ```yaml
253/// a: b
254/// ```
255/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
256/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
257/// kept inside the scanner until more context is fetched and we are able to know whether it is a
258/// plain scalar or a key.
259///
260/// For example, see the following 2 yaml documents:
261/// ```yaml
262/// ---
263/// a: b # Here, `a` is a key.
264/// ...
265/// ---
266/// a # Here, `a` is a plain scalar.
267/// ...
268/// ```
269/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
270///
271/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
272/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
273/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
274///
275/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
276/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
277/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
278/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
279/// [`TokenType::Scalar`] token.
280///
281/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
282/// [`TokenType::Key`] would be emitted by the scanner.
283#[derive(Clone, PartialEq, Debug, Eq)]
284struct SimpleKey {
285    /// Whether the token this [`SimpleKey`] refers to may still be a key.
286    ///
287    /// Sometimes, when we have more context, we notice that what we thought could be a key no
288    /// longer can be. In that case, [`Self::possible`] is set to `false`.
289    ///
290    /// For instance, let us consider the following invalid YAML:
291    /// ```yaml
292    /// key
293    ///   : value
294    /// ```
295    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
296    /// and [`Self::possible`] set to `false`.
297    possible: bool,
298    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
299    ///
300    /// With more context, we may know for sure that the token must be a key. If the YAML is
301    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
302    /// be raised. This boolean helps us know when to raise such error.
303    ///
304    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
305    required: bool,
306    /// The index of the token referred to by the [`SimpleKey`].
307    ///
308    /// This is the index in the scanner, which takes into account both the tokens that have been
309    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
310    /// [`Scanner::tokens`] for more details.
311    token_number: usize,
312    /// The position at which the token the [`SimpleKey`] refers to is.
313    mark: Marker,
314}
315
316impl SimpleKey {
317    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
318    fn new(mark: Marker) -> SimpleKey {
319        SimpleKey {
320            possible: false,
321            required: false,
322            token_number: 0,
323            mark,
324        }
325    }
326}
327
328/// An indentation level on the stack of indentations.
329#[derive(Clone, Debug, Default)]
330struct Indent {
331    /// The former indentation level.
332    indent: isize,
333    /// Whether, upon closing, this indents generates a `BlockEnd` token.
334    ///
335    /// There are levels of indentation which do not start a block. Examples of this would be:
336    /// ```yaml
337    /// -
338    ///   foo # ok
339    /// -
340    /// bar # ko, bar needs to be indented further than the `-`.
341    /// - [
342    ///  baz, # ok
343    /// quux # ko, quux needs to be indented further than the '-'.
344    /// ] # ko, the closing bracket needs to be indented further than the `-`.
345    /// ```
346    ///
347    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
348    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
349    /// sequence, although we must have exactly one to end the sequence.
350    needs_block_end: bool,
351}
352
353/// The knowledge we have about an implicit mapping.
354///
355/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
356/// sequence is omitted:
357/// ```yaml
358/// [ a: b, c: d ]
359/// # Equivalent to
360/// [ { a: b }, { c: d } ]
361/// # Equivalent to
362/// - a: b
363/// - c: d
364/// ```
365///
366/// The state must be carefully tracked for each nested flow sequence since we must emit a
367/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
368/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
369/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
370/// out-of-order.
371///
372/// [`FlowMappingStart`]: TokenType::FlowMappingStart
373/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
374#[derive(Debug, PartialEq)]
375enum ImplicitMappingState {
376    /// It is possible there is an implicit mapping.
377    ///
378    /// This state is the one when we have just encountered the opening `[`. We need more context
379    /// to know whether an implicit mapping follows.
380    Possible,
381    /// We are inside the implcit mapping.
382    ///
383    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
384    Inside,
385}
386
387/// The YAML scanner.
388///
389/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
390/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
391/// some of the constructs. It has understanding of indentation and whitespace and is able to
392/// generate error messages for some invalid YAML constructs.
393///
394/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
395/// YAML documents.
396#[derive(Debug)]
397#[allow(clippy::struct_excessive_bools)]
398pub struct Scanner<'input, T> {
399    /// The input source.
400    ///
401    /// This must implement [`Input`].
402    input: T,
403    /// The position of the cursor within the reader.
404    mark: Marker,
405    /// Buffer for tokens to be returned.
406    ///
407    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
408    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
409    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
410    /// [`Self::next`] until we have more context.
411    tokens: VecDeque<Token<'input>>,
412    /// The last error that happened.
413    error: Option<ScanError>,
414
415    /// Whether we have already emitted the `StreamStart` token.
416    stream_start_produced: bool,
417    /// Whether we have already emitted the `StreamEnd` token.
418    stream_end_produced: bool,
419    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
420    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
421    adjacent_value_allowed_at: usize,
422    /// Whether a simple key could potentially start at the current position.
423    ///
424    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
425    simple_key_allowed: bool,
426    /// A stack of potential simple keys.
427    ///
428    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
429    /// are.
430    simple_keys: Vec<SimpleKey>,
431    /// The current indentation level.
432    indent: isize,
433    /// List of all block indentation levels we are in (except the current one).
434    indents: Vec<Indent>,
435    /// Level of nesting of flow sequences.
436    flow_level: u8,
437    /// The number of tokens that have been returned from the scanner.
438    ///
439    /// This excludes the tokens from [`Self::tokens`].
440    tokens_parsed: usize,
441    /// Whether a token is ready to be taken from [`Self::tokens`].
442    token_available: bool,
443    /// Whether all characters encountered since the last newline were whitespace.
444    leading_whitespace: bool,
445    /// Whether we started a flow mapping.
446    ///
447    /// This is used to detect implicit flow mapping starts such as:
448    /// ```yaml
449    /// [ : foo ] # { null: "foo" }
450    /// ```
451    flow_mapping_started: bool,
452    /// An array of states, representing whether flow sequences have implicit mappings.
453    ///
454    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
455    /// the state is set to [`Possible`].
456    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
457    /// [`Inside`].
458    ///
459    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
460    /// The entries are created with the opening `]` and popped with the closing `]`.
461    ///
462    /// [`Possible`]: ImplicitMappingState::Possible
463    /// [`Inside`]: ImplicitMappingState::Inside
464    implicit_flow_mapping_states: Vec<ImplicitMappingState>,
465    buf_leading_break: String,
466    buf_trailing_breaks: String,
467    buf_whitespaces: String,
468}
469
470impl<'input, T: Input> Iterator for Scanner<'input, T> {
471    type Item = Token<'input>;
472
473    fn next(&mut self) -> Option<Self::Item> {
474        if self.error.is_some() {
475            return None;
476        }
477        match self.next_token() {
478            Ok(Some(tok)) => {
479                debug_print!(
480                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
481                    tok.1,
482                    tok.0
483                );
484                Some(tok)
485            }
486            Ok(tok) => tok,
487            Err(e) => {
488                self.error = Some(e);
489                None
490            }
491        }
492    }
493}
494
495/// A convenience alias for scanner functions that may fail without returning a value.
496pub type ScanResult = Result<(), ScanError>;
497
498impl<'input, T: Input> Scanner<'input, T> {
499    /// Creates the YAML tokenizer.
500    pub fn new(input: T) -> Self {
501        Scanner {
502            input,
503            mark: Marker::new(0, 1, 0),
504            tokens: VecDeque::new(),
505            error: None,
506
507            stream_start_produced: false,
508            stream_end_produced: false,
509            adjacent_value_allowed_at: 0,
510            simple_key_allowed: true,
511            simple_keys: Vec::new(),
512            indent: -1,
513            indents: Vec::new(),
514            flow_level: 0,
515            tokens_parsed: 0,
516            token_available: false,
517            leading_whitespace: true,
518            flow_mapping_started: false,
519            implicit_flow_mapping_states: vec![],
520
521            buf_leading_break: String::new(),
522            buf_trailing_breaks: String::new(),
523            buf_whitespaces: String::new(),
524        }
525    }
526
527    /// Get a copy of the last error that was encountered, if any.
528    ///
529    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
530    /// clone of) the same error.
531    #[inline]
532    pub fn get_error(&self) -> Option<ScanError> {
533        self.error.clone()
534    }
535
536    /// Consume the next character. It is assumed the next character is a blank.
537    #[inline]
538    fn skip_blank(&mut self) {
539        self.input.skip();
540
541        self.mark.index += 1;
542        self.mark.col += 1;
543    }
544
545    /// Consume the next character. It is assumed the next character is not a blank.
546    #[inline]
547    fn skip_non_blank(&mut self) {
548        self.input.skip();
549
550        self.mark.index += 1;
551        self.mark.col += 1;
552        self.leading_whitespace = false;
553    }
554
555    /// Consume the next characters. It is assumed none of the next characters are blanks.
556    #[inline]
557    fn skip_n_non_blank(&mut self, count: usize) {
558        self.input.skip_n(count);
559
560        self.mark.index += count;
561        self.mark.col += count;
562        self.leading_whitespace = false;
563    }
564
565    /// Consume the next character. It is assumed the next character is a newline.
566    #[inline]
567    fn skip_nl(&mut self) {
568        self.input.skip();
569
570        self.mark.index += 1;
571        self.mark.col = 0;
572        self.mark.line += 1;
573        self.leading_whitespace = true;
574    }
575
576    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
577    #[inline]
578    fn skip_linebreak(&mut self) {
579        if self.input.next_2_are('\r', '\n') {
580            // While technically not a blank, this does not matter as `self.leading_whitespace`
581            // will be reset by `skip_nl`.
582            self.skip_blank();
583            self.skip_nl();
584        } else if self.input.next_is_break() {
585            self.skip_nl();
586        }
587    }
588
589    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
590    #[inline]
591    pub fn stream_started(&self) -> bool {
592        self.stream_start_produced
593    }
594
595    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
596    #[inline]
597    pub fn stream_ended(&self) -> bool {
598        self.stream_end_produced
599    }
600
601    /// Get the current position in the input stream.
602    #[inline]
603    pub fn mark(&self) -> Marker {
604        self.mark
605    }
606
607    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
608    //
609    // A `\n` is pushed into `s`.
610    //
611    // # Panics (in debug)
612    // If the next characters do not correspond to a line break.
613    #[inline]
614    fn read_break(&mut self, s: &mut String) {
615        self.skip_break();
616        s.push('\n');
617    }
618
619    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
620    //
621    // # Panics (in debug)
622    // If the next characters do not correspond to a line break.
623    #[inline]
624    fn skip_break(&mut self) {
625        let c = self.input.peek();
626        let nc = self.input.peek_nth(1);
627        debug_assert!(is_break(c));
628        if c == '\r' && nc == '\n' {
629            self.skip_blank();
630        }
631        self.skip_nl();
632    }
633
634    /// Insert a token at the given position.
635    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
636        let old_len = self.tokens.len();
637        assert!(pos <= old_len);
638        self.tokens.insert(pos, tok);
639    }
640
641    fn allow_simple_key(&mut self) {
642        self.simple_key_allowed = true;
643    }
644
645    fn disallow_simple_key(&mut self) {
646        self.simple_key_allowed = false;
647    }
648
649    /// Fetch the next token in the stream.
650    ///
651    /// # Errors
652    /// Returns `ScanError` when the scanner does not find the next expected token.
653    pub fn fetch_next_token(&mut self) -> ScanResult {
654        self.input.lookahead(1);
655
656        if !self.stream_start_produced {
657            self.fetch_stream_start();
658            return Ok(());
659        }
660        self.skip_to_next_token()?;
661
662        debug_print!(
663            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
664            self.mark,
665            self.input.peek()
666        );
667
668        self.stale_simple_keys()?;
669
670        let mark = self.mark;
671        self.unroll_indent(mark.col as isize);
672
673        self.input.lookahead(4);
674
675        if self.input.next_is_z() {
676            self.fetch_stream_end()?;
677            return Ok(());
678        }
679
680        if self.mark.col == 0 {
681            if self.input.next_char_is('%') {
682                return self.fetch_directive();
683            } else if self.input.next_is_document_start() {
684                return self.fetch_document_indicator(TokenType::DocumentStart);
685            } else if self.input.next_is_document_end() {
686                self.fetch_document_indicator(TokenType::DocumentEnd)?;
687                self.skip_ws_to_eol(SkipTabs::Yes)?;
688                if !self.input.next_is_breakz() {
689                    return Err(ScanError::new_str(
690                        self.mark,
691                        "invalid content after document end marker",
692                    ));
693                }
694                return Ok(());
695            }
696        }
697
698        if (self.mark.col as isize) < self.indent {
699            return Err(ScanError::new_str(self.mark, "invalid indentation"));
700        }
701
702        let c = self.input.peek();
703        let nc = self.input.peek_nth(1);
704        match c {
705            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
706            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
707            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
708            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
709            ',' => self.fetch_flow_entry(),
710            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
711            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
712            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
713            ':' if self.flow_level > 0
714                && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at) =>
715            {
716                self.fetch_flow_value()
717            }
718            // Is it an alias?
719            '*' => self.fetch_anchor(true),
720            // Is it an anchor?
721            '&' => self.fetch_anchor(false),
722            '!' => self.fetch_tag(),
723            // Is it a literal scalar?
724            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
725            // Is it a folded scalar?
726            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
727            '\'' => self.fetch_flow_scalar(true),
728            '"' => self.fetch_flow_scalar(false),
729            // plain scalar
730            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
731            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
732                self.fetch_plain_scalar()
733            }
734            '%' | '@' | '`' => Err(ScanError::new(
735                self.mark,
736                format!("unexpected character: `{c}'"),
737            )),
738            _ => self.fetch_plain_scalar(),
739        }
740    }
741
742    /// Return the next token in the stream.
743    /// # Errors
744    /// Returns `ScanError` when scanning fails to find an expected next token.
745    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
746        if self.stream_end_produced {
747            return Ok(None);
748        }
749
750        if !self.token_available {
751            self.fetch_more_tokens()?;
752        }
753        let Some(t) = self.tokens.pop_front() else {
754            return Err(ScanError::new_str(
755                self.mark,
756                "did not find expected next token",
757            ));
758        };
759        self.token_available = false;
760        self.tokens_parsed += 1;
761
762        if let TokenType::StreamEnd = t.1 {
763            self.stream_end_produced = true;
764        }
765        Ok(Some(t))
766    }
767
768    /// Fetch tokens from the token stream.
769    /// # Errors
770    /// Returns `ScanError` when loading fails.
771    pub fn fetch_more_tokens(&mut self) -> ScanResult {
772        let mut need_more;
773        loop {
774            if self.tokens.is_empty() {
775                need_more = true;
776            } else {
777                need_more = false;
778                // Stale potential keys that we know won't be keys.
779                self.stale_simple_keys()?;
780                // If our next token to be emitted may be a key, fetch more context.
781                for sk in &self.simple_keys {
782                    if sk.possible && sk.token_number == self.tokens_parsed {
783                        need_more = true;
784                        break;
785                    }
786                }
787            }
788
789            if !need_more {
790                break;
791            }
792            self.fetch_next_token()?;
793        }
794        self.token_available = true;
795
796        Ok(())
797    }
798
799    /// Mark simple keys that can no longer be keys as such.
800    ///
801    /// This function sets `possible` to `false` to each key that, now we have more context, we
802    /// know will not be keys.
803    ///
804    /// # Errors
805    /// This function returns an error if one of the key we would stale was required to be a key.
806    fn stale_simple_keys(&mut self) -> ScanResult {
807        for sk in &mut self.simple_keys {
808            if sk.possible
809                // If not in a flow construct, simple keys cannot span multiple lines.
810                && self.flow_level == 0
811                    && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
812            {
813                if sk.required {
814                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
815                }
816                sk.possible = false;
817            }
818        }
819        Ok(())
820    }
821
822    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
823    ///
824    /// # Errors
825    /// This function returns an error if a tabulation is encountered where there should not be
826    /// one.
827    fn skip_to_next_token(&mut self) -> ScanResult {
828        loop {
829            // TODO(chenyh) BOM
830            match self.input.look_ch() {
831                // Tabs may not be used as indentation.
832                // "Indentation" only exists as long as a block is started, but does not exist
833                // inside of flow-style constructs. Tabs are allowed as part of leading
834                // whitespaces outside of indentation.
835                // If a flow-style construct is in an indented block, its contents must still be
836                // indented. Also, tabs are allowed anywhere in it if it has no content.
837                '\t' if self.is_within_block()
838                    && self.leading_whitespace
839                    && (self.mark.col as isize) < self.indent =>
840                {
841                    self.skip_ws_to_eol(SkipTabs::Yes)?;
842                    // If we have content on that line with a tab, return an error.
843                    if !self.input.next_is_breakz() {
844                        return Err(ScanError::new_str(
845                            self.mark,
846                            "tabs disallowed within this context (block indentation)",
847                        ));
848                    }
849                }
850                '\t' | ' ' => self.skip_blank(),
851                '\n' | '\r' => {
852                    self.input.lookahead(2);
853                    self.skip_linebreak();
854                    if self.flow_level == 0 {
855                        self.allow_simple_key();
856                    }
857                }
858                '#' => {
859                    let comment_length = self.input.skip_while_non_breakz();
860                    self.mark.index += comment_length;
861                    self.mark.col += comment_length;
862                }
863                _ => break,
864            }
865        }
866        Ok(())
867    }
868
869    /// Skip over YAML whitespace (` `, `\n`, `\r`).
870    ///
871    /// # Errors
872    /// This function returns an error if no whitespace was found.
873    fn skip_yaml_whitespace(&mut self) -> ScanResult {
874        let mut need_whitespace = true;
875        loop {
876            match self.input.look_ch() {
877                ' ' => {
878                    self.skip_blank();
879
880                    need_whitespace = false;
881                }
882                '\n' | '\r' => {
883                    self.input.lookahead(2);
884                    self.skip_linebreak();
885                    if self.flow_level == 0 {
886                        self.allow_simple_key();
887                    }
888                    need_whitespace = false;
889                }
890                '#' => {
891                    let comment_length = self.input.skip_while_non_breakz();
892                    self.mark.index += comment_length;
893                    self.mark.col += comment_length;
894                }
895                _ => break,
896            }
897        }
898
899        if need_whitespace {
900            Err(ScanError::new_str(self.mark(), "expected whitespace"))
901        } else {
902            Ok(())
903        }
904    }
905
906    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
907        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
908        self.mark.col += n_bytes;
909        self.mark.index += n_bytes;
910        result.map_err(|msg| ScanError::new_str(self.mark, msg))
911    }
912
913    fn fetch_stream_start(&mut self) {
914        let mark = self.mark;
915        self.indent = -1;
916        self.stream_start_produced = true;
917        self.allow_simple_key();
918        self.tokens.push_back(Token(
919            Span::empty(mark),
920            TokenType::StreamStart(TEncoding::Utf8),
921        ));
922        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
923    }
924
925    fn fetch_stream_end(&mut self) -> ScanResult {
926        // force new line
927        if self.mark.col != 0 {
928            self.mark.col = 0;
929            self.mark.line += 1;
930        }
931
932        // If the stream ended, we won't have more context. We can stall all the simple keys we
933        // had. If one was required, however, that was an error and we must propagate it.
934        for sk in &mut self.simple_keys {
935            if sk.required && sk.possible {
936                return Err(ScanError::new_str(self.mark, "simple key expected"));
937            }
938            sk.possible = false;
939        }
940
941        self.unroll_indent(-1);
942        self.remove_simple_key()?;
943        self.disallow_simple_key();
944
945        self.tokens
946            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
947        Ok(())
948    }
949
950    fn fetch_directive(&mut self) -> ScanResult {
951        self.unroll_indent(-1);
952        self.remove_simple_key()?;
953
954        self.disallow_simple_key();
955
956        let tok = self.scan_directive()?;
957        self.tokens.push_back(tok);
958
959        Ok(())
960    }
961
962    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
963        let start_mark = self.mark;
964        self.skip_non_blank();
965
966        let name = self.scan_directive_name()?;
967        let tok = match name.as_ref() {
968            "YAML" => self.scan_version_directive_value(&start_mark)?,
969            "TAG" => self.scan_tag_directive_value(&start_mark)?,
970            // XXX This should be a warning instead of an error
971            _ => {
972                // skip current line
973                let line_len = self.input.skip_while_non_breakz();
974                self.mark.index += line_len;
975                self.mark.col += line_len;
976                // XXX return an empty TagDirective token
977                Token(
978                    Span::new(start_mark, self.mark),
979                    TokenType::TagDirective(Cow::default(), Cow::default()),
980                )
981                // return Err(ScanError::new_str(start_mark,
982                //     "while scanning a directive, found unknown directive name"))
983            }
984        };
985
986        self.skip_ws_to_eol(SkipTabs::Yes)?;
987
988        if self.input.next_is_breakz() {
989            self.input.lookahead(2);
990            self.skip_linebreak();
991            Ok(tok)
992        } else {
993            Err(ScanError::new_str(
994                start_mark,
995                "while scanning a directive, did not find expected comment or line break",
996            ))
997        }
998    }
999
1000    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1001        let n_blanks = self.input.skip_while_blank();
1002        self.mark.index += n_blanks;
1003        self.mark.col += n_blanks;
1004
1005        let major = self.scan_version_directive_number(mark)?;
1006
1007        if self.input.peek() != '.' {
1008            return Err(ScanError::new_str(
1009                *mark,
1010                "while scanning a YAML directive, did not find expected digit or '.' character",
1011            ));
1012        }
1013        self.skip_non_blank();
1014
1015        let minor = self.scan_version_directive_number(mark)?;
1016
1017        Ok(Token(
1018            Span::new(*mark, self.mark),
1019            TokenType::VersionDirective(major, minor),
1020        ))
1021    }
1022
1023    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1024        let start_mark = self.mark;
1025        let mut string = String::new();
1026
1027        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1028        self.mark.index += n_chars;
1029        self.mark.col += n_chars;
1030
1031        if string.is_empty() {
1032            return Err(ScanError::new_str(
1033                start_mark,
1034                "while scanning a directive, could not find expected directive name",
1035            ));
1036        }
1037
1038        if !is_blank_or_breakz(self.input.peek()) {
1039            return Err(ScanError::new_str(
1040                start_mark,
1041                "while scanning a directive, found unexpected non-alphabetical character",
1042            ));
1043        }
1044
1045        Ok(string)
1046    }
1047
1048    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1049        let mut val = 0u32;
1050        let mut length = 0usize;
1051        while let Some(digit) = self.input.look_ch().to_digit(10) {
1052            if length + 1 > 9 {
1053                return Err(ScanError::new_str(
1054                    *mark,
1055                    "while scanning a YAML directive, found extremely long version number",
1056                ));
1057            }
1058            length += 1;
1059            val = val * 10 + digit;
1060            self.skip_non_blank();
1061        }
1062
1063        if length == 0 {
1064            return Err(ScanError::new_str(
1065                *mark,
1066                "while scanning a YAML directive, did not find expected version number",
1067            ));
1068        }
1069
1070        Ok(val)
1071    }
1072
1073    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1074        let n_blanks = self.input.skip_while_blank();
1075        self.mark.index += n_blanks;
1076        self.mark.col += n_blanks;
1077
1078        let handle = self.scan_tag_handle(true, mark)?;
1079
1080        let n_blanks = self.input.skip_while_blank();
1081        self.mark.index += n_blanks;
1082        self.mark.col += n_blanks;
1083
1084        let prefix = self.scan_tag_prefix(mark)?;
1085
1086        self.input.lookahead(1);
1087
1088        if self.input.next_is_blank_or_breakz() {
1089            Ok(Token(
1090                Span::new(*mark, self.mark),
1091                TokenType::TagDirective(handle.into(), prefix.into()),
1092            ))
1093        } else {
1094            Err(ScanError::new_str(
1095                *mark,
1096                "while scanning TAG, did not find expected whitespace or line break",
1097            ))
1098        }
1099    }
1100
1101    fn fetch_tag(&mut self) -> ScanResult {
1102        self.save_simple_key();
1103        self.disallow_simple_key();
1104
1105        let tok = self.scan_tag()?;
1106        self.tokens.push_back(tok);
1107        Ok(())
1108    }
1109
1110    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
1111        let start_mark = self.mark;
1112        let mut handle = String::new();
1113        let mut suffix;
1114
1115        // Check if the tag is in the canonical form (verbatim).
1116        self.input.lookahead(2);
1117
1118        if self.input.nth_char_is(1, '<') {
1119            suffix = self.scan_verbatim_tag(&start_mark)?;
1120        } else {
1121            // The tag has either the '!suffix' or the '!handle!suffix'
1122            handle = self.scan_tag_handle(false, &start_mark)?;
1123            // Check if it is, indeed, handle.
1124            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1125                // A tag handle starting with "!!" is a secondary tag handle.
1126                let is_secondary_handle = handle == "!!";
1127                suffix =
1128                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?;
1129            } else {
1130                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?;
1131                "!".clone_into(&mut handle);
1132                // A special case: the '!' tag.  Set the handle to '' and the
1133                // suffix to '!'.
1134                if suffix.is_empty() {
1135                    handle.clear();
1136                    "!".clone_into(&mut suffix);
1137                }
1138            }
1139        }
1140
1141        if is_blank_or_breakz(self.input.look_ch())
1142            || (self.flow_level > 0 && self.input.next_is_flow())
1143        {
1144            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1145            Ok(Token(
1146                Span::new(start_mark, self.mark),
1147                TokenType::Tag(handle, suffix),
1148            ))
1149        } else {
1150            Err(ScanError::new_str(
1151                start_mark,
1152                "while scanning a tag, did not find expected whitespace or line break",
1153            ))
1154        }
1155    }
1156
1157    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1158        let mut string = String::new();
1159        if self.input.look_ch() != '!' {
1160            return Err(ScanError::new_str(
1161                *mark,
1162                "while scanning a tag, did not find expected '!'",
1163            ));
1164        }
1165
1166        string.push(self.input.peek());
1167        self.skip_non_blank();
1168
1169        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1170        self.mark.index += n_chars;
1171        self.mark.col += n_chars;
1172
1173        // Check if the trailing character is '!' and copy it.
1174        if self.input.peek() == '!' {
1175            string.push(self.input.peek());
1176            self.skip_non_blank();
1177        } else if directive && string != "!" {
1178            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1179            // directive, it's an error.  If it's a tag token, it must be a part of
1180            // URI.
1181            return Err(ScanError::new_str(
1182                *mark,
1183                "while parsing a tag directive, did not find expected '!'",
1184            ));
1185        }
1186        Ok(string)
1187    }
1188
1189    /// Scan for a tag prefix (6.8.2.2).
1190    ///
1191    /// There are 2 kinds of tag prefixes:
1192    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1193    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1194    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1195        let mut string = String::new();
1196
1197        if self.input.look_ch() == '!' {
1198            // If we have a local tag, insert and skip `!`.
1199            string.push(self.input.peek());
1200            self.skip_non_blank();
1201        } else if !is_tag_char(self.input.peek()) {
1202            // Otherwise, check if the first global tag character is valid.
1203            return Err(ScanError::new_str(
1204                *start_mark,
1205                "invalid global tag character",
1206            ));
1207        } else if self.input.peek() == '%' {
1208            // If it is valid and an escape sequence, escape it.
1209            string.push(self.scan_uri_escapes(start_mark)?);
1210        } else {
1211            // Otherwise, push the first character.
1212            string.push(self.input.peek());
1213            self.skip_non_blank();
1214        }
1215
1216        while is_uri_char(self.input.look_ch()) {
1217            if self.input.peek() == '%' {
1218                string.push(self.scan_uri_escapes(start_mark)?);
1219            } else {
1220                string.push(self.input.peek());
1221                self.skip_non_blank();
1222            }
1223        }
1224
1225        Ok(string)
1226    }
1227
1228    /// Scan for a verbatim tag.
1229    ///
1230    /// The prefixing `!<` must _not_ have been skipped.
1231    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1232        // Eat `!<`
1233        self.skip_non_blank();
1234        self.skip_non_blank();
1235
1236        let mut string = String::new();
1237        while is_uri_char(self.input.look_ch()) {
1238            if self.input.peek() == '%' {
1239                string.push(self.scan_uri_escapes(start_mark)?);
1240            } else {
1241                string.push(self.input.peek());
1242                self.skip_non_blank();
1243            }
1244        }
1245
1246        if self.input.peek() != '>' {
1247            return Err(ScanError::new_str(
1248                *start_mark,
1249                "while scanning a verbatim tag, did not find the expected '>'",
1250            ));
1251        }
1252        self.skip_non_blank();
1253
1254        Ok(string)
1255    }
1256
1257    fn scan_tag_shorthand_suffix(
1258        &mut self,
1259        _directive: bool,
1260        _is_secondary: bool,
1261        head: &str,
1262        mark: &Marker,
1263    ) -> Result<String, ScanError> {
1264        let mut length = head.len();
1265        let mut string = String::new();
1266
1267        // Copy the head if needed.
1268        // Note that we don't copy the leading '!' character.
1269        if length > 1 {
1270            string.extend(head.chars().skip(1));
1271        }
1272
1273        while is_tag_char(self.input.look_ch()) {
1274            // Check if it is a URI-escape sequence.
1275            if self.input.peek() == '%' {
1276                string.push(self.scan_uri_escapes(mark)?);
1277            } else {
1278                string.push(self.input.peek());
1279                self.skip_non_blank();
1280            }
1281
1282            length += 1;
1283        }
1284
1285        if length == 0 {
1286            return Err(ScanError::new_str(
1287                *mark,
1288                "while parsing a tag, did not find expected tag URI",
1289            ));
1290        }
1291
1292        Ok(string)
1293    }
1294
1295    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1296        let mut width = 0usize;
1297        let mut code = 0u32;
1298        loop {
1299            self.input.lookahead(3);
1300
1301            let c = self.input.peek_nth(1);
1302            let nc = self.input.peek_nth(2);
1303
1304            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
1305                return Err(ScanError::new_str(
1306                    *mark,
1307                    "while parsing a tag, found an invalid escape sequence",
1308                ));
1309            }
1310
1311            let byte = (as_hex(c) << 4) + as_hex(nc);
1312            if width == 0 {
1313                width = match byte {
1314                    _ if byte & 0x80 == 0x00 => 1,
1315                    _ if byte & 0xE0 == 0xC0 => 2,
1316                    _ if byte & 0xF0 == 0xE0 => 3,
1317                    _ if byte & 0xF8 == 0xF0 => 4,
1318                    _ => {
1319                        return Err(ScanError::new_str(
1320                            *mark,
1321                            "while parsing a tag, found an incorrect leading UTF-8 byte",
1322                        ));
1323                    }
1324                };
1325                code = byte;
1326            } else {
1327                if byte & 0xc0 != 0x80 {
1328                    return Err(ScanError::new_str(
1329                        *mark,
1330                        "while parsing a tag, found an incorrect trailing UTF-8 byte",
1331                    ));
1332                }
1333                code = (code << 8) + byte;
1334            }
1335
1336            self.skip_n_non_blank(3);
1337
1338            width -= 1;
1339            if width == 0 {
1340                break;
1341            }
1342        }
1343
1344        match char::from_u32(code) {
1345            Some(ch) => Ok(ch),
1346            None => Err(ScanError::new_str(
1347                *mark,
1348                "while parsing a tag, found an invalid UTF-8 codepoint",
1349            )),
1350        }
1351    }
1352
1353    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
1354        self.save_simple_key();
1355        self.disallow_simple_key();
1356
1357        let tok = self.scan_anchor(alias)?;
1358
1359        self.tokens.push_back(tok);
1360
1361        Ok(())
1362    }
1363
1364    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
1365        let mut string = String::new();
1366        let start_mark = self.mark;
1367
1368        self.skip_non_blank();
1369        while is_anchor_char(self.input.look_ch()) {
1370            string.push(self.input.peek());
1371            self.skip_non_blank();
1372        }
1373
1374        if string.is_empty() {
1375            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
1376        }
1377
1378        let tok = if alias {
1379            TokenType::Alias(string.into())
1380        } else {
1381            TokenType::Anchor(string.into())
1382        };
1383        Ok(Token(Span::new(start_mark, self.mark), tok))
1384    }
1385
1386    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
1387        // The indicators '[' and '{' may start a simple key.
1388        self.save_simple_key();
1389
1390        self.roll_one_col_indent();
1391        self.increase_flow_level()?;
1392
1393        self.allow_simple_key();
1394
1395        let start_mark = self.mark;
1396        self.skip_non_blank();
1397
1398        if tok == TokenType::FlowMappingStart {
1399            self.flow_mapping_started = true;
1400        } else {
1401            self.implicit_flow_mapping_states
1402                .push(ImplicitMappingState::Possible);
1403        }
1404
1405        self.skip_ws_to_eol(SkipTabs::Yes)?;
1406
1407        self.tokens
1408            .push_back(Token(Span::new(start_mark, self.mark), tok));
1409        Ok(())
1410    }
1411
1412    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
1413        self.remove_simple_key()?;
1414        self.decrease_flow_level();
1415
1416        self.disallow_simple_key();
1417
1418        if matches!(tok, TokenType::FlowSequenceEnd) {
1419            self.end_implicit_mapping(self.mark);
1420            // We are out exiting the flow sequence, nesting goes down 1 level.
1421            self.implicit_flow_mapping_states.pop();
1422        }
1423
1424        let start_mark = self.mark;
1425        self.skip_non_blank();
1426        self.skip_ws_to_eol(SkipTabs::Yes)?;
1427
1428        // A flow collection within a flow mapping can be a key. In that case, the value may be
1429        // adjacent to the `:`.
1430        // ```yaml
1431        // - [ {a: b}:value ]
1432        // ```
1433        if self.flow_level > 0 {
1434            self.adjacent_value_allowed_at = self.mark.index;
1435        }
1436
1437        self.tokens
1438            .push_back(Token(Span::new(start_mark, self.mark), tok));
1439        Ok(())
1440    }
1441
1442    /// Push the `FlowEntry` token and skip over the `,`.
1443    fn fetch_flow_entry(&mut self) -> ScanResult {
1444        self.remove_simple_key()?;
1445        self.allow_simple_key();
1446
1447        self.end_implicit_mapping(self.mark);
1448
1449        let start_mark = self.mark;
1450        self.skip_non_blank();
1451        self.skip_ws_to_eol(SkipTabs::Yes)?;
1452
1453        self.tokens.push_back(Token(
1454            Span::new(start_mark, self.mark),
1455            TokenType::FlowEntry,
1456        ));
1457        Ok(())
1458    }
1459
1460    fn increase_flow_level(&mut self) -> ScanResult {
1461        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1462        self.flow_level = self
1463            .flow_level
1464            .checked_add(1)
1465            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
1466        Ok(())
1467    }
1468
1469    fn decrease_flow_level(&mut self) {
1470        if self.flow_level > 0 {
1471            self.flow_level -= 1;
1472            self.simple_keys.pop().unwrap();
1473        }
1474    }
1475
1476    /// Push the `Block*` token(s) and skip over the `-`.
1477    ///
1478    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
1479    /// `BlockEntry` token.
1480    /// This function only skips over the `-` and does not fetch the entry value.
1481    fn fetch_block_entry(&mut self) -> ScanResult {
1482        if self.flow_level > 0 {
1483            // - * only allowed in block
1484            return Err(ScanError::new_str(
1485                self.mark,
1486                r#""-" is only valid inside a block"#,
1487            ));
1488        }
1489        // Check if we are allowed to start a new entry.
1490        if !self.simple_key_allowed {
1491            return Err(ScanError::new_str(
1492                self.mark,
1493                "block sequence entries are not allowed in this context",
1494            ));
1495        }
1496
1497        // ???, fixes test G9HC.
1498        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
1499            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
1500                return Err(ScanError::new_str(
1501                    span.start,
1502                    "invalid indentation for anchor",
1503                ));
1504            }
1505        }
1506
1507        // Skip over the `-`.
1508        let mark = self.mark;
1509        self.skip_non_blank();
1510
1511        // generate BLOCK-SEQUENCE-START if indented
1512        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
1513        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
1514        self.input.lookahead(2);
1515        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
1516        {
1517            return Err(ScanError::new_str(
1518                self.mark,
1519                "'-' must be followed by a valid YAML whitespace",
1520            ));
1521        }
1522
1523        self.skip_ws_to_eol(SkipTabs::No)?;
1524        self.input.lookahead(1);
1525        if self.input.next_is_break() || self.input.next_is_flow() {
1526            self.roll_one_col_indent();
1527        }
1528
1529        self.remove_simple_key()?;
1530        self.allow_simple_key();
1531
1532        self.tokens
1533            .push_back(Token(Span::empty(self.mark), TokenType::BlockEntry));
1534
1535        Ok(())
1536    }
1537
1538    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
1539        self.unroll_indent(-1);
1540        self.remove_simple_key()?;
1541        self.disallow_simple_key();
1542
1543        let mark = self.mark;
1544
1545        self.skip_n_non_blank(3);
1546
1547        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
1548        Ok(())
1549    }
1550
1551    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
1552        self.save_simple_key();
1553        self.allow_simple_key();
1554        let tok = self.scan_block_scalar(literal)?;
1555
1556        self.tokens.push_back(tok);
1557        Ok(())
1558    }
1559
1560    #[allow(clippy::too_many_lines)]
1561    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
1562        let start_mark = self.mark;
1563        let mut chomping = Chomping::Clip;
1564        let mut increment: usize = 0;
1565        let mut indent: usize = 0;
1566        let mut trailing_blank: bool;
1567        let mut leading_blank: bool = false;
1568        let style = if literal {
1569            ScalarStyle::Literal
1570        } else {
1571            ScalarStyle::Folded
1572        };
1573
1574        let mut string = String::new();
1575        let mut leading_break = String::new();
1576        let mut trailing_breaks = String::new();
1577        let mut chomping_break = String::new();
1578
1579        // skip '|' or '>'
1580        self.skip_non_blank();
1581        self.unroll_non_block_indents();
1582
1583        if self.input.look_ch() == '+' || self.input.peek() == '-' {
1584            if self.input.peek() == '+' {
1585                chomping = Chomping::Keep;
1586            } else {
1587                chomping = Chomping::Strip;
1588            }
1589            self.skip_non_blank();
1590            self.input.lookahead(1);
1591            if self.input.next_is_digit() {
1592                if self.input.peek() == '0' {
1593                    return Err(ScanError::new_str(
1594                        start_mark,
1595                        "while scanning a block scalar, found an indentation indicator equal to 0",
1596                    ));
1597                }
1598                increment = (self.input.peek() as usize) - ('0' as usize);
1599                self.skip_non_blank();
1600            }
1601        } else if self.input.next_is_digit() {
1602            if self.input.peek() == '0' {
1603                return Err(ScanError::new_str(
1604                    start_mark,
1605                    "while scanning a block scalar, found an indentation indicator equal to 0",
1606                ));
1607            }
1608
1609            increment = (self.input.peek() as usize) - ('0' as usize);
1610            self.skip_non_blank();
1611            self.input.lookahead(1);
1612            if self.input.peek() == '+' || self.input.peek() == '-' {
1613                if self.input.peek() == '+' {
1614                    chomping = Chomping::Keep;
1615                } else {
1616                    chomping = Chomping::Strip;
1617                }
1618                self.skip_non_blank();
1619            }
1620        }
1621
1622        self.skip_ws_to_eol(SkipTabs::Yes)?;
1623
1624        // Check if we are at the end of the line.
1625        self.input.lookahead(1);
1626        if !self.input.next_is_breakz() {
1627            return Err(ScanError::new_str(
1628                start_mark,
1629                "while scanning a block scalar, did not find expected comment or line break",
1630            ));
1631        }
1632
1633        if self.input.next_is_break() {
1634            self.input.lookahead(2);
1635            self.read_break(&mut chomping_break);
1636        }
1637
1638        if self.input.look_ch() == '\t' {
1639            return Err(ScanError::new_str(
1640                start_mark,
1641                "a block scalar content cannot start with a tab",
1642            ));
1643        }
1644
1645        if increment > 0 {
1646            indent = if self.indent >= 0 {
1647                (self.indent + increment as isize) as usize
1648            } else {
1649                increment
1650            }
1651        }
1652
1653        // Scan the leading line breaks and determine the indentation level if needed.
1654        if indent == 0 {
1655            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
1656        } else {
1657            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1658        }
1659
1660        // We have an end-of-stream with no content, e.g.:
1661        // ```yaml
1662        // - |+
1663        // ```
1664        if self.input.next_is_z() {
1665            let contents = match chomping {
1666                // We strip trailing linebreaks. Nothing remain.
1667                Chomping::Strip => String::new(),
1668                // There was no newline after the chomping indicator.
1669                _ if self.mark.line == start_mark.line() => String::new(),
1670                // We clip lines, and there was a newline after the chomping indicator.
1671                // All other breaks are ignored.
1672                Chomping::Clip => chomping_break,
1673                // We keep lines. There was a newline after the chomping indicator but nothing
1674                // else.
1675                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
1676                // Otherwise, the newline after chomping is ignored.
1677                Chomping::Keep => trailing_breaks,
1678            };
1679            return Ok(Token(
1680                Span::new(start_mark, self.mark),
1681                TokenType::Scalar(style, contents.into()),
1682            ));
1683        }
1684
1685        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
1686            return Err(ScanError::new_str(
1687                self.mark,
1688                "wrongly indented line in block scalar",
1689            ));
1690        }
1691
1692        let mut line_buffer = String::with_capacity(100);
1693        let start_mark = self.mark;
1694        while self.mark.col == indent && !self.input.next_is_z() {
1695            if indent == 0 {
1696                self.input.lookahead(4);
1697                if self.input.next_is_document_end() {
1698                    break;
1699                }
1700            }
1701
1702            // We are at the first content character of a content line.
1703            trailing_blank = self.input.next_is_blank();
1704            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
1705                string.push_str(&trailing_breaks);
1706                if trailing_breaks.is_empty() {
1707                    string.push(' ');
1708                }
1709            } else {
1710                string.push_str(&leading_break);
1711                string.push_str(&trailing_breaks);
1712            }
1713
1714            leading_break.clear();
1715            trailing_breaks.clear();
1716
1717            leading_blank = self.input.next_is_blank();
1718
1719            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
1720
1721            // break on EOF
1722            self.input.lookahead(2);
1723            if self.input.next_is_z() {
1724                break;
1725            }
1726
1727            self.read_break(&mut leading_break);
1728
1729            // Eat the following indentation spaces and line breaks.
1730            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1731        }
1732
1733        // Chomp the tail.
1734        if chomping != Chomping::Strip {
1735            string.push_str(&leading_break);
1736            // If we had reached an eof but the last character wasn't an end-of-line, check if the
1737            // last line was indented at least as the rest of the scalar, then we need to consider
1738            // there is a newline.
1739            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
1740                string.push('\n');
1741            }
1742        }
1743
1744        if chomping == Chomping::Keep {
1745            string.push_str(&trailing_breaks);
1746        }
1747
1748        Ok(Token(
1749            Span::new(start_mark, self.mark),
1750            TokenType::Scalar(style, string.into()),
1751        ))
1752    }
1753
1754    /// Retrieve the contents of the line, parsing it as a block scalar.
1755    ///
1756    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
1757    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
1758    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
1759    /// `clear`ed before the end of the function.
1760    ///
1761    /// This function assumed the first character to read is the first content character in the
1762    /// line. This function does not consume the line break character(s) after the line.
1763    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
1764        // Start by evaluating characters in the buffer.
1765        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
1766            string.push(self.input.peek());
1767            // We may technically skip non-blank characters. However, the only distinction is
1768            // to determine what is leading whitespace and what is not. Here, we read the
1769            // contents of the line until either eof or a linebreak. We know we will not read
1770            // `self.leading_whitespace` until the end of the line, where it will be reset.
1771            // This allows us to call a slightly less expensive function.
1772            self.skip_blank();
1773        }
1774
1775        // All characters that were in the buffer were consumed. We need to check if more
1776        // follow.
1777        if self.input.buf_is_empty() {
1778            // We will read all consecutive non-breakz characters. We push them into a
1779            // temporary buffer. The main difference with going through `self.buffer` is that
1780            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
1781            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
1782            // (using `String::push_str`).
1783            while let Some(c) = self.input.raw_read_non_breakz_ch() {
1784                line_buffer.push(c);
1785            }
1786
1787            // We need to manually update our position; we haven't called a `skip` function.
1788            let n_chars = line_buffer.chars().count();
1789            self.mark.col += n_chars;
1790            self.mark.index += n_chars;
1791
1792            // We can now append our bytes to our `string`.
1793            string.reserve(line_buffer.len());
1794            string.push_str(line_buffer);
1795            // This clears the _contents_ without touching the _capacity_.
1796            line_buffer.clear();
1797        }
1798    }
1799
1800    /// Skip the block scalar indentation and empty lines.
1801    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
1802        loop {
1803            // Consume all spaces. Tabs cannot be used as indentation.
1804            if indent < self.input.bufmaxlen() - 2 {
1805                self.input.lookahead(self.input.bufmaxlen());
1806                while self.mark.col < indent && self.input.peek() == ' ' {
1807                    self.skip_blank();
1808                }
1809            } else {
1810                loop {
1811                    self.input.lookahead(self.input.bufmaxlen());
1812                    while !self.input.buf_is_empty()
1813                        && self.mark.col < indent
1814                        && self.input.peek() == ' '
1815                    {
1816                        self.skip_blank();
1817                    }
1818                    // If we reached our indent, we can break. We must also break if we have
1819                    // reached content or EOF; that is, the buffer is not empty and the next
1820                    // character is not a space.
1821                    if self.mark.col == indent
1822                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
1823                    {
1824                        break;
1825                    }
1826                }
1827                self.input.lookahead(2);
1828            }
1829
1830            // If our current line is empty, skip over the break and continue looping.
1831            if self.input.next_is_break() {
1832                self.read_break(breaks);
1833            } else {
1834                // Otherwise, we have a content line. Return control.
1835                break;
1836            }
1837        }
1838    }
1839
1840    /// Determine the indentation level for a block scalar from the first line of its contents.
1841    ///
1842    /// The function skips over whitespace-only lines and sets `indent` to the the longest
1843    /// whitespace line that was encountered.
1844    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
1845        let mut max_indent = 0;
1846        loop {
1847            // Consume all spaces. Tabs cannot be used as indentation.
1848            while self.input.look_ch() == ' ' {
1849                self.skip_blank();
1850            }
1851
1852            if self.mark.col > max_indent {
1853                max_indent = self.mark.col;
1854            }
1855
1856            if self.input.next_is_break() {
1857                // If our current line is empty, skip over the break and continue looping.
1858                self.input.lookahead(2);
1859                self.read_break(breaks);
1860            } else {
1861                // Otherwise, we have a content line. Return control.
1862                break;
1863            }
1864        }
1865
1866        // In case a yaml looks like:
1867        // ```yaml
1868        // |
1869        // foo
1870        // bar
1871        // ```
1872        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
1873        // least 1. When in the above example, `self.indent` will be set to -1.
1874        *indent = max_indent.max((self.indent + 1) as usize);
1875        if self.indent > 0 {
1876            *indent = (*indent).max(1);
1877        }
1878    }
1879
1880    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
1881        self.save_simple_key();
1882        self.disallow_simple_key();
1883
1884        let tok = self.scan_flow_scalar(single)?;
1885
1886        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
1887        // YAML allows the following value to be specified adjacent to the “:”.
1888        self.skip_to_next_token()?;
1889        self.adjacent_value_allowed_at = self.mark.index;
1890
1891        self.tokens.push_back(tok);
1892        Ok(())
1893    }
1894
1895    #[allow(clippy::too_many_lines)]
1896    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
1897        let start_mark = self.mark;
1898
1899        let mut string = String::new();
1900        let mut leading_break = String::new();
1901        let mut trailing_breaks = String::new();
1902        let mut whitespaces = String::new();
1903        let mut leading_blanks;
1904
1905        /* Eat the left quote. */
1906        self.skip_non_blank();
1907
1908        loop {
1909            /* Check for a document indicator. */
1910            self.input.lookahead(4);
1911
1912            if self.mark.col == 0 && self.input.next_is_document_indicator() {
1913                return Err(ScanError::new_str(
1914                    start_mark,
1915                    "while scanning a quoted scalar, found unexpected document indicator",
1916                ));
1917            }
1918
1919            if self.input.next_is_z() {
1920                return Err(ScanError::new_str(
1921                    start_mark,
1922                    "while scanning a quoted scalar, found unexpected end of stream",
1923                ));
1924            }
1925
1926            if (self.mark.col as isize) < self.indent {
1927                return Err(ScanError::new_str(
1928                    start_mark,
1929                    "invalid indentation in quoted scalar",
1930                ));
1931            }
1932
1933            leading_blanks = false;
1934            self.consume_flow_scalar_non_whitespace_chars(
1935                single,
1936                &mut string,
1937                &mut leading_blanks,
1938                &start_mark,
1939            )?;
1940
1941            match self.input.look_ch() {
1942                '\'' if single => break,
1943                '"' if !single => break,
1944                _ => {}
1945            }
1946
1947            // Consume blank characters.
1948            while self.input.next_is_blank() || self.input.next_is_break() {
1949                if self.input.next_is_blank() {
1950                    // Consume a space or a tab character.
1951                    if leading_blanks {
1952                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
1953                            return Err(ScanError::new_str(
1954                                self.mark,
1955                                "tab cannot be used as indentation",
1956                            ));
1957                        }
1958                        self.skip_blank();
1959                    } else {
1960                        whitespaces.push(self.input.peek());
1961                        self.skip_blank();
1962                    }
1963                } else {
1964                    self.input.lookahead(2);
1965                    // Check if it is a first line break.
1966                    if leading_blanks {
1967                        self.read_break(&mut trailing_breaks);
1968                    } else {
1969                        whitespaces.clear();
1970                        self.read_break(&mut leading_break);
1971                        leading_blanks = true;
1972                    }
1973                }
1974                self.input.lookahead(1);
1975            }
1976
1977            // Join the whitespaces or fold line breaks.
1978            if leading_blanks {
1979                if leading_break.is_empty() {
1980                    string.push_str(&leading_break);
1981                    string.push_str(&trailing_breaks);
1982                    trailing_breaks.clear();
1983                    leading_break.clear();
1984                } else {
1985                    if trailing_breaks.is_empty() {
1986                        string.push(' ');
1987                    } else {
1988                        string.push_str(&trailing_breaks);
1989                        trailing_breaks.clear();
1990                    }
1991                    leading_break.clear();
1992                }
1993            } else {
1994                string.push_str(&whitespaces);
1995                whitespaces.clear();
1996            }
1997        } // loop
1998
1999        // Eat the right quote.
2000        self.skip_non_blank();
2001        // Ensure there is no invalid trailing content.
2002        self.skip_ws_to_eol(SkipTabs::Yes)?;
2003        match self.input.peek() {
2004            // These can be encountered in flow sequences or mappings.
2005            ',' | '}' | ']' if self.flow_level > 0 => {}
2006            // An end-of-line / end-of-stream is fine. No trailing content.
2007            c if is_breakz(c) => {}
2008            // ':' can be encountered if our scalar is a key.
2009            // Outside of flow contexts, keys cannot span multiple lines
2010            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2011            // Inside a flow context, this is allowed.
2012            ':' if self.flow_level > 0 => {}
2013            _ => {
2014                return Err(ScanError::new_str(
2015                    self.mark,
2016                    "invalid trailing content after double-quoted scalar",
2017                ));
2018            }
2019        }
2020
2021        let style = if single {
2022            ScalarStyle::SingleQuoted
2023        } else {
2024            ScalarStyle::DoubleQuoted
2025        };
2026        Ok(Token(
2027            Span::new(start_mark, self.mark),
2028            TokenType::Scalar(style, string.into()),
2029        ))
2030    }
2031
2032    /// Consume successive non-whitespace characters from a flow scalar.
2033    ///
2034    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2035    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2036    /// for double quoted scalars).
2037    ///
2038    /// # Errors
2039    /// Return an error if an invalid escape sequence is found.
2040    fn consume_flow_scalar_non_whitespace_chars(
2041        &mut self,
2042        single: bool,
2043        string: &mut String,
2044        leading_blanks: &mut bool,
2045        start_mark: &Marker,
2046    ) -> Result<(), ScanError> {
2047        self.input.lookahead(2);
2048        while !is_blank_or_breakz(self.input.peek()) {
2049            match self.input.peek() {
2050                // Check for an escaped single quote.
2051                '\'' if self.input.peek_nth(1) == '\'' && single => {
2052                    string.push('\'');
2053                    self.skip_n_non_blank(2);
2054                }
2055                // Check for the right quote.
2056                '\'' if single => break,
2057                '"' if !single => break,
2058                // Check for an escaped line break.
2059                '\\' if !single && is_break(self.input.peek_nth(1)) => {
2060                    self.input.lookahead(3);
2061                    self.skip_non_blank();
2062                    self.skip_linebreak();
2063                    *leading_blanks = true;
2064                    break;
2065                }
2066                // Check for an escape sequence.
2067                '\\' if !single => {
2068                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2069                }
2070                c => {
2071                    string.push(c);
2072                    self.skip_non_blank();
2073                }
2074            }
2075            self.input.lookahead(2);
2076        }
2077        Ok(())
2078    }
2079
2080    /// Escape the sequence we encounter in a flow scalar.
2081    ///
2082    /// `self.input.peek()` must point to the `\` starting the escape sequence.
2083    ///
2084    /// # Errors
2085    /// Return an error if an invalid escape sequence is found.
2086    fn resolve_flow_scalar_escape_sequence(
2087        &mut self,
2088        start_mark: &Marker,
2089    ) -> Result<char, ScanError> {
2090        let mut code_length = 0usize;
2091        let mut ret = '\0';
2092
2093        match self.input.peek_nth(1) {
2094            '0' => ret = '\0',
2095            'a' => ret = '\x07',
2096            'b' => ret = '\x08',
2097            't' | '\t' => ret = '\t',
2098            'n' => ret = '\n',
2099            'v' => ret = '\x0b',
2100            'f' => ret = '\x0c',
2101            'r' => ret = '\x0d',
2102            'e' => ret = '\x1b',
2103            ' ' => ret = '\x20',
2104            '"' => ret = '"',
2105            '/' => ret = '/',
2106            '\\' => ret = '\\',
2107            // Unicode next line (#x85)
2108            'N' => ret = char::from_u32(0x85).unwrap(),
2109            // Unicode non-breaking space (#xA0)
2110            '_' => ret = char::from_u32(0xA0).unwrap(),
2111            // Unicode line separator (#x2028)
2112            'L' => ret = char::from_u32(0x2028).unwrap(),
2113            // Unicode paragraph separator (#x2029)
2114            'P' => ret = char::from_u32(0x2029).unwrap(),
2115            'x' => code_length = 2,
2116            'u' => code_length = 4,
2117            'U' => code_length = 8,
2118            _ => {
2119                return Err(ScanError::new_str(
2120                    *start_mark,
2121                    "while parsing a quoted scalar, found unknown escape character",
2122                ))
2123            }
2124        }
2125        self.skip_n_non_blank(2);
2126
2127        // Consume an arbitrary escape code.
2128        if code_length > 0 {
2129            self.input.lookahead(code_length);
2130            let mut value = 0u32;
2131            for i in 0..code_length {
2132                let c = self.input.peek_nth(i);
2133                if !is_hex(c) {
2134                    return Err(ScanError::new_str(
2135                        *start_mark,
2136                        "while parsing a quoted scalar, did not find expected hexadecimal number",
2137                    ));
2138                }
2139                value = (value << 4) + as_hex(c);
2140            }
2141
2142            let Some(ch) = char::from_u32(value) else {
2143                return Err(ScanError::new_str(
2144                    *start_mark,
2145                    "while parsing a quoted scalar, found invalid Unicode character escape code",
2146                ));
2147            };
2148            ret = ch;
2149
2150            self.skip_n_non_blank(code_length);
2151        }
2152        Ok(ret)
2153    }
2154
2155    fn fetch_plain_scalar(&mut self) -> ScanResult {
2156        self.save_simple_key();
2157        self.disallow_simple_key();
2158
2159        let tok = self.scan_plain_scalar()?;
2160
2161        self.tokens.push_back(tok);
2162        Ok(())
2163    }
2164
2165    /// Scan for a plain scalar.
2166    ///
2167    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
2168    /// some contexts.
2169    #[allow(clippy::too_many_lines)]
2170    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
2171        self.unroll_non_block_indents();
2172        let indent = self.indent + 1;
2173        let start_mark = self.mark;
2174
2175        if self.flow_level > 0 && (start_mark.col as isize) < indent {
2176            return Err(ScanError::new_str(
2177                start_mark,
2178                "invalid indentation in flow construct",
2179            ));
2180        }
2181
2182        let mut string = String::with_capacity(32);
2183        self.buf_whitespaces.clear();
2184        self.buf_leading_break.clear();
2185        self.buf_trailing_breaks.clear();
2186        let mut end_mark = self.mark;
2187
2188        loop {
2189            self.input.lookahead(4);
2190            if (self.leading_whitespace && self.input.next_is_document_indicator())
2191                || self.input.peek() == '#'
2192            {
2193                break;
2194            }
2195
2196            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
2197                return Err(ScanError::new_str(
2198                    self.mark,
2199                    "plain scalar cannot start with '-' followed by ,[]{}",
2200                ));
2201            }
2202
2203            if !self.input.next_is_blank_or_breakz()
2204                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
2205            {
2206                if self.leading_whitespace {
2207                    if self.buf_leading_break.is_empty() {
2208                        string.push_str(&self.buf_leading_break);
2209                        string.push_str(&self.buf_trailing_breaks);
2210                        self.buf_trailing_breaks.clear();
2211                        self.buf_leading_break.clear();
2212                    } else {
2213                        if self.buf_trailing_breaks.is_empty() {
2214                            string.push(' ');
2215                        } else {
2216                            string.push_str(&self.buf_trailing_breaks);
2217                            self.buf_trailing_breaks.clear();
2218                        }
2219                        self.buf_leading_break.clear();
2220                    }
2221                    self.leading_whitespace = false;
2222                } else if !self.buf_whitespaces.is_empty() {
2223                    string.push_str(&self.buf_whitespaces);
2224                    self.buf_whitespaces.clear();
2225                }
2226
2227                // We can unroll the first iteration of the loop.
2228                string.push(self.input.peek());
2229                self.skip_non_blank();
2230                string.reserve(self.input.bufmaxlen());
2231
2232                // Add content non-blank characters to the scalar.
2233                let mut end = false;
2234                while !end {
2235                    // Fill the buffer once and process all characters in the buffer until the next
2236                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
2237                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
2238                    self.input.lookahead(self.input.bufmaxlen());
2239                    for _ in 0..self.input.bufmaxlen() - 1 {
2240                        if self.input.next_is_blank_or_breakz()
2241                            || !self.input.next_can_be_plain_scalar(self.flow_level > 0)
2242                        {
2243                            end = true;
2244                            break;
2245                        }
2246                        string.push(self.input.peek());
2247                        self.skip_non_blank();
2248                    }
2249                }
2250                end_mark = self.mark;
2251            }
2252
2253            // We may reach the end of a plain scalar if:
2254            //  - We reach eof
2255            //  - We reach ": "
2256            //  - We find a flow character in a flow context
2257            if !(self.input.next_is_blank() || self.input.next_is_break()) {
2258                break;
2259            }
2260
2261            // Process blank characters.
2262            self.input.lookahead(2);
2263            while self.input.next_is_blank_or_break() {
2264                if self.input.next_is_blank() {
2265                    if !self.leading_whitespace {
2266                        self.buf_whitespaces.push(self.input.peek());
2267                        self.skip_blank();
2268                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
2269                        // Tabs in an indentation columns are allowed if and only if the line is
2270                        // empty. Skip to the end of the line.
2271                        self.skip_ws_to_eol(SkipTabs::Yes)?;
2272                        if !self.input.next_is_breakz() {
2273                            return Err(ScanError::new_str(
2274                                start_mark,
2275                                "while scanning a plain scalar, found a tab",
2276                            ));
2277                        }
2278                    } else {
2279                        self.skip_blank();
2280                    }
2281                } else {
2282                    // Check if it is a first line break
2283                    if self.leading_whitespace {
2284                        self.skip_break();
2285                        self.buf_trailing_breaks.push('\n');
2286                    } else {
2287                        self.buf_whitespaces.clear();
2288                        self.skip_break();
2289                        self.buf_leading_break.push('\n');
2290                        self.leading_whitespace = true;
2291                    }
2292                }
2293                self.input.lookahead(2);
2294            }
2295
2296            // check indentation level
2297            if self.flow_level == 0 && (self.mark.col as isize) < indent {
2298                break;
2299            }
2300        }
2301
2302        if self.leading_whitespace {
2303            self.allow_simple_key();
2304        }
2305
2306        if string.is_empty() {
2307            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
2308            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
2309            // erroneous inputs such as "{...".
2310            Err(ScanError::new_str(
2311                start_mark,
2312                "unexpected end of plain scalar",
2313            ))
2314        } else {
2315            Ok(Token(
2316                Span::new(start_mark, end_mark),
2317                TokenType::Scalar(ScalarStyle::Plain, string.into()),
2318            ))
2319        }
2320    }
2321
2322    fn fetch_key(&mut self) -> ScanResult {
2323        let start_mark = self.mark;
2324        if self.flow_level == 0 {
2325            // Check if we are allowed to start a new key (not necessarily simple).
2326            if !self.simple_key_allowed {
2327                return Err(ScanError::new_str(
2328                    self.mark,
2329                    "mapping keys are not allowed in this context",
2330                ));
2331            }
2332            self.roll_indent(
2333                start_mark.col,
2334                None,
2335                TokenType::BlockMappingStart,
2336                start_mark,
2337            );
2338        } else {
2339            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
2340            self.flow_mapping_started = true;
2341        }
2342
2343        self.remove_simple_key()?;
2344
2345        if self.flow_level == 0 {
2346            self.allow_simple_key();
2347        } else {
2348            self.disallow_simple_key();
2349        }
2350
2351        self.skip_non_blank();
2352        self.skip_yaml_whitespace()?;
2353        if self.input.peek() == '\t' {
2354            return Err(ScanError::new_str(
2355                self.mark(),
2356                "tabs disallowed in this context",
2357            ));
2358        }
2359        self.tokens
2360            .push_back(Token(Span::new(start_mark, self.mark), TokenType::Key));
2361        Ok(())
2362    }
2363
2364    /// Fetch a value in a mapping inside of a flow collection.
2365    ///
2366    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
2367    /// values in flow collections are respected prior to calling [`fetch_value`].
2368    ///
2369    /// [`self.flow_level`]: Self::flow_level
2370    /// [`fetch_value`]: Self::fetch_value
2371    fn fetch_flow_value(&mut self) -> ScanResult {
2372        let nc = self.input.peek_nth(1);
2373
2374        // If we encounter a ':' inside a flow collection and it is not immediately
2375        // followed by a blank or breakz:
2376        //   - We must check whether an adjacent value is allowed
2377        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
2378        //     is needed for JSON compatibility.
2379        //   - If not, we must ensure there is a space after the ':' and before its value.
2380        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
2381        //   - But if the value is empty (null), then it's okay.
2382        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
2383        // flow character), but the ']' is not the value. The value is an invisible empty
2384        // space which is represented as null ('~').
2385        if self.mark.index != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
2386            return Err(ScanError::new_str(
2387                self.mark,
2388                "':' may not precede any of `[{` in flow mapping",
2389            ));
2390        }
2391
2392        self.fetch_value()
2393    }
2394
2395    /// Fetch a value from a mapping (after a `:`).
2396    fn fetch_value(&mut self) -> ScanResult {
2397        let sk = self.simple_keys.last().unwrap().clone();
2398        let start_mark = self.mark;
2399        let is_implicit_flow_mapping =
2400            !self.implicit_flow_mapping_states.is_empty() && !self.flow_mapping_started;
2401        if is_implicit_flow_mapping {
2402            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Inside;
2403        }
2404
2405        // Skip over ':'.
2406        self.skip_non_blank();
2407        if self.input.look_ch() == '\t'
2408            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
2409            && (self.input.peek() == '-' || self.input.next_is_alpha())
2410        {
2411            return Err(ScanError::new_str(
2412                self.mark,
2413                "':' must be followed by a valid YAML whitespace",
2414            ));
2415        }
2416
2417        if sk.possible {
2418            // insert simple key
2419            let tok = Token(Span::empty(sk.mark), TokenType::Key);
2420            self.insert_token(sk.token_number - self.tokens_parsed, tok);
2421            if is_implicit_flow_mapping {
2422                if sk.mark.line < start_mark.line {
2423                    return Err(ScanError::new_str(
2424                        start_mark,
2425                        "illegal placement of ':' indicator",
2426                    ));
2427                }
2428                self.insert_token(
2429                    sk.token_number - self.tokens_parsed,
2430                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
2431                );
2432            }
2433
2434            // Add the BLOCK-MAPPING-START token if needed.
2435            self.roll_indent(
2436                sk.mark.col,
2437                Some(sk.token_number),
2438                TokenType::BlockMappingStart,
2439                sk.mark,
2440            );
2441            self.roll_one_col_indent();
2442
2443            self.simple_keys.last_mut().unwrap().possible = false;
2444            self.disallow_simple_key();
2445        } else {
2446            if is_implicit_flow_mapping {
2447                self.tokens
2448                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
2449            }
2450            // The ':' indicator follows a complex key.
2451            if self.flow_level == 0 {
2452                if !self.simple_key_allowed {
2453                    return Err(ScanError::new_str(
2454                        start_mark,
2455                        "mapping values are not allowed in this context",
2456                    ));
2457                }
2458
2459                self.roll_indent(
2460                    start_mark.col,
2461                    None,
2462                    TokenType::BlockMappingStart,
2463                    start_mark,
2464                );
2465            }
2466            self.roll_one_col_indent();
2467
2468            if self.flow_level == 0 {
2469                self.allow_simple_key();
2470            } else {
2471                self.disallow_simple_key();
2472            }
2473        }
2474        self.tokens
2475            .push_back(Token(Span::empty(start_mark), TokenType::Value));
2476
2477        Ok(())
2478    }
2479
2480    /// Add an indentation level to the stack with the given block token, if needed.
2481    ///
2482    /// An indentation level is added only if:
2483    ///   - We are not in a flow-style construct (which don't have indentation per-se).
2484    ///   - The current column is further indented than the last indent we have registered.
2485    fn roll_indent(
2486        &mut self,
2487        col: usize,
2488        number: Option<usize>,
2489        tok: TokenType<'input>,
2490        mark: Marker,
2491    ) {
2492        if self.flow_level > 0 {
2493            return;
2494        }
2495
2496        // If the last indent was a non-block indent, remove it.
2497        // This means that we prepared an indent that we thought we wouldn't use, but realized just
2498        // now that it is a block indent.
2499        if self.indent <= col as isize {
2500            if let Some(indent) = self.indents.last() {
2501                if !indent.needs_block_end {
2502                    self.indent = indent.indent;
2503                    self.indents.pop();
2504                }
2505            }
2506        }
2507
2508        if self.indent < col as isize {
2509            self.indents.push(Indent {
2510                indent: self.indent,
2511                needs_block_end: true,
2512            });
2513            self.indent = col as isize;
2514            let tokens_parsed = self.tokens_parsed;
2515            match number {
2516                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
2517                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
2518            }
2519        }
2520    }
2521
2522    /// Pop indentation levels from the stack as much as needed.
2523    ///
2524    /// Indentation levels are popped from the stack while they are further indented than `col`.
2525    /// If we are in a flow-style construct (which don't have indentation per-se), this function
2526    /// does nothing.
2527    fn unroll_indent(&mut self, col: isize) {
2528        if self.flow_level > 0 {
2529            return;
2530        }
2531        while self.indent > col {
2532            let indent = self.indents.pop().unwrap();
2533            self.indent = indent.indent;
2534            if indent.needs_block_end {
2535                self.tokens
2536                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
2537            }
2538        }
2539    }
2540
2541    /// Add an indentation level of 1 column that does not start a block.
2542    ///
2543    /// See the documentation of [`Indent::needs_block_end`] for more details.
2544    /// An indentation is not added if we are inside a flow level or if the last indent is already
2545    /// a non-block indent.
2546    fn roll_one_col_indent(&mut self) {
2547        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
2548            self.indents.push(Indent {
2549                indent: self.indent,
2550                needs_block_end: false,
2551            });
2552            self.indent += 1;
2553        }
2554    }
2555
2556    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
2557    fn unroll_non_block_indents(&mut self) {
2558        while let Some(indent) = self.indents.last() {
2559            if indent.needs_block_end {
2560                break;
2561            }
2562            self.indent = indent.indent;
2563            self.indents.pop();
2564        }
2565    }
2566
2567    /// Mark the next token to be inserted as a potential simple key.
2568    fn save_simple_key(&mut self) {
2569        if self.simple_key_allowed {
2570            let required = self.flow_level == 0
2571                && self.indent == (self.mark.col as isize)
2572                && self.indents.last().unwrap().needs_block_end;
2573            let mut sk = SimpleKey::new(self.mark);
2574            sk.possible = true;
2575            sk.required = required;
2576            sk.token_number = self.tokens_parsed + self.tokens.len();
2577
2578            self.simple_keys.pop();
2579            self.simple_keys.push(sk);
2580        }
2581    }
2582
2583    fn remove_simple_key(&mut self) -> ScanResult {
2584        let last = self.simple_keys.last_mut().unwrap();
2585        if last.possible && last.required {
2586            return Err(ScanError::new_str(self.mark, "simple key expected"));
2587        }
2588
2589        last.possible = false;
2590        Ok(())
2591    }
2592
2593    /// Return whether the scanner is inside a block but outside of a flow sequence.
2594    fn is_within_block(&self) -> bool {
2595        !self.indents.is_empty()
2596    }
2597
2598    /// If an implicit mapping had started, end it.
2599    ///
2600    /// This function does not pop the state in [`implicit_flow_mapping_states`].
2601    ///
2602    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
2603    fn end_implicit_mapping(&mut self, mark: Marker) {
2604        if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() {
2605            if *implicit_mapping == ImplicitMappingState::Inside {
2606                self.flow_mapping_started = false;
2607                *implicit_mapping = ImplicitMappingState::Possible;
2608                self.tokens
2609                    .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
2610            }
2611        }
2612    }
2613}
2614
2615/// Chomping, how final line breaks and trailing empty lines are interpreted.
2616///
2617/// See YAML spec 8.1.1.2.
2618#[derive(PartialEq, Eq)]
2619pub enum Chomping {
2620    /// The final line break and any trailing empty lines are excluded.
2621    Strip,
2622    /// The final line break is preserved, but trailing empty lines are excluded.
2623    Clip,
2624    /// The final line break and trailing empty lines are included.
2625    Keep,
2626}
2627
2628#[cfg(test)]
2629mod test {
2630    #[test]
2631    fn test_is_anchor_char() {
2632        use super::is_anchor_char;
2633        assert!(is_anchor_char('x'));
2634    }
2635}