saphyr_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use std::{char, collections::VecDeque, error::Error, fmt};
13
14use crate::{
15    char_traits::{
16        as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex,
17        is_tag_char, is_uri_char,
18    },
19    input::{Input, SkipTabs},
20};
21
22/// The encoding of the input. Currently, only UTF-8 is supported.
23#[derive(Clone, Copy, PartialEq, Debug, Eq)]
24pub enum TEncoding {
25    /// UTF-8 encoding.
26    Utf8,
27}
28
29/// The style as which the scalar was written in the YAML document.
30#[derive(Clone, Copy, PartialEq, Debug, Eq)]
31pub enum TScalarStyle {
32    /// A YAML plain scalar.
33    Plain,
34    /// A YAML single quoted scalar.
35    SingleQuoted,
36    /// A YAML double quoted scalar.
37    DoubleQuoted,
38
39    /// A YAML literal block (`|` block).
40    Literal,
41    /// A YAML folded block (`>` block).
42    Folded,
43}
44
45/// A location in a yaml document.
46#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
47pub struct Marker {
48    /// The index (in chars) in the input string.
49    index: usize,
50    /// The line (1-indexed).
51    line: usize,
52    /// The column (1-indexed).
53    col: usize,
54}
55
56impl Marker {
57    /// Create a new [`Marker`] at the given position.
58    #[must_use]
59    pub fn new(index: usize, line: usize, col: usize) -> Marker {
60        Marker { index, line, col }
61    }
62
63    /// Return the index (in bytes) of the marker in the source.
64    #[must_use]
65    pub fn index(&self) -> usize {
66        self.index
67    }
68
69    /// Return the line of the marker in the source.
70    #[must_use]
71    pub fn line(&self) -> usize {
72        self.line
73    }
74
75    /// Return the column of the marker in the source.
76    #[must_use]
77    pub fn col(&self) -> usize {
78        self.col
79    }
80}
81
82/// A range of locations in a Yaml document.
83#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
84pub struct Span {
85    /// The start (inclusive) of the range.
86    pub start: Marker,
87    /// The end (exclusive) of the range.
88    pub end: Marker,
89}
90
91impl Span {
92    /// Create a new [`Span`] for the given range.
93    #[must_use]
94    pub fn new(start: Marker, end: Marker) -> Span {
95        Span { start, end }
96    }
97
98    /// Create a empty [`Span`] at a given location.
99    ///
100    /// An empty span doesn't contain any characters, but its position may still be meaningful.
101    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
102    ///
103    /// [`SequenceEnd`]: crate::Event::SequenceEnd
104    #[must_use]
105    pub fn empty(mark: Marker) -> Span {
106        Span {
107            start: mark,
108            end: mark,
109        }
110    }
111}
112
113/// An error that occurred while scanning.
114#[derive(Clone, PartialEq, Debug, Eq)]
115pub struct ScanError {
116    /// The position at which the error happened in the source.
117    mark: Marker,
118    /// Human-readable details about the error.
119    info: String,
120}
121
122impl ScanError {
123    /// Create a new error from a location and an error string.
124    #[must_use]
125    pub fn new(loc: Marker, info: String) -> ScanError {
126        ScanError { mark: loc, info }
127    }
128
129    /// Convenience alias for string slices.
130    #[must_use]
131    pub fn new_str(loc: Marker, info: &str) -> ScanError {
132        ScanError {
133            mark: loc,
134            info: info.to_owned(),
135        }
136    }
137
138    /// Return the marker pointing to the error in the source.
139    #[must_use]
140    pub fn marker(&self) -> &Marker {
141        &self.mark
142    }
143
144    /// Return the information string describing the error that happened.
145    #[must_use]
146    pub fn info(&self) -> &str {
147        self.info.as_ref()
148    }
149}
150
151impl Error for ScanError {
152    fn source(&self) -> Option<&(dyn Error + 'static)> {
153        None
154    }
155}
156
157impl fmt::Display for ScanError {
158    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
159        write!(
160            formatter,
161            "{} at byte {} line {} column {}",
162            self.info,
163            self.mark.index,
164            self.mark.line,
165            self.mark.col + 1,
166        )
167    }
168}
169
170/// The contents of a scanner token.
171#[derive(Clone, PartialEq, Debug, Eq)]
172pub enum TokenType {
173    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
174    StreamStart(TEncoding),
175    /// The end of the stream, EOF.
176    StreamEnd,
177    /// A YAML version directive.
178    VersionDirective(
179        /// Major
180        u32,
181        /// Minor
182        u32,
183    ),
184    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
185    TagDirective(
186        /// Handle
187        String,
188        /// Prefix
189        String,
190    ),
191    /// The start of a YAML document (`---`).
192    DocumentStart,
193    /// The end of a YAML document (`...`).
194    DocumentEnd,
195    /// The start of a sequence block.
196    ///
197    /// Sequence blocks are arrays starting with a `-`.
198    BlockSequenceStart,
199    /// The start of a sequence mapping.
200    ///
201    /// Sequence mappings are "dictionaries" with "key: value" entries.
202    BlockMappingStart,
203    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
204    BlockEnd,
205    /// Start of an inline array (`[ a, b ]`).
206    FlowSequenceStart,
207    /// End of an inline array.
208    FlowSequenceEnd,
209    /// Start of an inline mapping (`{ a: b, c: d }`).
210    FlowMappingStart,
211    /// End of an inline mapping.
212    FlowMappingEnd,
213    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
214    BlockEntry,
215    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
216    FlowEntry,
217    /// A key in a mapping.
218    Key,
219    /// A value in a mapping.
220    Value,
221    /// A reference to an anchor.
222    Alias(String),
223    /// A YAML anchor (`&`/`*`).
224    Anchor(String),
225    /// A YAML tag (starting with bangs `!`).
226    Tag(
227        /// The handle of the tag.
228        String,
229        /// The suffix of the tag.
230        String,
231    ),
232    /// A regular YAML scalar.
233    Scalar(TScalarStyle, String),
234}
235
236/// A scanner token.
237#[derive(Clone, PartialEq, Debug, Eq)]
238pub struct Token(pub Span, pub TokenType);
239
240/// A scalar that was parsed and may correspond to a simple key.
241///
242/// Upon scanning the following yaml:
243/// ```yaml
244/// a: b
245/// ```
246/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
247/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
248/// kept inside the scanner until more context is fetched and we are able to know whether it is a
249/// plain scalar or a key.
250///
251/// For example, see the following 2 yaml documents:
252/// ```yaml
253/// ---
254/// a: b # Here, `a` is a key.
255/// ...
256/// ---
257/// a # Here, `a` is a plain scalar.
258/// ...
259/// ```
260/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
261///
262/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
263/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
264/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
265///
266/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
267/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
268/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
269/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
270/// [`TokenType::Scalar`] token.
271///
272/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
273/// [`TokenType::Key`] would be emitted by the scanner.
274#[derive(Clone, PartialEq, Debug, Eq)]
275struct SimpleKey {
276    /// Whether the token this [`SimpleKey`] refers to may still be a key.
277    ///
278    /// Sometimes, when we have more context, we notice that what we thought could be a key no
279    /// longer can be. In that case, [`Self::possible`] is set to `false`.
280    ///
281    /// For instance, let us consider the following invalid YAML:
282    /// ```yaml
283    /// key
284    ///   : value
285    /// ```
286    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
287    /// and [`Self::possible`] set to `false`.
288    possible: bool,
289    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
290    ///
291    /// With more context, we may know for sure that the token must be a key. If the YAML is
292    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
293    /// be raised. This boolean helps us know when to raise such error.
294    ///
295    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
296    required: bool,
297    /// The index of the token referred to by the [`SimpleKey`].
298    ///
299    /// This is the index in the scanner, which takes into account both the tokens that have been
300    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
301    /// [`Scanner::tokens`] for more details.
302    token_number: usize,
303    /// The position at which the token the [`SimpleKey`] refers to is.
304    mark: Marker,
305}
306
307impl SimpleKey {
308    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
309    fn new(mark: Marker) -> SimpleKey {
310        SimpleKey {
311            possible: false,
312            required: false,
313            token_number: 0,
314            mark,
315        }
316    }
317}
318
319/// An indentation level on the stack of indentations.
320#[derive(Clone, Debug, Default)]
321struct Indent {
322    /// The former indentation level.
323    indent: isize,
324    /// Whether, upon closing, this indents generates a `BlockEnd` token.
325    ///
326    /// There are levels of indentation which do not start a block. Examples of this would be:
327    /// ```yaml
328    /// -
329    ///   foo # ok
330    /// -
331    /// bar # ko, bar needs to be indented further than the `-`.
332    /// - [
333    ///  baz, # ok
334    /// quux # ko, quux needs to be indented further than the '-'.
335    /// ] # ko, the closing bracket needs to be indented further than the `-`.
336    /// ```
337    ///
338    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
339    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
340    /// sequence, although we must have exactly one to end the sequence.
341    needs_block_end: bool,
342}
343
344/// The knowledge we have about an implicit mapping.
345///
346/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
347/// sequence is omitted:
348/// ```yaml
349/// [ a: b, c: d ]
350/// # Equivalent to
351/// [ { a: b }, { c: d } ]
352/// # Equivalent to
353/// - a: b
354/// - c: d
355/// ```
356///
357/// The state must be carefully tracked for each nested flow sequence since we must emit a
358/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
359/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
360/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
361/// out-of-order.
362///
363/// [`FlowMappingStart`]: TokenType::FlowMappingStart
364/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
365#[derive(Debug, PartialEq)]
366enum ImplicitMappingState {
367    /// It is possible there is an implicit mapping.
368    ///
369    /// This state is the one when we have just encountered the opening `[`. We need more context
370    /// to know whether an implicit mapping follows.
371    Possible,
372    /// We are inside the implcit mapping.
373    ///
374    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
375    Inside,
376}
377
378/// The YAML scanner.
379///
380/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
381/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
382/// some of the constructs. It has understanding of indentation and whitespace and is able to
383/// generate error messages for some invalid YAML constructs.
384///
385/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
386/// YAML documents.
387#[derive(Debug)]
388#[allow(clippy::struct_excessive_bools)]
389pub struct Scanner<T> {
390    /// The input source.
391    ///
392    /// This must implement [`Input`].
393    input: T,
394    /// The position of the cursor within the reader.
395    mark: Marker,
396    /// Buffer for tokens to be returned.
397    ///
398    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
399    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
400    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
401    /// [`Self::next`] until we have more context.
402    tokens: VecDeque<Token>,
403    /// The last error that happened.
404    error: Option<ScanError>,
405
406    /// Whether we have already emitted the `StreamStart` token.
407    stream_start_produced: bool,
408    /// Whether we have already emitted the `StreamEnd` token.
409    stream_end_produced: bool,
410    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
411    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
412    adjacent_value_allowed_at: usize,
413    /// Whether a simple key could potentially start at the current position.
414    ///
415    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
416    simple_key_allowed: bool,
417    /// A stack of potential simple keys.
418    ///
419    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
420    /// are.
421    simple_keys: Vec<SimpleKey>,
422    /// The current indentation level.
423    indent: isize,
424    /// List of all block indentation levels we are in (except the current one).
425    indents: Vec<Indent>,
426    /// Level of nesting of flow sequences.
427    flow_level: u8,
428    /// The number of tokens that have been returned from the scanner.
429    ///
430    /// This excludes the tokens from [`Self::tokens`].
431    tokens_parsed: usize,
432    /// Whether a token is ready to be taken from [`Self::tokens`].
433    token_available: bool,
434    /// Whether all characters encountered since the last newline were whitespace.
435    leading_whitespace: bool,
436    /// Whether we started a flow mapping.
437    ///
438    /// This is used to detect implicit flow mapping starts such as:
439    /// ```yaml
440    /// [ : foo ] # { null: "foo" }
441    /// ```
442    flow_mapping_started: bool,
443    /// An array of states, representing whether flow sequences have implicit mappings.
444    ///
445    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
446    /// the state is set to [`Possible`].
447    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
448    /// [`Inside`].
449    ///
450    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
451    /// The entries are created with the opening `]` and popped with the closing `]`.
452    ///
453    /// [`Possible`]: ImplicitMappingState::Possible
454    /// [`Inside`]: ImplicitMappingState::Inside
455    implicit_flow_mapping_states: Vec<ImplicitMappingState>,
456    buf_leading_break: String,
457    buf_trailing_breaks: String,
458    buf_whitespaces: String,
459}
460
461impl<T: Input> Iterator for Scanner<T> {
462    type Item = Token;
463    fn next(&mut self) -> Option<Token> {
464        if self.error.is_some() {
465            return None;
466        }
467        match self.next_token() {
468            Ok(Some(tok)) => {
469                debug_print!(
470                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
471                    tok.1,
472                    tok.0
473                );
474                Some(tok)
475            }
476            Ok(tok) => tok,
477            Err(e) => {
478                self.error = Some(e);
479                None
480            }
481        }
482    }
483}
484
485/// A convenience alias for scanner functions that may fail without returning a value.
486pub type ScanResult = Result<(), ScanError>;
487
488impl<T: Input> Scanner<T> {
489    /// Creates the YAML tokenizer.
490    pub fn new(input: T) -> Scanner<T> {
491        Scanner {
492            input,
493            mark: Marker::new(0, 1, 0),
494            tokens: VecDeque::new(),
495            error: None,
496
497            stream_start_produced: false,
498            stream_end_produced: false,
499            adjacent_value_allowed_at: 0,
500            simple_key_allowed: true,
501            simple_keys: Vec::new(),
502            indent: -1,
503            indents: Vec::new(),
504            flow_level: 0,
505            tokens_parsed: 0,
506            token_available: false,
507            leading_whitespace: true,
508            flow_mapping_started: false,
509            implicit_flow_mapping_states: vec![],
510
511            buf_leading_break: String::new(),
512            buf_trailing_breaks: String::new(),
513            buf_whitespaces: String::new(),
514        }
515    }
516
517    /// Get a copy of the last error that was encountered, if any.
518    ///
519    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
520    /// clone of) the same error.
521    #[inline]
522    pub fn get_error(&self) -> Option<ScanError> {
523        self.error.clone()
524    }
525
526    /// Consume the next character. It is assumed the next character is a blank.
527    #[inline]
528    fn skip_blank(&mut self) {
529        self.input.skip();
530
531        self.mark.index += 1;
532        self.mark.col += 1;
533    }
534
535    /// Consume the next character. It is assumed the next character is not a blank.
536    #[inline]
537    fn skip_non_blank(&mut self) {
538        self.input.skip();
539
540        self.mark.index += 1;
541        self.mark.col += 1;
542        self.leading_whitespace = false;
543    }
544
545    /// Consume the next characters. It is assumed none of the next characters are blanks.
546    #[inline]
547    fn skip_n_non_blank(&mut self, count: usize) {
548        self.input.skip_n(count);
549
550        self.mark.index += count;
551        self.mark.col += count;
552        self.leading_whitespace = false;
553    }
554
555    /// Consume the next character. It is assumed the next character is a newline.
556    #[inline]
557    fn skip_nl(&mut self) {
558        self.input.skip();
559
560        self.mark.index += 1;
561        self.mark.col = 0;
562        self.mark.line += 1;
563        self.leading_whitespace = true;
564    }
565
566    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
567    #[inline]
568    fn skip_linebreak(&mut self) {
569        if self.input.next_2_are('\r', '\n') {
570            // While technically not a blank, this does not matter as `self.leading_whitespace`
571            // will be reset by `skip_nl`.
572            self.skip_blank();
573            self.skip_nl();
574        } else if self.input.next_is_break() {
575            self.skip_nl();
576        }
577    }
578
579    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
580    #[inline]
581    pub fn stream_started(&self) -> bool {
582        self.stream_start_produced
583    }
584
585    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
586    #[inline]
587    pub fn stream_ended(&self) -> bool {
588        self.stream_end_produced
589    }
590
591    /// Get the current position in the input stream.
592    #[inline]
593    pub fn mark(&self) -> Marker {
594        self.mark
595    }
596
597    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
598    //
599    // A `\n` is pushed into `s`.
600    //
601    // # Panics (in debug)
602    // If the next characters do not correspond to a line break.
603    #[inline]
604    fn read_break(&mut self, s: &mut String) {
605        self.skip_break();
606        s.push('\n');
607    }
608
609    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
610    //
611    // # Panics (in debug)
612    // If the next characters do not correspond to a line break.
613    #[inline]
614    fn skip_break(&mut self) {
615        let c = self.input.peek();
616        let nc = self.input.peek_nth(1);
617        debug_assert!(is_break(c));
618        if c == '\r' && nc == '\n' {
619            self.skip_blank();
620        }
621        self.skip_nl();
622    }
623
624    /// Insert a token at the given position.
625    fn insert_token(&mut self, pos: usize, tok: Token) {
626        let old_len = self.tokens.len();
627        assert!(pos <= old_len);
628        self.tokens.insert(pos, tok);
629    }
630
631    fn allow_simple_key(&mut self) {
632        self.simple_key_allowed = true;
633    }
634
635    fn disallow_simple_key(&mut self) {
636        self.simple_key_allowed = false;
637    }
638
639    /// Fetch the next token in the stream.
640    ///
641    /// # Errors
642    /// Returns `ScanError` when the scanner does not find the next expected token.
643    pub fn fetch_next_token(&mut self) -> ScanResult {
644        self.input.lookahead(1);
645
646        if !self.stream_start_produced {
647            self.fetch_stream_start();
648            return Ok(());
649        }
650        self.skip_to_next_token()?;
651
652        debug_print!(
653            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
654            self.mark,
655            self.input.peek()
656        );
657
658        self.stale_simple_keys()?;
659
660        let mark = self.mark;
661        self.unroll_indent(mark.col as isize);
662
663        self.input.lookahead(4);
664
665        if self.input.next_is_z() {
666            self.fetch_stream_end()?;
667            return Ok(());
668        }
669
670        if self.mark.col == 0 {
671            if self.input.next_char_is('%') {
672                return self.fetch_directive();
673            } else if self.input.next_is_document_start() {
674                return self.fetch_document_indicator(TokenType::DocumentStart);
675            } else if self.input.next_is_document_end() {
676                self.fetch_document_indicator(TokenType::DocumentEnd)?;
677                self.skip_ws_to_eol(SkipTabs::Yes)?;
678                if !self.input.next_is_breakz() {
679                    return Err(ScanError::new_str(
680                        self.mark,
681                        "invalid content after document end marker",
682                    ));
683                }
684                return Ok(());
685            }
686        }
687
688        if (self.mark.col as isize) < self.indent {
689            return Err(ScanError::new_str(self.mark, "invalid indentation"));
690        }
691
692        let c = self.input.peek();
693        let nc = self.input.peek_nth(1);
694        match c {
695            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
696            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
697            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
698            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
699            ',' => self.fetch_flow_entry(),
700            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
701            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
702            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
703            ':' if self.flow_level > 0
704                && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at) =>
705            {
706                self.fetch_flow_value()
707            }
708            // Is it an alias?
709            '*' => self.fetch_anchor(true),
710            // Is it an anchor?
711            '&' => self.fetch_anchor(false),
712            '!' => self.fetch_tag(),
713            // Is it a literal scalar?
714            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
715            // Is it a folded scalar?
716            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
717            '\'' => self.fetch_flow_scalar(true),
718            '"' => self.fetch_flow_scalar(false),
719            // plain scalar
720            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
721            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
722                self.fetch_plain_scalar()
723            }
724            '%' | '@' | '`' => Err(ScanError::new(
725                self.mark,
726                format!("unexpected character: `{c}'"),
727            )),
728            _ => self.fetch_plain_scalar(),
729        }
730    }
731
732    /// Return the next token in the stream.
733    /// # Errors
734    /// Returns `ScanError` when scanning fails to find an expected next token.
735    pub fn next_token(&mut self) -> Result<Option<Token>, ScanError> {
736        if self.stream_end_produced {
737            return Ok(None);
738        }
739
740        if !self.token_available {
741            self.fetch_more_tokens()?;
742        }
743        let Some(t) = self.tokens.pop_front() else {
744            return Err(ScanError::new_str(
745                self.mark,
746                "did not find expected next token",
747            ));
748        };
749        self.token_available = false;
750        self.tokens_parsed += 1;
751
752        if let TokenType::StreamEnd = t.1 {
753            self.stream_end_produced = true;
754        }
755        Ok(Some(t))
756    }
757
758    /// Fetch tokens from the token stream.
759    /// # Errors
760    /// Returns `ScanError` when loading fails.
761    pub fn fetch_more_tokens(&mut self) -> ScanResult {
762        let mut need_more;
763        loop {
764            if self.tokens.is_empty() {
765                need_more = true;
766            } else {
767                need_more = false;
768                // Stale potential keys that we know won't be keys.
769                self.stale_simple_keys()?;
770                // If our next token to be emitted may be a key, fetch more context.
771                for sk in &self.simple_keys {
772                    if sk.possible && sk.token_number == self.tokens_parsed {
773                        need_more = true;
774                        break;
775                    }
776                }
777            }
778
779            if !need_more {
780                break;
781            }
782            self.fetch_next_token()?;
783        }
784        self.token_available = true;
785
786        Ok(())
787    }
788
789    /// Mark simple keys that can no longer be keys as such.
790    ///
791    /// This function sets `possible` to `false` to each key that, now we have more context, we
792    /// know will not be keys.
793    ///
794    /// # Errors
795    /// This function returns an error if one of the key we would stale was required to be a key.
796    fn stale_simple_keys(&mut self) -> ScanResult {
797        for sk in &mut self.simple_keys {
798            if sk.possible
799                // If not in a flow construct, simple keys cannot span multiple lines.
800                && self.flow_level == 0
801                    && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
802            {
803                if sk.required {
804                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
805                }
806                sk.possible = false;
807            }
808        }
809        Ok(())
810    }
811
812    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
813    ///
814    /// # Errors
815    /// This function returns an error if a tabulation is encountered where there should not be
816    /// one.
817    fn skip_to_next_token(&mut self) -> ScanResult {
818        loop {
819            // TODO(chenyh) BOM
820            match self.input.look_ch() {
821                // Tabs may not be used as indentation.
822                // "Indentation" only exists as long as a block is started, but does not exist
823                // inside of flow-style constructs. Tabs are allowed as part of leading
824                // whitespaces outside of indentation.
825                // If a flow-style construct is in an indented block, its contents must still be
826                // indented. Also, tabs are allowed anywhere in it if it has no content.
827                '\t' if self.is_within_block()
828                    && self.leading_whitespace
829                    && (self.mark.col as isize) < self.indent =>
830                {
831                    self.skip_ws_to_eol(SkipTabs::Yes)?;
832                    // If we have content on that line with a tab, return an error.
833                    if !self.input.next_is_breakz() {
834                        return Err(ScanError::new_str(
835                            self.mark,
836                            "tabs disallowed within this context (block indentation)",
837                        ));
838                    }
839                }
840                '\t' | ' ' => self.skip_blank(),
841                '\n' | '\r' => {
842                    self.input.lookahead(2);
843                    self.skip_linebreak();
844                    if self.flow_level == 0 {
845                        self.allow_simple_key();
846                    }
847                }
848                '#' => {
849                    let comment_length = self.input.skip_while_non_breakz();
850                    self.mark.index += comment_length;
851                    self.mark.col += comment_length;
852                }
853                _ => break,
854            }
855        }
856        Ok(())
857    }
858
859    /// Skip over YAML whitespace (` `, `\n`, `\r`).
860    ///
861    /// # Errors
862    /// This function returns an error if no whitespace was found.
863    fn skip_yaml_whitespace(&mut self) -> ScanResult {
864        let mut need_whitespace = true;
865        loop {
866            match self.input.look_ch() {
867                ' ' => {
868                    self.skip_blank();
869
870                    need_whitespace = false;
871                }
872                '\n' | '\r' => {
873                    self.input.lookahead(2);
874                    self.skip_linebreak();
875                    if self.flow_level == 0 {
876                        self.allow_simple_key();
877                    }
878                    need_whitespace = false;
879                }
880                '#' => {
881                    let comment_length = self.input.skip_while_non_breakz();
882                    self.mark.index += comment_length;
883                    self.mark.col += comment_length;
884                }
885                _ => break,
886            }
887        }
888
889        if need_whitespace {
890            Err(ScanError::new_str(self.mark(), "expected whitespace"))
891        } else {
892            Ok(())
893        }
894    }
895
896    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
897        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
898        self.mark.col += n_bytes;
899        self.mark.index += n_bytes;
900        result.map_err(|msg| ScanError::new_str(self.mark, msg))
901    }
902
903    fn fetch_stream_start(&mut self) {
904        let mark = self.mark;
905        self.indent = -1;
906        self.stream_start_produced = true;
907        self.allow_simple_key();
908        self.tokens.push_back(Token(
909            Span::empty(mark),
910            TokenType::StreamStart(TEncoding::Utf8),
911        ));
912        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
913    }
914
915    fn fetch_stream_end(&mut self) -> ScanResult {
916        // force new line
917        if self.mark.col != 0 {
918            self.mark.col = 0;
919            self.mark.line += 1;
920        }
921
922        // If the stream ended, we won't have more context. We can stall all the simple keys we
923        // had. If one was required, however, that was an error and we must propagate it.
924        for sk in &mut self.simple_keys {
925            if sk.required && sk.possible {
926                return Err(ScanError::new_str(self.mark, "simple key expected"));
927            }
928            sk.possible = false;
929        }
930
931        self.unroll_indent(-1);
932        self.remove_simple_key()?;
933        self.disallow_simple_key();
934
935        self.tokens
936            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
937        Ok(())
938    }
939
940    fn fetch_directive(&mut self) -> ScanResult {
941        self.unroll_indent(-1);
942        self.remove_simple_key()?;
943
944        self.disallow_simple_key();
945
946        let tok = self.scan_directive()?;
947        self.tokens.push_back(tok);
948
949        Ok(())
950    }
951
952    fn scan_directive(&mut self) -> Result<Token, ScanError> {
953        let start_mark = self.mark;
954        self.skip_non_blank();
955
956        let name = self.scan_directive_name()?;
957        let tok = match name.as_ref() {
958            "YAML" => self.scan_version_directive_value(&start_mark)?,
959            "TAG" => self.scan_tag_directive_value(&start_mark)?,
960            // XXX This should be a warning instead of an error
961            _ => {
962                // skip current line
963                let line_len = self.input.skip_while_non_breakz();
964                self.mark.index += line_len;
965                self.mark.col += line_len;
966                // XXX return an empty TagDirective token
967                Token(
968                    Span::new(start_mark, self.mark),
969                    TokenType::TagDirective(String::new(), String::new()),
970                )
971                // return Err(ScanError::new_str(start_mark,
972                //     "while scanning a directive, found unknown directive name"))
973            }
974        };
975
976        self.skip_ws_to_eol(SkipTabs::Yes)?;
977
978        if self.input.next_is_breakz() {
979            self.input.lookahead(2);
980            self.skip_linebreak();
981            Ok(tok)
982        } else {
983            Err(ScanError::new_str(
984                start_mark,
985                "while scanning a directive, did not find expected comment or line break",
986            ))
987        }
988    }
989
990    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
991        let n_blanks = self.input.skip_while_blank();
992        self.mark.index += n_blanks;
993        self.mark.col += n_blanks;
994
995        let major = self.scan_version_directive_number(mark)?;
996
997        if self.input.peek() != '.' {
998            return Err(ScanError::new_str(
999                *mark,
1000                "while scanning a YAML directive, did not find expected digit or '.' character",
1001            ));
1002        }
1003        self.skip_non_blank();
1004
1005        let minor = self.scan_version_directive_number(mark)?;
1006
1007        Ok(Token(
1008            Span::new(*mark, self.mark),
1009            TokenType::VersionDirective(major, minor),
1010        ))
1011    }
1012
1013    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1014        let start_mark = self.mark;
1015        let mut string = String::new();
1016
1017        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1018        self.mark.index += n_chars;
1019        self.mark.col += n_chars;
1020
1021        if string.is_empty() {
1022            return Err(ScanError::new_str(
1023                start_mark,
1024                "while scanning a directive, could not find expected directive name",
1025            ));
1026        }
1027
1028        if !is_blank_or_breakz(self.input.peek()) {
1029            return Err(ScanError::new_str(
1030                start_mark,
1031                "while scanning a directive, found unexpected non-alphabetical character",
1032            ));
1033        }
1034
1035        Ok(string)
1036    }
1037
1038    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1039        let mut val = 0u32;
1040        let mut length = 0usize;
1041        while let Some(digit) = self.input.look_ch().to_digit(10) {
1042            if length + 1 > 9 {
1043                return Err(ScanError::new_str(
1044                    *mark,
1045                    "while scanning a YAML directive, found extremely long version number",
1046                ));
1047            }
1048            length += 1;
1049            val = val * 10 + digit;
1050            self.skip_non_blank();
1051        }
1052
1053        if length == 0 {
1054            return Err(ScanError::new_str(
1055                *mark,
1056                "while scanning a YAML directive, did not find expected version number",
1057            ));
1058        }
1059
1060        Ok(val)
1061    }
1062
1063    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
1064        let n_blanks = self.input.skip_while_blank();
1065        self.mark.index += n_blanks;
1066        self.mark.col += n_blanks;
1067
1068        let handle = self.scan_tag_handle(true, mark)?;
1069
1070        let n_blanks = self.input.skip_while_blank();
1071        self.mark.index += n_blanks;
1072        self.mark.col += n_blanks;
1073
1074        let prefix = self.scan_tag_prefix(mark)?;
1075
1076        self.input.lookahead(1);
1077
1078        if self.input.next_is_blank_or_breakz() {
1079            Ok(Token(
1080                Span::new(*mark, self.mark),
1081                TokenType::TagDirective(handle, prefix),
1082            ))
1083        } else {
1084            Err(ScanError::new_str(
1085                *mark,
1086                "while scanning TAG, did not find expected whitespace or line break",
1087            ))
1088        }
1089    }
1090
1091    fn fetch_tag(&mut self) -> ScanResult {
1092        self.save_simple_key();
1093        self.disallow_simple_key();
1094
1095        let tok = self.scan_tag()?;
1096        self.tokens.push_back(tok);
1097        Ok(())
1098    }
1099
1100    fn scan_tag(&mut self) -> Result<Token, ScanError> {
1101        let start_mark = self.mark;
1102        let mut handle = String::new();
1103        let mut suffix;
1104
1105        // Check if the tag is in the canonical form (verbatim).
1106        self.input.lookahead(2);
1107
1108        if self.input.nth_char_is(1, '<') {
1109            suffix = self.scan_verbatim_tag(&start_mark)?;
1110        } else {
1111            // The tag has either the '!suffix' or the '!handle!suffix'
1112            handle = self.scan_tag_handle(false, &start_mark)?;
1113            // Check if it is, indeed, handle.
1114            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1115                // A tag handle starting with "!!" is a secondary tag handle.
1116                let is_secondary_handle = handle == "!!";
1117                suffix =
1118                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?;
1119            } else {
1120                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?;
1121                "!".clone_into(&mut handle);
1122                // A special case: the '!' tag.  Set the handle to '' and the
1123                // suffix to '!'.
1124                if suffix.is_empty() {
1125                    handle.clear();
1126                    "!".clone_into(&mut suffix);
1127                }
1128            }
1129        }
1130
1131        if is_blank_or_breakz(self.input.look_ch())
1132            || (self.flow_level > 0 && self.input.next_is_flow())
1133        {
1134            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1135            Ok(Token(
1136                Span::new(start_mark, self.mark),
1137                TokenType::Tag(handle, suffix),
1138            ))
1139        } else {
1140            Err(ScanError::new_str(
1141                start_mark,
1142                "while scanning a tag, did not find expected whitespace or line break",
1143            ))
1144        }
1145    }
1146
1147    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1148        let mut string = String::new();
1149        if self.input.look_ch() != '!' {
1150            return Err(ScanError::new_str(
1151                *mark,
1152                "while scanning a tag, did not find expected '!'",
1153            ));
1154        }
1155
1156        string.push(self.input.peek());
1157        self.skip_non_blank();
1158
1159        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1160        self.mark.index += n_chars;
1161        self.mark.col += n_chars;
1162
1163        // Check if the trailing character is '!' and copy it.
1164        if self.input.peek() == '!' {
1165            string.push(self.input.peek());
1166            self.skip_non_blank();
1167        } else if directive && string != "!" {
1168            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1169            // directive, it's an error.  If it's a tag token, it must be a part of
1170            // URI.
1171            return Err(ScanError::new_str(
1172                *mark,
1173                "while parsing a tag directive, did not find expected '!'",
1174            ));
1175        }
1176        Ok(string)
1177    }
1178
1179    /// Scan for a tag prefix (6.8.2.2).
1180    ///
1181    /// There are 2 kinds of tag prefixes:
1182    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1183    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1184    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1185        let mut string = String::new();
1186
1187        if self.input.look_ch() == '!' {
1188            // If we have a local tag, insert and skip `!`.
1189            string.push(self.input.peek());
1190            self.skip_non_blank();
1191        } else if !is_tag_char(self.input.peek()) {
1192            // Otherwise, check if the first global tag character is valid.
1193            return Err(ScanError::new_str(
1194                *start_mark,
1195                "invalid global tag character",
1196            ));
1197        } else if self.input.peek() == '%' {
1198            // If it is valid and an escape sequence, escape it.
1199            string.push(self.scan_uri_escapes(start_mark)?);
1200        } else {
1201            // Otherwise, push the first character.
1202            string.push(self.input.peek());
1203            self.skip_non_blank();
1204        }
1205
1206        while is_uri_char(self.input.look_ch()) {
1207            if self.input.peek() == '%' {
1208                string.push(self.scan_uri_escapes(start_mark)?);
1209            } else {
1210                string.push(self.input.peek());
1211                self.skip_non_blank();
1212            }
1213        }
1214
1215        Ok(string)
1216    }
1217
1218    /// Scan for a verbatim tag.
1219    ///
1220    /// The prefixing `!<` must _not_ have been skipped.
1221    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1222        // Eat `!<`
1223        self.skip_non_blank();
1224        self.skip_non_blank();
1225
1226        let mut string = String::new();
1227        while is_uri_char(self.input.look_ch()) {
1228            if self.input.peek() == '%' {
1229                string.push(self.scan_uri_escapes(start_mark)?);
1230            } else {
1231                string.push(self.input.peek());
1232                self.skip_non_blank();
1233            }
1234        }
1235
1236        if self.input.peek() != '>' {
1237            return Err(ScanError::new_str(
1238                *start_mark,
1239                "while scanning a verbatim tag, did not find the expected '>'",
1240            ));
1241        }
1242        self.skip_non_blank();
1243
1244        Ok(string)
1245    }
1246
1247    fn scan_tag_shorthand_suffix(
1248        &mut self,
1249        _directive: bool,
1250        _is_secondary: bool,
1251        head: &str,
1252        mark: &Marker,
1253    ) -> Result<String, ScanError> {
1254        let mut length = head.len();
1255        let mut string = String::new();
1256
1257        // Copy the head if needed.
1258        // Note that we don't copy the leading '!' character.
1259        if length > 1 {
1260            string.extend(head.chars().skip(1));
1261        }
1262
1263        while is_tag_char(self.input.look_ch()) {
1264            // Check if it is a URI-escape sequence.
1265            if self.input.peek() == '%' {
1266                string.push(self.scan_uri_escapes(mark)?);
1267            } else {
1268                string.push(self.input.peek());
1269                self.skip_non_blank();
1270            }
1271
1272            length += 1;
1273        }
1274
1275        if length == 0 {
1276            return Err(ScanError::new_str(
1277                *mark,
1278                "while parsing a tag, did not find expected tag URI",
1279            ));
1280        }
1281
1282        Ok(string)
1283    }
1284
1285    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1286        let mut width = 0usize;
1287        let mut code = 0u32;
1288        loop {
1289            self.input.lookahead(3);
1290
1291            let c = self.input.peek_nth(1);
1292            let nc = self.input.peek_nth(2);
1293
1294            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
1295                return Err(ScanError::new_str(
1296                    *mark,
1297                    "while parsing a tag, found an invalid escape sequence",
1298                ));
1299            }
1300
1301            let byte = (as_hex(c) << 4) + as_hex(nc);
1302            if width == 0 {
1303                width = match byte {
1304                    _ if byte & 0x80 == 0x00 => 1,
1305                    _ if byte & 0xE0 == 0xC0 => 2,
1306                    _ if byte & 0xF0 == 0xE0 => 3,
1307                    _ if byte & 0xF8 == 0xF0 => 4,
1308                    _ => {
1309                        return Err(ScanError::new_str(
1310                            *mark,
1311                            "while parsing a tag, found an incorrect leading UTF-8 byte",
1312                        ));
1313                    }
1314                };
1315                code = byte;
1316            } else {
1317                if byte & 0xc0 != 0x80 {
1318                    return Err(ScanError::new_str(
1319                        *mark,
1320                        "while parsing a tag, found an incorrect trailing UTF-8 byte",
1321                    ));
1322                }
1323                code = (code << 8) + byte;
1324            }
1325
1326            self.skip_n_non_blank(3);
1327
1328            width -= 1;
1329            if width == 0 {
1330                break;
1331            }
1332        }
1333
1334        match char::from_u32(code) {
1335            Some(ch) => Ok(ch),
1336            None => Err(ScanError::new_str(
1337                *mark,
1338                "while parsing a tag, found an invalid UTF-8 codepoint",
1339            )),
1340        }
1341    }
1342
1343    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
1344        self.save_simple_key();
1345        self.disallow_simple_key();
1346
1347        let tok = self.scan_anchor(alias)?;
1348
1349        self.tokens.push_back(tok);
1350
1351        Ok(())
1352    }
1353
1354    fn scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError> {
1355        let mut string = String::new();
1356        let start_mark = self.mark;
1357
1358        self.skip_non_blank();
1359        while is_anchor_char(self.input.look_ch()) {
1360            string.push(self.input.peek());
1361            self.skip_non_blank();
1362        }
1363
1364        if string.is_empty() {
1365            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
1366        }
1367
1368        let tok = if alias {
1369            TokenType::Alias(string)
1370        } else {
1371            TokenType::Anchor(string)
1372        };
1373        Ok(Token(Span::new(start_mark, self.mark), tok))
1374    }
1375
1376    fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult {
1377        // The indicators '[' and '{' may start a simple key.
1378        self.save_simple_key();
1379
1380        self.roll_one_col_indent();
1381        self.increase_flow_level()?;
1382
1383        self.allow_simple_key();
1384
1385        let start_mark = self.mark;
1386        self.skip_non_blank();
1387
1388        if tok == TokenType::FlowMappingStart {
1389            self.flow_mapping_started = true;
1390        } else {
1391            self.implicit_flow_mapping_states
1392                .push(ImplicitMappingState::Possible);
1393        }
1394
1395        self.skip_ws_to_eol(SkipTabs::Yes)?;
1396
1397        self.tokens
1398            .push_back(Token(Span::new(start_mark, self.mark), tok));
1399        Ok(())
1400    }
1401
1402    fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult {
1403        self.remove_simple_key()?;
1404        self.decrease_flow_level();
1405
1406        self.disallow_simple_key();
1407
1408        if matches!(tok, TokenType::FlowSequenceEnd) {
1409            self.end_implicit_mapping(self.mark);
1410            // We are out exiting the flow sequence, nesting goes down 1 level.
1411            self.implicit_flow_mapping_states.pop();
1412        }
1413
1414        let start_mark = self.mark;
1415        self.skip_non_blank();
1416        self.skip_ws_to_eol(SkipTabs::Yes)?;
1417
1418        // A flow collection within a flow mapping can be a key. In that case, the value may be
1419        // adjacent to the `:`.
1420        // ```yaml
1421        // - [ {a: b}:value ]
1422        // ```
1423        if self.flow_level > 0 {
1424            self.adjacent_value_allowed_at = self.mark.index;
1425        }
1426
1427        self.tokens
1428            .push_back(Token(Span::new(start_mark, self.mark), tok));
1429        Ok(())
1430    }
1431
1432    /// Push the `FlowEntry` token and skip over the `,`.
1433    fn fetch_flow_entry(&mut self) -> ScanResult {
1434        self.remove_simple_key()?;
1435        self.allow_simple_key();
1436
1437        self.end_implicit_mapping(self.mark);
1438
1439        let start_mark = self.mark;
1440        self.skip_non_blank();
1441        self.skip_ws_to_eol(SkipTabs::Yes)?;
1442
1443        self.tokens.push_back(Token(
1444            Span::new(start_mark, self.mark),
1445            TokenType::FlowEntry,
1446        ));
1447        Ok(())
1448    }
1449
1450    fn increase_flow_level(&mut self) -> ScanResult {
1451        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1452        self.flow_level = self
1453            .flow_level
1454            .checked_add(1)
1455            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
1456        Ok(())
1457    }
1458
1459    fn decrease_flow_level(&mut self) {
1460        if self.flow_level > 0 {
1461            self.flow_level -= 1;
1462            self.simple_keys.pop().unwrap();
1463        }
1464    }
1465
1466    /// Push the `Block*` token(s) and skip over the `-`.
1467    ///
1468    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
1469    /// `BlockEntry` token.
1470    /// This function only skips over the `-` and does not fetch the entry value.
1471    fn fetch_block_entry(&mut self) -> ScanResult {
1472        if self.flow_level > 0 {
1473            // - * only allowed in block
1474            return Err(ScanError::new_str(
1475                self.mark,
1476                r#""-" is only valid inside a block"#,
1477            ));
1478        }
1479        // Check if we are allowed to start a new entry.
1480        if !self.simple_key_allowed {
1481            return Err(ScanError::new_str(
1482                self.mark,
1483                "block sequence entries are not allowed in this context",
1484            ));
1485        }
1486
1487        // ???, fixes test G9HC.
1488        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
1489            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
1490                return Err(ScanError::new_str(
1491                    span.start,
1492                    "invalid indentation for anchor",
1493                ));
1494            }
1495        }
1496
1497        // Skip over the `-`.
1498        let mark = self.mark;
1499        self.skip_non_blank();
1500
1501        // generate BLOCK-SEQUENCE-START if indented
1502        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
1503        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
1504        self.input.lookahead(2);
1505        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
1506        {
1507            return Err(ScanError::new_str(
1508                self.mark,
1509                "'-' must be followed by a valid YAML whitespace",
1510            ));
1511        }
1512
1513        self.skip_ws_to_eol(SkipTabs::No)?;
1514        self.input.lookahead(1);
1515        if self.input.next_is_break() || self.input.next_is_flow() {
1516            self.roll_one_col_indent();
1517        }
1518
1519        self.remove_simple_key()?;
1520        self.allow_simple_key();
1521
1522        self.tokens
1523            .push_back(Token(Span::empty(self.mark), TokenType::BlockEntry));
1524
1525        Ok(())
1526    }
1527
1528    fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult {
1529        self.unroll_indent(-1);
1530        self.remove_simple_key()?;
1531        self.disallow_simple_key();
1532
1533        let mark = self.mark;
1534
1535        self.skip_n_non_blank(3);
1536
1537        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
1538        Ok(())
1539    }
1540
1541    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
1542        self.save_simple_key();
1543        self.allow_simple_key();
1544        let tok = self.scan_block_scalar(literal)?;
1545
1546        self.tokens.push_back(tok);
1547        Ok(())
1548    }
1549
1550    #[allow(clippy::too_many_lines)]
1551    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError> {
1552        let start_mark = self.mark;
1553        let mut chomping = Chomping::Clip;
1554        let mut increment: usize = 0;
1555        let mut indent: usize = 0;
1556        let mut trailing_blank: bool;
1557        let mut leading_blank: bool = false;
1558        let style = if literal {
1559            TScalarStyle::Literal
1560        } else {
1561            TScalarStyle::Folded
1562        };
1563
1564        let mut string = String::new();
1565        let mut leading_break = String::new();
1566        let mut trailing_breaks = String::new();
1567        let mut chomping_break = String::new();
1568
1569        // skip '|' or '>'
1570        self.skip_non_blank();
1571        self.unroll_non_block_indents();
1572
1573        if self.input.look_ch() == '+' || self.input.peek() == '-' {
1574            if self.input.peek() == '+' {
1575                chomping = Chomping::Keep;
1576            } else {
1577                chomping = Chomping::Strip;
1578            }
1579            self.skip_non_blank();
1580            self.input.lookahead(1);
1581            if self.input.next_is_digit() {
1582                if self.input.peek() == '0' {
1583                    return Err(ScanError::new_str(
1584                        start_mark,
1585                        "while scanning a block scalar, found an indentation indicator equal to 0",
1586                    ));
1587                }
1588                increment = (self.input.peek() as usize) - ('0' as usize);
1589                self.skip_non_blank();
1590            }
1591        } else if self.input.next_is_digit() {
1592            if self.input.peek() == '0' {
1593                return Err(ScanError::new_str(
1594                    start_mark,
1595                    "while scanning a block scalar, found an indentation indicator equal to 0",
1596                ));
1597            }
1598
1599            increment = (self.input.peek() as usize) - ('0' as usize);
1600            self.skip_non_blank();
1601            self.input.lookahead(1);
1602            if self.input.peek() == '+' || self.input.peek() == '-' {
1603                if self.input.peek() == '+' {
1604                    chomping = Chomping::Keep;
1605                } else {
1606                    chomping = Chomping::Strip;
1607                }
1608                self.skip_non_blank();
1609            }
1610        }
1611
1612        self.skip_ws_to_eol(SkipTabs::Yes)?;
1613
1614        // Check if we are at the end of the line.
1615        self.input.lookahead(1);
1616        if !self.input.next_is_breakz() {
1617            return Err(ScanError::new_str(
1618                start_mark,
1619                "while scanning a block scalar, did not find expected comment or line break",
1620            ));
1621        }
1622
1623        if self.input.next_is_break() {
1624            self.input.lookahead(2);
1625            self.read_break(&mut chomping_break);
1626        }
1627
1628        if self.input.look_ch() == '\t' {
1629            return Err(ScanError::new_str(
1630                start_mark,
1631                "a block scalar content cannot start with a tab",
1632            ));
1633        }
1634
1635        if increment > 0 {
1636            indent = if self.indent >= 0 {
1637                (self.indent + increment as isize) as usize
1638            } else {
1639                increment
1640            }
1641        }
1642
1643        // Scan the leading line breaks and determine the indentation level if needed.
1644        if indent == 0 {
1645            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
1646        } else {
1647            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1648        }
1649
1650        // We have an end-of-stream with no content, e.g.:
1651        // ```yaml
1652        // - |+
1653        // ```
1654        if self.input.next_is_z() {
1655            let contents = match chomping {
1656                // We strip trailing linebreaks. Nothing remain.
1657                Chomping::Strip => String::new(),
1658                // There was no newline after the chomping indicator.
1659                _ if self.mark.line == start_mark.line() => String::new(),
1660                // We clip lines, and there was a newline after the chomping indicator.
1661                // All other breaks are ignored.
1662                Chomping::Clip => chomping_break,
1663                // We keep lines. There was a newline after the chomping indicator but nothing
1664                // else.
1665                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
1666                // Otherwise, the newline after chomping is ignored.
1667                Chomping::Keep => trailing_breaks,
1668            };
1669            return Ok(Token(
1670                Span::new(start_mark, self.mark),
1671                TokenType::Scalar(style, contents),
1672            ));
1673        }
1674
1675        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
1676            return Err(ScanError::new_str(
1677                self.mark,
1678                "wrongly indented line in block scalar",
1679            ));
1680        }
1681
1682        let mut line_buffer = String::with_capacity(100);
1683        let start_mark = self.mark;
1684        while self.mark.col == indent && !self.input.next_is_z() {
1685            if indent == 0 {
1686                self.input.lookahead(4);
1687                if self.input.next_is_document_end() {
1688                    break;
1689                }
1690            }
1691
1692            // We are at the first content character of a content line.
1693            trailing_blank = self.input.next_is_blank();
1694            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
1695                string.push_str(&trailing_breaks);
1696                if trailing_breaks.is_empty() {
1697                    string.push(' ');
1698                }
1699            } else {
1700                string.push_str(&leading_break);
1701                string.push_str(&trailing_breaks);
1702            }
1703
1704            leading_break.clear();
1705            trailing_breaks.clear();
1706
1707            leading_blank = self.input.next_is_blank();
1708
1709            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
1710
1711            // break on EOF
1712            self.input.lookahead(2);
1713            if self.input.next_is_z() {
1714                break;
1715            }
1716
1717            self.read_break(&mut leading_break);
1718
1719            // Eat the following indentation spaces and line breaks.
1720            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1721        }
1722
1723        // Chomp the tail.
1724        if chomping != Chomping::Strip {
1725            string.push_str(&leading_break);
1726            // If we had reached an eof but the last character wasn't an end-of-line, check if the
1727            // last line was indented at least as the rest of the scalar, then we need to consider
1728            // there is a newline.
1729            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
1730                string.push('\n');
1731            }
1732        }
1733
1734        if chomping == Chomping::Keep {
1735            string.push_str(&trailing_breaks);
1736        }
1737
1738        Ok(Token(
1739            Span::new(start_mark, self.mark),
1740            TokenType::Scalar(style, string),
1741        ))
1742    }
1743
1744    /// Retrieve the contents of the line, parsing it as a block scalar.
1745    ///
1746    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
1747    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
1748    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
1749    /// `clear`ed before the end of the function.
1750    ///
1751    /// This function assumed the first character to read is the first content character in the
1752    /// line. This function does not consume the line break character(s) after the line.
1753    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
1754        // Start by evaluating characters in the buffer.
1755        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
1756            string.push(self.input.peek());
1757            // We may technically skip non-blank characters. However, the only distinction is
1758            // to determine what is leading whitespace and what is not. Here, we read the
1759            // contents of the line until either eof or a linebreak. We know we will not read
1760            // `self.leading_whitespace` until the end of the line, where it will be reset.
1761            // This allows us to call a slightly less expensive function.
1762            self.skip_blank();
1763        }
1764
1765        // All characters that were in the buffer were consumed. We need to check if more
1766        // follow.
1767        if self.input.buf_is_empty() {
1768            // We will read all consecutive non-breakz characters. We push them into a
1769            // temporary buffer. The main difference with going through `self.buffer` is that
1770            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
1771            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
1772            // (using `String::push_str`).
1773            while let Some(c) = self.input.raw_read_non_breakz_ch() {
1774                line_buffer.push(c);
1775            }
1776
1777            // We need to manually update our position; we haven't called a `skip` function.
1778            let n_chars = line_buffer.chars().count();
1779            self.mark.col += n_chars;
1780            self.mark.index += n_chars;
1781
1782            // We can now append our bytes to our `string`.
1783            string.reserve(line_buffer.as_bytes().len());
1784            string.push_str(line_buffer);
1785            // This clears the _contents_ without touching the _capacity_.
1786            line_buffer.clear();
1787        }
1788    }
1789
1790    /// Skip the block scalar indentation and empty lines.
1791    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
1792        loop {
1793            // Consume all spaces. Tabs cannot be used as indentation.
1794            if indent < self.input.bufmaxlen() - 2 {
1795                self.input.lookahead(self.input.bufmaxlen());
1796                while self.mark.col < indent && self.input.peek() == ' ' {
1797                    self.skip_blank();
1798                }
1799            } else {
1800                loop {
1801                    self.input.lookahead(self.input.bufmaxlen());
1802                    while !self.input.buf_is_empty()
1803                        && self.mark.col < indent
1804                        && self.input.peek() == ' '
1805                    {
1806                        self.skip_blank();
1807                    }
1808                    // If we reached our indent, we can break. We must also break if we have
1809                    // reached content or EOF; that is, the buffer is not empty and the next
1810                    // character is not a space.
1811                    if self.mark.col == indent
1812                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
1813                    {
1814                        break;
1815                    }
1816                }
1817                self.input.lookahead(2);
1818            }
1819
1820            // If our current line is empty, skip over the break and continue looping.
1821            if self.input.next_is_break() {
1822                self.read_break(breaks);
1823            } else {
1824                // Otherwise, we have a content line. Return control.
1825                break;
1826            }
1827        }
1828    }
1829
1830    /// Determine the indentation level for a block scalar from the first line of its contents.
1831    ///
1832    /// The function skips over whitespace-only lines and sets `indent` to the the longest
1833    /// whitespace line that was encountered.
1834    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
1835        let mut max_indent = 0;
1836        loop {
1837            // Consume all spaces. Tabs cannot be used as indentation.
1838            while self.input.look_ch() == ' ' {
1839                self.skip_blank();
1840            }
1841
1842            if self.mark.col > max_indent {
1843                max_indent = self.mark.col;
1844            }
1845
1846            if self.input.next_is_break() {
1847                // If our current line is empty, skip over the break and continue looping.
1848                self.input.lookahead(2);
1849                self.read_break(breaks);
1850            } else {
1851                // Otherwise, we have a content line. Return control.
1852                break;
1853            }
1854        }
1855
1856        // In case a yaml looks like:
1857        // ```yaml
1858        // |
1859        // foo
1860        // bar
1861        // ```
1862        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
1863        // least 1. When in the above example, `self.indent` will be set to -1.
1864        *indent = max_indent.max((self.indent + 1) as usize);
1865        if self.indent > 0 {
1866            *indent = (*indent).max(1);
1867        }
1868    }
1869
1870    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
1871        self.save_simple_key();
1872        self.disallow_simple_key();
1873
1874        let tok = self.scan_flow_scalar(single)?;
1875
1876        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
1877        // YAML allows the following value to be specified adjacent to the “:”.
1878        self.skip_to_next_token()?;
1879        self.adjacent_value_allowed_at = self.mark.index;
1880
1881        self.tokens.push_back(tok);
1882        Ok(())
1883    }
1884
1885    #[allow(clippy::too_many_lines)]
1886    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError> {
1887        let start_mark = self.mark;
1888
1889        let mut string = String::new();
1890        let mut leading_break = String::new();
1891        let mut trailing_breaks = String::new();
1892        let mut whitespaces = String::new();
1893        let mut leading_blanks;
1894
1895        /* Eat the left quote. */
1896        self.skip_non_blank();
1897
1898        loop {
1899            /* Check for a document indicator. */
1900            self.input.lookahead(4);
1901
1902            if self.mark.col == 0 && self.input.next_is_document_indicator() {
1903                return Err(ScanError::new_str(
1904                    start_mark,
1905                    "while scanning a quoted scalar, found unexpected document indicator",
1906                ));
1907            }
1908
1909            if self.input.next_is_z() {
1910                return Err(ScanError::new_str(
1911                    start_mark,
1912                    "while scanning a quoted scalar, found unexpected end of stream",
1913                ));
1914            }
1915
1916            if (self.mark.col as isize) < self.indent {
1917                return Err(ScanError::new_str(
1918                    start_mark,
1919                    "invalid indentation in quoted scalar",
1920                ));
1921            }
1922
1923            leading_blanks = false;
1924            self.consume_flow_scalar_non_whitespace_chars(
1925                single,
1926                &mut string,
1927                &mut leading_blanks,
1928                &start_mark,
1929            )?;
1930
1931            match self.input.look_ch() {
1932                '\'' if single => break,
1933                '"' if !single => break,
1934                _ => {}
1935            }
1936
1937            // Consume blank characters.
1938            while self.input.next_is_blank() || self.input.next_is_break() {
1939                if self.input.next_is_blank() {
1940                    // Consume a space or a tab character.
1941                    if leading_blanks {
1942                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
1943                            return Err(ScanError::new_str(
1944                                self.mark,
1945                                "tab cannot be used as indentation",
1946                            ));
1947                        }
1948                        self.skip_blank();
1949                    } else {
1950                        whitespaces.push(self.input.peek());
1951                        self.skip_blank();
1952                    }
1953                } else {
1954                    self.input.lookahead(2);
1955                    // Check if it is a first line break.
1956                    if leading_blanks {
1957                        self.read_break(&mut trailing_breaks);
1958                    } else {
1959                        whitespaces.clear();
1960                        self.read_break(&mut leading_break);
1961                        leading_blanks = true;
1962                    }
1963                }
1964                self.input.lookahead(1);
1965            }
1966
1967            // Join the whitespaces or fold line breaks.
1968            if leading_blanks {
1969                if leading_break.is_empty() {
1970                    string.push_str(&leading_break);
1971                    string.push_str(&trailing_breaks);
1972                    trailing_breaks.clear();
1973                    leading_break.clear();
1974                } else {
1975                    if trailing_breaks.is_empty() {
1976                        string.push(' ');
1977                    } else {
1978                        string.push_str(&trailing_breaks);
1979                        trailing_breaks.clear();
1980                    }
1981                    leading_break.clear();
1982                }
1983            } else {
1984                string.push_str(&whitespaces);
1985                whitespaces.clear();
1986            }
1987        } // loop
1988
1989        // Eat the right quote.
1990        self.skip_non_blank();
1991        // Ensure there is no invalid trailing content.
1992        self.skip_ws_to_eol(SkipTabs::Yes)?;
1993        match self.input.peek() {
1994            // These can be encountered in flow sequences or mappings.
1995            ',' | '}' | ']' if self.flow_level > 0 => {}
1996            // An end-of-line / end-of-stream is fine. No trailing content.
1997            c if is_breakz(c) => {}
1998            // ':' can be encountered if our scalar is a key.
1999            // Outside of flow contexts, keys cannot span multiple lines
2000            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2001            // Inside a flow context, this is allowed.
2002            ':' if self.flow_level > 0 => {}
2003            _ => {
2004                return Err(ScanError::new_str(
2005                    self.mark,
2006                    "invalid trailing content after double-quoted scalar",
2007                ));
2008            }
2009        }
2010
2011        let style = if single {
2012            TScalarStyle::SingleQuoted
2013        } else {
2014            TScalarStyle::DoubleQuoted
2015        };
2016        Ok(Token(
2017            Span::new(start_mark, self.mark),
2018            TokenType::Scalar(style, string),
2019        ))
2020    }
2021
2022    /// Consume successive non-whitespace characters from a flow scalar.
2023    ///
2024    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2025    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2026    /// for double quoted scalars).
2027    ///
2028    /// # Errors
2029    /// Return an error if an invalid escape sequence is found.
2030    fn consume_flow_scalar_non_whitespace_chars(
2031        &mut self,
2032        single: bool,
2033        string: &mut String,
2034        leading_blanks: &mut bool,
2035        start_mark: &Marker,
2036    ) -> Result<(), ScanError> {
2037        self.input.lookahead(2);
2038        while !is_blank_or_breakz(self.input.peek()) {
2039            match self.input.peek() {
2040                // Check for an escaped single quote.
2041                '\'' if self.input.peek_nth(1) == '\'' && single => {
2042                    string.push('\'');
2043                    self.skip_n_non_blank(2);
2044                }
2045                // Check for the right quote.
2046                '\'' if single => break,
2047                '"' if !single => break,
2048                // Check for an escaped line break.
2049                '\\' if !single && is_break(self.input.peek_nth(1)) => {
2050                    self.input.lookahead(3);
2051                    self.skip_non_blank();
2052                    self.skip_linebreak();
2053                    *leading_blanks = true;
2054                    break;
2055                }
2056                // Check for an escape sequence.
2057                '\\' if !single => {
2058                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2059                }
2060                c => {
2061                    string.push(c);
2062                    self.skip_non_blank();
2063                }
2064            }
2065            self.input.lookahead(2);
2066        }
2067        Ok(())
2068    }
2069
2070    /// Escape the sequence we encounter in a flow scalar.
2071    ///
2072    /// `self.input.peek()` must point to the `\` starting the escape sequence.
2073    ///
2074    /// # Errors
2075    /// Return an error if an invalid escape sequence is found.
2076    fn resolve_flow_scalar_escape_sequence(
2077        &mut self,
2078        start_mark: &Marker,
2079    ) -> Result<char, ScanError> {
2080        let mut code_length = 0usize;
2081        let mut ret = '\0';
2082
2083        match self.input.peek_nth(1) {
2084            '0' => ret = '\0',
2085            'a' => ret = '\x07',
2086            'b' => ret = '\x08',
2087            't' | '\t' => ret = '\t',
2088            'n' => ret = '\n',
2089            'v' => ret = '\x0b',
2090            'f' => ret = '\x0c',
2091            'r' => ret = '\x0d',
2092            'e' => ret = '\x1b',
2093            ' ' => ret = '\x20',
2094            '"' => ret = '"',
2095            '/' => ret = '/',
2096            '\\' => ret = '\\',
2097            // Unicode next line (#x85)
2098            'N' => ret = char::from_u32(0x85).unwrap(),
2099            // Unicode non-breaking space (#xA0)
2100            '_' => ret = char::from_u32(0xA0).unwrap(),
2101            // Unicode line separator (#x2028)
2102            'L' => ret = char::from_u32(0x2028).unwrap(),
2103            // Unicode paragraph separator (#x2029)
2104            'P' => ret = char::from_u32(0x2029).unwrap(),
2105            'x' => code_length = 2,
2106            'u' => code_length = 4,
2107            'U' => code_length = 8,
2108            _ => {
2109                return Err(ScanError::new_str(
2110                    *start_mark,
2111                    "while parsing a quoted scalar, found unknown escape character",
2112                ))
2113            }
2114        }
2115        self.skip_n_non_blank(2);
2116
2117        // Consume an arbitrary escape code.
2118        if code_length > 0 {
2119            self.input.lookahead(code_length);
2120            let mut value = 0u32;
2121            for i in 0..code_length {
2122                let c = self.input.peek_nth(i);
2123                if !is_hex(c) {
2124                    return Err(ScanError::new_str(
2125                        *start_mark,
2126                        "while parsing a quoted scalar, did not find expected hexadecimal number",
2127                    ));
2128                }
2129                value = (value << 4) + as_hex(c);
2130            }
2131
2132            let Some(ch) = char::from_u32(value) else {
2133                return Err(ScanError::new_str(
2134                    *start_mark,
2135                    "while parsing a quoted scalar, found invalid Unicode character escape code",
2136                ));
2137            };
2138            ret = ch;
2139
2140            self.skip_n_non_blank(code_length);
2141        }
2142        Ok(ret)
2143    }
2144
2145    fn fetch_plain_scalar(&mut self) -> ScanResult {
2146        self.save_simple_key();
2147        self.disallow_simple_key();
2148
2149        let tok = self.scan_plain_scalar()?;
2150
2151        self.tokens.push_back(tok);
2152        Ok(())
2153    }
2154
2155    /// Scan for a plain scalar.
2156    ///
2157    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
2158    /// some contexts.
2159    #[allow(clippy::too_many_lines)]
2160    fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
2161        self.unroll_non_block_indents();
2162        let indent = self.indent + 1;
2163        let start_mark = self.mark;
2164
2165        if self.flow_level > 0 && (start_mark.col as isize) < indent {
2166            return Err(ScanError::new_str(
2167                start_mark,
2168                "invalid indentation in flow construct",
2169            ));
2170        }
2171
2172        let mut string = String::with_capacity(32);
2173        self.buf_whitespaces.clear();
2174        self.buf_leading_break.clear();
2175        self.buf_trailing_breaks.clear();
2176        let mut end_mark = self.mark;
2177
2178        loop {
2179            self.input.lookahead(4);
2180            if self.input.next_is_document_end()
2181                || (self.leading_whitespace && self.input.next_is_document_start())
2182                || self.input.peek() == '#'
2183            {
2184                break;
2185            }
2186
2187            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
2188                return Err(ScanError::new_str(
2189                    self.mark,
2190                    "plain scalar cannot start with '-' followed by ,[]{}",
2191                ));
2192            }
2193
2194            if !self.input.next_is_blank_or_breakz()
2195                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
2196            {
2197                if self.leading_whitespace {
2198                    if self.buf_leading_break.is_empty() {
2199                        string.push_str(&self.buf_leading_break);
2200                        string.push_str(&self.buf_trailing_breaks);
2201                        self.buf_trailing_breaks.clear();
2202                        self.buf_leading_break.clear();
2203                    } else {
2204                        if self.buf_trailing_breaks.is_empty() {
2205                            string.push(' ');
2206                        } else {
2207                            string.push_str(&self.buf_trailing_breaks);
2208                            self.buf_trailing_breaks.clear();
2209                        }
2210                        self.buf_leading_break.clear();
2211                    }
2212                    self.leading_whitespace = false;
2213                } else if !self.buf_whitespaces.is_empty() {
2214                    string.push_str(&self.buf_whitespaces);
2215                    self.buf_whitespaces.clear();
2216                }
2217
2218                // We can unroll the first iteration of the loop.
2219                string.push(self.input.peek());
2220                self.skip_non_blank();
2221                string.reserve(self.input.bufmaxlen());
2222
2223                // Add content non-blank characters to the scalar.
2224                let mut end = false;
2225                while !end {
2226                    // Fill the buffer once and process all characters in the buffer until the next
2227                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
2228                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
2229                    self.input.lookahead(self.input.bufmaxlen());
2230                    for _ in 0..self.input.bufmaxlen() - 1 {
2231                        if self.input.next_is_blank_or_breakz()
2232                            || !self.input.next_can_be_plain_scalar(self.flow_level > 0)
2233                        {
2234                            end = true;
2235                            break;
2236                        }
2237                        string.push(self.input.peek());
2238                        self.skip_non_blank();
2239                    }
2240                }
2241                end_mark = self.mark;
2242            }
2243
2244            // We may reach the end of a plain scalar if:
2245            //  - We reach eof
2246            //  - We reach ": "
2247            //  - We find a flow character in a flow context
2248            if !(self.input.next_is_blank() || self.input.next_is_break()) {
2249                break;
2250            }
2251
2252            // Process blank characters.
2253            self.input.lookahead(2);
2254            while self.input.next_is_blank_or_break() {
2255                if self.input.next_is_blank() {
2256                    if !self.leading_whitespace {
2257                        self.buf_whitespaces.push(self.input.peek());
2258                        self.skip_blank();
2259                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
2260                        // Tabs in an indentation columns are allowed if and only if the line is
2261                        // empty. Skip to the end of the line.
2262                        self.skip_ws_to_eol(SkipTabs::Yes)?;
2263                        if !self.input.next_is_breakz() {
2264                            return Err(ScanError::new_str(
2265                                start_mark,
2266                                "while scanning a plain scalar, found a tab",
2267                            ));
2268                        }
2269                    } else {
2270                        self.skip_blank();
2271                    }
2272                } else {
2273                    // Check if it is a first line break
2274                    if self.leading_whitespace {
2275                        self.skip_break();
2276                        self.buf_trailing_breaks.push('\n');
2277                    } else {
2278                        self.buf_whitespaces.clear();
2279                        self.skip_break();
2280                        self.buf_leading_break.push('\n');
2281                        self.leading_whitespace = true;
2282                    }
2283                }
2284                self.input.lookahead(2);
2285            }
2286
2287            // check indentation level
2288            if self.flow_level == 0 && (self.mark.col as isize) < indent {
2289                break;
2290            }
2291        }
2292
2293        if self.leading_whitespace {
2294            self.allow_simple_key();
2295        }
2296
2297        if string.is_empty() {
2298            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
2299            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
2300            // erroneous inputs such as "{...".
2301            Err(ScanError::new_str(
2302                start_mark,
2303                "unexpected end of plain scalar",
2304            ))
2305        } else {
2306            Ok(Token(
2307                Span::new(start_mark, end_mark),
2308                TokenType::Scalar(TScalarStyle::Plain, string),
2309            ))
2310        }
2311    }
2312
2313    fn fetch_key(&mut self) -> ScanResult {
2314        let start_mark = self.mark;
2315        if self.flow_level == 0 {
2316            // Check if we are allowed to start a new key (not necessarily simple).
2317            if !self.simple_key_allowed {
2318                return Err(ScanError::new_str(
2319                    self.mark,
2320                    "mapping keys are not allowed in this context",
2321                ));
2322            }
2323            self.roll_indent(
2324                start_mark.col,
2325                None,
2326                TokenType::BlockMappingStart,
2327                start_mark,
2328            );
2329        } else {
2330            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
2331            self.flow_mapping_started = true;
2332        }
2333
2334        self.remove_simple_key()?;
2335
2336        if self.flow_level == 0 {
2337            self.allow_simple_key();
2338        } else {
2339            self.disallow_simple_key();
2340        }
2341
2342        self.skip_non_blank();
2343        self.skip_yaml_whitespace()?;
2344        if self.input.peek() == '\t' {
2345            return Err(ScanError::new_str(
2346                self.mark(),
2347                "tabs disallowed in this context",
2348            ));
2349        }
2350        self.tokens
2351            .push_back(Token(Span::new(start_mark, self.mark), TokenType::Key));
2352        Ok(())
2353    }
2354
2355    /// Fetch a value in a mapping inside of a flow collection.
2356    ///
2357    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
2358    /// values in flow collections are respected prior to calling [`fetch_value`].
2359    ///
2360    /// [`self.flow_level`]: Self::flow_level
2361    /// [`fetch_value`]: Self::fetch_value
2362    fn fetch_flow_value(&mut self) -> ScanResult {
2363        let nc = self.input.peek_nth(1);
2364
2365        // If we encounter a ':' inside a flow collection and it is not immediately
2366        // followed by a blank or breakz:
2367        //   - We must check whether an adjacent value is allowed
2368        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
2369        //     is needed for JSON compatibility.
2370        //   - If not, we must ensure there is a space after the ':' and before its value.
2371        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
2372        //   - But if the value is empty (null), then it's okay.
2373        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
2374        // flow character), but the ']' is not the value. The value is an invisible empty
2375        // space which is represented as null ('~').
2376        if self.mark.index != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
2377            return Err(ScanError::new_str(
2378                self.mark,
2379                "':' may not precede any of `[{` in flow mapping",
2380            ));
2381        }
2382
2383        self.fetch_value()
2384    }
2385
2386    /// Fetch a value from a mapping (after a `:`).
2387    fn fetch_value(&mut self) -> ScanResult {
2388        let sk = self.simple_keys.last().unwrap().clone();
2389        let start_mark = self.mark;
2390        let is_implicit_flow_mapping =
2391            !self.implicit_flow_mapping_states.is_empty() && !self.flow_mapping_started;
2392        if is_implicit_flow_mapping {
2393            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Inside;
2394        }
2395
2396        // Skip over ':'.
2397        self.skip_non_blank();
2398        if self.input.look_ch() == '\t'
2399            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
2400            && (self.input.peek() == '-' || self.input.next_is_alpha())
2401        {
2402            return Err(ScanError::new_str(
2403                self.mark,
2404                "':' must be followed by a valid YAML whitespace",
2405            ));
2406        }
2407
2408        if sk.possible {
2409            // insert simple key
2410            let tok = Token(Span::empty(sk.mark), TokenType::Key);
2411            self.insert_token(sk.token_number - self.tokens_parsed, tok);
2412            if is_implicit_flow_mapping {
2413                if sk.mark.line < start_mark.line {
2414                    return Err(ScanError::new_str(
2415                        start_mark,
2416                        "illegal placement of ':' indicator",
2417                    ));
2418                }
2419                self.insert_token(
2420                    sk.token_number - self.tokens_parsed,
2421                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
2422                );
2423            }
2424
2425            // Add the BLOCK-MAPPING-START token if needed.
2426            self.roll_indent(
2427                sk.mark.col,
2428                Some(sk.token_number),
2429                TokenType::BlockMappingStart,
2430                sk.mark,
2431            );
2432            self.roll_one_col_indent();
2433
2434            self.simple_keys.last_mut().unwrap().possible = false;
2435            self.disallow_simple_key();
2436        } else {
2437            if is_implicit_flow_mapping {
2438                self.tokens
2439                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
2440            }
2441            // The ':' indicator follows a complex key.
2442            if self.flow_level == 0 {
2443                if !self.simple_key_allowed {
2444                    return Err(ScanError::new_str(
2445                        start_mark,
2446                        "mapping values are not allowed in this context",
2447                    ));
2448                }
2449
2450                self.roll_indent(
2451                    start_mark.col,
2452                    None,
2453                    TokenType::BlockMappingStart,
2454                    start_mark,
2455                );
2456            }
2457            self.roll_one_col_indent();
2458
2459            if self.flow_level == 0 {
2460                self.allow_simple_key();
2461            } else {
2462                self.disallow_simple_key();
2463            }
2464        }
2465        self.tokens
2466            .push_back(Token(Span::empty(start_mark), TokenType::Value));
2467
2468        Ok(())
2469    }
2470
2471    /// Add an indentation level to the stack with the given block token, if needed.
2472    ///
2473    /// An indentation level is added only if:
2474    ///   - We are not in a flow-style construct (which don't have indentation per-se).
2475    ///   - The current column is further indented than the last indent we have registered.
2476    fn roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker) {
2477        if self.flow_level > 0 {
2478            return;
2479        }
2480
2481        // If the last indent was a non-block indent, remove it.
2482        // This means that we prepared an indent that we thought we wouldn't use, but realized just
2483        // now that it is a block indent.
2484        if self.indent <= col as isize {
2485            if let Some(indent) = self.indents.last() {
2486                if !indent.needs_block_end {
2487                    self.indent = indent.indent;
2488                    self.indents.pop();
2489                }
2490            }
2491        }
2492
2493        if self.indent < col as isize {
2494            self.indents.push(Indent {
2495                indent: self.indent,
2496                needs_block_end: true,
2497            });
2498            self.indent = col as isize;
2499            let tokens_parsed = self.tokens_parsed;
2500            match number {
2501                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
2502                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
2503            }
2504        }
2505    }
2506
2507    /// Pop indentation levels from the stack as much as needed.
2508    ///
2509    /// Indentation levels are popped from the stack while they are further indented than `col`.
2510    /// If we are in a flow-style construct (which don't have indentation per-se), this function
2511    /// does nothing.
2512    fn unroll_indent(&mut self, col: isize) {
2513        if self.flow_level > 0 {
2514            return;
2515        }
2516        while self.indent > col {
2517            let indent = self.indents.pop().unwrap();
2518            self.indent = indent.indent;
2519            if indent.needs_block_end {
2520                self.tokens
2521                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
2522            }
2523        }
2524    }
2525
2526    /// Add an indentation level of 1 column that does not start a block.
2527    ///
2528    /// See the documentation of [`Indent::needs_block_end`] for more details.
2529    /// An indentation is not added if we are inside a flow level or if the last indent is already
2530    /// a non-block indent.
2531    fn roll_one_col_indent(&mut self) {
2532        if self.flow_level == 0 && self.indents.last().map_or(false, |x| x.needs_block_end) {
2533            self.indents.push(Indent {
2534                indent: self.indent,
2535                needs_block_end: false,
2536            });
2537            self.indent += 1;
2538        }
2539    }
2540
2541    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
2542    fn unroll_non_block_indents(&mut self) {
2543        while let Some(indent) = self.indents.last() {
2544            if indent.needs_block_end {
2545                break;
2546            }
2547            self.indent = indent.indent;
2548            self.indents.pop();
2549        }
2550    }
2551
2552    /// Mark the next token to be inserted as a potential simple key.
2553    fn save_simple_key(&mut self) {
2554        if self.simple_key_allowed {
2555            let required = self.flow_level == 0
2556                && self.indent == (self.mark.col as isize)
2557                && self.indents.last().unwrap().needs_block_end;
2558            let mut sk = SimpleKey::new(self.mark);
2559            sk.possible = true;
2560            sk.required = required;
2561            sk.token_number = self.tokens_parsed + self.tokens.len();
2562
2563            self.simple_keys.pop();
2564            self.simple_keys.push(sk);
2565        }
2566    }
2567
2568    fn remove_simple_key(&mut self) -> ScanResult {
2569        let last = self.simple_keys.last_mut().unwrap();
2570        if last.possible && last.required {
2571            return Err(ScanError::new_str(self.mark, "simple key expected"));
2572        }
2573
2574        last.possible = false;
2575        Ok(())
2576    }
2577
2578    /// Return whether the scanner is inside a block but outside of a flow sequence.
2579    fn is_within_block(&self) -> bool {
2580        !self.indents.is_empty()
2581    }
2582
2583    /// If an implicit mapping had started, end it.
2584    ///
2585    /// This function does not pop the state in [`implicit_flow_mapping_states`].
2586    ///
2587    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
2588    fn end_implicit_mapping(&mut self, mark: Marker) {
2589        if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() {
2590            if *implicit_mapping == ImplicitMappingState::Inside {
2591                self.flow_mapping_started = false;
2592                *implicit_mapping = ImplicitMappingState::Possible;
2593                self.tokens
2594                    .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
2595            }
2596        }
2597    }
2598}
2599
2600/// Chomping, how final line breaks and trailing empty lines are interpreted.
2601///
2602/// See YAML spec 8.1.1.2.
2603#[derive(PartialEq, Eq)]
2604pub enum Chomping {
2605    /// The final line break and any trailing empty lines are excluded.
2606    Strip,
2607    /// The final line break is preserved, but trailing empty lines are excluded.
2608    Clip,
2609    /// The final line break and trailing empty lines are included.
2610    Keep,
2611}
2612
2613#[cfg(test)]
2614mod test {
2615    #[test]
2616    fn test_is_anchor_char() {
2617        use super::is_anchor_char;
2618        assert!(is_anchor_char('x'));
2619    }
2620}
saphyr_parser/scanner.rs

saphyr_parser/
scanner.rs