saphyr_parser/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use std::{borrow::Cow, char, collections::VecDeque, error::Error, fmt};
13
14use crate::{
15    char_traits::{
16        as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex,
17        is_tag_char, is_uri_char,
18    },
19    input::{Input, SkipTabs},
20};
21
22/// The encoding of the input. Currently, only UTF-8 is supported.
23#[derive(Clone, Copy, PartialEq, Debug, Eq)]
24pub enum TEncoding {
25    /// UTF-8 encoding.
26    Utf8,
27}
28
29/// The style as which the scalar was written in the YAML document.
30#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
31pub enum ScalarStyle {
32    /// A YAML plain scalar.
33    Plain,
34    /// A YAML single quoted scalar.
35    SingleQuoted,
36    /// A YAML double quoted scalar.
37    DoubleQuoted,
38
39    /// A YAML literal block (`|` block).
40    ///
41    /// See [8.1.2](https://yaml.org/spec/1.2.2/#812-literal-style).
42    /// In literal blocks, any indented character is content, including white space characters.
43    /// There is no way to escape characters, nor to break a long line.
44    Literal,
45    /// A YAML folded block (`>` block).
46    ///
47    /// See [8.1.3](https://yaml.org/spec/1.2.2/#813-folded-style).
48    /// In folded blocks, any indented character is content, including white space characters.
49    /// There is no way to escape characters. Content is subject to line folding, allowing breaking
50    /// long lines.
51    Folded,
52}
53
54/// A location in a yaml document.
55#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
56pub struct Marker {
57    /// The index (in chars) in the input string.
58    index: usize,
59    /// The line (1-indexed).
60    line: usize,
61    /// The column (1-indexed).
62    col: usize,
63}
64
65impl Marker {
66    /// Create a new [`Marker`] at the given position.
67    #[must_use]
68    pub fn new(index: usize, line: usize, col: usize) -> Marker {
69        Marker { index, line, col }
70    }
71
72    /// Return the index (in bytes) of the marker in the source.
73    #[must_use]
74    pub fn index(&self) -> usize {
75        self.index
76    }
77
78    /// Return the line of the marker in the source.
79    #[must_use]
80    pub fn line(&self) -> usize {
81        self.line
82    }
83
84    /// Return the column of the marker in the source.
85    #[must_use]
86    pub fn col(&self) -> usize {
87        self.col
88    }
89}
90
91/// A range of locations in a Yaml document.
92#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
93pub struct Span {
94    /// The start (inclusive) of the range.
95    pub start: Marker,
96    /// The end (exclusive) of the range.
97    pub end: Marker,
98}
99
100impl Span {
101    /// Create a new [`Span`] for the given range.
102    #[must_use]
103    pub fn new(start: Marker, end: Marker) -> Span {
104        Span { start, end }
105    }
106
107    /// Create a empty [`Span`] at a given location.
108    ///
109    /// An empty span doesn't contain any characters, but its position may still be meaningful.
110    /// For example, for an indented sequence [`SequenceEnd`] has a location but an empty span.
111    ///
112    /// [`SequenceEnd`]: crate::Event::SequenceEnd
113    #[must_use]
114    pub fn empty(mark: Marker) -> Span {
115        Span {
116            start: mark,
117            end: mark,
118        }
119    }
120
121    /// Return the length of the span (in characters).
122    #[must_use]
123    pub fn len(&self) -> usize {
124        self.end.index - self.start.index
125    }
126
127    /// Return whether the [`Span`] has a length of zero.
128    #[must_use]
129    pub fn is_empty(&self) -> bool {
130        self.len() == 0
131    }
132}
133
134/// An error that occurred while scanning.
135#[derive(Clone, PartialEq, Debug, Eq)]
136pub struct ScanError {
137    /// The position at which the error happened in the source.
138    mark: Marker,
139    /// Human-readable details about the error.
140    info: String,
141}
142
143impl ScanError {
144    /// Create a new error from a location and an error string.
145    #[must_use]
146    pub fn new(loc: Marker, info: String) -> ScanError {
147        ScanError { mark: loc, info }
148    }
149
150    /// Convenience alias for string slices.
151    #[must_use]
152    pub fn new_str(loc: Marker, info: &str) -> ScanError {
153        ScanError {
154            mark: loc,
155            info: info.to_owned(),
156        }
157    }
158
159    /// Return the marker pointing to the error in the source.
160    #[must_use]
161    pub fn marker(&self) -> &Marker {
162        &self.mark
163    }
164
165    /// Return the information string describing the error that happened.
166    #[must_use]
167    pub fn info(&self) -> &str {
168        self.info.as_ref()
169    }
170}
171
172impl Error for ScanError {
173    fn source(&self) -> Option<&(dyn Error + 'static)> {
174        None
175    }
176}
177
178impl fmt::Display for ScanError {
179    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
180        write!(
181            formatter,
182            "{} at byte {} line {} column {}",
183            self.info,
184            self.mark.index,
185            self.mark.line,
186            self.mark.col + 1,
187        )
188    }
189}
190
191/// The contents of a scanner token.
192#[derive(Clone, PartialEq, Debug, Eq)]
193pub enum TokenType<'input> {
194    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
195    StreamStart(TEncoding),
196    /// The end of the stream, EOF.
197    StreamEnd,
198    /// A YAML version directive.
199    VersionDirective(
200        /// Major
201        u32,
202        /// Minor
203        u32,
204    ),
205    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
206    TagDirective(
207        /// Handle
208        Cow<'input, str>,
209        /// Prefix
210        Cow<'input, str>,
211    ),
212    /// The start of a YAML document (`---`).
213    DocumentStart,
214    /// The end of a YAML document (`...`).
215    DocumentEnd,
216    /// The start of a sequence block.
217    ///
218    /// Sequence blocks are arrays starting with a `-`.
219    BlockSequenceStart,
220    /// The start of a sequence mapping.
221    ///
222    /// Sequence mappings are "dictionaries" with "key: value" entries.
223    BlockMappingStart,
224    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
225    BlockEnd,
226    /// Start of an inline sequence (`[ a, b ]`).
227    FlowSequenceStart,
228    /// End of an inline sequence.
229    FlowSequenceEnd,
230    /// Start of an inline mapping (`{ a: b, c: d }`).
231    FlowMappingStart,
232    /// End of an inline mapping.
233    FlowMappingEnd,
234    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
235    BlockEntry,
236    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
237    FlowEntry,
238    /// A key in a mapping.
239    Key,
240    /// A value in a mapping.
241    Value,
242    /// A reference to an anchor.
243    Alias(Cow<'input, str>),
244    /// A YAML anchor (`&`/`*`).
245    Anchor(Cow<'input, str>),
246    /// A YAML tag (starting with bangs `!`).
247    Tag(
248        /// The handle of the tag.
249        String,
250        /// The suffix of the tag.
251        String,
252    ),
253    /// A regular YAML scalar.
254    Scalar(ScalarStyle, Cow<'input, str>),
255}
256
257/// A scanner token.
258#[derive(Clone, PartialEq, Debug, Eq)]
259pub struct Token<'input>(pub Span, pub TokenType<'input>);
260
261/// A scalar that was parsed and may correspond to a simple key.
262///
263/// Upon scanning the following yaml:
264/// ```yaml
265/// a: b
266/// ```
267/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
268/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
269/// kept inside the scanner until more context is fetched and we are able to know whether it is a
270/// plain scalar or a key.
271///
272/// For example, see the following 2 yaml documents:
273/// ```yaml
274/// ---
275/// a: b # Here, `a` is a key.
276/// ...
277/// ---
278/// a # Here, `a` is a plain scalar.
279/// ...
280/// ```
281/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
282///
283/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
284/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
285/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
286///
287/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
288/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
289/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
290/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
291/// [`TokenType::Scalar`] token.
292///
293/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
294/// [`TokenType::Key`] would be emitted by the scanner.
295#[derive(Clone, PartialEq, Debug, Eq)]
296struct SimpleKey {
297    /// Whether the token this [`SimpleKey`] refers to may still be a key.
298    ///
299    /// Sometimes, when we have more context, we notice that what we thought could be a key no
300    /// longer can be. In that case, [`Self::possible`] is set to `false`.
301    ///
302    /// For instance, let us consider the following invalid YAML:
303    /// ```yaml
304    /// key
305    ///   : value
306    /// ```
307    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
308    /// and [`Self::possible`] set to `false`.
309    possible: bool,
310    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
311    ///
312    /// With more context, we may know for sure that the token must be a key. If the YAML is
313    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
314    /// be raised. This boolean helps us know when to raise such error.
315    ///
316    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
317    required: bool,
318    /// The index of the token referred to by the [`SimpleKey`].
319    ///
320    /// This is the index in the scanner, which takes into account both the tokens that have been
321    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
322    /// [`Scanner::tokens`] for more details.
323    token_number: usize,
324    /// The position at which the token the [`SimpleKey`] refers to is.
325    mark: Marker,
326}
327
328impl SimpleKey {
329    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
330    fn new(mark: Marker) -> SimpleKey {
331        SimpleKey {
332            possible: false,
333            required: false,
334            token_number: 0,
335            mark,
336        }
337    }
338}
339
340/// An indentation level on the stack of indentations.
341#[derive(Clone, Debug, Default)]
342struct Indent {
343    /// The former indentation level.
344    indent: isize,
345    /// Whether, upon closing, this indents generates a `BlockEnd` token.
346    ///
347    /// There are levels of indentation which do not start a block. Examples of this would be:
348    /// ```yaml
349    /// -
350    ///   foo # ok
351    /// -
352    /// bar # ko, bar needs to be indented further than the `-`.
353    /// - [
354    ///  baz, # ok
355    /// quux # ko, quux needs to be indented further than the '-'.
356    /// ] # ko, the closing bracket needs to be indented further than the `-`.
357    /// ```
358    ///
359    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
360    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
361    /// sequence, although we must have exactly one to end the sequence.
362    needs_block_end: bool,
363}
364
365/// The knowledge we have about an implicit mapping.
366///
367/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
368/// sequence is omitted:
369/// ```yaml
370/// [ a: b, c: d ]
371/// # Equivalent to
372/// [ { a: b }, { c: d } ]
373/// # Equivalent to
374/// - a: b
375/// - c: d
376/// ```
377///
378/// The state must be carefully tracked for each nested flow sequence since we must emit a
379/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
380/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
381/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
382/// out-of-order.
383///
384/// [`FlowMappingStart`]: TokenType::FlowMappingStart
385/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
386#[derive(Debug, PartialEq)]
387enum ImplicitMappingState {
388    /// It is possible there is an implicit mapping.
389    ///
390    /// This state is the one when we have just encountered the opening `[`. We need more context
391    /// to know whether an implicit mapping follows.
392    Possible,
393    /// We are inside the implcit mapping.
394    ///
395    /// Note that this state is not set immediately (we need to have encountered the `:` to know).
396    Inside,
397}
398
399/// The YAML scanner.
400///
401/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
402/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
403/// some of the constructs. It has understanding of indentation and whitespace and is able to
404/// generate error messages for some invalid YAML constructs.
405///
406/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
407/// YAML documents.
408#[derive(Debug)]
409#[allow(clippy::struct_excessive_bools)]
410pub struct Scanner<'input, T> {
411    /// The input source.
412    ///
413    /// This must implement [`Input`].
414    input: T,
415    /// The position of the cursor within the reader.
416    mark: Marker,
417    /// Buffer for tokens to be returned.
418    ///
419    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
420    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
421    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
422    /// [`Self::next`] until we have more context.
423    tokens: VecDeque<Token<'input>>,
424    /// The last error that happened.
425    error: Option<ScanError>,
426
427    /// Whether we have already emitted the `StreamStart` token.
428    stream_start_produced: bool,
429    /// Whether we have already emitted the `StreamEnd` token.
430    stream_end_produced: bool,
431    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
432    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
433    adjacent_value_allowed_at: usize,
434    /// Whether a simple key could potentially start at the current position.
435    ///
436    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
437    simple_key_allowed: bool,
438    /// A stack of potential simple keys.
439    ///
440    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
441    /// are.
442    simple_keys: Vec<SimpleKey>,
443    /// The current indentation level.
444    indent: isize,
445    /// List of all block indentation levels we are in (except the current one).
446    indents: Vec<Indent>,
447    /// Level of nesting of flow sequences.
448    flow_level: u8,
449    /// The number of tokens that have been returned from the scanner.
450    ///
451    /// This excludes the tokens from [`Self::tokens`].
452    tokens_parsed: usize,
453    /// Whether a token is ready to be taken from [`Self::tokens`].
454    token_available: bool,
455    /// Whether all characters encountered since the last newline were whitespace.
456    leading_whitespace: bool,
457    /// Whether we started a flow mapping.
458    ///
459    /// This is used to detect implicit flow mapping starts such as:
460    /// ```yaml
461    /// [ : foo ] # { null: "foo" }
462    /// ```
463    flow_mapping_started: bool,
464    /// An array of states, representing whether flow sequences have implicit mappings.
465    ///
466    /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
467    /// the state is set to [`Possible`].
468    /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
469    /// [`Inside`].
470    ///
471    /// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
472    /// The entries are created with the opening `]` and popped with the closing `]`.
473    ///
474    /// [`Possible`]: ImplicitMappingState::Possible
475    /// [`Inside`]: ImplicitMappingState::Inside
476    implicit_flow_mapping_states: Vec<ImplicitMappingState>,
477    buf_leading_break: String,
478    buf_trailing_breaks: String,
479    buf_whitespaces: String,
480}
481
482impl<'input, T: Input> Iterator for Scanner<'input, T> {
483    type Item = Token<'input>;
484
485    fn next(&mut self) -> Option<Self::Item> {
486        if self.error.is_some() {
487            return None;
488        }
489        match self.next_token() {
490            Ok(Some(tok)) => {
491                debug_print!(
492                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
493                    tok.1,
494                    tok.0
495                );
496                Some(tok)
497            }
498            Ok(tok) => tok,
499            Err(e) => {
500                self.error = Some(e);
501                None
502            }
503        }
504    }
505}
506
507/// A convenience alias for scanner functions that may fail without returning a value.
508pub type ScanResult = Result<(), ScanError>;
509
510impl<'input, T: Input> Scanner<'input, T> {
511    /// Creates the YAML tokenizer.
512    pub fn new(input: T) -> Self {
513        Scanner {
514            input,
515            mark: Marker::new(0, 1, 0),
516            tokens: VecDeque::new(),
517            error: None,
518
519            stream_start_produced: false,
520            stream_end_produced: false,
521            adjacent_value_allowed_at: 0,
522            simple_key_allowed: true,
523            simple_keys: Vec::new(),
524            indent: -1,
525            indents: Vec::new(),
526            flow_level: 0,
527            tokens_parsed: 0,
528            token_available: false,
529            leading_whitespace: true,
530            flow_mapping_started: false,
531            implicit_flow_mapping_states: vec![],
532
533            buf_leading_break: String::new(),
534            buf_trailing_breaks: String::new(),
535            buf_whitespaces: String::new(),
536        }
537    }
538
539    /// Get a copy of the last error that was encountered, if any.
540    ///
541    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
542    /// clone of) the same error.
543    #[inline]
544    pub fn get_error(&self) -> Option<ScanError> {
545        self.error.clone()
546    }
547
548    /// Consume the next character. It is assumed the next character is a blank.
549    #[inline]
550    fn skip_blank(&mut self) {
551        self.input.skip();
552
553        self.mark.index += 1;
554        self.mark.col += 1;
555    }
556
557    /// Consume the next character. It is assumed the next character is not a blank.
558    #[inline]
559    fn skip_non_blank(&mut self) {
560        self.input.skip();
561
562        self.mark.index += 1;
563        self.mark.col += 1;
564        self.leading_whitespace = false;
565    }
566
567    /// Consume the next characters. It is assumed none of the next characters are blanks.
568    #[inline]
569    fn skip_n_non_blank(&mut self, count: usize) {
570        self.input.skip_n(count);
571
572        self.mark.index += count;
573        self.mark.col += count;
574        self.leading_whitespace = false;
575    }
576
577    /// Consume the next character. It is assumed the next character is a newline.
578    #[inline]
579    fn skip_nl(&mut self) {
580        self.input.skip();
581
582        self.mark.index += 1;
583        self.mark.col = 0;
584        self.mark.line += 1;
585        self.leading_whitespace = true;
586    }
587
588    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
589    #[inline]
590    fn skip_linebreak(&mut self) {
591        if self.input.next_2_are('\r', '\n') {
592            // While technically not a blank, this does not matter as `self.leading_whitespace`
593            // will be reset by `skip_nl`.
594            self.skip_blank();
595            self.skip_nl();
596        } else if self.input.next_is_break() {
597            self.skip_nl();
598        }
599    }
600
601    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
602    #[inline]
603    pub fn stream_started(&self) -> bool {
604        self.stream_start_produced
605    }
606
607    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
608    #[inline]
609    pub fn stream_ended(&self) -> bool {
610        self.stream_end_produced
611    }
612
613    /// Get the current position in the input stream.
614    #[inline]
615    pub fn mark(&self) -> Marker {
616        self.mark
617    }
618
619    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
620    //
621    // A `\n` is pushed into `s`.
622    //
623    // # Panics (in debug)
624    // If the next characters do not correspond to a line break.
625    #[inline]
626    fn read_break(&mut self, s: &mut String) {
627        self.skip_break();
628        s.push('\n');
629    }
630
631    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
632    //
633    // # Panics (in debug)
634    // If the next characters do not correspond to a line break.
635    #[inline]
636    fn skip_break(&mut self) {
637        let c = self.input.peek();
638        let nc = self.input.peek_nth(1);
639        debug_assert!(is_break(c));
640        if c == '\r' && nc == '\n' {
641            self.skip_blank();
642        }
643        self.skip_nl();
644    }
645
646    /// Insert a token at the given position.
647    fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
648        let old_len = self.tokens.len();
649        assert!(pos <= old_len);
650        self.tokens.insert(pos, tok);
651    }
652
653    fn allow_simple_key(&mut self) {
654        self.simple_key_allowed = true;
655    }
656
657    fn disallow_simple_key(&mut self) {
658        self.simple_key_allowed = false;
659    }
660
661    /// Fetch the next token in the stream.
662    ///
663    /// # Errors
664    /// Returns `ScanError` when the scanner does not find the next expected token.
665    pub fn fetch_next_token(&mut self) -> ScanResult {
666        self.input.lookahead(1);
667
668        if !self.stream_start_produced {
669            self.fetch_stream_start();
670            return Ok(());
671        }
672        self.skip_to_next_token()?;
673
674        debug_print!(
675            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
676            self.mark,
677            self.input.peek()
678        );
679
680        self.stale_simple_keys()?;
681
682        let mark = self.mark;
683        self.unroll_indent(mark.col as isize);
684
685        self.input.lookahead(4);
686
687        if self.input.next_is_z() {
688            self.fetch_stream_end()?;
689            return Ok(());
690        }
691
692        if self.mark.col == 0 {
693            if self.input.next_char_is('%') {
694                return self.fetch_directive();
695            } else if self.input.next_is_document_start() {
696                return self.fetch_document_indicator(TokenType::DocumentStart);
697            } else if self.input.next_is_document_end() {
698                self.fetch_document_indicator(TokenType::DocumentEnd)?;
699                self.skip_ws_to_eol(SkipTabs::Yes)?;
700                if !self.input.next_is_breakz() {
701                    return Err(ScanError::new_str(
702                        self.mark,
703                        "invalid content after document end marker",
704                    ));
705                }
706                return Ok(());
707            }
708        }
709
710        if (self.mark.col as isize) < self.indent {
711            return Err(ScanError::new_str(self.mark, "invalid indentation"));
712        }
713
714        let c = self.input.peek();
715        let nc = self.input.peek_nth(1);
716        match c {
717            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
718            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
719            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
720            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
721            ',' => self.fetch_flow_entry(),
722            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
723            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
724            ':' if is_blank_or_breakz(nc) => self.fetch_value(),
725            ':' if self.flow_level > 0
726                && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at) =>
727            {
728                self.fetch_flow_value()
729            }
730            // Is it an alias?
731            '*' => self.fetch_anchor(true),
732            // Is it an anchor?
733            '&' => self.fetch_anchor(false),
734            '!' => self.fetch_tag(),
735            // Is it a literal scalar?
736            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
737            // Is it a folded scalar?
738            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
739            '\'' => self.fetch_flow_scalar(true),
740            '"' => self.fetch_flow_scalar(false),
741            // plain scalar
742            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
743            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
744                self.fetch_plain_scalar()
745            }
746            '%' | '@' | '`' => Err(ScanError::new(
747                self.mark,
748                format!("unexpected character: `{c}'"),
749            )),
750            _ => self.fetch_plain_scalar(),
751        }
752    }
753
754    /// Return the next token in the stream.
755    /// # Errors
756    /// Returns `ScanError` when scanning fails to find an expected next token.
757    pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
758        if self.stream_end_produced {
759            return Ok(None);
760        }
761
762        if !self.token_available {
763            self.fetch_more_tokens()?;
764        }
765        let Some(t) = self.tokens.pop_front() else {
766            return Err(ScanError::new_str(
767                self.mark,
768                "did not find expected next token",
769            ));
770        };
771        self.token_available = false;
772        self.tokens_parsed += 1;
773
774        if let TokenType::StreamEnd = t.1 {
775            self.stream_end_produced = true;
776        }
777        Ok(Some(t))
778    }
779
780    /// Fetch tokens from the token stream.
781    /// # Errors
782    /// Returns `ScanError` when loading fails.
783    pub fn fetch_more_tokens(&mut self) -> ScanResult {
784        let mut need_more;
785        loop {
786            if self.tokens.is_empty() {
787                need_more = true;
788            } else {
789                need_more = false;
790                // Stale potential keys that we know won't be keys.
791                self.stale_simple_keys()?;
792                // If our next token to be emitted may be a key, fetch more context.
793                for sk in &self.simple_keys {
794                    if sk.possible && sk.token_number == self.tokens_parsed {
795                        need_more = true;
796                        break;
797                    }
798                }
799            }
800
801            if !need_more {
802                break;
803            }
804            self.fetch_next_token()?;
805        }
806        self.token_available = true;
807
808        Ok(())
809    }
810
811    /// Mark simple keys that can no longer be keys as such.
812    ///
813    /// This function sets `possible` to `false` to each key that, now we have more context, we
814    /// know will not be keys.
815    ///
816    /// # Errors
817    /// This function returns an error if one of the key we would stale was required to be a key.
818    fn stale_simple_keys(&mut self) -> ScanResult {
819        for sk in &mut self.simple_keys {
820            if sk.possible
821                // If not in a flow construct, simple keys cannot span multiple lines.
822                && self.flow_level == 0
823                    && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
824            {
825                if sk.required {
826                    return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
827                }
828                sk.possible = false;
829            }
830        }
831        Ok(())
832    }
833
834    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
835    ///
836    /// # Errors
837    /// This function returns an error if a tabulation is encountered where there should not be
838    /// one.
839    fn skip_to_next_token(&mut self) -> ScanResult {
840        loop {
841            // TODO(chenyh) BOM
842            match self.input.look_ch() {
843                // Tabs may not be used as indentation.
844                // "Indentation" only exists as long as a block is started, but does not exist
845                // inside of flow-style constructs. Tabs are allowed as part of leading
846                // whitespaces outside of indentation.
847                // If a flow-style construct is in an indented block, its contents must still be
848                // indented. Also, tabs are allowed anywhere in it if it has no content.
849                '\t' if self.is_within_block()
850                    && self.leading_whitespace
851                    && (self.mark.col as isize) < self.indent =>
852                {
853                    self.skip_ws_to_eol(SkipTabs::Yes)?;
854                    // If we have content on that line with a tab, return an error.
855                    if !self.input.next_is_breakz() {
856                        return Err(ScanError::new_str(
857                            self.mark,
858                            "tabs disallowed within this context (block indentation)",
859                        ));
860                    }
861                }
862                '\t' | ' ' => self.skip_blank(),
863                '\n' | '\r' => {
864                    self.input.lookahead(2);
865                    self.skip_linebreak();
866                    if self.flow_level == 0 {
867                        self.allow_simple_key();
868                    }
869                }
870                '#' => {
871                    let comment_length = self.input.skip_while_non_breakz();
872                    self.mark.index += comment_length;
873                    self.mark.col += comment_length;
874                }
875                _ => break,
876            }
877        }
878        Ok(())
879    }
880
881    /// Skip over YAML whitespace (` `, `\n`, `\r`).
882    ///
883    /// # Errors
884    /// This function returns an error if no whitespace was found.
885    fn skip_yaml_whitespace(&mut self) -> ScanResult {
886        let mut need_whitespace = true;
887        loop {
888            match self.input.look_ch() {
889                ' ' => {
890                    self.skip_blank();
891
892                    need_whitespace = false;
893                }
894                '\n' | '\r' => {
895                    self.input.lookahead(2);
896                    self.skip_linebreak();
897                    if self.flow_level == 0 {
898                        self.allow_simple_key();
899                    }
900                    need_whitespace = false;
901                }
902                '#' => {
903                    let comment_length = self.input.skip_while_non_breakz();
904                    self.mark.index += comment_length;
905                    self.mark.col += comment_length;
906                }
907                _ => break,
908            }
909        }
910
911        if need_whitespace {
912            Err(ScanError::new_str(self.mark(), "expected whitespace"))
913        } else {
914            Ok(())
915        }
916    }
917
918    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
919        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
920        self.mark.col += n_bytes;
921        self.mark.index += n_bytes;
922        result.map_err(|msg| ScanError::new_str(self.mark, msg))
923    }
924
925    fn fetch_stream_start(&mut self) {
926        let mark = self.mark;
927        self.indent = -1;
928        self.stream_start_produced = true;
929        self.allow_simple_key();
930        self.tokens.push_back(Token(
931            Span::empty(mark),
932            TokenType::StreamStart(TEncoding::Utf8),
933        ));
934        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
935    }
936
937    fn fetch_stream_end(&mut self) -> ScanResult {
938        // force new line
939        if self.mark.col != 0 {
940            self.mark.col = 0;
941            self.mark.line += 1;
942        }
943
944        // If the stream ended, we won't have more context. We can stall all the simple keys we
945        // had. If one was required, however, that was an error and we must propagate it.
946        for sk in &mut self.simple_keys {
947            if sk.required && sk.possible {
948                return Err(ScanError::new_str(self.mark, "simple key expected"));
949            }
950            sk.possible = false;
951        }
952
953        self.unroll_indent(-1);
954        self.remove_simple_key()?;
955        self.disallow_simple_key();
956
957        self.tokens
958            .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd));
959        Ok(())
960    }
961
962    fn fetch_directive(&mut self) -> ScanResult {
963        self.unroll_indent(-1);
964        self.remove_simple_key()?;
965
966        self.disallow_simple_key();
967
968        let tok = self.scan_directive()?;
969        self.tokens.push_back(tok);
970
971        Ok(())
972    }
973
974    fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
975        let start_mark = self.mark;
976        self.skip_non_blank();
977
978        let name = self.scan_directive_name()?;
979        let tok = match name.as_ref() {
980            "YAML" => self.scan_version_directive_value(&start_mark)?,
981            "TAG" => self.scan_tag_directive_value(&start_mark)?,
982            // XXX This should be a warning instead of an error
983            _ => {
984                // skip current line
985                let line_len = self.input.skip_while_non_breakz();
986                self.mark.index += line_len;
987                self.mark.col += line_len;
988                // XXX return an empty TagDirective token
989                Token(
990                    Span::new(start_mark, self.mark),
991                    TokenType::TagDirective(Cow::default(), Cow::default()),
992                )
993                // return Err(ScanError::new_str(start_mark,
994                //     "while scanning a directive, found unknown directive name"))
995            }
996        };
997
998        self.skip_ws_to_eol(SkipTabs::Yes)?;
999
1000        if self.input.next_is_breakz() {
1001            self.input.lookahead(2);
1002            self.skip_linebreak();
1003            Ok(tok)
1004        } else {
1005            Err(ScanError::new_str(
1006                start_mark,
1007                "while scanning a directive, did not find expected comment or line break",
1008            ))
1009        }
1010    }
1011
1012    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1013        let n_blanks = self.input.skip_while_blank();
1014        self.mark.index += n_blanks;
1015        self.mark.col += n_blanks;
1016
1017        let major = self.scan_version_directive_number(mark)?;
1018
1019        if self.input.peek() != '.' {
1020            return Err(ScanError::new_str(
1021                *mark,
1022                "while scanning a YAML directive, did not find expected digit or '.' character",
1023            ));
1024        }
1025        self.skip_non_blank();
1026
1027        let minor = self.scan_version_directive_number(mark)?;
1028
1029        Ok(Token(
1030            Span::new(*mark, self.mark),
1031            TokenType::VersionDirective(major, minor),
1032        ))
1033    }
1034
1035    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1036        let start_mark = self.mark;
1037        let mut string = String::new();
1038
1039        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1040        self.mark.index += n_chars;
1041        self.mark.col += n_chars;
1042
1043        if string.is_empty() {
1044            return Err(ScanError::new_str(
1045                start_mark,
1046                "while scanning a directive, could not find expected directive name",
1047            ));
1048        }
1049
1050        if !is_blank_or_breakz(self.input.peek()) {
1051            return Err(ScanError::new_str(
1052                start_mark,
1053                "while scanning a directive, found unexpected non-alphabetical character",
1054            ));
1055        }
1056
1057        Ok(string)
1058    }
1059
1060    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1061        let mut val = 0u32;
1062        let mut length = 0usize;
1063        while let Some(digit) = self.input.look_ch().to_digit(10) {
1064            if length + 1 > 9 {
1065                return Err(ScanError::new_str(
1066                    *mark,
1067                    "while scanning a YAML directive, found extremely long version number",
1068                ));
1069            }
1070            length += 1;
1071            val = val * 10 + digit;
1072            self.skip_non_blank();
1073        }
1074
1075        if length == 0 {
1076            return Err(ScanError::new_str(
1077                *mark,
1078                "while scanning a YAML directive, did not find expected version number",
1079            ));
1080        }
1081
1082        Ok(val)
1083    }
1084
1085    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1086        let n_blanks = self.input.skip_while_blank();
1087        self.mark.index += n_blanks;
1088        self.mark.col += n_blanks;
1089
1090        let handle = self.scan_tag_handle(true, mark)?;
1091
1092        let n_blanks = self.input.skip_while_blank();
1093        self.mark.index += n_blanks;
1094        self.mark.col += n_blanks;
1095
1096        let prefix = self.scan_tag_prefix(mark)?;
1097
1098        self.input.lookahead(1);
1099
1100        if self.input.next_is_blank_or_breakz() {
1101            Ok(Token(
1102                Span::new(*mark, self.mark),
1103                TokenType::TagDirective(handle.into(), prefix.into()),
1104            ))
1105        } else {
1106            Err(ScanError::new_str(
1107                *mark,
1108                "while scanning TAG, did not find expected whitespace or line break",
1109            ))
1110        }
1111    }
1112
1113    fn fetch_tag(&mut self) -> ScanResult {
1114        self.save_simple_key();
1115        self.disallow_simple_key();
1116
1117        let tok = self.scan_tag()?;
1118        self.tokens.push_back(tok);
1119        Ok(())
1120    }
1121
1122    fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
1123        let start_mark = self.mark;
1124        let mut handle = String::new();
1125        let mut suffix;
1126
1127        // Check if the tag is in the canonical form (verbatim).
1128        self.input.lookahead(2);
1129
1130        if self.input.nth_char_is(1, '<') {
1131            suffix = self.scan_verbatim_tag(&start_mark)?;
1132        } else {
1133            // The tag has either the '!suffix' or the '!handle!suffix'
1134            handle = self.scan_tag_handle(false, &start_mark)?;
1135            // Check if it is, indeed, handle.
1136            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1137                // A tag handle starting with "!!" is a secondary tag handle.
1138                let is_secondary_handle = handle == "!!";
1139                suffix =
1140                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?;
1141            } else {
1142                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?;
1143                "!".clone_into(&mut handle);
1144                // A special case: the '!' tag.  Set the handle to '' and the
1145                // suffix to '!'.
1146                if suffix.is_empty() {
1147                    handle.clear();
1148                    "!".clone_into(&mut suffix);
1149                }
1150            }
1151        }
1152
1153        if is_blank_or_breakz(self.input.look_ch())
1154            || (self.flow_level > 0 && self.input.next_is_flow())
1155        {
1156            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1157            Ok(Token(
1158                Span::new(start_mark, self.mark),
1159                TokenType::Tag(handle, suffix),
1160            ))
1161        } else {
1162            Err(ScanError::new_str(
1163                start_mark,
1164                "while scanning a tag, did not find expected whitespace or line break",
1165            ))
1166        }
1167    }
1168
1169    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1170        let mut string = String::new();
1171        if self.input.look_ch() != '!' {
1172            return Err(ScanError::new_str(
1173                *mark,
1174                "while scanning a tag, did not find expected '!'",
1175            ));
1176        }
1177
1178        string.push(self.input.peek());
1179        self.skip_non_blank();
1180
1181        let n_chars = self.input.fetch_while_is_alpha(&mut string);
1182        self.mark.index += n_chars;
1183        self.mark.col += n_chars;
1184
1185        // Check if the trailing character is '!' and copy it.
1186        if self.input.peek() == '!' {
1187            string.push(self.input.peek());
1188            self.skip_non_blank();
1189        } else if directive && string != "!" {
1190            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1191            // directive, it's an error.  If it's a tag token, it must be a part of
1192            // URI.
1193            return Err(ScanError::new_str(
1194                *mark,
1195                "while parsing a tag directive, did not find expected '!'",
1196            ));
1197        }
1198        Ok(string)
1199    }
1200
1201    /// Scan for a tag prefix (6.8.2.2).
1202    ///
1203    /// There are 2 kinds of tag prefixes:
1204    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1205    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1206    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1207        let mut string = String::new();
1208
1209        if self.input.look_ch() == '!' {
1210            // If we have a local tag, insert and skip `!`.
1211            string.push(self.input.peek());
1212            self.skip_non_blank();
1213        } else if !is_tag_char(self.input.peek()) {
1214            // Otherwise, check if the first global tag character is valid.
1215            return Err(ScanError::new_str(
1216                *start_mark,
1217                "invalid global tag character",
1218            ));
1219        } else if self.input.peek() == '%' {
1220            // If it is valid and an escape sequence, escape it.
1221            string.push(self.scan_uri_escapes(start_mark)?);
1222        } else {
1223            // Otherwise, push the first character.
1224            string.push(self.input.peek());
1225            self.skip_non_blank();
1226        }
1227
1228        while is_uri_char(self.input.look_ch()) {
1229            if self.input.peek() == '%' {
1230                string.push(self.scan_uri_escapes(start_mark)?);
1231            } else {
1232                string.push(self.input.peek());
1233                self.skip_non_blank();
1234            }
1235        }
1236
1237        Ok(string)
1238    }
1239
1240    /// Scan for a verbatim tag.
1241    ///
1242    /// The prefixing `!<` must _not_ have been skipped.
1243    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1244        // Eat `!<`
1245        self.skip_non_blank();
1246        self.skip_non_blank();
1247
1248        let mut string = String::new();
1249        while is_uri_char(self.input.look_ch()) {
1250            if self.input.peek() == '%' {
1251                string.push(self.scan_uri_escapes(start_mark)?);
1252            } else {
1253                string.push(self.input.peek());
1254                self.skip_non_blank();
1255            }
1256        }
1257
1258        if self.input.peek() != '>' {
1259            return Err(ScanError::new_str(
1260                *start_mark,
1261                "while scanning a verbatim tag, did not find the expected '>'",
1262            ));
1263        }
1264        self.skip_non_blank();
1265
1266        Ok(string)
1267    }
1268
1269    fn scan_tag_shorthand_suffix(
1270        &mut self,
1271        _directive: bool,
1272        _is_secondary: bool,
1273        head: &str,
1274        mark: &Marker,
1275    ) -> Result<String, ScanError> {
1276        let mut length = head.len();
1277        let mut string = String::new();
1278
1279        // Copy the head if needed.
1280        // Note that we don't copy the leading '!' character.
1281        if length > 1 {
1282            string.extend(head.chars().skip(1));
1283        }
1284
1285        while is_tag_char(self.input.look_ch()) {
1286            // Check if it is a URI-escape sequence.
1287            if self.input.peek() == '%' {
1288                string.push(self.scan_uri_escapes(mark)?);
1289            } else {
1290                string.push(self.input.peek());
1291                self.skip_non_blank();
1292            }
1293
1294            length += 1;
1295        }
1296
1297        if length == 0 {
1298            return Err(ScanError::new_str(
1299                *mark,
1300                "while parsing a tag, did not find expected tag URI",
1301            ));
1302        }
1303
1304        Ok(string)
1305    }
1306
1307    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1308        let mut width = 0usize;
1309        let mut code = 0u32;
1310        loop {
1311            self.input.lookahead(3);
1312
1313            let c = self.input.peek_nth(1);
1314            let nc = self.input.peek_nth(2);
1315
1316            if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
1317                return Err(ScanError::new_str(
1318                    *mark,
1319                    "while parsing a tag, found an invalid escape sequence",
1320                ));
1321            }
1322
1323            let byte = (as_hex(c) << 4) + as_hex(nc);
1324            if width == 0 {
1325                width = match byte {
1326                    _ if byte & 0x80 == 0x00 => 1,
1327                    _ if byte & 0xE0 == 0xC0 => 2,
1328                    _ if byte & 0xF0 == 0xE0 => 3,
1329                    _ if byte & 0xF8 == 0xF0 => 4,
1330                    _ => {
1331                        return Err(ScanError::new_str(
1332                            *mark,
1333                            "while parsing a tag, found an incorrect leading UTF-8 byte",
1334                        ));
1335                    }
1336                };
1337                code = byte;
1338            } else {
1339                if byte & 0xc0 != 0x80 {
1340                    return Err(ScanError::new_str(
1341                        *mark,
1342                        "while parsing a tag, found an incorrect trailing UTF-8 byte",
1343                    ));
1344                }
1345                code = (code << 8) + byte;
1346            }
1347
1348            self.skip_n_non_blank(3);
1349
1350            width -= 1;
1351            if width == 0 {
1352                break;
1353            }
1354        }
1355
1356        match char::from_u32(code) {
1357            Some(ch) => Ok(ch),
1358            None => Err(ScanError::new_str(
1359                *mark,
1360                "while parsing a tag, found an invalid UTF-8 codepoint",
1361            )),
1362        }
1363    }
1364
1365    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
1366        self.save_simple_key();
1367        self.disallow_simple_key();
1368
1369        let tok = self.scan_anchor(alias)?;
1370
1371        self.tokens.push_back(tok);
1372
1373        Ok(())
1374    }
1375
1376    fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
1377        let mut string = String::new();
1378        let start_mark = self.mark;
1379
1380        self.skip_non_blank();
1381        while is_anchor_char(self.input.look_ch()) {
1382            string.push(self.input.peek());
1383            self.skip_non_blank();
1384        }
1385
1386        if string.is_empty() {
1387            return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
1388        }
1389
1390        let tok = if alias {
1391            TokenType::Alias(string.into())
1392        } else {
1393            TokenType::Anchor(string.into())
1394        };
1395        Ok(Token(Span::new(start_mark, self.mark), tok))
1396    }
1397
1398    fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
1399        // The indicators '[' and '{' may start a simple key.
1400        self.save_simple_key();
1401
1402        self.roll_one_col_indent();
1403        self.increase_flow_level()?;
1404
1405        self.allow_simple_key();
1406
1407        let start_mark = self.mark;
1408        self.skip_non_blank();
1409
1410        if tok == TokenType::FlowMappingStart {
1411            self.flow_mapping_started = true;
1412        } else {
1413            self.implicit_flow_mapping_states
1414                .push(ImplicitMappingState::Possible);
1415        }
1416
1417        self.skip_ws_to_eol(SkipTabs::Yes)?;
1418
1419        self.tokens
1420            .push_back(Token(Span::new(start_mark, self.mark), tok));
1421        Ok(())
1422    }
1423
1424    fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
1425        self.remove_simple_key()?;
1426        self.decrease_flow_level();
1427
1428        self.disallow_simple_key();
1429
1430        if matches!(tok, TokenType::FlowSequenceEnd) {
1431            self.end_implicit_mapping(self.mark);
1432            // We are out exiting the flow sequence, nesting goes down 1 level.
1433            self.implicit_flow_mapping_states.pop();
1434        }
1435
1436        let start_mark = self.mark;
1437        self.skip_non_blank();
1438        self.skip_ws_to_eol(SkipTabs::Yes)?;
1439
1440        // A flow collection within a flow mapping can be a key. In that case, the value may be
1441        // adjacent to the `:`.
1442        // ```yaml
1443        // - [ {a: b}:value ]
1444        // ```
1445        if self.flow_level > 0 {
1446            self.adjacent_value_allowed_at = self.mark.index;
1447        }
1448
1449        self.tokens
1450            .push_back(Token(Span::new(start_mark, self.mark), tok));
1451        Ok(())
1452    }
1453
1454    /// Push the `FlowEntry` token and skip over the `,`.
1455    fn fetch_flow_entry(&mut self) -> ScanResult {
1456        self.remove_simple_key()?;
1457        self.allow_simple_key();
1458
1459        self.end_implicit_mapping(self.mark);
1460
1461        let start_mark = self.mark;
1462        self.skip_non_blank();
1463        self.skip_ws_to_eol(SkipTabs::Yes)?;
1464
1465        self.tokens.push_back(Token(
1466            Span::new(start_mark, self.mark),
1467            TokenType::FlowEntry,
1468        ));
1469        Ok(())
1470    }
1471
1472    fn increase_flow_level(&mut self) -> ScanResult {
1473        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1474        self.flow_level = self
1475            .flow_level
1476            .checked_add(1)
1477            .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
1478        Ok(())
1479    }
1480
1481    fn decrease_flow_level(&mut self) {
1482        if self.flow_level > 0 {
1483            self.flow_level -= 1;
1484            self.simple_keys.pop().unwrap();
1485        }
1486    }
1487
1488    /// Push the `Block*` token(s) and skip over the `-`.
1489    ///
1490    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
1491    /// `BlockEntry` token.
1492    /// This function only skips over the `-` and does not fetch the entry value.
1493    fn fetch_block_entry(&mut self) -> ScanResult {
1494        if self.flow_level > 0 {
1495            // - * only allowed in block
1496            return Err(ScanError::new_str(
1497                self.mark,
1498                r#""-" is only valid inside a block"#,
1499            ));
1500        }
1501        // Check if we are allowed to start a new entry.
1502        if !self.simple_key_allowed {
1503            return Err(ScanError::new_str(
1504                self.mark,
1505                "block sequence entries are not allowed in this context",
1506            ));
1507        }
1508
1509        // ???, fixes test G9HC.
1510        if let Some(Token(span, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
1511            if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
1512                return Err(ScanError::new_str(
1513                    span.start,
1514                    "invalid indentation for anchor",
1515                ));
1516            }
1517        }
1518
1519        // Skip over the `-`.
1520        let mark = self.mark;
1521        self.skip_non_blank();
1522
1523        // generate BLOCK-SEQUENCE-START if indented
1524        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
1525        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
1526        self.input.lookahead(2);
1527        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
1528        {
1529            return Err(ScanError::new_str(
1530                self.mark,
1531                "'-' must be followed by a valid YAML whitespace",
1532            ));
1533        }
1534
1535        self.skip_ws_to_eol(SkipTabs::No)?;
1536        self.input.lookahead(1);
1537        if self.input.next_is_break() || self.input.next_is_flow() {
1538            self.roll_one_col_indent();
1539        }
1540
1541        self.remove_simple_key()?;
1542        self.allow_simple_key();
1543
1544        self.tokens
1545            .push_back(Token(Span::empty(self.mark), TokenType::BlockEntry));
1546
1547        Ok(())
1548    }
1549
1550    fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
1551        self.unroll_indent(-1);
1552        self.remove_simple_key()?;
1553        self.disallow_simple_key();
1554
1555        let mark = self.mark;
1556
1557        self.skip_n_non_blank(3);
1558
1559        self.tokens.push_back(Token(Span::new(mark, self.mark), t));
1560        Ok(())
1561    }
1562
1563    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
1564        self.save_simple_key();
1565        self.allow_simple_key();
1566        let tok = self.scan_block_scalar(literal)?;
1567
1568        self.tokens.push_back(tok);
1569        Ok(())
1570    }
1571
1572    #[allow(clippy::too_many_lines)]
1573    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
1574        let start_mark = self.mark;
1575        let mut chomping = Chomping::Clip;
1576        let mut increment: usize = 0;
1577        let mut indent: usize = 0;
1578        let mut trailing_blank: bool;
1579        let mut leading_blank: bool = false;
1580        let style = if literal {
1581            ScalarStyle::Literal
1582        } else {
1583            ScalarStyle::Folded
1584        };
1585
1586        let mut string = String::new();
1587        let mut leading_break = String::new();
1588        let mut trailing_breaks = String::new();
1589        let mut chomping_break = String::new();
1590
1591        // skip '|' or '>'
1592        self.skip_non_blank();
1593        self.unroll_non_block_indents();
1594
1595        if self.input.look_ch() == '+' || self.input.peek() == '-' {
1596            if self.input.peek() == '+' {
1597                chomping = Chomping::Keep;
1598            } else {
1599                chomping = Chomping::Strip;
1600            }
1601            self.skip_non_blank();
1602            self.input.lookahead(1);
1603            if self.input.next_is_digit() {
1604                if self.input.peek() == '0' {
1605                    return Err(ScanError::new_str(
1606                        start_mark,
1607                        "while scanning a block scalar, found an indentation indicator equal to 0",
1608                    ));
1609                }
1610                increment = (self.input.peek() as usize) - ('0' as usize);
1611                self.skip_non_blank();
1612            }
1613        } else if self.input.next_is_digit() {
1614            if self.input.peek() == '0' {
1615                return Err(ScanError::new_str(
1616                    start_mark,
1617                    "while scanning a block scalar, found an indentation indicator equal to 0",
1618                ));
1619            }
1620
1621            increment = (self.input.peek() as usize) - ('0' as usize);
1622            self.skip_non_blank();
1623            self.input.lookahead(1);
1624            if self.input.peek() == '+' || self.input.peek() == '-' {
1625                if self.input.peek() == '+' {
1626                    chomping = Chomping::Keep;
1627                } else {
1628                    chomping = Chomping::Strip;
1629                }
1630                self.skip_non_blank();
1631            }
1632        }
1633
1634        self.skip_ws_to_eol(SkipTabs::Yes)?;
1635
1636        // Check if we are at the end of the line.
1637        self.input.lookahead(1);
1638        if !self.input.next_is_breakz() {
1639            return Err(ScanError::new_str(
1640                start_mark,
1641                "while scanning a block scalar, did not find expected comment or line break",
1642            ));
1643        }
1644
1645        if self.input.next_is_break() {
1646            self.input.lookahead(2);
1647            self.read_break(&mut chomping_break);
1648        }
1649
1650        if self.input.look_ch() == '\t' {
1651            return Err(ScanError::new_str(
1652                start_mark,
1653                "a block scalar content cannot start with a tab",
1654            ));
1655        }
1656
1657        if increment > 0 {
1658            indent = if self.indent >= 0 {
1659                (self.indent + increment as isize) as usize
1660            } else {
1661                increment
1662            }
1663        }
1664
1665        // Scan the leading line breaks and determine the indentation level if needed.
1666        if indent == 0 {
1667            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
1668        } else {
1669            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1670        }
1671
1672        // We have an end-of-stream with no content, e.g.:
1673        // ```yaml
1674        // - |+
1675        // ```
1676        if self.input.next_is_z() {
1677            let contents = match chomping {
1678                // We strip trailing linebreaks. Nothing remain.
1679                Chomping::Strip => String::new(),
1680                // There was no newline after the chomping indicator.
1681                _ if self.mark.line == start_mark.line() => String::new(),
1682                // We clip lines, and there was a newline after the chomping indicator.
1683                // All other breaks are ignored.
1684                Chomping::Clip => chomping_break,
1685                // We keep lines. There was a newline after the chomping indicator but nothing
1686                // else.
1687                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
1688                // Otherwise, the newline after chomping is ignored.
1689                Chomping::Keep => trailing_breaks,
1690            };
1691            return Ok(Token(
1692                Span::new(start_mark, self.mark),
1693                TokenType::Scalar(style, contents.into()),
1694            ));
1695        }
1696
1697        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
1698            return Err(ScanError::new_str(
1699                self.mark,
1700                "wrongly indented line in block scalar",
1701            ));
1702        }
1703
1704        let mut line_buffer = String::with_capacity(100);
1705        let start_mark = self.mark;
1706        while self.mark.col == indent && !self.input.next_is_z() {
1707            if indent == 0 {
1708                self.input.lookahead(4);
1709                if self.input.next_is_document_end() {
1710                    break;
1711                }
1712            }
1713
1714            // We are at the first content character of a content line.
1715            trailing_blank = self.input.next_is_blank();
1716            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
1717                string.push_str(&trailing_breaks);
1718                if trailing_breaks.is_empty() {
1719                    string.push(' ');
1720                }
1721            } else {
1722                string.push_str(&leading_break);
1723                string.push_str(&trailing_breaks);
1724            }
1725
1726            leading_break.clear();
1727            trailing_breaks.clear();
1728
1729            leading_blank = self.input.next_is_blank();
1730
1731            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
1732
1733            // break on EOF
1734            self.input.lookahead(2);
1735            if self.input.next_is_z() {
1736                break;
1737            }
1738
1739            self.read_break(&mut leading_break);
1740
1741            // Eat the following indentation spaces and line breaks.
1742            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1743        }
1744
1745        // Chomp the tail.
1746        if chomping != Chomping::Strip {
1747            string.push_str(&leading_break);
1748            // If we had reached an eof but the last character wasn't an end-of-line, check if the
1749            // last line was indented at least as the rest of the scalar, then we need to consider
1750            // there is a newline.
1751            if self.input.next_is_z() && self.mark.col >= indent.max(1) {
1752                string.push('\n');
1753            }
1754        }
1755
1756        if chomping == Chomping::Keep {
1757            string.push_str(&trailing_breaks);
1758        }
1759
1760        Ok(Token(
1761            Span::new(start_mark, self.mark),
1762            TokenType::Scalar(style, string.into()),
1763        ))
1764    }
1765
1766    /// Retrieve the contents of the line, parsing it as a block scalar.
1767    ///
1768    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
1769    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
1770    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
1771    /// `clear`ed before the end of the function.
1772    ///
1773    /// This function assumed the first character to read is the first content character in the
1774    /// line. This function does not consume the line break character(s) after the line.
1775    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
1776        // Start by evaluating characters in the buffer.
1777        while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
1778            string.push(self.input.peek());
1779            // We may technically skip non-blank characters. However, the only distinction is
1780            // to determine what is leading whitespace and what is not. Here, we read the
1781            // contents of the line until either eof or a linebreak. We know we will not read
1782            // `self.leading_whitespace` until the end of the line, where it will be reset.
1783            // This allows us to call a slightly less expensive function.
1784            self.skip_blank();
1785        }
1786
1787        // All characters that were in the buffer were consumed. We need to check if more
1788        // follow.
1789        if self.input.buf_is_empty() {
1790            // We will read all consecutive non-breakz characters. We push them into a
1791            // temporary buffer. The main difference with going through `self.buffer` is that
1792            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
1793            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
1794            // (using `String::push_str`).
1795            while let Some(c) = self.input.raw_read_non_breakz_ch() {
1796                line_buffer.push(c);
1797            }
1798
1799            // We need to manually update our position; we haven't called a `skip` function.
1800            let n_chars = line_buffer.chars().count();
1801            self.mark.col += n_chars;
1802            self.mark.index += n_chars;
1803
1804            // We can now append our bytes to our `string`.
1805            string.reserve(line_buffer.len());
1806            string.push_str(line_buffer);
1807            // This clears the _contents_ without touching the _capacity_.
1808            line_buffer.clear();
1809        }
1810    }
1811
1812    /// Skip the block scalar indentation and empty lines.
1813    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
1814        loop {
1815            // Consume all spaces. Tabs cannot be used as indentation.
1816            if indent < self.input.bufmaxlen() - 2 {
1817                self.input.lookahead(self.input.bufmaxlen());
1818                while self.mark.col < indent && self.input.peek() == ' ' {
1819                    self.skip_blank();
1820                }
1821            } else {
1822                loop {
1823                    self.input.lookahead(self.input.bufmaxlen());
1824                    while !self.input.buf_is_empty()
1825                        && self.mark.col < indent
1826                        && self.input.peek() == ' '
1827                    {
1828                        self.skip_blank();
1829                    }
1830                    // If we reached our indent, we can break. We must also break if we have
1831                    // reached content or EOF; that is, the buffer is not empty and the next
1832                    // character is not a space.
1833                    if self.mark.col == indent
1834                        || (!self.input.buf_is_empty() && self.input.peek() != ' ')
1835                    {
1836                        break;
1837                    }
1838                }
1839                self.input.lookahead(2);
1840            }
1841
1842            // If our current line is empty, skip over the break and continue looping.
1843            if self.input.next_is_break() {
1844                self.read_break(breaks);
1845            } else {
1846                // Otherwise, we have a content line. Return control.
1847                break;
1848            }
1849        }
1850    }
1851
1852    /// Determine the indentation level for a block scalar from the first line of its contents.
1853    ///
1854    /// The function skips over whitespace-only lines and sets `indent` to the the longest
1855    /// whitespace line that was encountered.
1856    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
1857        let mut max_indent = 0;
1858        loop {
1859            // Consume all spaces. Tabs cannot be used as indentation.
1860            while self.input.look_ch() == ' ' {
1861                self.skip_blank();
1862            }
1863
1864            if self.mark.col > max_indent {
1865                max_indent = self.mark.col;
1866            }
1867
1868            if self.input.next_is_break() {
1869                // If our current line is empty, skip over the break and continue looping.
1870                self.input.lookahead(2);
1871                self.read_break(breaks);
1872            } else {
1873                // Otherwise, we have a content line. Return control.
1874                break;
1875            }
1876        }
1877
1878        // In case a yaml looks like:
1879        // ```yaml
1880        // |
1881        // foo
1882        // bar
1883        // ```
1884        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
1885        // least 1. When in the above example, `self.indent` will be set to -1.
1886        *indent = max_indent.max((self.indent + 1) as usize);
1887        if self.indent > 0 {
1888            *indent = (*indent).max(1);
1889        }
1890    }
1891
1892    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
1893        self.save_simple_key();
1894        self.disallow_simple_key();
1895
1896        let tok = self.scan_flow_scalar(single)?;
1897
1898        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
1899        // YAML allows the following value to be specified adjacent to the “:”.
1900        self.skip_to_next_token()?;
1901        self.adjacent_value_allowed_at = self.mark.index;
1902
1903        self.tokens.push_back(tok);
1904        Ok(())
1905    }
1906
1907    #[allow(clippy::too_many_lines)]
1908    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
1909        let start_mark = self.mark;
1910
1911        let mut string = String::new();
1912        let mut leading_break = String::new();
1913        let mut trailing_breaks = String::new();
1914        let mut whitespaces = String::new();
1915        let mut leading_blanks;
1916
1917        /* Eat the left quote. */
1918        self.skip_non_blank();
1919
1920        loop {
1921            /* Check for a document indicator. */
1922            self.input.lookahead(4);
1923
1924            if self.mark.col == 0 && self.input.next_is_document_indicator() {
1925                return Err(ScanError::new_str(
1926                    start_mark,
1927                    "while scanning a quoted scalar, found unexpected document indicator",
1928                ));
1929            }
1930
1931            if self.input.next_is_z() {
1932                return Err(ScanError::new_str(
1933                    start_mark,
1934                    "while scanning a quoted scalar, found unexpected end of stream",
1935                ));
1936            }
1937
1938            if (self.mark.col as isize) < self.indent {
1939                return Err(ScanError::new_str(
1940                    start_mark,
1941                    "invalid indentation in quoted scalar",
1942                ));
1943            }
1944
1945            leading_blanks = false;
1946            self.consume_flow_scalar_non_whitespace_chars(
1947                single,
1948                &mut string,
1949                &mut leading_blanks,
1950                &start_mark,
1951            )?;
1952
1953            match self.input.look_ch() {
1954                '\'' if single => break,
1955                '"' if !single => break,
1956                _ => {}
1957            }
1958
1959            // Consume blank characters.
1960            while self.input.next_is_blank() || self.input.next_is_break() {
1961                if self.input.next_is_blank() {
1962                    // Consume a space or a tab character.
1963                    if leading_blanks {
1964                        if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
1965                            return Err(ScanError::new_str(
1966                                self.mark,
1967                                "tab cannot be used as indentation",
1968                            ));
1969                        }
1970                        self.skip_blank();
1971                    } else {
1972                        whitespaces.push(self.input.peek());
1973                        self.skip_blank();
1974                    }
1975                } else {
1976                    self.input.lookahead(2);
1977                    // Check if it is a first line break.
1978                    if leading_blanks {
1979                        self.read_break(&mut trailing_breaks);
1980                    } else {
1981                        whitespaces.clear();
1982                        self.read_break(&mut leading_break);
1983                        leading_blanks = true;
1984                    }
1985                }
1986                self.input.lookahead(1);
1987            }
1988
1989            // Join the whitespaces or fold line breaks.
1990            if leading_blanks {
1991                if leading_break.is_empty() {
1992                    string.push_str(&leading_break);
1993                    string.push_str(&trailing_breaks);
1994                    trailing_breaks.clear();
1995                    leading_break.clear();
1996                } else {
1997                    if trailing_breaks.is_empty() {
1998                        string.push(' ');
1999                    } else {
2000                        string.push_str(&trailing_breaks);
2001                        trailing_breaks.clear();
2002                    }
2003                    leading_break.clear();
2004                }
2005            } else {
2006                string.push_str(&whitespaces);
2007                whitespaces.clear();
2008            }
2009        } // loop
2010
2011        // Eat the right quote.
2012        self.skip_non_blank();
2013        // Ensure there is no invalid trailing content.
2014        self.skip_ws_to_eol(SkipTabs::Yes)?;
2015        match self.input.peek() {
2016            // These can be encountered in flow sequences or mappings.
2017            ',' | '}' | ']' if self.flow_level > 0 => {}
2018            // An end-of-line / end-of-stream is fine. No trailing content.
2019            c if is_breakz(c) => {}
2020            // ':' can be encountered if our scalar is a key.
2021            // Outside of flow contexts, keys cannot span multiple lines
2022            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2023            // Inside a flow context, this is allowed.
2024            ':' if self.flow_level > 0 => {}
2025            _ => {
2026                return Err(ScanError::new_str(
2027                    self.mark,
2028                    "invalid trailing content after double-quoted scalar",
2029                ));
2030            }
2031        }
2032
2033        let style = if single {
2034            ScalarStyle::SingleQuoted
2035        } else {
2036            ScalarStyle::DoubleQuoted
2037        };
2038        Ok(Token(
2039            Span::new(start_mark, self.mark),
2040            TokenType::Scalar(style, string.into()),
2041        ))
2042    }
2043
2044    /// Consume successive non-whitespace characters from a flow scalar.
2045    ///
2046    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2047    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2048    /// for double quoted scalars).
2049    ///
2050    /// # Errors
2051    /// Return an error if an invalid escape sequence is found.
2052    fn consume_flow_scalar_non_whitespace_chars(
2053        &mut self,
2054        single: bool,
2055        string: &mut String,
2056        leading_blanks: &mut bool,
2057        start_mark: &Marker,
2058    ) -> Result<(), ScanError> {
2059        self.input.lookahead(2);
2060        while !is_blank_or_breakz(self.input.peek()) {
2061            match self.input.peek() {
2062                // Check for an escaped single quote.
2063                '\'' if self.input.peek_nth(1) == '\'' && single => {
2064                    string.push('\'');
2065                    self.skip_n_non_blank(2);
2066                }
2067                // Check for the right quote.
2068                '\'' if single => break,
2069                '"' if !single => break,
2070                // Check for an escaped line break.
2071                '\\' if !single && is_break(self.input.peek_nth(1)) => {
2072                    self.input.lookahead(3);
2073                    self.skip_non_blank();
2074                    self.skip_linebreak();
2075                    *leading_blanks = true;
2076                    break;
2077                }
2078                // Check for an escape sequence.
2079                '\\' if !single => {
2080                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2081                }
2082                c => {
2083                    string.push(c);
2084                    self.skip_non_blank();
2085                }
2086            }
2087            self.input.lookahead(2);
2088        }
2089        Ok(())
2090    }
2091
2092    /// Escape the sequence we encounter in a flow scalar.
2093    ///
2094    /// `self.input.peek()` must point to the `\` starting the escape sequence.
2095    ///
2096    /// # Errors
2097    /// Return an error if an invalid escape sequence is found.
2098    fn resolve_flow_scalar_escape_sequence(
2099        &mut self,
2100        start_mark: &Marker,
2101    ) -> Result<char, ScanError> {
2102        let mut code_length = 0usize;
2103        let mut ret = '\0';
2104
2105        match self.input.peek_nth(1) {
2106            '0' => ret = '\0',
2107            'a' => ret = '\x07',
2108            'b' => ret = '\x08',
2109            't' | '\t' => ret = '\t',
2110            'n' => ret = '\n',
2111            'v' => ret = '\x0b',
2112            'f' => ret = '\x0c',
2113            'r' => ret = '\x0d',
2114            'e' => ret = '\x1b',
2115            ' ' => ret = '\x20',
2116            '"' => ret = '"',
2117            '/' => ret = '/',
2118            '\\' => ret = '\\',
2119            // Unicode next line (#x85)
2120            'N' => ret = char::from_u32(0x85).unwrap(),
2121            // Unicode non-breaking space (#xA0)
2122            '_' => ret = char::from_u32(0xA0).unwrap(),
2123            // Unicode line separator (#x2028)
2124            'L' => ret = char::from_u32(0x2028).unwrap(),
2125            // Unicode paragraph separator (#x2029)
2126            'P' => ret = char::from_u32(0x2029).unwrap(),
2127            'x' => code_length = 2,
2128            'u' => code_length = 4,
2129            'U' => code_length = 8,
2130            _ => {
2131                return Err(ScanError::new_str(
2132                    *start_mark,
2133                    "while parsing a quoted scalar, found unknown escape character",
2134                ))
2135            }
2136        }
2137        self.skip_n_non_blank(2);
2138
2139        // Consume an arbitrary escape code.
2140        if code_length > 0 {
2141            self.input.lookahead(code_length);
2142            let mut value = 0u32;
2143            for i in 0..code_length {
2144                let c = self.input.peek_nth(i);
2145                if !is_hex(c) {
2146                    return Err(ScanError::new_str(
2147                        *start_mark,
2148                        "while parsing a quoted scalar, did not find expected hexadecimal number",
2149                    ));
2150                }
2151                value = (value << 4) + as_hex(c);
2152            }
2153
2154            let Some(ch) = char::from_u32(value) else {
2155                return Err(ScanError::new_str(
2156                    *start_mark,
2157                    "while parsing a quoted scalar, found invalid Unicode character escape code",
2158                ));
2159            };
2160            ret = ch;
2161
2162            self.skip_n_non_blank(code_length);
2163        }
2164        Ok(ret)
2165    }
2166
2167    fn fetch_plain_scalar(&mut self) -> ScanResult {
2168        self.save_simple_key();
2169        self.disallow_simple_key();
2170
2171        let tok = self.scan_plain_scalar()?;
2172
2173        self.tokens.push_back(tok);
2174        Ok(())
2175    }
2176
2177    /// Scan for a plain scalar.
2178    ///
2179    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
2180    /// some contexts.
2181    #[allow(clippy::too_many_lines)]
2182    fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
2183        self.unroll_non_block_indents();
2184        let indent = self.indent + 1;
2185        let start_mark = self.mark;
2186
2187        if self.flow_level > 0 && (start_mark.col as isize) < indent {
2188            return Err(ScanError::new_str(
2189                start_mark,
2190                "invalid indentation in flow construct",
2191            ));
2192        }
2193
2194        let mut string = String::with_capacity(32);
2195        self.buf_whitespaces.clear();
2196        self.buf_leading_break.clear();
2197        self.buf_trailing_breaks.clear();
2198        let mut end_mark = self.mark;
2199
2200        loop {
2201            self.input.lookahead(4);
2202            if (self.leading_whitespace && self.input.next_is_document_indicator())
2203                || self.input.peek() == '#'
2204            {
2205                break;
2206            }
2207
2208            if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
2209                return Err(ScanError::new_str(
2210                    self.mark,
2211                    "plain scalar cannot start with '-' followed by ,[]{}",
2212                ));
2213            }
2214
2215            if !self.input.next_is_blank_or_breakz()
2216                && self.input.next_can_be_plain_scalar(self.flow_level > 0)
2217            {
2218                if self.leading_whitespace {
2219                    if self.buf_leading_break.is_empty() {
2220                        string.push_str(&self.buf_leading_break);
2221                        string.push_str(&self.buf_trailing_breaks);
2222                        self.buf_trailing_breaks.clear();
2223                        self.buf_leading_break.clear();
2224                    } else {
2225                        if self.buf_trailing_breaks.is_empty() {
2226                            string.push(' ');
2227                        } else {
2228                            string.push_str(&self.buf_trailing_breaks);
2229                            self.buf_trailing_breaks.clear();
2230                        }
2231                        self.buf_leading_break.clear();
2232                    }
2233                    self.leading_whitespace = false;
2234                } else if !self.buf_whitespaces.is_empty() {
2235                    string.push_str(&self.buf_whitespaces);
2236                    self.buf_whitespaces.clear();
2237                }
2238
2239                // We can unroll the first iteration of the loop.
2240                string.push(self.input.peek());
2241                self.skip_non_blank();
2242                string.reserve(self.input.bufmaxlen());
2243
2244                // Add content non-blank characters to the scalar.
2245                let mut end = false;
2246                while !end {
2247                    // Fill the buffer once and process all characters in the buffer until the next
2248                    // fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
2249                    // hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
2250                    self.input.lookahead(self.input.bufmaxlen());
2251                    for _ in 0..self.input.bufmaxlen() - 1 {
2252                        if self.input.next_is_blank_or_breakz()
2253                            || !self.input.next_can_be_plain_scalar(self.flow_level > 0)
2254                        {
2255                            end = true;
2256                            break;
2257                        }
2258                        string.push(self.input.peek());
2259                        self.skip_non_blank();
2260                    }
2261                }
2262                end_mark = self.mark;
2263            }
2264
2265            // We may reach the end of a plain scalar if:
2266            //  - We reach eof
2267            //  - We reach ": "
2268            //  - We find a flow character in a flow context
2269            if !(self.input.next_is_blank() || self.input.next_is_break()) {
2270                break;
2271            }
2272
2273            // Process blank characters.
2274            self.input.lookahead(2);
2275            while self.input.next_is_blank_or_break() {
2276                if self.input.next_is_blank() {
2277                    if !self.leading_whitespace {
2278                        self.buf_whitespaces.push(self.input.peek());
2279                        self.skip_blank();
2280                    } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
2281                        // Tabs in an indentation columns are allowed if and only if the line is
2282                        // empty. Skip to the end of the line.
2283                        self.skip_ws_to_eol(SkipTabs::Yes)?;
2284                        if !self.input.next_is_breakz() {
2285                            return Err(ScanError::new_str(
2286                                start_mark,
2287                                "while scanning a plain scalar, found a tab",
2288                            ));
2289                        }
2290                    } else {
2291                        self.skip_blank();
2292                    }
2293                } else {
2294                    // Check if it is a first line break
2295                    if self.leading_whitespace {
2296                        self.skip_break();
2297                        self.buf_trailing_breaks.push('\n');
2298                    } else {
2299                        self.buf_whitespaces.clear();
2300                        self.skip_break();
2301                        self.buf_leading_break.push('\n');
2302                        self.leading_whitespace = true;
2303                    }
2304                }
2305                self.input.lookahead(2);
2306            }
2307
2308            // check indentation level
2309            if self.flow_level == 0 && (self.mark.col as isize) < indent {
2310                break;
2311            }
2312        }
2313
2314        if self.leading_whitespace {
2315            self.allow_simple_key();
2316        }
2317
2318        if string.is_empty() {
2319            // `fetch_plain_scalar` must absolutely consume at least one byte. Otherwise,
2320            // `fetch_next_token` will never stop calling it. An empty plain scalar may happen with
2321            // erroneous inputs such as "{...".
2322            Err(ScanError::new_str(
2323                start_mark,
2324                "unexpected end of plain scalar",
2325            ))
2326        } else {
2327            Ok(Token(
2328                Span::new(start_mark, end_mark),
2329                TokenType::Scalar(ScalarStyle::Plain, string.into()),
2330            ))
2331        }
2332    }
2333
2334    fn fetch_key(&mut self) -> ScanResult {
2335        let start_mark = self.mark;
2336        if self.flow_level == 0 {
2337            // Check if we are allowed to start a new key (not necessarily simple).
2338            if !self.simple_key_allowed {
2339                return Err(ScanError::new_str(
2340                    self.mark,
2341                    "mapping keys are not allowed in this context",
2342                ));
2343            }
2344            self.roll_indent(
2345                start_mark.col,
2346                None,
2347                TokenType::BlockMappingStart,
2348                start_mark,
2349            );
2350        } else {
2351            // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
2352            self.flow_mapping_started = true;
2353        }
2354
2355        self.remove_simple_key()?;
2356
2357        if self.flow_level == 0 {
2358            self.allow_simple_key();
2359        } else {
2360            self.disallow_simple_key();
2361        }
2362
2363        self.skip_non_blank();
2364        self.skip_yaml_whitespace()?;
2365        if self.input.peek() == '\t' {
2366            return Err(ScanError::new_str(
2367                self.mark(),
2368                "tabs disallowed in this context",
2369            ));
2370        }
2371        self.tokens
2372            .push_back(Token(Span::new(start_mark, self.mark), TokenType::Key));
2373        Ok(())
2374    }
2375
2376    /// Fetch a value in a mapping inside of a flow collection.
2377    ///
2378    /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
2379    /// values in flow collections are respected prior to calling [`fetch_value`].
2380    ///
2381    /// [`self.flow_level`]: Self::flow_level
2382    /// [`fetch_value`]: Self::fetch_value
2383    fn fetch_flow_value(&mut self) -> ScanResult {
2384        let nc = self.input.peek_nth(1);
2385
2386        // If we encounter a ':' inside a flow collection and it is not immediately
2387        // followed by a blank or breakz:
2388        //   - We must check whether an adjacent value is allowed
2389        //     `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
2390        //     is needed for JSON compatibility.
2391        //   - If not, we must ensure there is a space after the ':' and before its value.
2392        //     `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
2393        //   - But if the value is empty (null), then it's okay.
2394        // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
2395        // flow character), but the ']' is not the value. The value is an invisible empty
2396        // space which is represented as null ('~').
2397        if self.mark.index != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
2398            return Err(ScanError::new_str(
2399                self.mark,
2400                "':' may not precede any of `[{` in flow mapping",
2401            ));
2402        }
2403
2404        self.fetch_value()
2405    }
2406
2407    /// Fetch a value from a mapping (after a `:`).
2408    fn fetch_value(&mut self) -> ScanResult {
2409        let sk = self.simple_keys.last().unwrap().clone();
2410        let start_mark = self.mark;
2411        let is_implicit_flow_mapping =
2412            !self.implicit_flow_mapping_states.is_empty() && !self.flow_mapping_started;
2413        if is_implicit_flow_mapping {
2414            *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Inside;
2415        }
2416
2417        // Skip over ':'.
2418        self.skip_non_blank();
2419        if self.input.look_ch() == '\t'
2420            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
2421            && (self.input.peek() == '-' || self.input.next_is_alpha())
2422        {
2423            return Err(ScanError::new_str(
2424                self.mark,
2425                "':' must be followed by a valid YAML whitespace",
2426            ));
2427        }
2428
2429        if sk.possible {
2430            // insert simple key
2431            let tok = Token(Span::empty(sk.mark), TokenType::Key);
2432            self.insert_token(sk.token_number - self.tokens_parsed, tok);
2433            if is_implicit_flow_mapping {
2434                if sk.mark.line < start_mark.line {
2435                    return Err(ScanError::new_str(
2436                        start_mark,
2437                        "illegal placement of ':' indicator",
2438                    ));
2439                }
2440                self.insert_token(
2441                    sk.token_number - self.tokens_parsed,
2442                    Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
2443                );
2444            }
2445
2446            // Add the BLOCK-MAPPING-START token if needed.
2447            self.roll_indent(
2448                sk.mark.col,
2449                Some(sk.token_number),
2450                TokenType::BlockMappingStart,
2451                sk.mark,
2452            );
2453            self.roll_one_col_indent();
2454
2455            self.simple_keys.last_mut().unwrap().possible = false;
2456            self.disallow_simple_key();
2457        } else {
2458            if is_implicit_flow_mapping {
2459                self.tokens
2460                    .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart));
2461            }
2462            // The ':' indicator follows a complex key.
2463            if self.flow_level == 0 {
2464                if !self.simple_key_allowed {
2465                    return Err(ScanError::new_str(
2466                        start_mark,
2467                        "mapping values are not allowed in this context",
2468                    ));
2469                }
2470
2471                self.roll_indent(
2472                    start_mark.col,
2473                    None,
2474                    TokenType::BlockMappingStart,
2475                    start_mark,
2476                );
2477            }
2478            self.roll_one_col_indent();
2479
2480            if self.flow_level == 0 {
2481                self.allow_simple_key();
2482            } else {
2483                self.disallow_simple_key();
2484            }
2485        }
2486        self.tokens
2487            .push_back(Token(Span::empty(start_mark), TokenType::Value));
2488
2489        Ok(())
2490    }
2491
2492    /// Add an indentation level to the stack with the given block token, if needed.
2493    ///
2494    /// An indentation level is added only if:
2495    ///   - We are not in a flow-style construct (which don't have indentation per-se).
2496    ///   - The current column is further indented than the last indent we have registered.
2497    fn roll_indent(
2498        &mut self,
2499        col: usize,
2500        number: Option<usize>,
2501        tok: TokenType<'input>,
2502        mark: Marker,
2503    ) {
2504        if self.flow_level > 0 {
2505            return;
2506        }
2507
2508        // If the last indent was a non-block indent, remove it.
2509        // This means that we prepared an indent that we thought we wouldn't use, but realized just
2510        // now that it is a block indent.
2511        if self.indent <= col as isize {
2512            if let Some(indent) = self.indents.last() {
2513                if !indent.needs_block_end {
2514                    self.indent = indent.indent;
2515                    self.indents.pop();
2516                }
2517            }
2518        }
2519
2520        if self.indent < col as isize {
2521            self.indents.push(Indent {
2522                indent: self.indent,
2523                needs_block_end: true,
2524            });
2525            self.indent = col as isize;
2526            let tokens_parsed = self.tokens_parsed;
2527            match number {
2528                Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
2529                None => self.tokens.push_back(Token(Span::empty(mark), tok)),
2530            }
2531        }
2532    }
2533
2534    /// Pop indentation levels from the stack as much as needed.
2535    ///
2536    /// Indentation levels are popped from the stack while they are further indented than `col`.
2537    /// If we are in a flow-style construct (which don't have indentation per-se), this function
2538    /// does nothing.
2539    fn unroll_indent(&mut self, col: isize) {
2540        if self.flow_level > 0 {
2541            return;
2542        }
2543        while self.indent > col {
2544            let indent = self.indents.pop().unwrap();
2545            self.indent = indent.indent;
2546            if indent.needs_block_end {
2547                self.tokens
2548                    .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd));
2549            }
2550        }
2551    }
2552
2553    /// Add an indentation level of 1 column that does not start a block.
2554    ///
2555    /// See the documentation of [`Indent::needs_block_end`] for more details.
2556    /// An indentation is not added if we are inside a flow level or if the last indent is already
2557    /// a non-block indent.
2558    fn roll_one_col_indent(&mut self) {
2559        if self.flow_level == 0 && self.indents.last().map_or(false, |x| x.needs_block_end) {
2560            self.indents.push(Indent {
2561                indent: self.indent,
2562                needs_block_end: false,
2563            });
2564            self.indent += 1;
2565        }
2566    }
2567
2568    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
2569    fn unroll_non_block_indents(&mut self) {
2570        while let Some(indent) = self.indents.last() {
2571            if indent.needs_block_end {
2572                break;
2573            }
2574            self.indent = indent.indent;
2575            self.indents.pop();
2576        }
2577    }
2578
2579    /// Mark the next token to be inserted as a potential simple key.
2580    fn save_simple_key(&mut self) {
2581        if self.simple_key_allowed {
2582            let required = self.flow_level == 0
2583                && self.indent == (self.mark.col as isize)
2584                && self.indents.last().unwrap().needs_block_end;
2585            let mut sk = SimpleKey::new(self.mark);
2586            sk.possible = true;
2587            sk.required = required;
2588            sk.token_number = self.tokens_parsed + self.tokens.len();
2589
2590            self.simple_keys.pop();
2591            self.simple_keys.push(sk);
2592        }
2593    }
2594
2595    fn remove_simple_key(&mut self) -> ScanResult {
2596        let last = self.simple_keys.last_mut().unwrap();
2597        if last.possible && last.required {
2598            return Err(ScanError::new_str(self.mark, "simple key expected"));
2599        }
2600
2601        last.possible = false;
2602        Ok(())
2603    }
2604
2605    /// Return whether the scanner is inside a block but outside of a flow sequence.
2606    fn is_within_block(&self) -> bool {
2607        !self.indents.is_empty()
2608    }
2609
2610    /// If an implicit mapping had started, end it.
2611    ///
2612    /// This function does not pop the state in [`implicit_flow_mapping_states`].
2613    ///
2614    /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
2615    fn end_implicit_mapping(&mut self, mark: Marker) {
2616        if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() {
2617            if *implicit_mapping == ImplicitMappingState::Inside {
2618                self.flow_mapping_started = false;
2619                *implicit_mapping = ImplicitMappingState::Possible;
2620                self.tokens
2621                    .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd));
2622            }
2623        }
2624    }
2625}
2626
2627/// Chomping, how final line breaks and trailing empty lines are interpreted.
2628///
2629/// See YAML spec 8.1.1.2.
2630#[derive(PartialEq, Eq)]
2631pub enum Chomping {
2632    /// The final line break and any trailing empty lines are excluded.
2633    Strip,
2634    /// The final line break is preserved, but trailing empty lines are excluded.
2635    Clip,
2636    /// The final line break and trailing empty lines are included.
2637    Keep,
2638}
2639
2640#[cfg(test)]
2641mod test {
2642    #[test]
2643    fn test_is_anchor_char() {
2644        use super::is_anchor_char;
2645        assert!(is_anchor_char('x'));
2646    }
2647}
saphyr_parser/scanner.rs

saphyr_parser/
scanner.rs