#![allow(clippy::cast_possible_wrap)]
#![allow(clippy::cast_sign_loss)]
use alloc::{
borrow::{Cow, ToOwned},
collections::VecDeque,
string::String,
vec::Vec,
};
use core::{char, fmt};
use crate::{
char_traits::{
as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
is_tag_char, is_uri_char,
},
input::{BorrowedInput, SkipTabs},
};
const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
pub enum TEncoding {
Utf8,
}
#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
pub enum ScalarStyle {
Plain,
SingleQuoted,
DoubleQuoted,
Literal,
Folded,
}
#[derive(Clone, Copy, Debug, Default)]
pub struct MarkerOffsets {
chars: usize,
bytes: Option<usize>,
}
impl PartialEq for MarkerOffsets {
fn eq(&self, other: &Self) -> bool {
self.chars == other.chars
}
}
impl Eq for MarkerOffsets {}
#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
pub struct Marker {
offsets: MarkerOffsets,
line: usize,
col: usize,
}
impl Marker {
#[must_use]
pub fn new(index: usize, line: usize, col: usize) -> Marker {
Marker {
offsets: MarkerOffsets {
chars: index,
bytes: None,
},
line,
col,
}
}
#[must_use]
pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
self.offsets.bytes = byte_offset;
self
}
#[must_use]
pub fn index(&self) -> usize {
self.offsets.chars
}
#[must_use]
pub fn byte_offset(&self) -> Option<usize> {
self.offsets.bytes
}
#[must_use]
pub fn line(&self) -> usize {
self.line
}
#[must_use]
pub fn col(&self) -> usize {
self.col
}
}
#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
pub struct Span {
pub start: Marker,
pub end: Marker,
pub indent: Option<usize>,
}
impl Span {
#[must_use]
pub fn new(start: Marker, end: Marker) -> Span {
Span {
start,
end,
indent: None,
}
}
#[must_use]
pub fn empty(mark: Marker) -> Span {
Span {
start: mark,
end: mark,
indent: None,
}
}
#[must_use]
pub fn with_indent(mut self, indent: Option<usize>) -> Span {
self.indent = indent;
self
}
#[must_use]
pub fn len(&self) -> usize {
self.end.index() - self.start.index()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
#[must_use]
pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
let start = self.start.byte_offset()?;
let end = self.end.byte_offset()?;
Some(start..end)
}
#[must_use]
pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
source.get(self.byte_range()?)
}
}
#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
pub enum Placement {
Above,
Right,
#[default]
Free,
Last,
}
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct Comment<'input> {
pub span: Span,
pub text: Cow<'input, str>,
pub placement: Placement,
}
impl<'input> Comment<'input> {
#[must_use]
pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
Self {
span,
text: text.into(),
placement: Placement::Free,
}
}
#[must_use]
pub fn with_placement(mut self, placement: Placement) -> Self {
self.placement = placement;
self
}
#[must_use]
pub fn trimmed_text(&self) -> &str {
self.text.trim()
}
}
impl AsRef<str> for Comment<'_> {
fn as_ref(&self) -> &str {
self.text.as_ref()
}
}
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct ScanError {
mark: Marker,
info: String,
}
impl ScanError {
#[must_use]
#[cold]
pub fn new(loc: Marker, info: String) -> ScanError {
ScanError { mark: loc, info }
}
#[must_use]
#[cold]
pub fn new_str(loc: Marker, info: &str) -> ScanError {
ScanError {
mark: loc,
info: info.to_owned(),
}
}
#[cold]
pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
Err(self)
}
#[must_use]
pub fn marker(&self) -> &Marker {
&self.mark
}
#[must_use]
pub fn info(&self) -> &str {
self.info.as_ref()
}
}
impl fmt::Display for ScanError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{} at char {} line {} column {}",
self.info,
self.mark.index(),
self.mark.line(),
self.mark.col() + 1
)
}
}
impl core::error::Error for ScanError {}
#[derive(Clone, PartialEq, Debug, Eq)]
pub enum TokenType<'input> {
StreamStart(TEncoding),
StreamEnd,
VersionDirective(
u32,
u32,
),
TagDirective(
Cow<'input, str>,
Cow<'input, str>,
),
DocumentStart,
DocumentEnd,
BlockSequenceStart,
BlockMappingStart,
BlockEnd,
FlowSequenceStart,
FlowSequenceEnd,
FlowMappingStart,
FlowMappingEnd,
BlockEntry,
FlowEntry,
Key,
Value,
Alias(Cow<'input, str>),
Anchor(Cow<'input, str>),
Tag(
Cow<'input, str>,
Cow<'input, str>,
),
Scalar(ScalarStyle, Cow<'input, str>),
Comment(
Comment<'input>,
),
ReservedDirective(
String,
Vec<String>,
),
}
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct Token<'input>(
pub Span,
pub TokenType<'input>,
);
#[derive(Clone, PartialEq, Debug, Eq)]
pub(crate) struct QueuedComment<'input> {
pub(crate) text: Cow<'input, str>,
pub(crate) placement: Placement,
}
impl<'input> QueuedComment<'input> {
fn into_public(self, span: Span) -> Comment<'input> {
Comment::new(span, self.text).with_placement(self.placement)
}
}
impl<'input> From<Comment<'input>> for QueuedComment<'input> {
fn from(comment: Comment<'input>) -> Self {
Self {
text: comment.text,
placement: comment.placement,
}
}
}
#[derive(Clone, PartialEq, Debug, Eq)]
pub(crate) enum QueuedTokenType<'input> {
StreamStart(TEncoding),
StreamEnd,
VersionDirective(u32, u32),
TagDirective(Cow<'input, str>, Cow<'input, str>),
DocumentStart,
DocumentEnd,
BlockSequenceStart,
BlockMappingStart,
BlockEnd,
FlowSequenceStart,
FlowSequenceEnd,
FlowMappingStart,
FlowMappingEnd,
BlockEntry,
FlowEntry,
Key,
Value,
Alias(Cow<'input, str>),
Anchor(Cow<'input, str>),
Tag(Cow<'input, str>, Cow<'input, str>),
Scalar(ScalarStyle, Cow<'input, str>),
Comment(QueuedComment<'input>),
ReservedDirective(String, Vec<String>),
}
impl<'input> QueuedTokenType<'input> {
fn into_public(self, span: Span) -> TokenType<'input> {
match self {
Self::StreamStart(encoding) => TokenType::StreamStart(encoding),
Self::StreamEnd => TokenType::StreamEnd,
Self::VersionDirective(major, minor) => TokenType::VersionDirective(major, minor),
Self::TagDirective(handle, prefix) => TokenType::TagDirective(handle, prefix),
Self::DocumentStart => TokenType::DocumentStart,
Self::DocumentEnd => TokenType::DocumentEnd,
Self::BlockSequenceStart => TokenType::BlockSequenceStart,
Self::BlockMappingStart => TokenType::BlockMappingStart,
Self::BlockEnd => TokenType::BlockEnd,
Self::FlowSequenceStart => TokenType::FlowSequenceStart,
Self::FlowSequenceEnd => TokenType::FlowSequenceEnd,
Self::FlowMappingStart => TokenType::FlowMappingStart,
Self::FlowMappingEnd => TokenType::FlowMappingEnd,
Self::BlockEntry => TokenType::BlockEntry,
Self::FlowEntry => TokenType::FlowEntry,
Self::Key => TokenType::Key,
Self::Value => TokenType::Value,
Self::Alias(name) => TokenType::Alias(name),
Self::Anchor(name) => TokenType::Anchor(name),
Self::Tag(handle, suffix) => TokenType::Tag(handle, suffix),
Self::Scalar(style, value) => TokenType::Scalar(style, value),
Self::Comment(comment) => TokenType::Comment(comment.into_public(span)),
Self::ReservedDirective(name, params) => TokenType::ReservedDirective(name, params),
}
}
}
impl<'input> From<TokenType<'input>> for QueuedTokenType<'input> {
fn from(token: TokenType<'input>) -> Self {
match token {
TokenType::StreamStart(encoding) => Self::StreamStart(encoding),
TokenType::StreamEnd => Self::StreamEnd,
TokenType::VersionDirective(major, minor) => Self::VersionDirective(major, minor),
TokenType::TagDirective(handle, prefix) => Self::TagDirective(handle, prefix),
TokenType::DocumentStart => Self::DocumentStart,
TokenType::DocumentEnd => Self::DocumentEnd,
TokenType::BlockSequenceStart => Self::BlockSequenceStart,
TokenType::BlockMappingStart => Self::BlockMappingStart,
TokenType::BlockEnd => Self::BlockEnd,
TokenType::FlowSequenceStart => Self::FlowSequenceStart,
TokenType::FlowSequenceEnd => Self::FlowSequenceEnd,
TokenType::FlowMappingStart => Self::FlowMappingStart,
TokenType::FlowMappingEnd => Self::FlowMappingEnd,
TokenType::BlockEntry => Self::BlockEntry,
TokenType::FlowEntry => Self::FlowEntry,
TokenType::Key => Self::Key,
TokenType::Value => Self::Value,
TokenType::Alias(name) => Self::Alias(name),
TokenType::Anchor(name) => Self::Anchor(name),
TokenType::Tag(handle, suffix) => Self::Tag(handle, suffix),
TokenType::Scalar(style, value) => Self::Scalar(style, value),
TokenType::Comment(comment) => Self::Comment(comment.into()),
TokenType::ReservedDirective(name, params) => Self::ReservedDirective(name, params),
}
}
}
#[derive(Clone, PartialEq, Debug, Eq)]
pub(crate) struct QueuedToken<'input>(pub(crate) Span, pub(crate) QueuedTokenType<'input>);
impl<'input> QueuedToken<'input> {
fn into_public(self) -> Token<'input> {
Token(self.0, self.1.into_public(self.0))
}
}
impl<'input> From<Token<'input>> for QueuedToken<'input> {
fn from(token: Token<'input>) -> Self {
Self(token.0, token.1.into())
}
}
#[derive(Clone, PartialEq, Debug, Eq)]
struct SimpleKey {
possible: bool,
required: bool,
token_number: usize,
mark: Marker,
}
impl SimpleKey {
fn new(mark: Marker) -> SimpleKey {
SimpleKey {
possible: false,
required: false,
token_number: 0,
mark,
}
}
}
#[derive(Clone, Debug, Default)]
struct Indent {
indent: isize,
needs_block_end: bool,
}
#[derive(Debug, PartialEq)]
enum ImplicitMappingState {
Possible,
Inside(u8),
}
#[derive(Debug)]
#[allow(clippy::struct_excessive_bools)]
pub struct Scanner<'input, T> {
input: T,
mark: Marker,
tokens: VecDeque<QueuedToken<'input>>,
error: Option<ScanError>,
deferred_error: Option<ScanError>,
comments_possible: bool,
stream_start_produced: bool,
stream_end_produced: bool,
document_prefix_allowed: bool,
adjacent_value_allowed_at: usize,
simple_key_allowed: bool,
simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
indent: isize,
indents: smallvec::SmallVec<[Indent; 8]>,
flow_level: u8,
tokens_parsed: usize,
token_available: bool,
leading_whitespace: bool,
flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
interrupted_plain_by_comment: Option<Marker>,
explicit_key_tab_check_pending: bool,
flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
buf_leading_break: String,
buf_trailing_breaks: String,
buf_whitespaces: String,
}
impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
type Item = Token<'input>;
fn next(&mut self) -> Option<Self::Item> {
if self.error.is_some() {
return None;
}
match self.next_token() {
Ok(Some(tok)) => {
debug_print!(
" \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
tok.1,
tok.0
);
Some(tok)
}
Ok(tok) => tok,
Err(e) => self.stop_after_error(e),
}
}
}
pub type ScanResult = Result<(), ScanError>;
#[derive(Debug)]
enum FlowScalarBuf {
Borrowed {
start: usize,
end: usize,
pending_ws_start: Option<usize>,
pending_ws_end: usize,
},
Owned(String),
}
impl FlowScalarBuf {
#[inline]
fn new_borrowed(start: usize) -> Self {
Self::Borrowed {
start,
end: start,
pending_ws_start: None,
pending_ws_end: start,
}
}
#[inline]
fn new_owned() -> Self {
Self::Owned(String::new())
}
#[inline]
fn as_owned_mut(&mut self) -> Option<&mut String> {
match self {
Self::Owned(s) => Some(s),
Self::Borrowed { .. } => None,
}
}
#[inline]
fn commit_pending_ws(&mut self) {
if let Self::Borrowed {
end,
pending_ws_start,
pending_ws_end,
..
} = self
{
if pending_ws_start.is_some() {
*end = *pending_ws_end;
*pending_ws_start = None;
}
}
}
#[inline]
fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
if let Self::Borrowed {
pending_ws_start,
pending_ws_end,
..
} = self
{
if pending_ws_start.is_none() {
*pending_ws_start = Some(ws_start);
}
*pending_ws_end = ws_end;
}
}
#[inline]
fn discard_pending_ws(&mut self) {
if let Self::Borrowed {
pending_ws_start,
pending_ws_end,
end,
..
} = self
{
*pending_ws_start = None;
*pending_ws_end = *end;
}
}
}
impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
#[inline]
fn promote_flow_scalar_buf_to_owned(
&self,
start_mark: &Marker,
buf: &mut FlowScalarBuf,
) -> Result<(), ScanError> {
let FlowScalarBuf::Borrowed {
start,
end,
pending_ws_start: _,
pending_ws_end: _,
} = *buf
else {
return Ok(());
};
let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
ScanError::new_str(
*start_mark,
"internal error: input advertised offsets but did not provide a slice",
)
})?;
*buf = FlowScalarBuf::Owned(slice.to_owned());
Ok(())
}
#[inline]
fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
self.input.slice_borrowed(start, end)
}
fn scan_tag_handle_directive_cow(
&mut self,
mark: &Marker,
) -> Result<Cow<'input, str>, ScanError> {
let Some(start) = self.input.byte_offset() else {
return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
};
if self.input.look_ch() != '!' {
return Err(ScanError::new_str(
*mark,
"while scanning a tag, did not find expected '!'",
));
}
self.skip_non_blank();
self.input.lookahead(1);
while self.input.next_is_alpha() {
self.skip_non_blank();
self.input.lookahead(1);
}
if self.input.peek() == '!' {
self.skip_non_blank();
}
let Some(end) = self.input.byte_offset() else {
return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
};
let Some(slice) = self.try_borrow_slice(start, end) else {
let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
ScanError::new_str(
*mark,
"internal error: input advertised slicing but did not provide a slice",
)
})?;
if !slice.ends_with('!') && slice != "!" {
return Err(ScanError::new_str(
*mark,
"while parsing a tag directive, did not find expected '!'",
));
}
return Ok(Cow::Owned(slice.to_owned()));
};
if !slice.ends_with('!') && slice != "!" {
return Err(ScanError::new_str(
*mark,
"while parsing a tag directive, did not find expected '!'",
));
}
Ok(Cow::Borrowed(slice))
}
fn scan_tag_prefix_directive_cow(
&mut self,
start_mark: &Marker,
) -> Result<Cow<'input, str>, ScanError> {
let Some(start) = self.input.byte_offset() else {
return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
};
if self.input.look_ch() == '!' {
self.skip_non_blank();
} else if !is_tag_char(self.input.peek()) {
return Err(ScanError::new_str(
*start_mark,
"invalid global tag character",
));
} else if self.input.peek() == '%' {
} else {
self.skip_non_blank();
}
while is_uri_char(self.input.look_ch()) {
if self.input.peek() == '%' {
break;
}
self.skip_non_blank();
}
if self.input.peek() == '%' {
let current = self
.input
.byte_offset()
.expect("byte_offset() must remain available once enabled");
let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
slice.to_owned()
} else {
String::new()
};
while is_uri_char(self.input.look_ch()) {
if self.input.peek() == '%' {
out.push(self.scan_uri_escapes(start_mark)?);
} else {
out.push(self.input.peek());
self.skip_non_blank();
}
}
return Ok(Cow::Owned(out));
}
let Some(end) = self.input.byte_offset() else {
return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
};
let Some(slice) = self.try_borrow_slice(start, end) else {
let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
ScanError::new_str(
*start_mark,
"internal error: input advertised slicing but did not provide a slice",
)
})?;
return Ok(Cow::Owned(slice.to_owned()));
};
Ok(Cow::Borrowed(slice))
}
pub fn new(input: T) -> Self {
let initial_byte_offset = input.byte_offset();
let comments_possible = input.may_contain_comments();
Scanner {
input,
mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
tokens: VecDeque::with_capacity(64),
error: None,
deferred_error: None,
comments_possible,
stream_start_produced: false,
stream_end_produced: false,
document_prefix_allowed: true,
adjacent_value_allowed_at: 0,
simple_key_allowed: true,
simple_keys: smallvec::SmallVec::new(),
indent: -1,
indents: smallvec::SmallVec::new(),
flow_level: 0,
tokens_parsed: 0,
token_available: false,
leading_whitespace: true,
flow_mapping_started: smallvec::SmallVec::new(),
implicit_flow_mapping_states: smallvec::SmallVec::new(),
flow_markers: smallvec::SmallVec::new(),
interrupted_plain_by_comment: None,
explicit_key_tab_check_pending: false,
buf_leading_break: String::with_capacity(128),
buf_trailing_breaks: String::with_capacity(128),
buf_whitespaces: String::with_capacity(128),
}
}
#[inline]
pub fn get_error(&self) -> Option<ScanError> {
self.error.clone().or_else(|| self.deferred_error.clone())
}
#[cold]
fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
self.error = Some(error);
None
}
#[cold]
fn simple_key_expected(&self) -> ScanError {
ScanError::new_str(self.mark, "simple key expected")
}
#[cold]
fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
}
#[inline]
fn skip_blank(&mut self) {
self.input.skip();
self.mark.offsets.chars += 1;
self.mark.col += 1;
self.mark.offsets.bytes = self.input.byte_offset();
}
#[inline]
fn skip_non_blank(&mut self) {
self.input.skip();
self.mark.offsets.chars += 1;
self.mark.col += 1;
self.mark.offsets.bytes = self.input.byte_offset();
self.leading_whitespace = false;
}
#[inline]
fn skip_bom(&mut self) {
self.input.skip();
self.mark.offsets.chars += 1;
self.mark.offsets.bytes = self.input.byte_offset();
}
#[inline]
fn skip_comment_char(&mut self) {
self.input.skip();
self.mark.offsets.chars += 1;
self.mark.col += 1;
self.mark.offsets.bytes = self.input.byte_offset();
}
#[inline]
fn skip_n_non_blank(&mut self, count: usize) {
for _ in 0..count {
self.input.skip();
self.mark.offsets.chars += 1;
self.mark.col += 1;
}
self.mark.offsets.bytes = self.input.byte_offset();
self.leading_whitespace = false;
}
#[inline]
fn skip_nl(&mut self) {
self.input.skip();
self.mark.offsets.chars += 1;
self.mark.col = 0;
self.mark.line += 1;
self.mark.offsets.bytes = self.input.byte_offset();
self.leading_whitespace = true;
}
#[inline]
fn skip_linebreak(&mut self) {
if self.input.next_2_are('\r', '\n') {
self.skip_blank();
self.skip_nl();
} else if self.input.next_is_break() {
self.skip_nl();
}
}
#[cfg(test)]
fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
Ok(self.scan_comment_queued_token()?.into_public())
}
fn scan_comment_queued_token(&mut self) -> Result<QueuedToken<'input>, ScanError> {
let start_mark = self.mark;
debug_assert_eq!(self.input.peek(), '#');
let placement = if self.leading_whitespace {
Placement::Free
} else {
Placement::Right
};
self.skip_comment_char();
let text = if let Some(start) = self.input.byte_offset() {
let n = self.input.skip_while_non_breakz();
self.mark.offsets.chars += n;
self.mark.col += n;
let byte_offset = self.input.byte_offset();
self.mark.offsets.bytes = byte_offset;
let end = byte_offset.expect("byte_offset must remain available once enabled");
if let Some(slice) = self.try_borrow_slice(start, end) {
Cow::Borrowed(slice)
} else if let Some(slice) = self.input.slice_bytes(start, end) {
Cow::Owned(slice.to_owned())
} else {
return Err(ScanError::new_str(
start_mark,
"internal error: input advertised offsets but did not provide a slice",
));
}
} else {
let mut owned = String::new();
while !is_breakz(self.input.look_ch()) {
owned.push(self.input.peek());
self.skip_comment_char();
}
Cow::Owned(owned)
};
let end_mark = self.mark;
let span = Span::new(start_mark, end_mark);
Ok(QueuedToken(
span,
QueuedTokenType::Comment(QueuedComment { text, placement }),
))
}
fn push_comment_token(&mut self) -> ScanResult {
let token = self.scan_comment_queued_token()?;
self.tokens.push_back(token);
Ok(())
}
fn skip_comment(&mut self) {
debug_assert_eq!(self.input.peek(), '#');
self.skip_comment_char();
let n = self.input.skip_while_non_breakz();
self.mark.offsets.chars += n;
self.mark.col += n;
self.mark.offsets.bytes = self.input.byte_offset();
}
#[inline]
pub fn stream_started(&self) -> bool {
self.stream_start_produced
}
#[inline]
pub fn stream_ended(&self) -> bool {
self.stream_end_produced
}
#[inline]
pub fn mark(&self) -> Marker {
self.mark
}
#[inline]
pub(crate) fn comments_possible(&self) -> bool {
self.comments_possible
}
#[inline]
fn read_break(&mut self, s: &mut String) {
self.skip_break();
s.push('\n');
}
#[inline]
fn skip_break(&mut self) {
let c = self.input.peek();
let nc = self.input.peek_nth(1);
debug_assert!(is_break(c));
if c == '\r' && nc == '\n' {
self.skip_blank();
}
self.skip_nl();
}
fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
let old_len = self.tokens.len();
assert!(pos <= old_len);
self.tokens.insert(pos, tok.into());
}
#[inline]
fn allow_simple_key(&mut self) {
self.simple_key_allowed = true;
}
#[inline]
fn disallow_simple_key(&mut self) {
self.simple_key_allowed = false;
}
pub fn fetch_next_token(&mut self) -> ScanResult {
self.input.lookahead(1);
if !self.stream_start_produced {
self.fetch_stream_start();
return Ok(());
}
if self.skip_to_next_token(true)? {
return Ok(());
}
debug_print!(
" \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
self.mark,
self.input.peek()
);
self.stale_simple_keys()?;
let mark = self.mark;
self.unroll_indent(mark.col as isize);
self.input.lookahead(4);
if self.input.next_is_z() {
self.fetch_stream_end()?;
return Ok(());
}
if self.mark.col == 0 {
if self.input.next_char_is('%') {
return self.fetch_directive();
} else if self.input.next_is_document_start() {
return self.fetch_document_indicator(TokenType::DocumentStart);
} else if self.input.next_is_document_end() {
self.fetch_document_indicator(TokenType::DocumentEnd)?;
self.skip_ws_to_eol(SkipTabs::Yes)?;
if !self.input.next_is_breakz() {
return Err(ScanError::new_str(
self.mark,
"invalid content after document end marker",
));
}
return Ok(());
}
}
if self.document_prefix_allowed {
self.document_prefix_allowed = false;
}
if (self.mark.col as isize) < self.indent {
self.input.lookahead(1);
let c = self.input.peek();
if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
return Err(ScanError::new_str(self.mark, "invalid indentation"));
}
}
let c = self.input.peek();
let nc = self.input.peek_nth(1);
match c {
'[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
'{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
'}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
',' => self.fetch_flow_entry(),
'-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
'?' if is_blank_or_breakz(nc) => self.fetch_key(),
':' if is_blank_or_breakz(nc) => self.fetch_value(),
':' if self.flow_level > 0
&& (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
{
self.fetch_flow_value()
}
'*' => self.fetch_anchor(true),
'&' => self.fetch_anchor(false),
'!' => self.fetch_tag(),
'|' if self.flow_level == 0 => self.fetch_block_scalar(true),
'>' if self.flow_level == 0 => self.fetch_block_scalar(false),
'\'' => self.fetch_flow_scalar(true),
'"' => self.fetch_flow_scalar(false),
'-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
self.fetch_plain_scalar()
}
c if is_bom(c) => Err(ScanError::new_str(
self.mark,
"a BOM must not appear inside a document",
)),
'%' | '@' | '`' => Err(ScanError::new(
self.mark,
format!("unexpected character: `{c}'"),
)),
_ => self.fetch_plain_scalar(),
}
}
pub(crate) fn next_queued_token(&mut self) -> Result<Option<QueuedToken<'input>>, ScanError> {
if self.deferred_error.is_some() {
if !matches!(
self.tokens.front().map(|token| &token.1),
Some(QueuedTokenType::Comment(_))
) {
if let Some(error) = self.deferred_error.take() {
return error.into_result();
}
}
self.token_available = true;
}
if self.stream_end_produced {
return Ok(None);
}
if !self.token_available {
if let Err(error) = self.fetch_more_tokens() {
if matches!(
self.tokens.front().map(|token| &token.1),
Some(QueuedTokenType::Comment(_))
) {
self.deferred_error = Some(error);
} else {
return Err(error);
}
}
}
let Some(t) = self.tokens.pop_front() else {
return Err(ScanError::new_str(
self.mark,
"did not find expected next token",
));
};
self.token_available = false;
self.tokens_parsed += 1;
let is_stream_end = matches!(t.1, QueuedTokenType::StreamEnd);
if is_stream_end {
self.stream_end_produced = true;
}
Ok(Some(t))
}
pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
Ok(self.next_queued_token()?.map(QueuedToken::into_public))
}
pub fn fetch_more_tokens(&mut self) -> ScanResult {
let mut need_more;
loop {
if self.tokens.is_empty() {
need_more = true;
} else {
need_more = false;
self.stale_simple_keys()?;
if !matches!(
self.tokens.front().map(|token| &token.1),
Some(QueuedTokenType::Comment(_))
) {
for sk in &self.simple_keys {
if sk.possible && sk.token_number == self.tokens_parsed {
need_more = true;
break;
}
}
}
}
if let Some(token) = self.tokens.back() {
if matches!(
token.1,
QueuedTokenType::DocumentEnd | QueuedTokenType::DocumentStart
) {
break;
}
}
if !need_more {
break;
}
self.fetch_next_token()?;
}
self.token_available = true;
Ok(())
}
fn stale_simple_keys(&mut self) -> ScanResult {
for sk in &mut self.simple_keys {
let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
let is_length_stale =
self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
if sk.possible && (is_line_stale || is_length_stale) {
if sk.required {
return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
}
sk.possible = false;
}
}
Ok(())
}
fn skip_to_next_token(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
let consume_linebreak = |this: &mut Self| {
this.input.lookahead(2);
this.skip_linebreak();
if this.flow_level == 0 {
this.allow_simple_key();
}
};
loop {
let ch = self.input.look_ch();
if self.explicit_key_tab_check_pending {
match ch {
'\t' => {
return Err(ScanError::new_str(
self.mark(),
"tabs disallowed in this context",
));
}
' ' | '\n' | '\r' | '#' => {}
_ => self.explicit_key_tab_check_pending = false,
}
}
match ch {
'\t' => {
if self.is_within_block()
&& self.leading_whitespace
&& (self.mark.col as isize) < self.indent
{
self.skip_ws_to_eol(SkipTabs::Yes)?;
if !self.input.next_is_breakz() {
return Err(ScanError::new_str(
self.mark,
"tabs disallowed within this context (block indentation)",
));
}
if matches!(self.input.look_ch(), '\n' | '\r') {
consume_linebreak(self);
}
} else {
self.skip_blank();
}
}
' ' => self.skip_blank(),
'\n' | '\r' => consume_linebreak(self),
c if is_bom(c)
&& self.document_prefix_allowed
&& self.flow_level == 0
&& self.mark.col == 0 =>
{
self.skip_bom();
}
'#' => {
self.push_comment_token()?;
if matches!(self.input.look_ch(), '\n' | '\r') {
consume_linebreak(self);
}
if stop_after_comment {
return Ok(true);
}
}
_ => break,
}
}
if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
let is_immediate_next_line = self.mark.line == err_mark.line + 1;
if self.flow_level == 0
&& is_immediate_next_line
&& (self.mark.col as isize) > self.indent
{
self.input.lookahead(4);
if !self.input.next_is_z()
&& !self.input.next_is_document_indicator()
&& self.input.next_can_be_plain_scalar(false)
{
return Err(ScanError::new_str(
err_mark,
"comment intercepting the multiline text",
));
}
}
}
Ok(false)
}
fn skip_yaml_whitespace(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
let mut need_whitespace = true;
loop {
match self.input.look_ch() {
' ' => {
self.skip_blank();
need_whitespace = false;
}
'\n' | '\r' => {
self.input.lookahead(2);
self.skip_linebreak();
if self.flow_level == 0 {
self.allow_simple_key();
}
need_whitespace = false;
}
'#' => {
if need_whitespace {
self.skip_comment();
} else {
self.push_comment_token()?;
if stop_after_comment {
return Ok(true);
}
}
}
_ => break,
}
}
if need_whitespace {
Err(ScanError::new_str(self.mark(), "expected whitespace"))
} else {
Ok(false)
}
}
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
if !self.comments_possible {
let (chars_consumed, result) = self.input.skip_ws_to_eol(skip_tabs);
self.mark.col += chars_consumed;
self.mark.offsets.chars += chars_consumed;
self.mark.offsets.bytes = self.input.byte_offset();
return result.map_err(|msg| ScanError::new_str(self.mark, msg));
}
let (chars_consumed, whitespace) = self.input.skip_ws_to_eol_blanks(skip_tabs);
self.mark.col += chars_consumed;
self.mark.offsets.chars += chars_consumed;
self.mark.offsets.bytes = self.input.byte_offset();
if self.input.look_ch() != '#' {
return Ok(whitespace);
}
if !whitespace.found_tabs() && !whitespace.has_valid_yaml_ws() {
return Err(ScanError::new_str(
self.mark,
"comments must be separated from other tokens by whitespace",
));
}
self.push_comment_token()?;
Ok(whitespace)
}
fn fetch_stream_start(&mut self) {
let mark = self.mark;
self.indent = -1;
self.stream_start_produced = true;
self.allow_simple_key();
self.tokens
.push_back(Token(Span::empty(mark), TokenType::StreamStart(TEncoding::Utf8)).into());
self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
}
fn fetch_stream_end(&mut self) -> ScanResult {
if self.mark.col != 0 {
self.mark.col = 0;
self.mark.line += 1;
}
if let Some((mark, bracket)) = self.flow_markers.pop() {
return Err(Self::unclosed_bracket(mark, bracket));
}
for sk in &mut self.simple_keys {
if sk.required && sk.possible {
return Err(self.simple_key_expected());
}
sk.possible = false;
}
self.unroll_indent(-1);
self.remove_simple_key()?;
self.disallow_simple_key();
self.tokens
.push_back(Token(Span::empty(self.mark), TokenType::StreamEnd).into());
Ok(())
}
fn fetch_directive(&mut self) -> ScanResult {
self.unroll_indent(-1);
self.remove_simple_key()?;
self.disallow_simple_key();
let token_index = self.tokens.len();
let tok = self.scan_directive()?;
self.insert_token(token_index, tok);
Ok(())
}
fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
let start_mark = self.mark;
self.skip_non_blank();
let name = self.scan_directive_name()?;
let tok = match name.as_ref() {
"YAML" => self.scan_version_directive_value(&start_mark)?,
"TAG" => self.scan_tag_directive_value(&start_mark)?,
_ => {
let mut params = Vec::new();
while self.input.next_is_blank() {
let n_blanks = self.input.skip_while_blank();
self.mark.offsets.chars += n_blanks;
self.mark.col += n_blanks;
self.mark.offsets.bytes = self.input.byte_offset();
if !is_blank_or_breakz(self.input.peek()) {
let mut param = String::new();
let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
self.mark.offsets.chars += n_chars;
self.mark.col += n_chars;
self.mark.offsets.bytes = self.input.byte_offset();
params.push(param);
}
}
Token(
Span::new(start_mark, self.mark),
TokenType::ReservedDirective(name, params),
)
}
};
self.skip_ws_to_eol(SkipTabs::Yes)?;
if self.input.next_is_breakz() {
self.input.lookahead(2);
self.skip_linebreak();
Ok(tok)
} else {
Err(ScanError::new_str(
start_mark,
"while scanning a directive, did not find expected comment or line break",
))
}
}
fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
let n_blanks = self.input.skip_while_blank();
self.mark.offsets.chars += n_blanks;
self.mark.col += n_blanks;
self.mark.offsets.bytes = self.input.byte_offset();
let major = self.scan_version_directive_number(mark)?;
if self.input.peek() != '.' {
return Err(ScanError::new_str(
*mark,
"while scanning a YAML directive, did not find expected digit or '.' character",
));
}
self.skip_non_blank();
let minor = self.scan_version_directive_number(mark)?;
Ok(Token(
Span::new(*mark, self.mark),
TokenType::VersionDirective(major, minor),
))
}
fn scan_directive_name(&mut self) -> Result<String, ScanError> {
let start_mark = self.mark;
let mut string = String::new();
let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
self.mark.offsets.chars += n_chars;
self.mark.col += n_chars;
self.mark.offsets.bytes = self.input.byte_offset();
if string.is_empty() {
return Err(ScanError::new_str(
start_mark,
"while scanning a directive, could not find expected directive name",
));
}
if !is_blank_or_breakz(self.input.peek()) {
return Err(ScanError::new_str(
start_mark,
"while scanning a directive, found unexpected non-alphabetical character",
));
}
Ok(string)
}
fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
let mut val = 0u32;
let mut length = 0usize;
while let Some(digit) = self.input.look_ch().to_digit(10) {
if length + 1 > 9 {
return Err(ScanError::new_str(
*mark,
"while scanning a YAML directive, found extremely long version number",
));
}
length += 1;
val = val * 10 + digit;
self.skip_non_blank();
}
if length == 0 {
return Err(ScanError::new_str(
*mark,
"while scanning a YAML directive, did not find expected version number",
));
}
Ok(val)
}
fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
let n_blanks = self.input.skip_while_blank();
self.mark.offsets.chars += n_blanks;
self.mark.col += n_blanks;
self.mark.offsets.bytes = self.input.byte_offset();
let handle = self.scan_tag_handle_directive_cow(mark)?;
let n_blanks = self.input.skip_while_blank();
self.mark.offsets.chars += n_blanks;
self.mark.col += n_blanks;
self.mark.offsets.bytes = self.input.byte_offset();
let prefix = self.scan_tag_prefix_directive_cow(mark)?;
self.input.lookahead(1);
if self.input.next_is_blank_or_breakz() {
Ok(Token(
Span::new(*mark, self.mark),
TokenType::TagDirective(handle, prefix),
))
} else {
Err(ScanError::new_str(
*mark,
"while scanning TAG, did not find expected whitespace or line break",
))
}
}
fn fetch_tag(&mut self) -> ScanResult {
self.save_simple_key();
self.disallow_simple_key();
let tok = self.scan_tag()?;
self.tokens.push_back(tok.into());
Ok(())
}
fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
let start_mark = self.mark;
self.input.lookahead(2);
if self.input.byte_offset().is_none() {
return self.scan_tag_owned(&start_mark);
}
let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
if self.input.nth_char_is(1, '<') {
let suffix = self.scan_verbatim_tag(&start_mark)?;
(Cow::Owned(String::new()), Cow::Owned(suffix))
} else {
let handle = self.scan_tag_handle_cow(&start_mark)?;
if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
(handle, suffix)
} else {
let remaining_suffix =
self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
let suffix = if handle.len() > 1 {
if remaining_suffix.is_empty() {
match handle {
Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
}
} else {
let mut combined = handle[1..].to_owned();
combined.push_str(&remaining_suffix);
Cow::Owned(combined)
}
} else {
remaining_suffix
};
if suffix.is_empty() {
(Cow::Borrowed(""), Cow::Borrowed("!"))
} else {
(Cow::Borrowed("!"), suffix)
}
}
};
if is_blank_or_breakz(self.input.look_ch())
|| (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
{
Ok(Token(
Span::new(start_mark, self.mark),
TokenType::Tag(handle, suffix),
))
} else {
Err(ScanError::new_str(
start_mark,
"while scanning a tag, did not find expected whitespace or line break",
))
}
}
fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
let mut handle = String::new();
let mut suffix;
if self.input.nth_char_is(1, '<') {
suffix = self.scan_verbatim_tag(start_mark)?;
} else {
handle = self.scan_tag_handle(false, start_mark)?;
if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
let is_secondary_handle = handle == "!!";
suffix =
self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
} else {
suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
"!".clone_into(&mut handle);
if suffix.is_empty() {
handle.clear();
"!".clone_into(&mut suffix);
}
}
}
if is_blank_or_breakz(self.input.look_ch())
|| (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
{
Ok(Token(
Span::new(*start_mark, self.mark),
TokenType::Tag(handle.into(), suffix.into()),
))
} else {
Err(ScanError::new_str(
*start_mark,
"while scanning a tag, did not find expected whitespace or line break",
))
}
}
fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
let Some(start) = self.input.byte_offset() else {
return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
};
if self.input.look_ch() != '!' {
return Err(ScanError::new_str(
*mark,
"while scanning a tag, did not find expected '!'",
));
}
self.skip_non_blank();
self.input.lookahead(1);
while self.input.next_is_alpha() {
self.skip_non_blank();
self.input.lookahead(1);
}
if self.input.peek() == '!' {
self.skip_non_blank();
}
let Some(end) = self.input.byte_offset() else {
return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
};
if let Some(slice) = self.try_borrow_slice(start, end) {
Ok(Cow::Borrowed(slice))
} else {
let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
ScanError::new_str(
*mark,
"internal error: input advertised slicing but did not provide a slice",
)
})?;
Ok(Cow::Owned(slice.to_owned()))
}
}
fn scan_tag_shorthand_suffix_cow(
&mut self,
mark: &Marker,
require_non_empty: bool,
) -> Result<Cow<'input, str>, ScanError> {
let Some(start) = self.input.byte_offset() else {
return Ok(Cow::Owned(
self.scan_tag_shorthand_suffix(false, false, "", mark)?,
));
};
while is_tag_char(self.input.look_ch()) {
if self.input.peek() == '%' {
let current = self
.input
.byte_offset()
.expect("byte_offset() must remain available once enabled");
let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
slice.to_owned()
} else {
String::new()
};
while is_tag_char(self.input.look_ch()) {
if self.input.peek() == '%' {
out.push(self.scan_uri_escapes(mark)?);
} else {
out.push(self.input.peek());
self.skip_non_blank();
}
}
return Ok(Cow::Owned(out));
}
self.skip_non_blank();
}
let Some(end) = self.input.byte_offset() else {
return Ok(Cow::Owned(
self.scan_tag_shorthand_suffix(false, false, "", mark)?,
));
};
if require_non_empty && start == end {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, did not find expected tag URI",
));
}
if let Some(slice) = self.try_borrow_slice(start, end) {
Ok(Cow::Borrowed(slice))
} else {
let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
ScanError::new_str(
*mark,
"internal error: input advertised slicing but did not provide a slice",
)
})?;
Ok(Cow::Owned(slice.to_owned()))
}
}
fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.input.look_ch() != '!' {
return Err(ScanError::new_str(
*mark,
"while scanning a tag, did not find expected '!'",
));
}
string.push(self.input.peek());
self.skip_non_blank();
let n_chars = self.input.fetch_while_is_alpha(&mut string);
self.mark.offsets.chars += n_chars;
self.mark.col += n_chars;
self.mark.offsets.bytes = self.input.byte_offset();
if self.input.peek() == '!' {
string.push(self.input.peek());
self.skip_non_blank();
} else if directive && string != "!" {
return Err(ScanError::new_str(
*mark,
"while parsing a tag directive, did not find expected '!'",
));
}
Ok(string)
}
fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.input.look_ch() == '!' {
string.push(self.input.peek());
self.skip_non_blank();
} else if !is_tag_char(self.input.peek()) {
return Err(ScanError::new_str(
*start_mark,
"invalid global tag character",
));
} else if self.input.peek() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
string.push(self.input.peek());
self.skip_non_blank();
}
while is_uri_char(self.input.look_ch()) {
if self.input.peek() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
string.push(self.input.peek());
self.skip_non_blank();
}
}
Ok(string)
}
fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
self.skip_non_blank();
self.skip_non_blank();
let mut string = String::new();
while is_uri_char(self.input.look_ch()) {
if self.input.peek() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
string.push(self.input.peek());
self.skip_non_blank();
}
}
if string.is_empty() {
return Err(ScanError::new_str(
*start_mark,
"while parsing a tag, did not find expected tag URI",
));
}
if self.input.peek() != '>' {
return Err(ScanError::new_str(
*start_mark,
"while scanning a verbatim tag, did not find the expected '>'",
));
}
self.skip_non_blank();
Ok(string)
}
fn scan_tag_shorthand_suffix(
&mut self,
_directive: bool,
_is_secondary: bool,
head: &str,
mark: &Marker,
) -> Result<String, ScanError> {
let mut length = head.len();
let mut string = String::new();
if length > 1 {
string.extend(head.chars().skip(1));
}
while is_tag_char(self.input.look_ch()) {
if self.input.peek() == '%' {
string.push(self.scan_uri_escapes(mark)?);
} else {
string.push(self.input.peek());
self.skip_non_blank();
}
length += 1;
}
if length == 0 {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, did not find expected tag URI",
));
}
Ok(string)
}
fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
let mut width = 0usize;
let mut bytes = [0u8; 4];
let mut bytes_len = 0usize;
loop {
self.input.lookahead(3);
let c = self.input.peek_nth(1);
let nc = self.input.peek_nth(2);
if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, found an invalid escape sequence",
));
}
let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
.expect("two hex nibbles always fit in a byte");
if width == 0 {
width = match byte {
_ if byte & 0x80 == 0x00 => 1,
_ if byte & 0xE0 == 0xC0 => 2,
_ if byte & 0xF0 == 0xE0 => 3,
_ if byte & 0xF8 == 0xF0 => 4,
_ => {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, found an incorrect leading UTF-8 byte",
));
}
};
} else if byte & 0xc0 != 0x80 {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, found an incorrect trailing UTF-8 byte",
));
}
bytes[bytes_len] = byte;
bytes_len += 1;
self.skip_n_non_blank(3);
width -= 1;
if width == 0 {
break;
}
}
let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
ScanError::new_str(
*mark,
"while parsing a tag, found an invalid UTF-8 codepoint",
)
})?;
let mut chars = s.chars();
match (chars.next(), chars.next()) {
(Some(ch), None) => Ok(ch),
_ => Err(ScanError::new_str(
*mark,
"while parsing a tag, found an invalid UTF-8 codepoint",
)),
}
}
fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
self.save_simple_key();
self.disallow_simple_key();
let tok = self.scan_anchor(alias)?;
self.tokens.push_back(tok.into());
Ok(())
}
fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
let start_mark = self.mark;
self.skip_non_blank();
if let Some(start) = self.input.byte_offset() {
while is_anchor_char(self.input.look_ch()) {
self.skip_non_blank();
}
let end = self
.input
.byte_offset()
.expect("byte_offset() must remain available once enabled");
if start == end {
return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
}
let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
Cow::Borrowed(slice)
} else if let Some(slice) = self.input.slice_bytes(start, end) {
Cow::Owned(slice.to_owned())
} else {
return Err(ScanError::new_str(
start_mark,
"internal error: input advertised slicing but did not provide a slice",
));
};
let tok = if alias {
TokenType::Alias(cow)
} else {
TokenType::Anchor(cow)
};
return Ok(Token(Span::new(start_mark, self.mark), tok));
}
let mut string = String::new();
while is_anchor_char(self.input.look_ch()) {
string.push(self.input.peek());
self.skip_non_blank();
}
if string.is_empty() {
return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
}
let tok = if alias {
TokenType::Alias(string.into())
} else {
TokenType::Anchor(string.into())
};
Ok(Token(Span::new(start_mark, self.mark), tok))
}
fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
self.save_simple_key();
let start_mark = self.mark;
let indicator = self.input.peek();
self.flow_markers.push((start_mark, indicator));
self.roll_one_col_indent();
self.increase_flow_level()?;
self.allow_simple_key();
self.skip_non_blank();
if tok == TokenType::FlowMappingStart {
self.flow_mapping_started.push(true);
} else {
self.flow_mapping_started.push(false);
self.implicit_flow_mapping_states
.push(ImplicitMappingState::Possible);
}
let token_index = self.tokens.len();
self.skip_ws_to_eol(SkipTabs::Yes)?;
self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
Ok(())
}
fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
if self.flow_level == 0 {
return Err(ScanError::new_str(self.mark, "misplaced bracket"));
}
let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
return Err(ScanError::new_str(self.mark, "misplaced bracket"));
};
let (expected_open, actual_close) = match tok {
TokenType::FlowSequenceEnd => ('[', ']'),
TokenType::FlowMappingEnd => ('{', '}'),
_ => unreachable!("flow collection end called with non-closing token"),
};
if open_ch != expected_open {
return Err(ScanError::new(
open_mark,
format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
));
}
let flow_level = self.flow_level;
self.remove_simple_key()?;
if matches!(tok, TokenType::FlowSequenceEnd) {
self.end_implicit_mapping(self.mark, flow_level);
self.implicit_flow_mapping_states.pop();
}
self.flow_mapping_started.pop();
self.decrease_flow_level();
self.disallow_simple_key();
let start_mark = self.mark;
self.skip_non_blank();
let token_index = self.tokens.len();
self.skip_ws_to_eol(SkipTabs::Yes)?;
if self.flow_level > 0 {
self.adjacent_value_allowed_at = self.mark.index();
}
self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
Ok(())
}
fn fetch_flow_entry(&mut self) -> ScanResult {
self.remove_simple_key()?;
self.allow_simple_key();
self.end_implicit_mapping(self.mark, self.flow_level);
if self.current_flow_collection_is_sequence() {
self.set_current_flow_mapping_started(false);
}
let start_mark = self.mark;
self.skip_non_blank();
let token_index = self.tokens.len();
self.skip_ws_to_eol(SkipTabs::Yes)?;
self.insert_token(
token_index,
Token(Span::new(start_mark, self.mark), TokenType::FlowEntry),
);
Ok(())
}
fn increase_flow_level(&mut self) -> ScanResult {
self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
self.flow_level = self
.flow_level
.checked_add(1)
.ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
Ok(())
}
fn decrease_flow_level(&mut self) {
if self.flow_level > 0 {
self.flow_level -= 1;
self.simple_keys.pop().unwrap();
}
}
fn fetch_block_entry(&mut self) -> ScanResult {
if self.flow_level > 0 {
return Err(ScanError::new_str(
self.mark,
r#""-" is only valid inside a block"#,
));
}
if !self.simple_key_allowed {
return Err(ScanError::new_str(
self.mark,
"block sequence entries are not allowed in this context",
));
}
if let Some(QueuedToken(span, QueuedTokenType::Anchor(..) | QueuedTokenType::Tag(..))) =
self.tokens.back()
{
if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
return Err(ScanError::new_str(
span.start,
"invalid indentation for anchor",
));
}
}
let mark = self.mark;
self.skip_non_blank();
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
let token_index = self.tokens.len();
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
self.input.lookahead(2);
if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
{
return Err(ScanError::new_str(
self.mark,
"'-' must be followed by a valid YAML whitespace",
));
}
self.skip_ws_to_eol(SkipTabs::No)?;
self.input.lookahead(1);
if self.input.next_is_break() || self.input.next_is_flow() {
self.roll_one_col_indent();
}
self.remove_simple_key()?;
self.allow_simple_key();
self.insert_token(
token_index,
Token(Span::empty(self.mark), TokenType::BlockEntry),
);
Ok(())
}
fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
if let Some((mark, bracket)) = self.flow_markers.pop() {
return Err(ScanError::new(
mark,
format!("unclosed bracket '{bracket}'"),
));
}
self.unroll_indent(-1);
self.remove_simple_key()?;
self.disallow_simple_key();
let mark = self.mark;
self.skip_n_non_blank(3);
self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
self.tokens
.push_back(Token(Span::new(mark, self.mark), t).into());
Ok(())
}
fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
self.save_simple_key();
self.allow_simple_key();
let tok = self.scan_block_scalar(literal)?;
self.tokens.push_back(tok.into());
Ok(())
}
#[allow(clippy::too_many_lines)]
fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
let start_mark = self.mark;
let mut chomping = Chomping::Clip;
let mut increment: usize = 0;
let mut indent: usize = 0;
let mut trailing_blank: bool;
let mut leading_blank: bool = false;
let style = if literal {
ScalarStyle::Literal
} else {
ScalarStyle::Folded
};
let mut string = String::new();
let mut leading_break = String::new();
let mut trailing_breaks = String::new();
let mut chomping_break = String::new();
self.skip_non_blank();
self.unroll_non_block_indents();
if self.input.look_ch() == '+' || self.input.peek() == '-' {
if self.input.peek() == '+' {
chomping = Chomping::Keep;
} else {
chomping = Chomping::Strip;
}
self.skip_non_blank();
self.input.lookahead(1);
if self.input.next_is_digit() {
if self.input.peek() == '0' {
return Err(ScanError::new_str(
start_mark,
"while scanning a block scalar, found an indentation indicator equal to 0",
));
}
increment = (self.input.peek() as usize) - ('0' as usize);
self.skip_non_blank();
}
} else if self.input.next_is_digit() {
if self.input.peek() == '0' {
return Err(ScanError::new_str(
start_mark,
"while scanning a block scalar, found an indentation indicator equal to 0",
));
}
increment = (self.input.peek() as usize) - ('0' as usize);
self.skip_non_blank();
self.input.lookahead(1);
if self.input.peek() == '+' || self.input.peek() == '-' {
if self.input.peek() == '+' {
chomping = Chomping::Keep;
} else {
chomping = Chomping::Strip;
}
self.skip_non_blank();
}
}
self.skip_ws_to_eol(SkipTabs::Yes)?;
self.input.lookahead(1);
if !self.input.next_is_breakz() {
return Err(ScanError::new_str(
start_mark,
"while scanning a block scalar, did not find expected comment or line break",
));
}
if self.input.next_is_break() {
self.input.lookahead(2);
self.read_break(&mut chomping_break);
}
if self.input.look_ch() == '\t' {
return Err(ScanError::new_str(
start_mark,
"a block scalar content cannot start with a tab",
));
}
if increment > 0 {
indent = if self.indent >= 0 {
(self.indent + increment as isize) as usize
} else {
increment
}
}
if indent == 0 {
self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
} else {
self.skip_block_scalar_indent(indent, &mut trailing_breaks);
}
if self.input.next_is_z() {
let contents = match chomping {
Chomping::Strip => String::new(),
_ if self.mark.line == start_mark.line() => String::new(),
Chomping::Clip => chomping_break,
Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
Chomping::Keep => trailing_breaks,
};
return Ok(Token(
Span::new(start_mark, self.mark),
TokenType::Scalar(style, contents.into()),
));
}
if self.mark.col < indent && (self.mark.col as isize) > self.indent {
if self.indent < 0 && self.mark.col == 0 {
self.input.lookahead(4);
if self.input.next_is_document_start()
|| self.input.next_is_document_end()
|| self.input.peek() == '#'
{
} else {
return Err(ScanError::new_str(
self.mark,
"wrongly indented line in block scalar",
));
}
} else {
return Err(ScanError::new_str(
self.mark,
"wrongly indented line in block scalar",
));
}
}
let mut line_buffer = String::with_capacity(100);
let start_mark = self.mark;
while self.mark.col == indent && !self.input.next_is_z() {
if indent == 0 {
self.input.lookahead(4);
if self.input.next_is_document_end() {
break;
}
}
trailing_blank = self.input.next_is_blank();
if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
string.push_str(&trailing_breaks);
if trailing_breaks.is_empty() {
string.push(' ');
}
} else {
string.push_str(&leading_break);
string.push_str(&trailing_breaks);
}
leading_break.clear();
trailing_breaks.clear();
leading_blank = self.input.next_is_blank();
self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
self.input.lookahead(2);
if self.input.next_is_z() {
break;
}
self.read_break(&mut leading_break);
self.skip_block_scalar_indent(indent, &mut trailing_breaks);
}
if chomping != Chomping::Strip {
string.push_str(&leading_break);
if self.input.next_is_z() && self.mark.col >= indent.max(1) {
string.push('\n');
}
}
if chomping == Chomping::Keep {
string.push_str(&trailing_breaks);
}
Ok(Token(
Span::new(start_mark, self.mark),
TokenType::Scalar(style, string.into()),
))
}
fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
string.push(self.input.peek());
self.skip_blank();
}
if self.input.buf_is_empty() {
let mut n_chars = 0;
debug_assert!(line_buffer.is_empty());
while let Some(c) = self.input.raw_read_non_breakz_ch() {
line_buffer.push(c);
n_chars += 1;
}
self.mark.col += n_chars;
self.mark.offsets.chars += n_chars;
self.mark.offsets.bytes = self.input.byte_offset();
string.reserve(line_buffer.len());
string.push_str(line_buffer);
line_buffer.clear();
}
}
fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
loop {
if indent < self.input.bufmaxlen() - 2 {
self.input.lookahead(self.input.bufmaxlen());
while self.mark.col < indent && self.input.peek() == ' ' {
self.skip_blank();
}
} else {
loop {
self.input.lookahead(self.input.bufmaxlen());
while !self.input.buf_is_empty()
&& self.mark.col < indent
&& self.input.peek() == ' '
{
self.skip_blank();
}
if self.mark.col == indent
|| (!self.input.buf_is_empty() && self.input.peek() != ' ')
{
break;
}
}
self.input.lookahead(2);
}
if self.input.next_is_break() {
self.read_break(breaks);
} else {
break;
}
}
}
fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
let mut max_indent = 0;
loop {
while self.input.look_ch() == ' ' {
self.skip_blank();
}
if self.mark.col > max_indent {
max_indent = self.mark.col;
}
if self.input.next_is_break() {
self.input.lookahead(2);
self.read_break(breaks);
} else {
break;
}
}
*indent = max_indent.max((self.indent + 1) as usize);
if self.indent > 0 {
*indent = (*indent).max(1);
}
}
fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
self.save_simple_key();
self.disallow_simple_key();
let token_index = self.tokens.len();
let tok = self.scan_flow_scalar(single)?;
if self.skip_to_next_token(true)? {
self.adjacent_value_allowed_at = usize::MAX;
} else {
self.adjacent_value_allowed_at = self.mark.index();
}
self.insert_token(token_index, tok);
Ok(())
}
#[allow(clippy::too_many_lines)]
fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
let start_mark = self.mark;
let mut buf = match self.input.byte_offset() {
Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
None => FlowScalarBuf::new_owned(),
};
let mut break_scratch = String::new();
self.skip_non_blank();
loop {
self.input.lookahead(4);
if self.mark.col == 0 && self.input.next_is_document_indicator() {
return Err(ScanError::new_str(
start_mark,
"while scanning a quoted scalar, found unexpected document indicator",
));
}
if self.input.next_is_z() {
return Err(ScanError::new_str(start_mark, "unclosed quote"));
}
let mut leading_blanks = false;
self.consume_flow_scalar_non_whitespace_chars(
single,
&mut buf,
&mut leading_blanks,
&start_mark,
)?;
match self.input.look_ch() {
'\'' if single => break,
'"' if !single => break,
_ => {}
}
let mut trailing_ws_start: Option<usize> = None;
let mut has_leading_break = false;
let mut has_trailing_breaks = false;
let mut pending_ws_start: Option<usize> = None;
while self.input.next_is_blank() || self.input.next_is_break() {
if self.input.next_is_blank() {
if leading_blanks {
if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
return Err(ScanError::new_str(
self.mark,
"tab cannot be used as indentation",
));
}
self.skip_blank();
} else {
match buf {
FlowScalarBuf::Owned(ref mut string) => {
if trailing_ws_start.is_none() {
trailing_ws_start = Some(string.len());
}
string.push(self.input.peek());
}
FlowScalarBuf::Borrowed { .. } => {
if pending_ws_start.is_none() {
pending_ws_start = self.input.byte_offset();
}
}
}
self.skip_blank();
if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
(&mut buf, pending_ws_start, self.input.byte_offset())
{
buf.note_pending_ws(ws_start, ws_end);
}
}
} else {
self.input.lookahead(2);
if leading_blanks {
match buf {
FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
FlowScalarBuf::Borrowed { .. } => {
self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
let Some(string) = buf.as_owned_mut() else {
unreachable!()
};
self.read_break(string);
}
}
has_trailing_breaks = true;
} else {
if let Some(pos) = trailing_ws_start.take() {
if let FlowScalarBuf::Owned(ref mut string) = buf {
string.truncate(pos);
}
}
if pending_ws_start.take().is_some() {
if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
}
buf.discard_pending_ws();
} else {
buf.commit_pending_ws();
}
break_scratch.clear();
self.read_break(&mut break_scratch);
has_leading_break = true;
leading_blanks = true;
}
}
self.input.lookahead(1);
}
if leading_blanks && has_leading_break && self.flow_level == 0 {
let next_ch = self.input.peek();
let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
if !is_closing_quote && (self.mark.col as isize) <= self.indent {
return Err(ScanError::new_str(
self.mark,
"invalid indentation in multiline quoted scalar",
));
}
}
if leading_blanks {
if has_leading_break && !has_trailing_breaks {
match buf {
FlowScalarBuf::Owned(ref mut string) => string.push(' '),
FlowScalarBuf::Borrowed { .. } => {
self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
let Some(string) = buf.as_owned_mut() else {
unreachable!()
};
string.push(' ');
}
}
}
}
}
self.skip_non_blank();
let end_mark = self.mark;
self.skip_ws_to_eol(SkipTabs::Yes)?;
match self.input.peek() {
',' | '}' | ']' if self.flow_level > 0 => {}
c if is_breakz(c) => {}
':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
':' if self.flow_level > 0 => {}
_ => {
return Err(ScanError::new_str(
self.mark,
"invalid trailing content after double-quoted scalar",
));
}
}
let style = if single {
ScalarStyle::SingleQuoted
} else {
ScalarStyle::DoubleQuoted
};
let contents = match buf {
FlowScalarBuf::Owned(string) => Cow::Owned(string),
FlowScalarBuf::Borrowed {
start,
mut end,
pending_ws_start,
pending_ws_end,
} => {
if pending_ws_start.is_some() {
end = pending_ws_end;
}
if let Some(slice) = self.try_borrow_slice(start, end) {
Cow::Borrowed(slice)
} else {
let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
ScanError::new_str(
start_mark,
"internal error: input advertised offsets but did not provide a slice",
)
})?;
Cow::Owned(slice.to_owned())
}
}
};
Ok(Token(
Span::new(start_mark, end_mark),
TokenType::Scalar(style, contents),
))
}
fn consume_flow_scalar_non_whitespace_chars(
&mut self,
single: bool,
buf: &mut FlowScalarBuf,
leading_blanks: &mut bool,
start_mark: &Marker,
) -> Result<(), ScanError> {
self.input.lookahead(2);
while !is_blank_or_breakz(self.input.peek()) {
match self.input.peek() {
'\'' if self.input.peek_nth(1) == '\'' && single => {
if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
buf.commit_pending_ws();
self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
}
let Some(string) = buf.as_owned_mut() else {
unreachable!()
};
string.push('\'');
self.skip_n_non_blank(2);
}
'\'' if single => break,
'"' if !single => break,
'\\' if !single && is_break(self.input.peek_nth(1)) => {
self.input.lookahead(3);
if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
buf.commit_pending_ws();
self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
}
self.skip_non_blank();
self.skip_linebreak();
*leading_blanks = true;
break;
}
'\\' if !single => {
if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
buf.commit_pending_ws();
self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
}
let Some(string) = buf.as_owned_mut() else {
unreachable!()
};
string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
}
c => {
match buf {
FlowScalarBuf::Owned(ref mut string) => {
string.push(c);
}
FlowScalarBuf::Borrowed { .. } => {
buf.commit_pending_ws();
}
}
self.skip_non_blank();
if let Some(new_end) = self.input.byte_offset() {
if let FlowScalarBuf::Borrowed { end, .. } = buf {
*end = new_end;
}
}
}
}
self.input.lookahead(2);
}
Ok(())
}
fn resolve_flow_scalar_escape_sequence(
&mut self,
start_mark: &Marker,
) -> Result<char, ScanError> {
let mut code_length = 0usize;
let mut ret = '\0';
match self.input.peek_nth(1) {
'0' => ret = '\0',
'a' => ret = '\x07',
'b' => ret = '\x08',
't' | '\t' => ret = '\t',
'n' => ret = '\n',
'v' => ret = '\x0b',
'f' => ret = '\x0c',
'r' => ret = '\x0d',
'e' => ret = '\x1b',
' ' => ret = '\x20',
'"' => ret = '"',
'/' => ret = '/',
'\\' => ret = '\\',
'N' => ret = char::from_u32(0x85).unwrap(),
'_' => ret = char::from_u32(0xA0).unwrap(),
'L' => ret = char::from_u32(0x2028).unwrap(),
'P' => ret = char::from_u32(0x2029).unwrap(),
'x' => code_length = 2,
'u' => code_length = 4,
'U' => code_length = 8,
_ => {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, found unknown escape character",
))
}
}
self.skip_n_non_blank(2);
if code_length > 0 {
self.input.lookahead(code_length);
let mut value = 0u32;
for i in 0..code_length {
let c = self.input.peek_nth(i);
if !is_hex(c) {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, did not find expected hexadecimal number",
));
}
value = (value << 4) + as_hex(c);
}
self.skip_n_non_blank(code_length);
if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
self.input.lookahead(2);
if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
self.skip_n_non_blank(2);
self.input.lookahead(4);
let mut low_value = 0u32;
for i in 0..4 {
let c = self.input.peek_nth(i);
if !is_hex(c) {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
));
}
low_value = (low_value << 4) + as_hex(c);
}
if (0xDC00..=0xDFFF).contains(&low_value) {
value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
self.skip_n_non_blank(4);
} else {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, found invalid low surrogate",
));
}
} else {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, found high surrogate without following low surrogate",
));
}
} else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, found unpaired low surrogate",
));
}
let Some(ch) = char::from_u32(value) else {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, found invalid Unicode character escape code",
));
};
ret = ch;
}
Ok(ret)
}
fn fetch_plain_scalar(&mut self) -> ScanResult {
self.save_simple_key();
self.disallow_simple_key();
let token_index = self.tokens.len();
let tok = self.scan_plain_scalar()?;
self.insert_token(token_index, tok);
Ok(())
}
#[allow(clippy::too_many_lines)]
fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
self.unroll_non_block_indents();
let indent = self.indent + 1;
let start_mark = self.mark;
if self.flow_level > 0 && (start_mark.col as isize) < indent {
return Err(ScanError::new_str(
start_mark,
"invalid indentation in flow construct",
));
}
let mut string = String::with_capacity(32);
self.buf_whitespaces.clear();
self.buf_leading_break.clear();
self.buf_trailing_breaks.clear();
let mut end_mark = self.mark;
loop {
self.input.lookahead(4);
if (self.mark.col == 0 && self.input.next_is_document_indicator())
|| self.input.peek() == '#'
{
if self.input.peek() == '#'
&& !string.is_empty()
&& !self.buf_whitespaces.is_empty()
&& self.flow_level == 0
{
self.interrupted_plain_by_comment = Some(self.mark);
}
break;
}
if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
return Err(ScanError::new_str(
self.mark,
"plain scalar cannot start with '-' followed by ,[]{}",
));
}
if !self.input.next_is_blank_or_breakz()
&& self.input.next_can_be_plain_scalar(self.flow_level > 0)
{
if self.leading_whitespace {
if self.buf_leading_break.is_empty() {
string.push_str(&self.buf_leading_break);
string.push_str(&self.buf_trailing_breaks);
self.buf_trailing_breaks.clear();
self.buf_leading_break.clear();
} else {
if self.buf_trailing_breaks.is_empty() {
string.push(' ');
} else {
string.push_str(&self.buf_trailing_breaks);
self.buf_trailing_breaks.clear();
}
self.buf_leading_break.clear();
}
self.leading_whitespace = false;
} else if !self.buf_whitespaces.is_empty() {
string.push_str(&self.buf_whitespaces);
self.buf_whitespaces.clear();
}
string.push(self.input.peek());
self.skip_non_blank();
string.reserve(self.input.bufmaxlen());
let mut end = false;
while !end {
self.input.lookahead(self.input.bufmaxlen());
let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
&mut string,
self.input.bufmaxlen() - 1,
self.flow_level > 0,
);
end = stop;
self.mark.offsets.chars += chars_consumed;
self.mark.col += chars_consumed;
self.mark.offsets.bytes = self.input.byte_offset();
}
end_mark = self.mark;
}
if !(self.input.next_is_blank() || self.input.next_is_break()) {
break;
}
self.input.lookahead(2);
while self.input.next_is_blank_or_break() {
if self.input.next_is_blank() {
if !self.leading_whitespace {
self.buf_whitespaces.push(self.input.peek());
self.skip_blank();
} else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
self.skip_ws_to_eol(SkipTabs::Yes)?;
if !self.input.next_is_breakz() {
return Err(ScanError::new_str(
start_mark,
"while scanning a plain scalar, found a tab",
));
}
} else {
self.skip_blank();
}
} else {
if self.leading_whitespace {
self.skip_break();
self.buf_trailing_breaks.push('\n');
} else {
self.buf_whitespaces.clear();
self.skip_break();
self.buf_leading_break.push('\n');
self.leading_whitespace = true;
}
}
self.input.lookahead(2);
}
if self.flow_level == 0 && (self.mark.col as isize) < indent {
break;
}
}
if self.leading_whitespace {
self.allow_simple_key();
}
if string.is_empty() {
Err(ScanError::new_str(
start_mark,
"unexpected end of plain scalar",
))
} else {
let contents = if let (Some(start), Some(end)) =
(start_mark.byte_offset(), end_mark.byte_offset())
{
match self.try_borrow_slice(start, end) {
Some(slice) if slice == string => Cow::Borrowed(slice),
_ => Cow::Owned(string),
}
} else {
Cow::Owned(string)
};
Ok(Token(
Span::new(start_mark, end_mark),
TokenType::Scalar(ScalarStyle::Plain, contents),
))
}
}
fn fetch_key(&mut self) -> ScanResult {
let start_mark = self.mark;
if self.flow_level == 0 {
if !self.simple_key_allowed {
return Err(ScanError::new_str(
self.mark,
"mapping keys are not allowed in this context",
));
}
self.roll_indent(
start_mark.col,
None,
TokenType::BlockMappingStart,
start_mark,
);
} else {
self.set_current_flow_mapping_started(true);
}
self.remove_simple_key()?;
if self.flow_level == 0 {
self.allow_simple_key();
} else {
self.disallow_simple_key();
}
self.skip_non_blank();
let token_index = self.tokens.len();
self.explicit_key_tab_check_pending = false;
let stopped_after_comment = self.skip_yaml_whitespace(true)?;
if self.input.peek() == '\t' {
return Err(ScanError::new_str(
self.mark(),
"tabs disallowed in this context",
));
}
self.explicit_key_tab_check_pending = stopped_after_comment;
self.insert_token(
token_index,
Token(Span::new(start_mark, self.mark), TokenType::Key),
);
Ok(())
}
fn fetch_flow_value(&mut self) -> ScanResult {
let nc = self.input.peek_nth(1);
if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
return Err(ScanError::new_str(
self.mark,
"':' may not precede any of `[{` in flow mapping",
));
}
self.fetch_value()
}
fn fetch_value(&mut self) -> ScanResult {
let sk = self.simple_keys.last().unwrap().clone();
let start_mark = self.mark;
let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
&& !self.current_flow_mapping_started()
&& !self.implicit_flow_mapping_states.is_empty();
if is_implicit_flow_mapping {
*self.implicit_flow_mapping_states.last_mut().unwrap() =
ImplicitMappingState::Inside(self.flow_level);
}
self.skip_non_blank();
let mut trailing_tokens = VecDeque::new();
if self.input.look_ch() == '\t' {
let trailing_token_index = self.tokens.len();
let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
trailing_tokens = self.tokens.split_off(trailing_token_index);
if !whitespace.has_valid_yaml_ws()
&& (self.input.peek() == '-' || self.input.next_is_alpha())
{
return Err(ScanError::new_str(
self.mark,
"':' must be followed by a valid YAML whitespace",
));
}
}
if sk.possible {
let tok = Token(Span::empty(sk.mark), TokenType::Key);
self.insert_token(sk.token_number - self.tokens_parsed, tok);
if is_implicit_flow_mapping {
if sk.mark.line < start_mark.line {
return Err(ScanError::new_str(
start_mark,
"illegal placement of ':' indicator",
));
}
self.insert_token(
sk.token_number - self.tokens_parsed,
Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
);
}
self.roll_indent(
sk.mark.col,
Some(sk.token_number),
TokenType::BlockMappingStart,
sk.mark,
);
self.roll_one_col_indent();
self.simple_keys.last_mut().unwrap().possible = false;
self.disallow_simple_key();
} else {
if is_implicit_flow_mapping {
self.tokens
.push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart).into());
}
if self.flow_level == 0 {
if !self.simple_key_allowed {
return Err(ScanError::new_str(
start_mark,
"mapping values are not allowed in this context",
));
}
self.roll_indent(
start_mark.col,
None,
TokenType::BlockMappingStart,
start_mark,
);
}
self.roll_one_col_indent();
if self.flow_level == 0 {
self.allow_simple_key();
} else {
self.disallow_simple_key();
}
}
self.tokens
.push_back(Token(Span::empty(start_mark), TokenType::Value).into());
self.tokens.append(&mut trailing_tokens);
Ok(())
}
fn roll_indent(
&mut self,
col: usize,
number: Option<usize>,
tok: TokenType<'input>,
mark: Marker,
) {
if self.flow_level > 0 {
return;
}
if self.indent <= col as isize {
if let Some(indent) = self.indents.last() {
if !indent.needs_block_end {
self.indent = indent.indent;
self.indents.pop();
}
}
}
if self.indent < col as isize {
self.indents.push(Indent {
indent: self.indent,
needs_block_end: true,
});
self.indent = col as isize;
let tokens_parsed = self.tokens_parsed;
match number {
Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
None => self.tokens.push_back(Token(Span::empty(mark), tok).into()),
}
}
}
fn unroll_indent(&mut self, col: isize) {
if self.flow_level > 0 {
return;
}
while self.indent > col {
let indent = self.indents.pop().unwrap();
self.indent = indent.indent;
if indent.needs_block_end {
self.tokens
.push_back(Token(Span::empty(self.mark), TokenType::BlockEnd).into());
}
}
}
fn roll_one_col_indent(&mut self) {
if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
self.indents.push(Indent {
indent: self.indent,
needs_block_end: false,
});
self.indent += 1;
}
}
fn unroll_non_block_indents(&mut self) {
while let Some(indent) = self.indents.last() {
if indent.needs_block_end {
break;
}
self.indent = indent.indent;
self.indents.pop();
}
}
fn save_simple_key(&mut self) {
if self.simple_key_allowed {
let required = self.flow_level == 0
&& self.indent == (self.mark.col as isize)
&& self.indents.last().unwrap().needs_block_end;
if let Some(last) = self.simple_keys.last_mut() {
*last = SimpleKey {
mark: self.mark,
possible: true,
required,
token_number: self.tokens_parsed + self.tokens.len(),
};
}
}
}
fn remove_simple_key(&mut self) -> ScanResult {
let last = self.simple_keys.last_mut().unwrap();
if last.possible && last.required {
return Err(self.simple_key_expected());
}
last.possible = false;
Ok(())
}
fn is_within_block(&self) -> bool {
!self.indents.is_empty()
}
fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
if self
.implicit_flow_mapping_states
.last()
.is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
{
*self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
self.set_current_flow_mapping_started(false);
self.tokens
.push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd).into());
}
}
fn current_flow_collection_is_sequence(&self) -> bool {
self.flow_markers
.last()
.is_some_and(|(_, bracket)| *bracket == '[')
}
fn current_flow_mapping_started(&self) -> bool {
self.flow_mapping_started.last().copied().unwrap_or(false)
}
fn set_current_flow_mapping_started(&mut self, started: bool) {
if let Some(current) = self.flow_mapping_started.last_mut() {
*current = started;
}
}
}
#[derive(PartialEq, Eq)]
pub enum Chomping {
Strip,
Clip,
Keep,
}
#[cfg(test)]
mod test {
use alloc::{
borrow::{Cow, ToOwned},
rc::Rc,
string::String,
vec::Vec,
};
use core::cell::Cell;
use crate::{
input::{str::StrInput, BorrowedInput, BufferedInput, Input},
scanner::{
Comment, Marker, Placement, QueuedToken, QueuedTokenType, ScalarStyle, Scanner, Span,
TEncoding, Token, TokenType,
},
};
struct CountingChars {
chars: alloc::vec::IntoIter<char>,
read: Rc<Cell<usize>>,
}
impl Iterator for CountingChars {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
let next = self.chars.next();
if next.is_some() {
self.read.set(self.read.get() + 1);
}
next
}
}
struct SlicingOnlyInput<'input> {
inner: StrInput<'input>,
expose_slice: bool,
}
impl<'input> SlicingOnlyInput<'input> {
fn new(source: &'input str, expose_slice: bool) -> Self {
Self {
inner: StrInput::new(source),
expose_slice,
}
}
}
impl Input for SlicingOnlyInput<'_> {
fn lookahead(&mut self, count: usize) {
self.inner.lookahead(count);
}
fn buflen(&self) -> usize {
self.inner.buflen()
}
fn bufmaxlen(&self) -> usize {
self.inner.bufmaxlen()
}
fn raw_read_ch(&mut self) -> char {
self.inner.raw_read_ch()
}
fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
self.inner.raw_read_non_breakz_ch()
}
fn skip(&mut self) {
self.inner.skip();
}
fn skip_n(&mut self, count: usize) {
self.inner.skip_n(count);
}
fn peek(&self) -> char {
self.inner.peek()
}
fn peek_nth(&self, n: usize) -> char {
self.inner.peek_nth(n)
}
fn byte_offset(&self) -> Option<usize> {
self.inner.byte_offset()
}
fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
if self.expose_slice {
self.inner.slice_bytes(start, end)
} else {
None
}
}
}
impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
None
}
}
#[test]
fn test_is_anchor_char() {
use super::is_anchor_char;
assert!(is_anchor_char('x'));
}
#[test]
fn flow_simple_key_length_limit_bounds_buffering() {
let mut yaml = String::from("[\n\"start\"\n");
for _ in 0..600 {
yaml.push_str("\"x\"\n");
}
let total_chars = yaml.chars().count();
let read = Rc::new(Cell::new(0));
let chars = yaml.chars().collect::<Vec<_>>().into_iter();
let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
chars,
read: Rc::clone(&read),
}));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::StreamStart(_)
));
let token = scanner.next_token().unwrap().unwrap();
assert!(matches!(token.1, TokenType::FlowSequenceStart));
let token = scanner.next_token().unwrap().unwrap();
assert!(matches!(
token.1,
TokenType::Scalar(_, ref value) if value == "start"
));
assert!(
read.get() < total_chars,
"scanner consumed all {total_chars} chars before yielding the first flow scalar"
);
assert!(
read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
"scanner read {} chars before yielding the first flow scalar",
read.get()
);
}
#[test]
fn comment_capture_does_not_change_leading_whitespace() {
let mut scanner = Scanner::new(StrInput::new("# comment\n"));
let token = scanner.scan_comment_token().unwrap();
assert!(scanner.leading_whitespace);
assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
scanner.input.lookahead(1);
let token = scanner.scan_comment_token().unwrap();
assert!(scanner.leading_whitespace);
assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
}
#[test]
fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
scanner.input.lookahead(2);
assert_eq!(scanner.input.peek_nth(1), ' ');
let token = scanner.scan_comment_token().unwrap();
assert!(matches!(token.1, TokenType::Comment(ref comment)
if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
}
#[test]
fn comment_capture_errors_when_offsets_have_no_slice() {
let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
let error = scanner.scan_comment_token().unwrap_err();
assert_eq!(
error.info(),
"internal error: input advertised offsets but did not provide a slice"
);
}
#[test]
fn queued_token_roundtrips_public_token_variants() {
let span = Span::new(Marker::new(0, 1, 0), Marker::new(7, 1, 7));
let tokens = [
Token(span, TokenType::StreamStart(TEncoding::Utf8)),
Token(span, TokenType::StreamEnd),
Token(span, TokenType::VersionDirective(1, 2)),
Token(
span,
TokenType::TagDirective(Cow::Borrowed("!app!"), Cow::Borrowed("tag:app.example,")),
),
Token(span, TokenType::DocumentStart),
Token(span, TokenType::DocumentEnd),
Token(span, TokenType::BlockSequenceStart),
Token(span, TokenType::BlockMappingStart),
Token(span, TokenType::BlockEnd),
Token(span, TokenType::FlowSequenceStart),
Token(span, TokenType::FlowSequenceEnd),
Token(span, TokenType::FlowMappingStart),
Token(span, TokenType::FlowMappingEnd),
Token(span, TokenType::BlockEntry),
Token(span, TokenType::FlowEntry),
Token(span, TokenType::Key),
Token(span, TokenType::Value),
Token(span, TokenType::Alias(Cow::Borrowed("alias"))),
Token(span, TokenType::Anchor(Cow::Borrowed("anchor"))),
Token(
span,
TokenType::Tag(Cow::Borrowed("!"), Cow::Borrowed("tag")),
),
Token(
span,
TokenType::Scalar(ScalarStyle::Literal, Cow::Borrowed("scalar")),
),
Token(
span,
TokenType::Comment(
Comment::new(span, Cow::Borrowed(" comment")).with_placement(Placement::Right),
),
),
Token(
span,
TokenType::ReservedDirective(
"reserved".to_owned(),
vec!["one".to_owned(), "two".to_owned()],
),
),
];
for token in tokens {
let queued: QueuedToken = token.clone().into();
assert_eq!(queued.into_public(), token);
}
}
#[test]
fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
scanner.skip_yaml_whitespace(false).unwrap();
assert!(scanner.tokens.is_empty());
assert_eq!(scanner.mark.line(), 2);
assert_eq!(scanner.mark.col(), 0);
}
#[test]
fn yaml_whitespace_can_stop_after_queued_comment() {
let mut scanner = Scanner::new(StrInput::new(" # queued\n# later\n"));
assert!(scanner.skip_yaml_whitespace(true).unwrap());
assert_eq!(scanner.tokens.len(), 1);
assert!(matches!(
scanner.tokens.front().unwrap().1,
QueuedTokenType::Comment(ref comment) if comment.text == " queued"
));
assert_eq!(scanner.mark.line(), 1);
assert_eq!(scanner.mark.col(), 9);
}
#[test]
fn token_skip_can_stop_after_queued_comment() {
let mut scanner = Scanner::new(StrInput::new("# first\n# second\n"));
assert!(scanner.skip_to_next_token(true).unwrap());
assert_eq!(scanner.tokens.len(), 1);
assert!(matches!(
scanner.tokens.front().unwrap().1,
QueuedTokenType::Comment(ref comment) if comment.text == " first"
));
assert_eq!(scanner.mark.line(), 2);
assert_eq!(scanner.mark.col(), 0);
}
#[test]
fn scanner_emits_first_leading_comment_before_scanning_next_comment() {
let mut scanner = Scanner::new(StrInput::new("# first\n# second\nkey: value\n"));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::StreamStart(_)
));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::Comment(ref comment) if comment.text == " first"
));
assert!(scanner.tokens.is_empty());
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::Comment(ref comment) if comment.text == " second"
));
}
#[test]
fn scanner_emits_quoted_scalar_comment_before_scanning_following_value() {
let mut scanner = Scanner::new(StrInput::new("\"key\" # quoted\n: value\n"));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::StreamStart(_)
));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::Comment(ref comment) if comment.text == " quoted"
));
}
#[test]
fn flow_scalar_comment_disables_adjacent_value_lookahead() {
let mut scanner = Scanner::new(StrInput::new("\"key\"\n# quoted\n: value\n"));
scanner.fetch_flow_scalar(false).unwrap();
assert_eq!(scanner.adjacent_value_allowed_at, usize::MAX);
assert!(matches!(
scanner.tokens.front().unwrap().1,
QueuedTokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
));
assert!(scanner.tokens.iter().any(|QueuedToken(_, token)| matches!(
token,
QueuedTokenType::Comment(comment) if comment.text == " quoted"
)));
}
#[test]
fn deferred_error_waits_for_all_comment_tokens() {
let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::StreamStart(_)
));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::Comment(ref comment) if comment.text == " first"
));
assert!(matches!(
scanner.next_token().unwrap().unwrap().1,
TokenType::Comment(ref comment) if comment.text == " second"
));
let error = scanner.next_token().unwrap_err();
assert!(error.info().contains("unexpected character"));
}
#[test]
fn anchor_name_is_borrowed_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("&anch\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Anchor(name) = tok.1 {
assert!(matches!(name, Cow::Borrowed("anch")));
break;
}
}
}
#[test]
fn anchor_name_rejects_non_printable_control_chars() {
let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
loop {
let tok = scanner
.next_token()
.expect("scanning should not fail")
.expect("scanner must eventually produce a token");
if let TokenType::Anchor(name) = tok.1 {
assert!(matches!(name, Cow::Borrowed("foo")));
let next = scanner.next_token().expect("scanning should not fail");
if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
assert!(rest.starts_with('\u{0001}'));
}
break;
}
}
}
#[test]
fn alias_name_rejects_non_printable_control_chars() {
let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
loop {
let tok = scanner
.next_token()
.expect("scanning should not fail")
.expect("scanner must eventually produce a token");
if let TokenType::Alias(name) = tok.1 {
assert!(matches!(name, Cow::Borrowed("foo")));
let next = scanner.next_token().expect("scanning should not fail");
if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
assert!(rest.starts_with('\u{0001}'));
}
break;
}
}
}
#[test]
fn alias_name_is_borrowed_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("*anch\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Alias(name) = tok.1 {
assert!(matches!(name, Cow::Borrowed("anch")));
break;
}
}
}
#[test]
fn tag_directive_parts_are_borrowed_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::TagDirective(handle, prefix) = tok.1 {
assert!(matches!(handle, Cow::Borrowed("!e!")));
assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
break;
}
}
}
#[test]
fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("foo\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Scalar(_, value) = tok.1 {
assert!(matches!(value, Cow::Borrowed("foo")));
break;
}
}
}
#[test]
fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Scalar(_, value) = tok.1 {
assert!(matches!(value, Cow::Borrowed("foo bar")));
break;
}
}
}
#[test]
fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Scalar(_, value) = tok.1 {
assert!(matches!(value, Cow::Borrowed("foo bar")));
break;
}
}
}
#[test]
fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Scalar(_, value) = tok.1 {
assert!(matches!(value, Cow::Owned(_)));
assert_eq!(&*value, "foo'bar");
break;
}
}
}
#[test]
fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Scalar(_, value) = tok.1 {
assert!(matches!(value, Cow::Borrowed("foo bar")));
break;
}
}
}
#[test]
fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Scalar(_, value) = tok.1 {
assert!(matches!(value, Cow::Owned(_)));
assert_eq!(&*value, "foo\nbar");
break;
}
}
}
#[test]
fn plain_key_is_borrowed_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
let mut found_key = false;
let mut key_value: Option<Cow<'_, str>> = None;
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors");
let Some(tok) = tok else { break };
if matches!(tok.1, TokenType::Key) {
found_key = true;
} else if found_key {
if let TokenType::Scalar(_, value) = tok.1 {
key_value = Some(value);
break;
}
}
}
assert!(found_key, "expected to find a Key token");
let key_value = key_value.expect("expected to find a scalar after Key token");
assert!(
matches!(key_value, Cow::Borrowed("mykey")),
"key should be borrowed, got: {key_value:?}"
);
}
#[test]
fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
let mut found_key = false;
let mut key_value: Option<Cow<'_, str>> = None;
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors");
let Some(tok) = tok else { break };
if matches!(tok.1, TokenType::Key) {
found_key = true;
} else if found_key {
if let TokenType::Scalar(_, value) = tok.1 {
key_value = Some(value);
break;
}
}
}
assert!(found_key, "expected to find a Key token");
let key_value = key_value.expect("expected to find a scalar after Key token");
assert!(
matches!(key_value, Cow::Borrowed("mykey")),
"quoted key should be borrowed when verbatim, got: {key_value:?}"
);
}
#[test]
fn tag_handle_and_suffix_are_borrowed_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Tag(handle, suffix) = tok.1 {
assert!(
matches!(handle, Cow::Borrowed("!!")),
"tag handle should be borrowed, got: {handle:?}"
);
assert!(
matches!(suffix, Cow::Borrowed("str")),
"tag suffix should be borrowed, got: {suffix:?}"
);
break;
}
}
}
#[test]
fn local_tag_suffix_is_borrowed_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Tag(handle, suffix) = tok.1 {
assert!(
matches!(handle, Cow::Borrowed("!")),
"local tag handle should be '!', got: {handle:?}"
);
assert!(
matches!(suffix, Cow::Borrowed("mytag")),
"local tag suffix should be borrowed, got: {suffix:?}"
);
break;
}
}
}
#[test]
fn tag_with_uri_escape_is_owned_for_str_input() {
let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
loop {
let tok = scanner
.next_token()
.expect("valid YAML must scan without errors")
.expect("scanner must eventually produce a token");
if let TokenType::Tag(handle, suffix) = tok.1 {
assert!(
matches!(handle, Cow::Borrowed("!!")),
"tag handle should still be borrowed, got: {handle:?}"
);
assert!(
matches!(suffix, Cow::Owned(_)),
"tag suffix with URI escape should be owned, got: {suffix:?}"
);
assert_eq!(&*suffix, "my tag");
break;
}
}
}
#[test]
fn flow_scalar_buffer_tracks_pending_whitespace() {
let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
borrowed.note_pending_ws(5, 8);
borrowed.commit_pending_ws();
assert!(matches!(
borrowed,
super::FlowScalarBuf::Borrowed {
end: 8,
pending_ws_start: None,
pending_ws_end: 8,
..
}
));
borrowed.note_pending_ws(9, 11);
borrowed.discard_pending_ws();
assert!(matches!(
borrowed,
super::FlowScalarBuf::Borrowed {
end: 8,
pending_ws_start: None,
pending_ws_end: 8,
..
}
));
assert!(borrowed.as_owned_mut().is_none());
let mut owned = super::FlowScalarBuf::new_owned();
owned.as_owned_mut().unwrap().push_str("owned");
assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
}
fn first_scanner_error_info(input: &str) -> String {
let mut scanner = Scanner::new(StrInput::new(input));
loop {
match scanner.next_token() {
Ok(Some(_)) => {}
Ok(None) => panic!("expected scanner error"),
Err(error) => return error.info().to_owned(),
}
}
}
fn first_scalar_value(input: &str) -> String {
let mut scanner = Scanner::new(StrInput::new(input));
loop {
match scanner.next_token().expect("scanner should not error") {
Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
Some(_) => {}
None => panic!("expected scalar token"),
}
}
}
#[test]
fn iterator_next_records_error_and_then_stays_empty() {
let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
while scanner.next().is_some() {}
let error = scanner
.get_error()
.expect("scanner should retain the error");
assert_eq!(error.info(), "unclosed quote");
assert!(scanner.next().is_none());
}
#[test]
fn next_token_returns_none_after_stream_end() {
let mut scanner = Scanner::new(StrInput::new(""));
while let Some(token) = scanner.next_token().unwrap() {
if matches!(token.1, TokenType::StreamEnd) {
break;
}
}
assert!(scanner.stream_started());
assert!(scanner.stream_ended());
assert!(scanner.next_token().unwrap().is_none());
}
#[test]
fn directive_name_must_be_present() {
assert_eq!(
first_scanner_error_info("%\n"),
"while scanning a directive, could not find expected directive name"
);
}
#[test]
fn yaml_directive_requires_dot_between_version_numbers() {
assert_eq!(
first_scanner_error_info("%YAML 1\n"),
"while scanning a YAML directive, did not find expected digit or '.' character"
);
}
#[test]
fn yaml_directive_requires_major_version_number() {
assert_eq!(
first_scanner_error_info("%YAML .2\n"),
"while scanning a YAML directive, did not find expected version number"
);
}
#[test]
fn yaml_directive_rejects_extremely_long_version_number() {
assert_eq!(
first_scanner_error_info("%YAML 1234567890.2\n"),
"while scanning a YAML directive, found extremely long version number"
);
}
#[test]
fn tag_directive_handle_must_end_with_bang() {
assert_eq!(
first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
"while parsing a tag directive, did not find expected '!'"
);
}
#[test]
fn tag_directive_handle_must_start_with_bang() {
assert_eq!(
first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
"while scanning a tag, did not find expected '!'"
);
}
#[test]
fn tag_directive_prefix_must_start_with_tag_character() {
assert_eq!(
first_scanner_error_info("%TAG !e! `bad\n"),
"invalid global tag character"
);
}
#[test]
fn tag_directive_prefix_must_end_before_invalid_content() {
assert_eq!(
first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
"while scanning TAG, did not find expected whitespace or line break"
);
}
#[test]
fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
let mut scanner =
Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
loop {
let token = scanner
.next_token()
.expect("valid directive should scan")
.expect("scanner must produce a directive token");
if let TokenType::TagDirective(handle, prefix) = token.1 {
assert!(matches!(handle, Cow::Borrowed("!e!")));
assert!(matches!(prefix, Cow::Owned(_)));
assert_eq!(&*prefix, "tag:example.com,2024:some app/");
break;
}
}
}
#[test]
fn bare_bang_tag_scans_as_non_specific_tag() {
let mut scanner = Scanner::new(StrInput::new("! foo\n"));
loop {
let token = scanner
.next_token()
.expect("valid tag should scan")
.expect("scanner must produce a tag token");
if let TokenType::Tag(handle, suffix) = token.1 {
assert_eq!(&*handle, "");
assert_eq!(&*suffix, "!");
break;
}
}
}
#[test]
fn tag_requires_separation_after_suffix() {
assert_eq!(
first_scanner_error_info("!foo,bar\n"),
"while scanning a tag, did not find expected whitespace or line break"
);
}
#[test]
fn verbatim_tag_requires_uri() {
assert_eq!(
first_scanner_error_info("!<> foo\n"),
"while parsing a tag, did not find expected tag URI"
);
}
#[test]
fn verbatim_tag_requires_closing_angle_bracket() {
assert_eq!(
first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
"while scanning a verbatim tag, did not find the expected '>'"
);
}
#[test]
fn tag_uri_escape_requires_hex_digits() {
assert_eq!(
first_scanner_error_info("!!bad%zz foo\n"),
"while parsing a tag, found an invalid escape sequence"
);
}
#[test]
fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
assert_eq!(
first_scanner_error_info("!!bad%80 foo\n"),
"while parsing a tag, found an incorrect leading UTF-8 byte"
);
}
#[test]
fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
assert_eq!(
first_scanner_error_info("!!bad%C2%41 foo\n"),
"while parsing a tag, found an incorrect trailing UTF-8 byte"
);
}
#[test]
fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
assert_eq!(
first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
"while parsing a tag, found an invalid UTF-8 codepoint"
);
}
#[test]
fn anchors_and_aliases_require_names() {
let expected =
"while scanning an anchor or alias, did not find expected alphabetic or numeric character";
assert_eq!(first_scanner_error_info("& \n"), expected);
assert_eq!(first_scanner_error_info("* \n"), expected);
}
#[test]
fn document_end_marker_rejects_trailing_content() {
assert_eq!(
first_scanner_error_info("... trailing\n"),
"invalid content after document end marker"
);
}
#[test]
fn reserved_indicators_are_rejected_outside_directives() {
assert_eq!(
first_scanner_error_info(" @\n"),
"unexpected character: `@'"
);
}
#[test]
fn flow_block_entry_indicator_is_rejected() {
assert_eq!(
first_scanner_error_info("[- ]\n"),
r#""-" is only valid inside a block"#
);
}
#[test]
fn block_entry_after_tabbed_separator_reports_specific_error() {
assert_eq!(
first_scanner_error_info("-\t- value\n"),
"'-' must be followed by a valid YAML whitespace"
);
}
#[test]
fn document_indicator_reports_unclosed_flow_collection() {
assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
}
#[test]
fn block_scalar_header_rejects_trailing_content() {
assert_eq!(
first_scanner_error_info("|+ trailing\n"),
"while scanning a block scalar, did not find expected comment or line break"
);
}
#[test]
fn block_scalar_rejects_zero_indent_indicator() {
let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
assert_eq!(first_scanner_error_info("|0\n"), expected);
assert_eq!(first_scanner_error_info("|+0\n"), expected);
}
#[test]
fn empty_block_scalar_at_eof_honors_chomping() {
assert_eq!(first_scalar_value("|-\n"), "");
assert_eq!(first_scalar_value("|+\n"), "\n");
}
#[test]
fn explicit_indent_block_scalar_can_end_at_document_marker() {
assert_eq!(first_scalar_value("|1\n...\n"), "");
}
#[test]
fn root_explicit_indent_block_scalar_rejects_underindented_content() {
assert_eq!(
first_scanner_error_info("|2\nx\n"),
"wrongly indented line in block scalar"
);
}
#[test]
fn quoted_scalar_rejects_document_indicator_at_line_start() {
assert_eq!(
first_scanner_error_info("\"one\n---\ntwo\"\n"),
"while scanning a quoted scalar, found unexpected document indicator"
);
}
#[test]
fn quoted_scalar_rejects_tab_indentation_after_line_break() {
assert_eq!(
first_scanner_error_info("a: \"one\n\tbad\"\n"),
"tab cannot be used as indentation"
);
}
#[test]
fn quoted_scalar_rejects_underindented_continuation() {
assert_eq!(
first_scanner_error_info("a: \"one\nbad\"\n"),
"invalid indentation in multiline quoted scalar"
);
}
#[test]
fn indented_flow_scalar_reports_invalid_indentation() {
assert_eq!(
first_scanner_error_info("a:\n [\nfoo]\n"),
"invalid indentation"
);
}
#[test]
fn required_simple_key_requires_value_at_stream_end() {
assert_eq!(
first_scanner_error_info("a:\n&b\n- c\n"),
"simple key expect ':'"
);
}
#[test]
fn plain_scalar_rejects_dash_before_flow_indicator() {
assert_eq!(
first_scanner_error_info("[-]\n"),
"plain scalar cannot start with '-' followed by ,[]{}"
);
}
#[test]
fn explicit_key_rejects_tab_after_indicator() {
assert_eq!(
first_scanner_error_info("? \tfoo\n"),
"tabs disallowed in this context"
);
}
#[test]
fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
assert_eq!(
first_scanner_error_info("[a:[]]\n"),
"':' may not precede any of `[{` in flow mapping"
);
}
#[test]
fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
assert_eq!(
first_scanner_error_info("[foo\n: bar]\n"),
"illegal placement of ':' indicator"
);
}
}