use crate::lexical;
use super::{ErrorKind, Pos, Token};
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum Num {
Minus,
Zero,
Int,
Dot,
Frac,
Exp,
ExpSign,
ExpInt,
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum Str {
Ready {
escaped: bool,
},
Esc,
EscU,
EscU1(u16),
EscU2(u16),
EscU3(u16),
EscHi(u16),
EscLoEsc(u16),
EscLoEscU(u16),
EscLoEscU1(u16, u16),
EscLoEscU2(u16, u16),
EscLoEscU3(u16, u16),
Utf821 {
escaped: bool,
},
Utf831 {
escaped: bool,
b0: u8,
},
Utf832 {
escaped: bool,
b0: u8,
b1: u8,
},
Utf841 {
escaped: bool,
b0: u8,
},
Utf842 {
escaped: bool,
b0: u8,
b1: u8,
},
Utf843 {
escaped: bool,
b0: u8,
b1: u8,
b2: u8,
},
}
#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)]
enum InnerState {
#[default]
Start,
Eof,
Err,
F,
Fa,
Fal,
Fals,
False,
N,
Nu,
Nul,
Null,
Num(Num),
Str(Str),
T,
Tr,
Tru,
True,
White,
WhiteCr,
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum State {
Mid,
End {
token: Token,
escaped: bool,
repeat: bool,
},
Err(ErrorKind),
}
#[derive(Debug, Default, Clone)]
pub struct Machine {
state: InnerState,
pos: Pos,
}
impl Machine {
pub fn next(&mut self, b: Option<u8>) -> State {
match self.state {
InnerState::Start => self.start(b),
InnerState::Eof => State::End {
token: Token::Eof,
escaped: false,
repeat: false,
},
InnerState::Err => panic!("already in error state"),
InnerState::F => self.expect_char(Token::LitFalse, b'a', b, InnerState::Fa),
InnerState::Fa => self.expect_char(Token::LitFalse, b'l', b, InnerState::Fal),
InnerState::Fal => self.expect_char(Token::LitFalse, b's', b, InnerState::Fals),
InnerState::Fals => self.expect_char(Token::LitFalse, b'e', b, InnerState::False),
InnerState::False => self.expect_boundary(Token::LitFalse, b),
InnerState::N => self.expect_char(Token::LitNull, b'u', b, InnerState::Nu),
InnerState::Nu => self.expect_char(Token::LitNull, b'l', b, InnerState::Nul),
InnerState::Nul => self.expect_char(Token::LitNull, b'l', b, InnerState::Null),
InnerState::Null => self.expect_boundary(Token::LitNull, b),
InnerState::Num(num) => self.num(num, b),
InnerState::Str(str) => self.str(str, b),
InnerState::T => self.expect_char(Token::LitTrue, b'r', b, InnerState::Tr),
InnerState::Tr => self.expect_char(Token::LitTrue, b'u', b, InnerState::Tru),
InnerState::Tru => self.expect_char(Token::LitTrue, b'e', b, InnerState::True),
InnerState::True => self.expect_boundary(Token::LitTrue, b),
InnerState::White => self.white(b),
InnerState::WhiteCr => self.white_cr(b),
}
}
#[inline(always)]
pub fn pos(&self) -> &Pos {
&self.pos
}
fn start(&mut self, b: Option<u8>) -> State {
match b {
Some(b'{') => {
self.pos.advance_col();
State::End {
token: Token::ObjBegin,
escaped: false,
repeat: false,
}
}
Some(b'}') => {
self.pos.advance_col();
State::End {
token: Token::ObjEnd,
escaped: false,
repeat: false,
}
}
Some(b'[') => {
self.pos.advance_col();
State::End {
token: Token::ArrBegin,
escaped: false,
repeat: false,
}
}
Some(b']') => {
self.pos.advance_col();
State::End {
token: Token::ArrEnd,
escaped: false,
repeat: false,
}
}
Some(b':') => {
self.pos.advance_col();
State::End {
token: Token::NameSep,
escaped: false,
repeat: false,
}
}
Some(b',') => {
self.pos.advance_col();
State::End {
token: Token::ValueSep,
escaped: false,
repeat: false,
}
}
Some(b'f') => {
self.pos.advance_col();
self.state = InnerState::F;
State::Mid
}
Some(b'n') => {
self.pos.advance_col();
self.state = InnerState::N;
State::Mid
}
Some(b'-') => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Minus);
State::Mid
}
Some(b'0') => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Zero);
State::Mid
}
Some(b'1'..=b'9') => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Int);
State::Mid
}
Some(b'"') => {
self.pos.advance_col();
self.state = InnerState::Str(Str::Ready { escaped: false });
State::Mid
}
Some(b't') => {
self.pos.advance_col();
self.state = InnerState::T;
State::Mid
}
Some(b' ') | Some(b'\t') => {
self.pos.advance_col();
self.state = InnerState::White;
State::Mid
}
Some(b'\r') => {
self.pos.advance_offset(1);
self.state = InnerState::WhiteCr;
State::Mid
}
Some(b'\n') => {
self.pos.advance_line();
self.state = InnerState::White;
State::Mid
}
None => {
self.state = InnerState::Eof;
State::End {
token: Token::Eof,
escaped: false,
repeat: false,
}
}
Some(c) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_token_start_char(c))
}
}
}
#[inline(always)]
fn expect_char(
&mut self,
tok: Token,
expect: u8,
actual: Option<u8>,
next: InnerState,
) -> State {
match actual {
Some(c) if c == expect => {
self.pos.advance_col();
self.state = next;
State::Mid
}
Some(c) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_char(tok, c, expect as char))
}
None => self.unexpected_eof(tok),
}
}
#[inline(always)]
fn unexpected_eof(&mut self, tok: Token) -> State {
self.state = InnerState::Err;
State::Err(ErrorKind::UnexpectedEof(tok))
}
fn is_boundary_byte(b: u8) -> bool {
b == b'{'
|| b == b'}'
|| b == b'['
|| b == b']'
|| b == b':'
|| b == b','
|| b == b'"'
|| b == b' '
|| b == b'\t'
|| b == b'\n'
|| b == b'\r'
}
fn is_hex_byte(b: u8) -> bool {
match b {
b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => true,
_ => false,
}
}
fn expect_boundary(&mut self, tok: Token, b: Option<u8>) -> State {
match b {
None | Some(b'{') | Some(b'}') | Some(b'[') | Some(b']') | Some(b':') | Some(b',')
| Some(b'"') | Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'\r') => {
self.state = InnerState::Start;
State::End {
token: tok,
escaped: false,
repeat: true,
}
}
Some(c) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_boundary(tok, c))
}
}
}
fn num(&mut self, num: Num, b: Option<u8>) -> State {
match (num, b) {
(Num::Minus, Some(b'0')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Zero);
State::Mid
}
(Num::Minus, Some(b'1'..=b'9')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Int);
State::Mid
}
(Num::Int, Some(b'0'..=b'9')) | (Num::Frac, Some(b'0'..=b'9')) => {
self.pos.advance_col();
State::Mid
}
(Num::Zero, Some(b'.')) | (Num::Int, Some(b'.')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Dot);
State::Mid
}
(Num::Dot, Some(b'0'..=b'9')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Frac);
State::Mid
}
(Num::Zero, Some(b'e')) | (Num::Int, Some(b'e')) | (Num::Frac, Some(b'e')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::Exp);
State::Mid
}
(Num::Exp, Some(b'-')) | (Num::Exp, Some(b'+')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::ExpSign);
State::Mid
}
(Num::Exp, Some(b'0'..=b'9')) | (Num::ExpSign, Some(b'0'..=b'9')) => {
self.pos.advance_col();
self.state = InnerState::Num(Num::ExpInt);
State::Mid
}
(Num::ExpInt, Some(b'0'..=b'9')) => {
self.pos.advance_col();
State::Mid
}
(Num::Zero, None) | (Num::Int, None) | (Num::Frac, None) | (Num::ExpInt, None) => {
self.state = InnerState::Start;
State::End {
token: Token::Num,
escaped: false,
repeat: true,
}
}
(Num::Zero, Some(c))
| (Num::Int, Some(c))
| (Num::Frac, Some(c))
| (Num::ExpInt, Some(c))
if Self::is_boundary_byte(c) =>
{
self.state = InnerState::Start;
State::End {
token: Token::Num,
escaped: false,
repeat: true,
}
}
(Num::Zero, Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_dot_or_boundary(c))
}
(Num::Int, Some(c)) | (Num::Frac, Some(c)) | (Num::ExpInt, Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_digit_or_boundary(c))
}
(Num::Minus, Some(c)) | (Num::Dot, Some(c)) | (Num::ExpSign, Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_digit(c))
}
(Num::Exp, Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_exp_sign_or_digit(c))
}
(Num::Minus, None) | (Num::Dot, None) | (Num::Exp, None) | (Num::ExpSign, None) => {
self.unexpected_eof(Token::Num)
}
}
}
fn str(&mut self, str: Str, b: Option<u8>) -> State {
let mut in_utf8_seq = false;
let s = match (str, b) {
(Str::Ready { escaped }, Some(b'"')) => {
self.state = InnerState::Start;
State::End {
token: Token::Str,
escaped,
repeat: false,
}
}
(Str::Ready { escaped: _ }, Some(b'\\')) => {
self.state = InnerState::Str(Str::Esc);
State::Mid
}
(Str::Ready { escaped: _ }, Some(b' '..=0x7f)) => State::Mid,
(Str::Ready { escaped }, Some(0xc2..=0xdf)) => {
in_utf8_seq = true;
self.state = InnerState::Str(Str::Utf821 { escaped });
State::Mid
}
(Str::Ready { escaped }, Some(b0)) if (0xe0..=0xef).contains(&b0) => {
in_utf8_seq = true;
self.state = InnerState::Str(Str::Utf831 { escaped, b0 });
State::Mid
}
(Str::Ready { escaped }, Some(b0)) if (0xf0..=0xf4).contains(&b0) => {
in_utf8_seq = true;
self.state = InnerState::Str(Str::Utf841 { escaped, b0 });
State::Mid
}
(Str::Ready { escaped: _ }, Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_str_char(c))
}
(Str::Esc, Some(c))
if c == b'\\'
|| c == b'"'
|| c == b'n'
|| c == b't'
|| c == b'r'
|| c == b'f'
|| c == b'b'
|| c == b'/' =>
{
self.state = InnerState::Str(Str::Ready { escaped: true });
State::Mid
}
(Str::Esc, Some(b'u')) => {
self.state = InnerState::Str(Str::EscU);
State::Mid
}
(Str::Esc, Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_esc_char(c))
}
(Str::EscU, Some(x)) if Self::is_hex_byte(x) => {
self.state = InnerState::Str(Str::EscU1(lexical::hex2u16(x)));
State::Mid
}
(Str::EscU1(acc), Some(x)) if Self::is_hex_byte(x) => {
self.state = InnerState::Str(Str::EscU2(acc << 4 | lexical::hex2u16(x)));
State::Mid
}
(Str::EscU2(acc), Some(x)) if Self::is_hex_byte(x) => {
self.state = InnerState::Str(Str::EscU3(acc << 4 | lexical::hex2u16(x)));
State::Mid
}
(Str::EscU3(acc), Some(x)) if Self::is_hex_byte(x) => {
let c = acc << 4 | lexical::hex2u16(x);
match c {
0x0000..=0xd7ff | 0xe000..=0xffff => {
self.state = InnerState::Str(Str::Ready { escaped: true });
State::Mid
}
0xd800..=0xdbff => {
self.state = InnerState::Str(Str::EscHi(c));
State::Mid
}
0xdc00..=0xdfff => {
self.state = InnerState::Err;
State::Err(ErrorKind::BadSurrogate { first: c, second: None, offset: 5 })
}
}
}
(Str::EscHi(hi), Some(b'\\')) => {
self.state = InnerState::Str(Str::EscLoEsc(hi));
State::Mid
}
(Str::EscHi(hi), Some(_)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::BadSurrogate { first: hi, second: None, offset: 6 })
}
(Str::EscLoEsc(hi), Some(b'u')) => {
self.state = InnerState::Str(Str::EscLoEscU(hi));
State::Mid
}
(Str::EscLoEsc(hi), Some(_)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::BadSurrogate { first: hi, second: None, offset: 7 })
}
(Str::EscLoEscU(hi), Some(x)) if Self::is_hex_byte(x) => {
self.state = InnerState::Str(Str::EscLoEscU1(hi, lexical::hex2u16(x)));
State::Mid
}
(Str::EscLoEscU1(hi, acc), Some(x)) if Self::is_hex_byte(x) => {
self.state = InnerState::Str(Str::EscLoEscU2(hi, acc << 4 | lexical::hex2u16(x)));
State::Mid
}
(Str::EscLoEscU2(hi, acc), Some(x)) if Self::is_hex_byte(x) => {
self.state = InnerState::Str(Str::EscLoEscU3(hi, acc << 4 | lexical::hex2u16(x)));
State::Mid
}
(Str::EscLoEscU3(hi, acc), Some(x)) if Self::is_hex_byte(x) => {
let c = acc << 4 | lexical::hex2u16(x);
match c {
0xdc00..=0xdfff => {
self.state = InnerState::Str(Str::Ready { escaped: true });
State::Mid
}
_ => {
self.state = InnerState::Err;
State::Err(ErrorKind::BadSurrogate { first: hi, second: Some(c), offset: 5 })
}
}
}
(Str::EscU, Some(c))
| (Str::EscU1(_), Some(c))
| (Str::EscU2(_), Some(c))
| (Str::EscU3(_), Some(c))
| (Str::EscLoEscU(_), Some(c))
| (Str::EscLoEscU1(_, _), Some(c))
| (Str::EscLoEscU2(_, _), Some(c))
| (Str::EscLoEscU3(_, _), Some(c)) => {
self.state = InnerState::Err;
State::Err(ErrorKind::expect_unicode_esc_hex_digit(c))
}
(Str::Utf821 { escaped }, Some(b1)) => {
if b1 & 0xc0 == 0x80 {
self.state = InnerState::Str(Str::Ready { escaped });
State::Mid
} else {
self.state = InnerState::Err;
State::Err(ErrorKind::bad_utf8_cont_byte(2, 1, b1))
}
}
(Str::Utf831 { escaped, b0 }, Some(b1)) => {
in_utf8_seq = true;
self.state = InnerState::Str(Str::Utf832 { escaped, b0, b1 });
State::Mid
}
(Str::Utf832 { escaped, b0, b1 }, Some(b2)) => match (b0, b1) {
(0xe0, 0xa0..=0xbf) | (0xed, 0x80..=0x9f) if b2 & 0xc0 == 0x80 => {
self.state = InnerState::Str(Str::Ready { escaped });
State::Mid
}
(_, 0x80..=0xbf) if b0 != 0xe0 && b0 != 0xed && b1 & 0xc0 == 0x80 => {
self.state = InnerState::Str(Str::Ready { escaped });
State::Mid
}
(_, _) if b2 & 0xc0 == 0x80 => {
self.state = InnerState::Err;
State::Err(ErrorKind::bad_utf8_cont_byte(3, 1, b1))
}
_ => {
self.state = InnerState::Err;
State::Err(ErrorKind::bad_utf8_cont_byte(3, 2, b2))
}
},
(Str::Utf841 { escaped, b0 }, Some(b1)) => {
in_utf8_seq = true;
self.state = InnerState::Str(Str::Utf842 { escaped, b0, b1 });
State::Mid
}
(Str::Utf842 { escaped, b0, b1 }, Some(b2)) => {
in_utf8_seq = true;
self.state = InnerState::Str(Str::Utf843 {
escaped,
b0,
b1,
b2,
});
State::Mid
}
(
Str::Utf843 {
escaped,
b0,
b1,
b2,
},
Some(b3),
) => match (b0, b1) {
(0xf0, 0x90..0xbf) | (0xf4, 0x80..=0x8f)
if b2 & 0xc0 == 0x80 && b3 & 0xc0 == 0x80 =>
{
self.state = InnerState::Str(Str::Ready { escaped });
State::Mid
}
(_, 0x80..=0xbf)
if b0 != 0xf0 && b0 != 0xf4 && b2 & 0xc0 == 0x80 && b3 & 0xc0 == 0x80 =>
{
self.state = InnerState::Str(Str::Ready { escaped });
State::Mid
}
(_, _) if b2 & 0xc0 == 0x80 && b3 & 0xc0 == 0x80 => {
self.state = InnerState::Err;
State::Err(ErrorKind::bad_utf8_cont_byte(4, 1, b1))
}
(_, _) if b3 & 0xc0 == 0x80 => {
self.state = InnerState::Err;
State::Err(ErrorKind::bad_utf8_cont_byte(4, 2, b2))
}
_ => {
self.state = InnerState::Err;
State::Err(ErrorKind::bad_utf8_cont_byte(4, 3, b3))
}
},
(_, None) => {
self.state = InnerState::Err;
State::Err(ErrorKind::UnexpectedEof(Token::Str))
}
};
if self.state != InnerState::Err && !in_utf8_seq {
self.pos.advance_col();
} else if in_utf8_seq {
self.pos.advance_offset(1);
}
s
}
fn white(&mut self, b: Option<u8>) -> State {
match b {
Some(b' ') | Some(b'\t') => {
self.pos.advance_col();
State::Mid
}
Some(b'\n') => {
self.pos.advance_line();
State::Mid
}
Some(b'\r') => {
self.pos.advance_offset(1);
self.state = InnerState::WhiteCr;
State::Mid
}
_ => {
self.state = InnerState::Start;
State::End {
token: Token::White,
escaped: false,
repeat: true,
}
}
}
}
fn white_cr(&mut self, b: Option<u8>) -> State {
match b {
Some(b' ') | Some(b'\t') => {
self.pos.advance_line_no_offset(); self.pos.advance_col();
self.state = InnerState::White;
State::Mid
}
Some(b'\n') => {
self.pos.advance_line();
self.state = InnerState::White;
State::Mid
}
Some(b'\r') => {
self.pos.advance_line();
State::Mid
}
_ => {
self.pos.advance_line_no_offset(); self.state = InnerState::Start;
State::End {
token: Token::White,
escaped: false,
repeat: true,
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
#[rstest]
#[case("", Token::Eof, true, false)]
#[case("{", Token::ObjBegin, true, false)]
#[case("}", Token::ObjEnd, true, false)]
#[case("[", Token::ArrBegin, true, false)]
#[case("]", Token::ArrEnd, true, false)]
#[case(":", Token::NameSep, true, false)]
#[case(",", Token::ValueSep, true, false)]
#[case("false", Token::LitFalse, false, false)]
#[case("null", Token::LitNull, false, false)]
#[case("true", Token::LitTrue, false, false)]
#[case("0", Token::Num, false, false)]
#[case("-0", Token::Num, false, false)]
#[case("1", Token::Num, false, false)]
#[case("-1", Token::Num, false, false)]
#[case("12", Token::Num, false, false)]
#[case("-12", Token::Num, false, false)]
#[case("0.0", Token::Num, false, false)]
#[case("-0.0", Token::Num, false, false)]
#[case("0.123456789", Token::Num, false, false)]
#[case("-123.456789", Token::Num, false, false)]
#[case("0e0", Token::Num, false, false)]
#[case("0e+0", Token::Num, false, false)]
#[case("0e-0", Token::Num, false, false)]
#[case("0.0e0", Token::Num, false, false)]
#[case("0.0e+0", Token::Num, false, false)]
#[case("0.0e0", Token::Num, false, false)]
#[case("0e0", Token::Num, false, false)]
#[case("-0e+0", Token::Num, false, false)]
#[case("-0e-0", Token::Num, false, false)]
#[case("-0.0e0", Token::Num, false, false)]
#[case("-0.0e+0", Token::Num, false, false)]
#[case("-0.0e0", Token::Num, false, false)]
#[case("123e456", Token::Num, false, false)]
#[case("123.456e+7", Token::Num, false, false)]
#[case("123.456e-89", Token::Num, false, false)]
#[case("-123e456", Token::Num, false, false)]
#[case("-123.456e+7", Token::Num, false, false)]
#[case("-123.456e-89", Token::Num, false, false)]
#[case(r#""""#, Token::Str, true, false)]
#[case(r#"" ""#, Token::Str, true, false)]
#[case(r#""foo""#, Token::Str, true, false)]
#[case(
r#""The quick brown fox jumped over the lazy dog!""#,
Token::Str,
true,
false
)]
#[case(r#""\\""#, Token::Str, true, true)]
#[case(r#""\/""#, Token::Str, true, true)]
#[case(r#""\t""#, Token::Str, true, true)]
#[case(r#""\r""#, Token::Str, true, true)]
#[case(r#""\n""#, Token::Str, true, true)]
#[case(r#""\f""#, Token::Str, true, true)]
#[case(r#""\b""#, Token::Str, true, true)]
#[case(r#""\u0000""#, Token::Str, true, true)]
#[case(r#""\u001f""#, Token::Str, true, true)]
#[case(r#""\u0020""#, Token::Str, true, true)]
#[case(r#""\u007E""#, Token::Str, true, true)]
#[case(r#""\u007F""#, Token::Str, true, true)]
#[case(r#""\u0080""#, Token::Str, true, true)]
#[case(r#""\u0100""#, Token::Str, true, true)]
#[case(r#""\uE000""#, Token::Str, true, true)]
#[case(r#""\ufDCf""#, Token::Str, true, true)]
#[case(r#""\uFdeF""#, Token::Str, true, true)]
#[case(r#""\ufffd""#, Token::Str, true, true)]
#[case(r#""\uFFFE""#, Token::Str, true, true)]
#[case(r#""\uFFFF""#, Token::Str, true, true)]
#[case(r#""\ud800\udc00""#, Token::Str, true, true)] #[case(r#""\uD800\uDFFF""#, Token::Str, true, true)] #[case(r#""\uDBFF\uDC00""#, Token::Str, true, true)] #[case(r#""\udbFf\udfff""#, Token::Str, true, true)] #[case(" ", Token::White, false, false)]
#[case("\t", Token::White, false, false)]
#[case(" ", Token::White, false, false)]
#[case("\t\t", Token::White, false, false)]
#[case(" \t \t \t \t\t", Token::White, false, false)]
fn test_single_token(
#[case] input: &str,
#[case] expect: Token,
#[case] self_terminating: bool,
#[case] escaped: bool,
) {
let mut mach = Machine::default();
assert_eq!(Pos::default(), *mach.pos());
for (i, b) in input.bytes().enumerate() {
assert_eq!(i, mach.pos().offset);
assert_eq!(1, mach.pos().line);
assert_eq!(i + 1, mach.pos().col);
let s = mach.next(Some(b));
if (i < input.len() - 1) || !self_terminating {
assert_eq!(State::Mid, s);
} else {
assert_eq!(
State::End {
token: expect,
escaped,
repeat: false
},
s
);
}
assert_eq!(i + 1, mach.pos().offset);
assert_eq!(1, mach.pos().line);
assert_eq!(i + 2, mach.pos().col);
}
let s = mach.next(None);
if !(self_terminating) {
assert_eq!(
State::End {
token: expect,
escaped,
repeat: true
},
s
);
} else {
assert_eq!(
State::End {
token: Token::Eof,
escaped: false,
repeat: false
},
s
);
}
assert_eq!(input.len(), mach.pos().offset);
assert_eq!(1, mach.pos().line);
assert_eq!(input.len() + 1, mach.pos().col);
let t = mach.next(None);
assert_eq!(
State::End {
token: Token::Eof,
escaped: false,
repeat: false
},
t
);
assert_eq!(input.len(), mach.pos().offset);
assert_eq!(1, mach.pos().line);
assert_eq!(input.len() + 1, mach.pos().col);
let u = mach.next(None);
assert_eq!(
State::End {
token: Token::Eof,
escaped: false,
repeat: false
},
u
);
assert_eq!(input.len(), mach.pos().offset);
assert_eq!(1, mach.pos().line);
assert_eq!(input.len() + 1, mach.pos().col);
}
#[rstest]
#[case("\"\u{007f}\"")] #[case("\"\u{0080}\"")] #[case("\"\u{07ff}\"")] #[case("\"\u{0800}\"")] #[case("\"\u{d7ff}\"")] #[case("\"\u{e000}\"")] #[case("\"\u{ffff}\"")] #[case("\"\u{10000}\"")] #[case("\"\u{10ffff}\"")] fn test_utf8_seq(#[case] input: &str) {
let mut mach = Machine::default();
assert_eq!(Pos::default(), *mach.pos());
let n = input.len() - 2 ;
let mut iter = input.bytes();
assert_eq!(State::Mid, mach.next(Some(iter.next().unwrap())));
assert_eq!(
Pos {
offset: 1,
line: 1,
col: 2
},
*mach.pos()
);
for i in 1..n {
assert_eq!(State::Mid, mach.next(Some(iter.next().unwrap())));
assert_eq!(
Pos {
offset: 1 + i,
line: 1,
col: 2
},
*mach.pos()
);
}
assert_eq!(State::Mid, mach.next(Some(iter.next().unwrap())));
assert_eq!(
Pos {
offset: 1 + n,
line: 1,
col: 3
},
*mach.pos()
);
assert_eq!(
State::End {
token: Token::Str,
escaped: false,
repeat: false
},
mach.next(Some(iter.next().unwrap()))
);
assert_eq!(
Pos {
offset: 2 + n,
line: 1,
col: 4
},
*mach.pos()
);
assert!(matches!(iter.next(), None));
assert_eq!(
State::End {
token: Token::Eof,
escaped: false,
repeat: false
},
mach.next(None)
);
assert_eq!(
Pos {
offset: 2 + n,
line: 1,
col: 4
},
*mach.pos()
);
assert_eq!(
State::End {
token: Token::Eof,
escaped: false,
repeat: false
},
mach.next(None)
);
assert_eq!(
Pos {
offset: 2 + n,
line: 1,
col: 4
},
*mach.pos()
);
}
#[rstest]
#[case("\n", &[(2, 1), (2, 1)])]
#[case("\n\n", &[(2, 1), (3, 1), (3, 1)])]
#[case("\r", &[(1, 1), (2, 1)])]
#[case("\r\r", &[(1, 1), (2, 1), (3, 1)])]
#[case("\r\n", &[(1, 1), (2, 1), (2, 1)])]
#[case("\n\r", &[(2, 1), (2, 1), (3,1)])]
#[case("\n\n\r\r", &[(2, 1), (3, 1), (3, 1), (4,1), (5, 1)])]
#[case("\r\n\r", &[(1, 1), (2, 1), (2, 1), (3, 1)])]
#[case("\n\r\n", &[(2, 1), (2, 1), (3, 1), (3, 1)])]
#[case(" \n", &[(1, 2), (2, 1), (2, 1)])]
#[case("\n ", &[(2, 1), (2, 2), (2, 2)])]
#[case(" \r", &[(1, 2), (1, 2), (2, 1)])]
#[case("\r ", &[(1, 1), (2, 2), (2, 2)])]
#[case("\t\n", &[(1, 2), (2, 1), (2, 1)])]
#[case("\n ", &[(2, 1), (2, 2), (2, 2)])]
#[case("\t\r", &[(1, 2), (1, 2), (2, 1)])]
#[case("\r\t", &[(1, 1), (2, 2), (2, 2)])]
fn test_whitespace_multiline(#[case] input: &str, #[case] line_col: &[(usize, usize)]) {
assert_eq!(input.len() + 1, line_col.len());
let mut mach = Machine::default();
assert_eq!(Pos::default(), *mach.pos());
for (i, b) in input.bytes().enumerate() {
let s = mach.next(Some(b));
assert_eq!(State::Mid, s, "i={i}");
let (line, col) = line_col[i];
assert_eq!(i + 1, mach.pos().offset, "i={i}");
assert_eq!(line, mach.pos().line, "i={i}");
assert_eq!(col, mach.pos().col, "i={i}");
}
let s = mach.next(None);
assert_eq!(
State::End {
token: Token::White,
escaped: false,
repeat: true
},
s
);
let (line, col) = line_col[input.len()];
assert_eq!(input.len(), mach.pos().offset);
assert_eq!(line, mach.pos().line);
assert_eq!(col, mach.pos().col);
let t = mach.next(None);
assert_eq!(
State::End {
token: Token::Eof,
escaped: false,
repeat: false
},
t
);
assert_eq!(input.len(), mach.pos().offset);
assert_eq!(line, mach.pos().line);
assert_eq!(col, mach.pos().col);
}
#[rstest]
#[case(r#""\uDC00""#, 0xdc00, None, 5, 6)]
#[case(r#""\udc00""#, 0xdc00, None, 5, 6)]
#[case(r#""\uDFFF""#, 0xdfff, None, 5, 6)]
#[case(r#""\udfff""#, 0xdfff, None, 5, 6)]
#[case(r#""\uD800""#, 0xd800, None, 6, 7)]
#[case(r#""\ud800""#, 0xd800, None, 6, 7)]
#[case(r#""\uDBFF""#, 0xdbff, None, 6, 7)]
#[case(r#""\udbff""#, 0xdbff, None, 6, 7)]
#[case(r#""\uD800x""#, 0xd800, None, 6, 7)]
#[case(r#""\ud800x""#, 0xd800, None, 6, 7)]
#[case(r#""\uDBFFx""#, 0xdbff, None, 6, 7)]
#[case(r#""\udbffx""#, 0xdbff, None, 6, 7)]
#[case(r#""\uD800\""#, 0xd800, None, 7, 8)]
#[case(r#""\ud800\""#, 0xd800, None, 7, 8)]
#[case(r#""\uDBFF\""#, 0xdbff, None, 7, 8)]
#[case(r#""\udbff\""#, 0xdbff, None, 7, 8)]
#[case(r#""\uD800\/""#, 0xd800, None, 7, 8)]
#[case(r#""\ud800\t""#, 0xd800, None, 7, 8)]
#[case(r#""\uDBFF\r""#, 0xdbff, None, 7, 8)]
#[case(r#""\udbff\n""#, 0xdbff, None, 7, 8)]
#[case(r#""\uD800\ud800""#, 0xd800, Some(0xd800), 5, 12)]
#[case(r#""\uD800\uDBFF""#, 0xd800, Some(0xdbff), 5, 12)]
#[case(r#""\udbff\ue000""#, 0xdbff, Some(0xe000), 5, 12)]
#[case(r#""\udbff\u0000""#, 0xdbff, Some(0x0000), 5, 12)]
fn test_single_error_bad_surrogate(#[case] input: &str, #[case] first: u16, #[case] second: Option<u16>, #[case] offset: u8, #[case] trigger_offset: usize) {
let mut mach = Machine::default();
assert_eq!(Pos::default(), *mach.pos());
let mut iter = input.bytes().enumerate();
let b = loop {
let (i, b) = iter.next().unwrap();
if i == trigger_offset {
break b;
}
let s = mach.next(Some(b));
assert_eq!(State::Mid, s, "i={i}");
assert_eq!(i + 1, mach.pos().offset, "i={i}");
assert_eq!(1, mach.pos().line, "i={i}");
assert_eq!(i + 2, mach.pos().col, "i={i}");
};
let s = mach.next(Some(b));
assert!(matches!(s, State::Err(ErrorKind::BadSurrogate { first: f, second: s, offset: o }) if f == first && s == second && o == offset as u8), "s = {s:?}, but first = {first:?}, second = {second:?}, and offset = {offset} at {}", mach.pos());
assert_eq!(Pos { offset: trigger_offset, line: 1, col: trigger_offset + 1 }, *mach.pos());
}
}