use crate::Pos;
use std::fmt;
pub mod buf;
pub mod state;
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum Token {
ArrBegin,
ArrEnd,
Eof,
Err,
LitFalse,
LitNull,
LitTrue,
NameSep,
Num,
ObjBegin,
ObjEnd,
Str,
ValueSep,
White,
}
impl Token {
pub const fn is_value(&self) -> bool {
matches!(
self,
Self::LitFalse | Self::LitNull | Self::LitTrue | Self::Num | Self::Str
)
}
pub const fn is_punct(&self) -> bool {
matches!(
self,
Self::ArrBegin
| Self::ArrEnd
| Self::NameSep
| Self::ObjBegin
| Self::ObjEnd
| Self::ValueSep
)
}
pub fn static_content(&self) -> Option<&'static str> {
match self {
Self::ArrBegin => Some("["),
Self::ArrEnd => Some("]"),
Self::LitFalse => Some("false"),
Self::LitNull => Some("null"),
Self::LitTrue => Some("true"),
Self::NameSep => Some(":"),
Self::ObjBegin => Some("{"),
Self::ObjEnd => Some("}"),
Self::ValueSep => Some(","),
_ => None,
}
}
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
Self::ArrBegin => "[",
Self::ArrEnd => "]",
Self::Eof => "EOF",
Self::Err => "error",
Self::LitFalse => "false",
Self::LitNull => "null",
Self::LitTrue => "true",
Self::NameSep => ":",
Self::Num => "number",
Self::ObjBegin => "{",
Self::ObjEnd => "}",
Self::Str => "string",
Self::ValueSep => ",",
Self::White => "whitespace",
};
f.write_str(s)
}
}
pub trait Content {
fn literal(&self) -> &str;
fn is_escaped(&self) -> bool;
fn unescaped(&mut self) -> &str;
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum Expect {
Boundary,
Char(char),
Digit,
DigitOrExpSign,
DigitOrBoundary,
DotOrBoundary,
EscChar,
StrChar,
TokenStartChar,
UnicodeEscHexDigit,
}
impl fmt::Display for Expect {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Boundary => write!(f, "boundary character or EOF"),
Self::Char(c) => write!(f, "character '{c}'"),
Self::Digit => write!(f, "digit character '0'..'9'"),
Self::DigitOrBoundary => {
write!(f, "digit character '0'..'9', boundary character, or EOF")
}
Self::DigitOrExpSign => write!(
f,
"exponent sign character '+' or '-', or exponent digit character '0'..'9'"
),
Self::DotOrBoundary => write!(f, "character '.', boundary character, or EOF"),
Self::EscChar => write!(
f,
"escape sequence character '\\', '\"', '/', 'r', 'n', 't', or 'u'"
),
Self::StrChar => write!(f, "string character"),
Self::TokenStartChar => write!(f, "token start character"),
Self::UnicodeEscHexDigit => write!(
f,
"Unicode escape sequence hex digit '0'..'9', 'A'..'F', or 'a'..'f'"
),
}
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub enum ErrorKind {
BadSurrogate {
first: u16,
second: Option<u16>,
offset: u8,
},
BadUtf8ContByte {
seq_len: u8,
offset: u8,
value: u8,
},
Read,
UnexpectedByte {
token: Option<Token>,
expect: Expect,
actual: u8,
},
UnexpectedEof(Token),
}
impl ErrorKind {
pub(crate) fn bad_utf8_cont_byte(seq_len: u8, offset: u8, value: u8) -> ErrorKind {
ErrorKind::BadUtf8ContByte {
seq_len,
offset,
value,
}
}
pub(crate) fn expect_boundary(token: Token, actual: u8) -> ErrorKind {
let expect = Expect::Boundary;
ErrorKind::UnexpectedByte {
token: Some(token),
expect,
actual,
}
}
pub(crate) fn expect_char(token: Token, actual: u8, expect: char) -> ErrorKind {
let expect = Expect::Char(expect);
ErrorKind::UnexpectedByte {
token: Some(token),
expect,
actual,
}
}
pub(crate) fn expect_digit(actual: u8) -> ErrorKind {
let expect = Expect::Digit;
ErrorKind::UnexpectedByte {
token: Some(Token::Num),
expect,
actual,
}
}
pub(crate) fn expect_digit_or_boundary(actual: u8) -> ErrorKind {
let expect = Expect::DigitOrBoundary;
ErrorKind::UnexpectedByte {
token: Some(Token::Num),
expect,
actual,
}
}
pub(crate) fn expect_dot_or_boundary(actual: u8) -> ErrorKind {
let expect = Expect::DotOrBoundary;
ErrorKind::UnexpectedByte {
token: Some(Token::Num),
expect,
actual,
}
}
pub(crate) fn expect_esc_char(actual: u8) -> ErrorKind {
let expect = Expect::EscChar;
ErrorKind::UnexpectedByte {
token: Some(Token::Str),
expect,
actual,
}
}
pub(crate) fn expect_exp_sign_or_digit(actual: u8) -> ErrorKind {
let expect = Expect::DigitOrExpSign;
ErrorKind::UnexpectedByte {
token: Some(Token::Num),
expect,
actual,
}
}
pub(crate) fn expect_str_char(actual: u8) -> ErrorKind {
let expect = Expect::StrChar;
ErrorKind::UnexpectedByte {
token: Some(Token::Str),
expect,
actual,
}
}
pub(crate) fn expect_token_start_char(actual: u8) -> ErrorKind {
let expect = Expect::TokenStartChar;
ErrorKind::UnexpectedByte {
token: None,
expect,
actual,
}
}
pub(crate) fn expect_unicode_esc_hex_digit(actual: u8) -> ErrorKind {
let expect = Expect::UnicodeEscHexDigit;
ErrorKind::UnexpectedByte {
token: Some(Token::Str),
expect,
actual,
}
}
pub(crate) fn fmt_at(&self, f: &mut fmt::Formatter, pos: Option<&Pos>) -> fmt::Result {
match self {
Self::BadSurrogate {
first: lo,
second: None,
offset: _,
} if (0xdc00..=0xdfff).contains(lo) => {
write!(
f,
"bad Unicode escape sequence: low surrogate '\\u{lo:04X}' without preceding high surrogate"
)?;
}
Self::BadSurrogate {
first: hi,
second: None,
offset: _,
} => {
write!(
f,
"bad Unicode escape sequence: high surrogate '\\u{hi:04X}' not followed by low surrogate"
)?;
}
Self::BadSurrogate {
first: hi,
second: Some(lo),
offset: _,
} => {
write!(
f,
"bad Unicode escape sequence surrogate pair: high surrogate '\\u{hi:04X}' followed by invalid low surrogate '\\u{lo:04X}'"
)?;
}
Self::BadUtf8ContByte {
seq_len,
offset,
value,
} => {
write!(
f,
"bad UTF-8 continuation byte 0x{value:02x} in {seq_len}-byte UTF-8 sequence (byte #{offset})"
)?;
}
Self::Read => write!(f, "read error")?,
Self::UnexpectedByte {
token,
expect,
actual,
} if (b' '..=0x7e).contains(actual) => {
write!(
f,
"expected {expect} but got character '{}' (ASCII 0x{actual:02x}",
*actual as char
)?;
if let Some(t) = token {
write!(f, " in {t} token")?;
}
}
Self::UnexpectedByte {
token,
expect,
actual,
} => {
write!(f, "expected {expect} but got byte {actual:02x}")?;
if let Some(t) = token {
write!(f, "in {t} token")?;
}
}
Self::UnexpectedEof(token) => {
write!(f, "unexpected EOF in {token} token")?;
}
};
if let Some(p) = pos {
write!(f, "at {}", *p)?;
}
Ok(())
}
}
impl fmt::Display for ErrorKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.fmt_at(f, None)
}
}
pub trait Error: std::error::Error + Send + Sync {
fn kind(&self) -> ErrorKind;
fn pos(&self) -> &Pos;
}
pub trait Analyzer {
type Content: Content;
type Error: Error;
fn next(&mut self) -> Token;
fn content(&self) -> Result<Self::Content, Self::Error>;
fn pos(&self) -> &Pos;
}
pub(crate) fn hex2u16(b: u8) -> u16 {
match b {
b'0'..=b'9' => (b - b'0') as u16,
b'a'..=b'f' => (10 + b - b'a') as u16,
b'A'..=b'F' => (10 + b - b'A') as u16,
_ => panic!("invalid hex character: 0x{b:02x}"),
}
}
pub(crate) fn unescape(literal: &str, buf: &mut Vec<u8>) {
debug_assert!(literal.len() >= 2);
debug_assert!(matches!(literal.chars().next(), Some('"')));
debug_assert!(matches!(literal.chars().nth_back(0), Some('"')));
let bytes = literal.as_bytes();
buf.reserve(bytes.len() - 1);
let (mut i, mut j) = (0usize, 0usize);
let mut hi_surrogate: Option<u32> = None;
while j < bytes.len() {
if bytes[j] != b'\\' {
j += 1;
} else {
buf.extend_from_slice(&bytes[i..j]);
let x = bytes[j + 1];
let mut len = 2;
match x {
b'"' | b'\\' | b'/' => buf.push(x),
b'b' => buf.push(b'\x08'),
b't' => buf.push(b'\t'),
b'f' => buf.push(b'\x0c'),
b'n' => buf.push(b'\n'),
b'r' => buf.push(b'\r'),
b'u' => {
len = 6;
let (b0, b1, b2, b3) = (bytes[j + 2], bytes[j + 3], bytes[j + 4], bytes[j + 5]);
let x: u32 =
(hex2u16(b0) << 12 | hex2u16(b1) << 8 | hex2u16(b2) << 4 | hex2u16(b3))
as u32;
let code_point = match (hi_surrogate, x as u32) {
(None, 0xd800..=0xdbff) => {
hi_surrogate = Some(x);
None
}
(None, _) => Some(x),
(Some(hi), 0xdc00..=0xdfff) => {
hi_surrogate = None;
Some(0x10000 + (((hi - 0xd800) << 10) | (x - 0xdc00)))
}
(Some(hi), _) => panic!(
"high surrogate followed by invalid low surrogate: [0x{hi:04x}], [0x{x:04x}]"
),
};
if let Some(c) = code_point {
match char::from_u32(c) {
Some(y) => {
let mut seq = [0u8; 4];
let utf8_str = y.encode_utf8(&mut seq);
buf.extend_from_slice(utf8_str.as_bytes());
}
None => unreachable!(),
}
}
}
_ => panic!("invalid escape sequence byte after '\\': 0x{x:02x}"),
}
j += len;
i = j;
}
}
debug_assert!(hi_surrogate.is_none());
buf.extend_from_slice(&bytes[i..j]);
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
#[rstest]
#[case(Token::ArrBegin, false)]
#[case(Token::ArrEnd, false)]
#[case(Token::Eof, false)]
#[case(Token::Err, false)]
#[case(Token::LitFalse, true)]
#[case(Token::LitNull, true)]
#[case(Token::LitTrue, true)]
#[case(Token::NameSep, false)]
#[case(Token::Num, true)]
#[case(Token::ObjBegin, false)]
#[case(Token::ObjEnd, false)]
#[case(Token::Str, true)]
#[case(Token::ValueSep, false)]
#[case(Token::White, false)]
fn test_token_is_value(#[case] token: Token, #[case] is_value: bool) {
assert_eq!(is_value, token.is_value());
}
#[rstest]
#[case(Token::ArrBegin, true)]
#[case(Token::ArrEnd, true)]
#[case(Token::Eof, false)]
#[case(Token::Err, false)]
#[case(Token::LitFalse, false)]
#[case(Token::LitNull, false)]
#[case(Token::LitTrue, false)]
#[case(Token::NameSep, true)]
#[case(Token::Num, false)]
#[case(Token::ObjBegin, true)]
#[case(Token::ObjEnd, true)]
#[case(Token::Str, false)]
#[case(Token::ValueSep, true)]
#[case(Token::White, false)]
fn test_token_is_punct(#[case] token: Token, #[case] is_punct: bool) {
assert_eq!(is_punct, token.is_punct());
}
#[rstest]
#[case(Token::ArrBegin, Some("["))]
#[case(Token::ArrEnd, Some("]"))]
#[case(Token::Eof, None)]
#[case(Token::Err, None)]
#[case(Token::LitFalse, Some("false"))]
#[case(Token::LitNull, Some("null"))]
#[case(Token::LitTrue, Some("true"))]
#[case(Token::NameSep, Some(":"))]
#[case(Token::Num, None)]
#[case(Token::ObjBegin, Some("{"))]
#[case(Token::ObjEnd, Some("}"))]
#[case(Token::Str, None)]
#[case(Token::ValueSep, Some(","))]
#[case(Token::White, None)]
fn test_token_static_content(#[case] token: Token, #[case] static_content: Option<&str>) {
assert_eq!(static_content, token.static_content());
}
#[rstest]
#[case(r#""""#, r#""""#)]
#[case(r#""f""#, r#""f""#)]
#[case(r#""fo""#, r#""fo""#)]
#[case(r#""foo""#, r#""foo""#)]
#[case(r#""\\""#, r#""\""#)]
#[case(r#""\/""#, r#""/""#)]
#[case(r#""\"""#, r#"""""#)]
#[case(r#""\b""#, "\"\x08\"")]
#[case(r#""\t""#, "\"\t\"")]
#[case(r#""\f""#, "\"\x0c\"")]
#[case(r#""\n""#, "\"\n\"")]
#[case(r#""\r""#, "\"\r\"")]
#[case(r#""\u0000""#, "\"\0\"")]
#[case(r#""\u0008""#, "\"\x08\"")]
#[case(r#""\u0009""#, "\"\t\"")]
#[case(r#""\u000c""#, "\"\x0c\"")]
#[case(r#""\u000C""#, "\"\x0C\"")]
#[case(r#""\u000a""#, "\"\n\"")]
#[case(r#""\u000A""#, "\"\n\"")]
#[case(r#""\u000d""#, "\"\r\"")]
#[case(r#""\u000D""#, "\"\r\"")]
#[case(r#""\u000D""#, "\"\r\"")]
#[case(r#""\u0021""#, r#""!""#)]
#[case(r#""\u0030""#, r#""0""#)]
#[case(r#""\u0041""#, r#""A""#)]
#[case(r#""\u0062""#, r#""b""#)]
#[case(r#""\u007F""#, "\"\x7f\"")] #[case(r#""\u00A9""#, r#""©""#)] #[case(r#""\u03A9""#, r#""Ω""#)] #[case(r#""\u0080""#, "\"\u{80}\"")] #[case(r#""\u07FF""#, "\"\u{7ff}\"")] #[case(r#""\u20AC""#, r#""€""#)] #[case(r#""\u2603""#, r#""☃""#)] #[case(r#""\u0800""#, "\"\u{800}\"")] #[case(r#""\uFFFF""#, "\"\u{ffff}\"")] #[case(r#""\ud83D\uDe00""#, r#""😀""#)] #[case(r#""\ud800\uDC00""#, "\"\u{10000}\"")] #[case(r#""\uDBFF\udfff""#, "\"\u{10FFFF}\"")] fn test_unescape_ok(#[case] literal: &str, #[case] expect: &str) {
{
let mut buf = Vec::new();
unescape(literal, &mut buf);
let actual = String::from_utf8(buf).unwrap();
assert_eq!(actual, expect);
}
{
let mut buf = Vec::new();
buf.extend_from_slice(b"foo");
unescape(literal, &mut buf);
let actual = String::from_utf8(buf).unwrap();
assert_eq!(actual, format!("foo{expect}"));
}
}
#[rstest]
#[case(r#""\ud800\u0000""#)]
#[case(r#""\uDBFF\ud800""#)]
#[should_panic(expected = "high surrogate followed by invalid low surrogate")]
fn test_unescape_panic_invalid_surrogate_pair(#[case] literal: &str) {
let mut buf = Vec::new();
unescape(literal, &mut buf);
}
#[rstest]
#[case(r#""\a""#)]
#[case(r#""\U""#)]
#[case(r#""\:""#)]
#[should_panic(expected = "invalid escape sequence byte after '\\'")]
fn test_unescape_panic_invalid_esc_seq_byte(#[case] literal: &str) {
let mut buf = Vec::new();
unescape(literal, &mut buf);
}
}