use crate::error::Reason;
use crate::input::Input;
use crate::raw_token::RawToken;
use std::io::Read;
#[derive(Debug)]
pub struct Scanner {
leftover: Option<u8>,
scratch: Vec<u8>,
}
impl Scanner {
pub fn new() -> Self {
Self {
leftover: None,
scratch: vec![],
}
}
pub fn reset(&mut self) {
self.scratch.clear();
}
#[inline]
pub fn read_token(&mut self, input: &mut Input<impl Read>) -> Result<RawToken, Reason> {
let ch = loop {
let next = match self.leftover.take() {
Some(d) => Some(d),
None => input.next()?,
};
match next {
Some(b'\t' | b'\n' | b'\r' | b' ') => {
continue;
}
None => return Ok(RawToken::Eof),
Some(c) => break c,
};
};
match ch {
b'"' => self.string(input),
n @ b'0'..=b'9' => self.number(n, input),
b'-' => self.number(b'-', input),
b'n' => self.null(input),
b'f' => self.bool_false(input),
b't' => self.bool_true(input),
b'{' => Ok(RawToken::ObjectStart),
b'}' => Ok(RawToken::ObjectEnd),
b'[' => Ok(RawToken::ArrayStart),
b']' => Ok(RawToken::ArrayEnd),
b',' => Ok(RawToken::Comma),
b':' => Ok(RawToken::Colon),
_ => Err(Reason::UnexpectedChar),
}
}
pub fn null(&mut self, input: &mut Input<impl Read>) -> Result<RawToken, Reason> {
let Some([b'u', b'l', b'l']) = input.read_n::<3>()? else {
return Err(Reason::ExpectedNull);
};
Ok(RawToken::Null)
}
pub fn bool_true(&mut self, input: &mut Input<impl Read>) -> Result<RawToken, Reason> {
let Some([b'r', b'u', b'e']) = input.read_n::<3>()? else {
return Err(Reason::ExpectedBool);
};
Ok(RawToken::Bool(true))
}
pub fn bool_false(&mut self, input: &mut Input<impl Read>) -> Result<RawToken, Reason> {
let Some([b'a', b'l', b's', b'e']) = input.read_n::<4>()? else {
return Err(Reason::ExpectedBool);
};
Ok(RawToken::Bool(false))
}
pub fn string(&mut self, input: &mut Input<impl Read>) -> Result<RawToken, Reason> {
self.scratch.clear();
loop {
let next = match input.next()? {
Some(c @ b'"') | Some(c @ b'\\') | Some(c @ 0x00..=0x1f) => c,
Some(c) => {
self.scratch.push(c);
continue;
}
None => return Err(Reason::UnexpectedEof),
};
match next {
b'"' => {
let Ok(s) = std::str::from_utf8(&self.scratch) else {
return Err(Reason::InvalidUtf8);
};
return Ok(RawToken::String(s));
}
b'\\' => match input.next()? {
Some(b'b') => self.scratch.push(0x08),
Some(b'f') => self.scratch.push(0x0c),
Some(b't') => self.scratch.push(b'\t'),
Some(b'r') => self.scratch.push(b'\r'),
Some(b'n') => self.scratch.push(b'\n'),
Some(byte @ (b'/' | b'"' | b'\\')) => self.scratch.push(byte),
Some(b'u') => {
self.read_unicode_escape(input)?;
}
Some(_) => return Err(Reason::InvalidEscapeCode),
None => return Err(Reason::UnexpectedEof),
},
_ => return Err(Reason::UnexpectedCtrlChar),
}
}
}
fn read_unicode_escape(&mut self, input: &mut Input<impl Read>) -> Result<(), Reason> {
let mut buf = [0; 4];
let Some([h1, h2, h3, h4]) = input.read_n::<4>()? else {
return Err(Reason::UnexpectedEof);
};
let Some(codepoint) = Self::codepoint_value_from_hex(h1, h2, h3, h4) else {
return Err(Reason::InvalidEscapeCode);
};
if !(0xd800..=0xdfff).contains(&codepoint) {
let c = char::from_u32(codepoint as u32).ok_or(Reason::InvalidUtf8)?;
self.scratch
.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
} else {
let Some([b'\\', b'u', h1, h2, h3, h4]) = input.read_n::<6>()? else {
return Err(Reason::UnexpectedEof);
};
let Some(second_codepoint) = Self::codepoint_value_from_hex(h1, h2, h3, h4) else {
return Err(Reason::InvalidEscapeCode);
};
let decoded = char::decode_utf16([codepoint, second_codepoint])
.next()
.expect("an unpaired surrogate or a valid surrogate pair");
let c = decoded.map_err(|_| Reason::InvalidUtf8)?;
self.scratch
.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
}
Ok(())
}
fn decode_hex_digit(hex: u8) -> Option<u8> {
match hex {
b'A'..=b'F' => Some(hex - b'A' + 10),
b'a'..=b'f' => Some(hex - b'a' + 10),
b'0'..=b'9' => Some(hex - b'0'),
_ => None,
}
}
fn codepoint_value_from_hex(h1: u8, h2: u8, h3: u8, h4: u8) -> Option<u16> {
Some(
(Self::decode_hex_digit(h1)? as u16) << 12
| (Self::decode_hex_digit(h2)? as u16) << 8
| (Self::decode_hex_digit(h3)? as u16) << 4
| Self::decode_hex_digit(h4)? as u16,
)
}
pub fn number(
&mut self,
leading_byte: u8,
input: &mut Input<impl Read>,
) -> Result<RawToken, Reason> {
self.scratch.clear();
let first_digit = if leading_byte == b'-' {
self.scratch.push(b'-');
input.next()?
} else {
Some(leading_byte)
};
match first_digit {
Some(b'0') => {
match input.next()? {
Some(b'0'..=b'9') => Err(Reason::ExpectedNumber),
c => {
self.scratch.push(b'0');
return self.read_decimal(c, input);
}
}
}
Some(c @ b'1'..=b'9') => {
self.scratch.push(c);
loop {
match input.next()? {
Some(c @ b'0'..=b'9') => {
self.scratch.push(c);
}
c => return self.read_decimal(c, input),
};
}
}
_ => Err(Reason::ExpectedNumber),
}
}
fn read_decimal(
&mut self,
leading_byte: Option<u8>,
input: &mut Input<impl Read>,
) -> Result<RawToken, Reason> {
match leading_byte {
Some(b'.') => {
self.scratch.push(b'.');
match input.next()? {
Some(c @ b'0'..=b'9') => {
self.scratch.push(c);
}
_ => return Err(Reason::ExpectedNumber),
}
loop {
match input.next()? {
Some(c @ b'0'..=b'9') => self.scratch.push(c),
c => return self.read_exponent(c, input),
}
}
}
c => return self.read_exponent(c, input),
}
}
fn read_exponent(
&mut self,
leading_byte: Option<u8>,
input: &mut Input<impl Read>,
) -> Result<RawToken, Reason> {
match leading_byte {
Some(b'e' | b'E') => {
self.scratch.push(b'e');
let first_digit = match input.next()? {
Some(b'+') => input.next()?,
Some(b'-') => {
self.scratch.push(b'-');
input.next()?
}
Some(d) => Some(d),
None => return Err(Reason::ExpectedNumber),
};
match first_digit {
Some(c @ b'0'..=b'9') => {
self.scratch.push(c);
}
_ => return Err(Reason::ExpectedNumber),
}
loop {
match input.next()? {
Some(c @ b'0'..=b'9') => {
self.scratch.push(c);
}
l => {
self.leftover = l;
return Ok(RawToken::Number(&self.scratch[..]));
}
}
}
}
l => {
self.leftover = l;
return Ok(RawToken::Number(&self.scratch[..]));
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[track_caller]
fn pass<I>(json: &str, expected: I)
where
I: IntoIterator<Item = RawToken<'static>>,
{
let mut input = Input::new(json.as_bytes());
let mut scanner = Scanner::new();
for e in expected {
let actual = scanner.read_token(&mut input);
assert!(
matches!(actual, Ok(_)),
"failed to parse token: {}\n{:?}",
json,
actual.unwrap_err()
);
assert_eq!(actual.unwrap(), e);
}
}
#[track_caller]
fn fail(json: &str, expected: Reason) {
let mut input = Input::new(json.as_bytes());
let mut scanner = Scanner::new();
let actual = scanner.read_token(&mut input);
assert!(
matches!(actual, Err(_)),
"unexpectedly succeeded to parse token: {}",
json
);
let actual = actual.unwrap_err();
assert_eq!(actual, expected);
}
#[test]
fn emits_eof_once_end_is_encountered() {
let mut input = Input::new("".as_bytes());
let mut scanner = Scanner::new();
assert_eq!(Ok(RawToken::Eof), scanner.read_token(&mut input));
assert_eq!(Ok(RawToken::Eof), scanner.read_token(&mut input));
assert_eq!(Ok(RawToken::Eof), scanner.read_token(&mut input));
assert_eq!(Ok(RawToken::Eof), scanner.read_token(&mut input));
assert_eq!(Ok(RawToken::Eof), scanner.read_token(&mut input));
assert_eq!(Ok(RawToken::Eof), scanner.read_token(&mut input));
}
#[test]
fn number_integer() {
fail("-", Reason::ExpectedNumber);
pass("0", [RawToken::Number(b"0")]);
pass("1", [RawToken::Number(b"1")]);
fail("01", Reason::ExpectedNumber);
pass("419", [RawToken::Number(b"419")]);
pass("-419", [RawToken::Number(b"-419")]);
pass("-419", [RawToken::Number(b"-419")]);
pass("-0]", [RawToken::Number(b"-0")]);
}
#[test]
fn number_decimal() {
pass("23.32", [RawToken::Number(b"23.32")]);
pass("-0.1234", [RawToken::Number(b"-0.1234")]);
pass("-0.1234]", [RawToken::Number(b"-0.1234")]);
fail("3.", Reason::ExpectedNumber);
fail("0.", Reason::ExpectedNumber);
}
#[test]
fn number_exponent() {
pass("123e12", [RawToken::Number(b"123e12")]);
pass("43.43e-12", [RawToken::Number(b"43.43e-12")]);
pass("43.43e+12", [RawToken::Number(b"43.43e12")]);
pass("43.43E+12", [RawToken::Number(b"43.43e12")]);
fail("3.e", Reason::ExpectedNumber);
fail("3.e-", Reason::ExpectedNumber);
fail("3.e+", Reason::ExpectedNumber);
}
#[test]
fn string() {
pass(r#""hello""#, [RawToken::String("hello")]);
pass(r#""\uE691""#, [RawToken::String("")]);
pass(r#""\uD834\uDD1E""#, [RawToken::String("𝄞")]);
pass(r#"" \n""#, [RawToken::String(" \n")]);
}
#[test]
fn bool() {
pass("true", [RawToken::Bool(true)]);
fail("tru", Reason::ExpectedBool);
fail("truf", Reason::ExpectedBool);
pass("false", [RawToken::Bool(false)]);
fail("fals", Reason::ExpectedBool);
fail("falsh", Reason::ExpectedBool);
}
#[test]
fn null() {
pass("null", [RawToken::Null]);
fail("nul", Reason::ExpectedNull);
fail("nulh", Reason::ExpectedNull);
}
#[test]
fn structural() {
pass("", [RawToken::Eof]);
pass("[", [RawToken::ArrayStart]);
pass("]", [RawToken::ArrayEnd]);
pass("{", [RawToken::ObjectStart]);
pass("}", [RawToken::ObjectEnd]);
pass(",", [RawToken::Comma]);
pass(":", [RawToken::Colon]);
}
#[test]
fn multiple_tokens() {
pass(
r#"{}[],:"hello"nullfalsetrue1"#,
[
RawToken::ObjectStart,
RawToken::ObjectEnd,
RawToken::ArrayStart,
RawToken::ArrayEnd,
RawToken::Comma,
RawToken::Colon,
RawToken::String("hello"),
RawToken::Null,
RawToken::Bool(false),
RawToken::Bool(true),
RawToken::Number(b"1"),
],
)
}
}