use std::{char, mem};
use lexical::{format, parse_partial_with_options, parse_with_options};
use crate::buf::Buf;
use crate::error::{self, Error, InvalidInput};
#[derive(Debug, PartialEq)]
pub enum Token<'a> {
LeftBrace,
RightBrace,
LeftBracket,
RightBracket,
Comma,
StrLit(&'a [u8]),
NumLit(Number),
True,
False,
Colon,
Null,
Eof,
}
#[derive(Debug, PartialEq, Copy, Clone)]
pub enum Number {
PositiveInt(u64),
NegativeInt(i64),
Float(f64),
}
pub struct Lexer<R> {
buf: Buf<R>,
}
impl<R> Lexer<R>
where
R: std::io::Read,
{
pub fn new(reader: R, cap: usize) -> Self {
Self {
buf: Buf::new(reader, cap),
}
}
pub fn pos(&self) -> u64 {
self.buf.stream_pos()
}
#[allow(clippy::should_implement_trait)]
#[inline]
pub fn next(&mut self) -> error::Result<Token<'_>> {
self.seek_past_whitespace()?;
if self.buf.ensure_capacity(1).is_err() {
return Ok(Token::Eof);
}
let c = self.buf.munch();
match c {
b'"' => Ok(Token::StrLit(self.parse_string_literal()?)),
b'-' | b'0'..=b'9' => {
self.buf.seek_back(1);
Ok(Token::NumLit(self.parse_number()?))
}
b'n' => {
self.parse_byte_literal(b"ull")?;
Ok(Token::Null)
}
b't' => {
self.parse_byte_literal(b"rue")?;
Ok(Token::True)
}
b'f' => {
self.parse_byte_literal(b"alse")?;
Ok(Token::False)
}
b'{' => Ok(Token::LeftBrace),
b'}' => Ok(Token::RightBrace),
b'[' => Ok(Token::LeftBracket),
b']' => Ok(Token::RightBracket),
b',' => Ok(Token::Comma),
b':' => Ok(Token::Colon),
x => InvalidInput::InvalidTokenStart(x).res(self.buf.stream_pos()),
}
}
fn parse_string_literal(&mut self) -> error::Result<&[u8]> {
let idx = match mycroft_search_string_escapes(self.buf.bytes()) {
Some(idx) => idx,
None => {
let offset = self.buf.bytes().len();
self.buf.refill()?;
mycroft_search_string_escapes(&self.buf.bytes()[offset..])
.ok_or(Error::TokenTooLarge)?
+ offset
}
};
let c = self.buf.bytes()[idx];
match c {
b'"' => Ok(self.buf.consume_and_slice(0, idx, idx + 1)),
b'\\' => self.parse_string_with_escape(idx),
_ => InvalidInput::ControlCharacterInString(c).res(self.pos()),
}
}
fn parse_string_with_escape(&mut self, first_pos: usize) -> error::Result<&[u8]> {
let mut read_pos = first_pos;
let mut write_pos = first_pos;
let mut escape = false;
macro_rules! write_buf {
($buf:ident, $byte:expr) => {{
$buf[write_pos] = $byte;
write_pos += 1;
}};
}
macro_rules! read_buf {
($buf:ident, $count:expr) => {{
let parts = &$buf[read_pos..read_pos + $count];
read_pos += $count;
parts
}};
}
for _ in 0..=1 {
let stream_pos = self.pos() + read_pos as u64;
let mut buf = self.buf.bytes_mut();
let len = buf.len();
while read_pos < len {
let b = buf[read_pos];
read_pos += 1;
if escape {
escape = false;
match b {
b'\\' | b'/' | b'"' => write_buf!(buf, b),
b'b' => write_buf!(buf, b'\x08'),
b'f' => write_buf!(buf, b'\x0C'),
b'n' => write_buf!(buf, b'\n'),
b'r' => write_buf!(buf, b'\r'),
b't' => write_buf!(buf, b'\t'),
b'u' => {
self.buf.ensure_capacity(read_pos + 4)?;
buf = self.buf.bytes_mut();
let parts = read_buf!(buf, 4);
let surrogate = decode_surrogate(
parts[0], parts[1], parts[2], parts[3], stream_pos,
)?;
if !(0xD800..0xDBFF).contains(&surrogate) {
let c =
char::from_u32(surrogate).expect("Already checked the bounds");
write_pos += c.encode_utf8(&mut buf[write_pos..]).len();
continue;
}
if surrogate >= 0xDC00 {
return InvalidInput::InvalidHighSurrogate(surrogate)
.res(stream_pos + read_pos as u64);
}
self.buf.ensure_capacity(read_pos + 6)?;
buf = self.buf.bytes_mut();
if read_buf!(buf, 2) != b"\\u" {
return InvalidInput::MissingLowSurrogate(surrogate)
.res(stream_pos);
}
let parts = read_buf!(buf, 4);
let low_surrogate = decode_surrogate(
parts[0], parts[1], parts[2], parts[3], stream_pos,
)?;
if !(0xDC00..0xDFFF).contains(&low_surrogate) {
return InvalidInput::InvalidLowSurrogate(low_surrogate)
.res(stream_pos + read_pos as u64);
}
let codepoint = (((surrogate - 0xD800) << 10)
| (low_surrogate - 0xDC00))
+ 0x1_0000;
let c = char::from_u32(codepoint).unwrap();
write_pos += c.encode_utf8(&mut buf[write_pos..]).len();
}
x => return InvalidInput::InvalidEscape(x).res(self.pos()),
}
} else {
match b {
b'\\' => {
escape = true;
}
b'"' => return Ok(self.buf.consume_and_slice(0, write_pos, read_pos)),
0x00..=0x1F => {
return InvalidInput::ControlCharacterInString(b)
.res(self.pos() + read_pos as u64);
}
_ => {
write_buf!(buf, b)
}
};
}
}
self.buf.refill()?;
}
InvalidInput::UnterminatedString.res(self.pos())
}
fn parse_number(&mut self) -> error::Result<Number> {
let mut idx = 0;
for _ in 0..=1 {
let buf = self.buf.bytes();
while idx < buf.len() {
match buf[idx] {
b'0'..=b'9' | b'-' => {}
b'.' | b'e' | b'E' => {
let f = self.parse_float()?;
return Ok(Number::Float(f));
}
b'\t' | b'\n' | b'\r' | b' ' | b'}' | b']' | b',' => {
let n = parse_integer(&buf[0..idx])?;
self.buf.consume(idx);
return Ok(n);
}
_ => {
return InvalidInput::InvalidNumber(None).res(self.pos());
}
}
idx += 1;
}
self.buf.refill()?;
}
if idx == self.buf.cap() {
return Err(Error::TokenTooLarge);
}
let n = parse_integer(&self.buf.bytes()[0..idx])?;
self.buf.consume(idx);
Ok(n)
}
fn parse_float(&mut self) -> error::Result<f64> {
let opts = lexical::ParseFloatOptions::new();
match parse_partial_with_options::<_, _, { format::JSON }>(self.buf.bytes(), &opts) {
Ok((num, len)) => {
if len != self.buf.bytes().len() {
self.buf.consume(len);
return Ok(num);
}
}
_ => {}
}
self.buf.refill()?;
let (num, len) =
parse_partial_with_options::<_, _, { format::JSON }>(self.buf.bytes(), &opts)?;
if len == self.buf.cap() {
return Err(Error::TokenTooLarge);
}
self.buf.consume(len);
Ok(num)
}
fn parse_byte_literal(&mut self, lit: &[u8]) -> error::Result<()> {
let len = lit.len();
self.buf.ensure_capacity(len)?;
if self.buf.peek(len) == lit {
self.buf.consume(len);
Ok(())
} else {
InvalidInput::InvalidLiteral.res(self.pos())
}
}
fn seek_past_whitespace(&mut self) -> error::Result<()> {
loop {
let buf = self.buf.bytes();
let mut i = 0;
while i < buf.len() {
match buf[i] {
b'\n' | b'\t' | b'\r' | b' ' => {}
_ => {
self.buf.consume(i);
return Ok(());
}
}
i += 1
}
if self.buf.consume_all_and_refill()? == 0 {
return Ok(());
}
}
}
}
fn mycroft_search_string_escapes(haystack: &[u8]) -> Option<usize> {
type Chunk = u64;
const STEP_SIZE: usize = mem::size_of::<Chunk>();
const ONE_BYTES: Chunk = Chunk::MAX / 255;
for chunk in haystack.chunks_exact(STEP_SIZE) {
let chars = Chunk::from_le_bytes(chunk.try_into().unwrap());
let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars;
let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"'));
let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote;
let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\'));
let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
if masked != 0 {
let idx = unsafe { chunk.as_ptr().offset_from(haystack.as_ptr()) } as usize
+ masked.trailing_zeros() as usize / 8;
return Some(idx);
}
}
let mut idx = haystack.len() / STEP_SIZE * STEP_SIZE;
while idx < haystack.len() {
let b = haystack[idx];
if b == b'"' || b == b'\\' || b <= 0x1F {
return Some(idx);
}
idx += 1;
}
None
}
#[inline]
fn decode_surrogate(b1: u8, b2: u8, b3: u8, b4: u8, pos: u64) -> error::Result<u32> {
let Some(surrogate) = decode_four_hex_digits(b1, b2, b3, b4) else {
return InvalidInput::InvalidEscapeSequence(b1, b2, b3, b4).res(pos);
};
Ok(surrogate as u32)
}
#[inline]
fn parse_integer(buf: &[u8]) -> error::Result<Number> {
if buf[0] == b'-' {
Ok(Number::NegativeInt(parse_negative_integer(buf)?))
} else {
Ok(Number::PositiveInt(parse_positive_integer(buf)?))
}
}
#[inline]
fn parse_positive_integer(buf: &[u8]) -> error::Result<u64> {
let opts = lexical::ParseIntegerOptions::new();
let num = parse_with_options::<_, _, { format::JSON }>(buf, &opts)?;
Ok(num)
}
#[inline]
fn parse_negative_integer(buf: &[u8]) -> error::Result<i64> {
let opts = lexical::ParseIntegerOptions::new();
let num = parse_with_options::<_, _, { format::JSON }>(buf, &opts)?;
Ok(num)
}
const fn decode_hex_val_slow(val: u8) -> Option<u8> {
match val {
b'0'..=b'9' => Some(val - b'0'),
b'A'..=b'F' => Some(val - b'A' + 10),
b'a'..=b'f' => Some(val - b'a' + 10),
_ => None,
}
}
const fn build_hex_table(shift: usize) -> [i16; 256] {
let mut table = [0; 256];
let mut ch = 0;
while ch < 256 {
table[ch] = match decode_hex_val_slow(ch as u8) {
Some(val) => (val as i16) << shift,
None => -1,
};
ch += 1;
}
table
}
static HEX0: [i16; 256] = build_hex_table(0);
static HEX1: [i16; 256] = build_hex_table(4);
fn decode_four_hex_digits(a: u8, b: u8, c: u8, d: u8) -> Option<u16> {
let a = HEX1[a as usize] as i32;
let b = HEX0[b as usize] as i32;
let c = HEX1[c as usize] as i32;
let d = HEX0[d as usize] as i32;
let codepoint = ((a | b) << 8) | c | d;
if codepoint >= 0 {
Some(codepoint as u16)
} else {
None
}
}
#[cfg(test)]
mod test {
use std::io::Cursor;
use super::*;
#[test]
#[should_panic]
fn test_control_char() {
let data = "\"s\tring\"";
test_string_pos(data.as_bytes(), 32, "".as_bytes(), 0);
}
#[test]
#[should_panic]
fn test_control_char_after_escape() {
let data = "\"s\\\\\tring\"";
test_string_pos(data.as_bytes(), 32, "".as_bytes(), 0);
}
#[test]
fn test_buf_roll_escape_sequence() {
let data = r#" "\uD83D\uDE0A""#;
let expected = "\u{1F60A}";
test_string_pos(data.as_bytes(), 14, expected.as_bytes(), 2);
}
#[test]
fn test_escape_sequence() {
let data = r#""\uD83D\uDE0A""#;
let expected = "\u{1F60A}";
test_string_pos(data.as_bytes(), 20, expected.as_bytes(), 0);
}
#[test]
fn test_all_escapes() {
let data = r#" "\"\\\t\r\b\n\f\/""#;
let expected = b"\"\\\t\r\x08\n\x0C/";
test_string_pos(data.as_bytes(), 20, expected, 3);
}
#[test]
fn test_multiple_escape() {
let data = r#" "fo\"o\\\\""#;
let expected = r#"fo"o\\"#;
test_string_pos(data.as_bytes(), 12, expected.as_bytes(), 2);
}
#[test]
fn test_escape_string_refill() {
let data = r#" "foo\\""#;
let expected = r#"foo\"#;
test_string_pos(data.as_bytes(), 6, expected.as_bytes(), 2);
}
#[test]
fn test_backslash_escape_string() {
let data = r#""foo\\""#;
let expected = r#"foo\"#;
test_string(data.as_bytes(), 12, expected.as_bytes());
}
#[test]
fn test_quote_escape_strings() {
let data = r#""\"foo""#;
let expected = r#""foo"#;
test_string(data.as_bytes(), 12, expected.as_bytes());
}
fn test_string(input: &[u8], cap: usize, expected: &[u8]) {
test_string_pos(input, cap, expected, 0);
}
fn test_string_pos(input: &[u8], cap: usize, expected: &[u8], _pos: usize) {
let expected = Token::StrLit(expected);
test_single_match(input, cap, expected);
}
#[test]
fn test_simple_string() {
let data = r#" "hello world" "#;
let expected = Token::StrLit(b"hello world");
test_single_match(data.as_bytes(), 12, expected);
}
#[test]
fn test_float() {
let data = b"\n\t123";
let expected = Token::NumLit(Number::PositiveInt(123));
test_single_match(data, 4, expected);
}
#[test]
fn test_null() {
let data = b"\n\tnull";
let expected = Token::Null;
test_single_match(data, 9, expected);
}
#[test]
fn test_true() {
let data = b"\n\ttrue";
let expected = Token::True;
test_single_match(data, 10, expected);
}
#[test]
fn test_false() {
let data = b"\n\tfalse";
let expected = Token::False;
test_single_match(data, 10, expected);
}
#[test]
fn test_colon() {
let data = b"\n\t:";
let expected = Token::Colon;
test_single_match(data, 10, expected);
}
#[test]
fn test_comma() {
let data = b",\n\t";
let expected = Token::Comma;
test_single_match(data, 16, expected);
}
#[test]
fn test_right_bracket() {
let data = b"]\n\t";
let expected = Token::RightBracket;
test_single_match(data, 2, expected);
}
#[test]
fn test_left_bracket() {
let data = b"\t [\n\t";
let expected = Token::LeftBracket;
test_single_match(data, 2, expected);
}
#[test]
fn test_right_brace() {
let data = b"\t \n\t }";
let expected = Token::RightBrace;
test_single_match(data, 2, expected);
}
#[test]
fn test_left_brace() {
let data = b"\t\t\n \n\t {";
let expected = Token::LeftBrace;
test_single_match(data, 5, expected);
}
#[test]
fn test_whitespace() {
let data = Cursor::new(" \t\t\t\t\t \n\n\n \t\n\n\n\n ");
let mut lexer = Lexer::new(data, 5);
let next = lexer.next();
eprintln!("Next: {:?}", &next);
assert!(matches!(next, Ok::<Token<'_>, error::Error>(Token::Eof)));
let next = lexer.next();
eprintln!("Next: {:?}", &next);
assert!(matches!(next, Ok::<Token<'_>, error::Error>(Token::Eof)));
}
#[test]
#[should_panic]
fn float_same_size_as_buf() {
let data = b" 128.01";
let expected = Token::NumLit(Number::Float(128.0));
test_single_match(data, 5, expected);
}
fn test_single_match(data: &[u8], cap: usize, expected: Token) {
let mut lexer = Lexer::new(Cursor::new(data), cap);
let next = lexer.next();
eprintln!("Next: {:?}", &next);
assert_eq!(expected, next.unwrap());
let next = lexer.next();
eprintln!("Next: {:?}", &next);
assert!(matches!(next, Ok::<Token<'_>, error::Error>(Token::Eof)));
}
#[test]
fn test_large_object() {
let literal = r#"
{
"foo": 1,
"bar": 2,
"baz": 3,
"bing": 4
}"#;
let expected = vec![
Token::LeftBrace,
Token::StrLit("foo".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(1)),
Token::Comma,
Token::StrLit("bar".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(2)),
Token::Comma,
Token::StrLit("baz".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(3)),
Token::Comma,
Token::StrLit("bing".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(4)),
Token::RightBrace,
];
test_helper(literal, expected, 6)
}
#[test]
fn test_multi_json() {
let literal = r#"
{ "xq": 1, "yq": 2 }
{ "xq": 3, "yq": 4 }
"#;
let expected = vec![
Token::LeftBrace,
Token::StrLit("xq".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(1)),
Token::Comma,
Token::StrLit("yq".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(2)),
Token::RightBrace,
Token::LeftBrace,
Token::StrLit("xq".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(3)),
Token::Comma,
Token::StrLit("yq".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(4)),
Token::RightBrace,
];
test_helper(literal, expected, 4);
}
#[test]
fn test_object() {
let literal = "{ \"foo\": 123.456 }".to_string();
let expected = vec![
Token::LeftBrace,
Token::StrLit("foo".as_bytes()),
Token::Colon,
Token::NumLit(Number::Float(123.456)),
Token::RightBrace,
];
test_helper(&literal, expected, 8)
}
#[test]
fn test_list() {
let literal = "[{\"foo\": 123.456}]".to_string();
let expected = vec![
Token::LeftBracket,
Token::LeftBrace,
Token::StrLit("foo".as_bytes()),
Token::Colon,
Token::NumLit(Number::Float(123.456)),
Token::RightBrace,
Token::RightBracket,
];
test_helper(&literal, expected, 8);
}
#[test]
fn test_list_of_stuff() {
let literal = "[{\"foo\": 123.456}, true,\tfalse,\n3, {\"bar\": 4}]".to_string();
let expected = vec![
Token::LeftBracket,
Token::LeftBrace,
Token::StrLit("foo".as_bytes()),
Token::Colon,
Token::NumLit(Number::Float(123.456)),
Token::RightBrace,
Token::Comma,
Token::True,
Token::Comma,
Token::False,
Token::Comma,
Token::NumLit(Number::PositiveInt(3)),
Token::Comma,
Token::LeftBrace,
Token::StrLit("bar".as_bytes()),
Token::Colon,
Token::NumLit(Number::PositiveInt(4)),
Token::RightBrace,
Token::RightBracket,
];
test_helper(&literal, expected, 8)
}
fn test_helper(s: &str, tokens: Vec<Token>, cap: usize) {
let mut lexer = Lexer::new(Cursor::new(&s), cap);
let mut count = 0;
while let Ok(token) = lexer.next() {
eprintln!("Token: {:?}", &token);
if count >= tokens.len() {
if token == Token::Eof {
break;
}
panic!(
"More tokens in iterator than expected. Idx: '{}', token: '{:?}'",
count, token
);
}
assert_eq!(tokens[count], token);
count += 1;
}
assert_eq!(tokens.len(), count)
}
}