#![expect(
clippy::arithmetic_side_effects,
reason = "pos is bounded by source.len() and only advances after a successful byte read; arithmetic is safe within parser state machine invariants"
)]
#![expect(
clippy::as_conversions,
reason = "byte position casts between usize and u32 are safe: usize->u32 is bounded by available memory, u32->usize always fits"
)]
#![expect(
clippy::cast_possible_truncation,
reason = "source length is bounded by available memory, so byte positions always fit in u32"
)]
#![expect(
clippy::string_slice,
reason = "span boundaries are always at ASCII JSON token boundaries, so slices are valid UTF-8"
)]
#[cfg(test)]
mod test;
#[cfg(test)]
mod test_basics;
#[cfg(test)]
mod test_parser;
#[cfg(test)]
mod test_type_sizes;
use std::rc::Rc;
use crate::{string, warning};
use super::{
Document, DocumentInner, ElemId, Element, Field, Location, PathEntry, PathTable, RawStr, Span,
Value,
};
const MAX_DEPTH: usize = 128;
const SPACE: u8 = b' ';
const TAB: u8 = b'\t';
const LF: u8 = b'\n';
const CR: u8 = b'\r';
const QUOTE: u8 = b'"';
const BACKSLASH: u8 = b'\\';
const COMMA: u8 = b',';
const COLON: u8 = b':';
const ARRAY_OPEN: u8 = b'[';
const ARRAY_CLOSE: u8 = b']';
const OBJECT_OPEN: u8 = b'{';
const OBJECT_CLOSE: u8 = b'}';
const MINUS: u8 = b'-';
const PLUS: u8 = b'+';
const DECIMAL_POINT: u8 = b'.';
const EXP_LOWER: u8 = b'e';
const EXP_UPPER: u8 = b'E';
const DIGIT_0: u8 = b'0';
const DIGIT_1: u8 = b'1';
const DIGIT_9: u8 = b'9';
const NULL: &str = "null";
const TRUE: &str = "true";
const FALSE: &str = "false";
const BOM: &[u8; 3] = b"\xEF\xBB\xBF";
pub(crate) fn parse(source: string::ReasonableLen<'_>) -> Result<Document<'_>, Error> {
let mut p = Parser::new(source.into_inner());
if p.bytes.starts_with(BOM) {
p.pos = BOM.len();
}
let raw_root = p.parse_value(PathEntry::Root)?;
p.skip_ws();
if p.pos < p.bytes.len() {
return Err(p.error(ErrorKind::TrailingContent));
}
let inner = Rc::new(DocumentInner {
source: source.into_inner(),
paths: p.table,
});
let root = into_element(raw_root, &inner);
Ok(Document { inner, root })
}
#[derive(Debug)]
pub struct Error {
byte_offset: usize,
position: Location,
kind: ErrorKind,
}
impl Error {
pub fn byte_offset(&self) -> usize {
self.byte_offset
}
pub fn kind(&self) -> &ErrorKind {
&self.kind
}
pub fn into_kind(self) -> ErrorKind {
self.kind
}
pub fn into_parts(self) -> (usize, ErrorKind) {
(self.byte_offset, self.kind)
}
}
#[derive(Debug, Eq, PartialEq)]
pub enum ErrorKind {
ExpectedNumeral,
ExpectedStart,
ExpectedLiteral { expected: &'static str },
ExpectedEndArray,
ExpectedEndObject,
UnexpectedChar { expected: char },
UnexpectedEOF,
TrailingContent,
DepthLimitExceeded,
MaxElements,
}
impl crate::Warning for Error {
fn id(&self) -> warning::Id {
let s = match self.kind {
ErrorKind::ExpectedNumeral => "expected_numeral",
ErrorKind::ExpectedStart => "expected_start",
ErrorKind::ExpectedLiteral { .. } => "expected_literal",
ErrorKind::ExpectedEndArray => "expected_end_array",
ErrorKind::ExpectedEndObject => "expected_end_object",
ErrorKind::UnexpectedChar { .. } => "unexpected_char",
ErrorKind::UnexpectedEOF => "unexpected_eof",
ErrorKind::TrailingContent => "trailing_content",
ErrorKind::DepthLimitExceeded => "depth_limit_exceeded",
ErrorKind::MaxElements => "max_elements",
};
warning::Id::from_static(s)
}
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let Self {
byte_offset,
position,
kind,
} = self;
match kind {
ErrorKind::ExpectedLiteral { expected } => {
write!(
f,
"unexpected literal found at line: `{position}`, byte `{byte_offset}`; expected: `{expected:?}`"
)
}
ErrorKind::ExpectedNumeral => {
write!(
f,
"unexpected numeral found at line: `{position}`, byte `{byte_offset}`; expected: `0-9`"
)
}
ErrorKind::ExpectedStart => {
write!(
f,
"unexpected start character found at line: `{position}`, byte `{byte_offset}`; expected one of: `[n, t, f, \", -, 0-9, [, {{]`"
)
}
ErrorKind::ExpectedEndArray => {
write!(
f,
"unexpected character found at line: `{position}`, byte `{byte_offset}`; expected: `,` or `]`"
)
}
ErrorKind::ExpectedEndObject => {
write!(
f,
"unexpected character found at line: `{position}`, byte `{byte_offset}`; expected: `,` or `}}`"
)
}
ErrorKind::UnexpectedChar { expected } => {
write!(
f,
"unexpected character `{expected}` found at line: `{position}`, byte `{byte_offset}``"
)
}
ErrorKind::UnexpectedEOF => write!(
f,
"unexpected end of input found at line: `{position}`, byte `{byte_offset}`"
),
ErrorKind::TrailingContent => write!(
f,
"trailing content found at line: `{position}`, byte `{byte_offset}`"
),
ErrorKind::DepthLimitExceeded => {
write!(f, "nesting depth exceeds the {MAX_DEPTH}-level limit")
}
ErrorKind::MaxElements => write!(f, "document exceeds {} JSON elements", u32::MAX),
}
}
}
impl std::error::Error for Error {}
struct RawElement<'buf> {
id: ElemId,
span: Span,
full_span_end: u32,
value: RawValue<'buf>,
}
enum RawValue<'buf> {
Null,
True,
False,
String(RawStr<'buf>),
Number(&'buf str),
Array(Vec<RawElement<'buf>>),
Object(Vec<RawField<'buf>>),
}
struct RawField<'buf> {
key_span: Span,
element: RawElement<'buf>,
}
struct Parser<'buf> {
source: &'buf str,
bytes: &'buf [u8],
pos: usize,
next_id: usize,
table: PathTable<'buf>,
depth: usize,
}
impl<'buf> Parser<'buf> {
fn new(source: &'buf str) -> Self {
Self {
source,
bytes: source.as_bytes(),
pos: 0,
next_id: 0,
table: PathTable::default(),
depth: 0,
}
}
fn alloc_id(&mut self) -> Result<ElemId, Error> {
let id = ElemId(self.next_id);
self.next_id = self
.next_id
.checked_add(1)
.ok_or_else(|| self.error(ErrorKind::MaxElements))?;
Ok(id)
}
fn skip_ws(&mut self) {
while matches!(self.bytes.get(self.pos), Some(&SPACE | &TAB | &LF | &CR)) {
self.chomp();
}
}
fn peek(&self) -> Option<u8> {
self.bytes.get(self.pos).copied()
}
#[inline]
fn chomp(&mut self) {
self.pos += 1;
}
fn error(&self, kind: ErrorKind) -> Error {
let parsed = &self.source[..self.pos];
Error {
byte_offset: parsed.len(),
position: super::line_col(parsed),
kind,
}
}
fn advance(&mut self) -> Option<u8> {
let b = self.bytes.get(self.pos).copied();
if b.is_some() {
self.chomp();
}
b
}
fn expect_byte(&mut self, byte: u8) -> Result<(), Error> {
match self.bytes.get(self.pos) {
Some(&b) if b == byte => {
self.chomp();
Ok(())
}
Some(_) => Err(self.error(ErrorKind::UnexpectedChar {
expected: char::from(byte),
})),
None => Err(self.error(ErrorKind::UnexpectedEOF)),
}
}
fn expect_literal(&mut self, literal: &'static str) -> Result<(), Error> {
for &expected in literal.as_bytes() {
match self.advance() {
Some(b) if b == expected => {}
Some(_) => {
self.pos -= 1;
return Err(self.error(ErrorKind::ExpectedLiteral { expected: literal }));
}
None => return Err(self.error(ErrorKind::UnexpectedEOF)),
}
}
Ok(())
}
fn parse_value(&mut self, entry: PathEntry<'buf>) -> Result<RawElement<'buf>, Error> {
self.skip_ws();
let id = self.alloc_id()?;
self.table.push(entry);
let start = self.pos;
let value = self.parse_value_kind(id)?;
let span = Span::new(start as u32, self.pos as u32);
Ok(RawElement {
id,
span,
full_span_end: span.end,
value,
})
}
fn parse_value_kind(&mut self, id: ElemId) -> Result<RawValue<'buf>, Error> {
match self
.peek()
.ok_or_else(|| self.error(ErrorKind::UnexpectedEOF))?
{
b'n' => {
self.expect_literal(NULL)?;
Ok(RawValue::Null)
}
b't' => {
self.expect_literal(TRUE)?;
Ok(RawValue::True)
}
b'f' => {
self.expect_literal(FALSE)?;
Ok(RawValue::False)
}
QUOTE => Ok(RawValue::String(self.parse_raw_str()?)),
MINUS | DIGIT_0..=DIGIT_9 => Ok(RawValue::Number(self.parse_number_str()?)),
ARRAY_OPEN => self.parse_array(id),
OBJECT_OPEN => self.parse_object(id),
_ => Err(self.error(ErrorKind::ExpectedStart)),
}
}
fn parse_number_str(&mut self) -> Result<&'buf str, Error> {
let start = self.pos;
if self.peek() == Some(MINUS) {
self.chomp();
}
match self
.peek()
.ok_or_else(|| self.error(ErrorKind::UnexpectedEOF))?
{
DIGIT_0 => self.chomp(),
DIGIT_1..=DIGIT_9 => {
while matches!(self.peek(), Some(DIGIT_0..=DIGIT_9)) {
self.chomp();
}
}
_ => return Err(self.error(ErrorKind::ExpectedNumeral)),
}
if self.peek() == Some(DECIMAL_POINT) {
self.chomp();
if !matches!(self.peek(), Some(DIGIT_0..=DIGIT_9)) {
return Err(match self.peek() {
Some(_) => self.error(ErrorKind::ExpectedNumeral),
None => self.error(ErrorKind::UnexpectedEOF),
});
}
while matches!(self.peek(), Some(DIGIT_0..=DIGIT_9)) {
self.chomp();
}
}
if matches!(self.peek(), Some(EXP_LOWER | EXP_UPPER)) {
self.chomp();
if matches!(self.peek(), Some(PLUS | MINUS)) {
self.chomp();
}
if !matches!(self.peek(), Some(DIGIT_0..=DIGIT_9)) {
return Err(match self.peek() {
Some(_) => self.error(ErrorKind::ExpectedNumeral),
None => self.error(ErrorKind::UnexpectedEOF),
});
}
while matches!(self.peek(), Some(DIGIT_0..=DIGIT_9)) {
self.chomp();
}
}
Ok(&self.source[start..self.pos])
}
fn parse_raw_str(&mut self) -> Result<RawStr<'buf>, Error> {
self.expect_byte(QUOTE)?;
let content_start = self.pos;
loop {
match self
.advance()
.ok_or_else(|| self.error(ErrorKind::UnexpectedEOF))?
{
QUOTE => break,
BACKSLASH => {
self.advance()
.ok_or_else(|| self.error(ErrorKind::UnexpectedEOF))?;
}
_ => {}
}
}
Ok(RawStr(&self.source[content_start..self.pos - 1]))
}
fn parse_array(&mut self, parent_id: ElemId) -> Result<RawValue<'buf>, Error> {
self.depth += 1;
if self.depth > MAX_DEPTH {
return Err(self.error(ErrorKind::DepthLimitExceeded));
}
self.expect_byte(ARRAY_OPEN)?;
self.skip_ws();
let mut elements: Vec<RawElement<'buf>> = Vec::new();
if self.peek() != Some(ARRAY_CLOSE) {
loop {
let entry = PathEntry::Item {
parent: parent_id,
index: elements.len() as u32,
};
let mut elem = self.parse_value(entry)?;
self.skip_ws();
match self
.peek()
.ok_or_else(|| self.error(ErrorKind::UnexpectedEOF))?
{
COMMA => {
self.chomp();
self.skip_ws();
if self.peek() == Some(ARRAY_CLOSE) {
return Err(self.error(ErrorKind::ExpectedEndArray));
}
elem.full_span_end = self.pos as u32;
elements.push(elem);
}
ARRAY_CLOSE => {
elements.push(elem);
break;
}
_ => return Err(self.error(ErrorKind::ExpectedEndArray)),
}
}
}
self.expect_byte(ARRAY_CLOSE)?;
self.depth -= 1;
Ok(RawValue::Array(elements))
}
fn parse_object(&mut self, parent_id: ElemId) -> Result<RawValue<'buf>, Error> {
self.depth += 1;
if self.depth > MAX_DEPTH {
return Err(self.error(ErrorKind::DepthLimitExceeded));
}
self.expect_byte(OBJECT_OPEN)?;
self.skip_ws();
let mut fields: Vec<RawField<'buf>> = Vec::new();
if self.peek() != Some(OBJECT_CLOSE) {
loop {
let key_start = self.pos;
let key = self.parse_raw_str()?;
let key_span = Span::new(key_start as u32, self.pos as u32);
self.skip_ws();
self.expect_byte(COLON)?;
let entry = PathEntry::Field {
parent: parent_id,
key,
};
let mut elem = self.parse_value(entry)?;
self.skip_ws();
match self
.peek()
.ok_or_else(|| self.error(ErrorKind::UnexpectedEOF))?
{
COMMA => {
self.chomp();
self.skip_ws();
if self.peek() == Some(OBJECT_CLOSE) {
return Err(self.error(ErrorKind::ExpectedEndObject));
}
elem.full_span_end = self.pos as u32;
fields.push(RawField {
key_span,
element: elem,
});
}
OBJECT_CLOSE => {
fields.push(RawField {
key_span,
element: elem,
});
break;
}
_ => return Err(self.error(ErrorKind::ExpectedEndObject)),
}
}
}
self.expect_byte(OBJECT_CLOSE)?;
self.depth -= 1;
Ok(RawValue::Object(fields))
}
}
fn into_element<'buf>(raw: RawElement<'buf>, inner: &Rc<DocumentInner<'buf>>) -> Element<'buf> {
enum Task<'buf> {
Process(RawElement<'buf>),
BuildArray {
id: ElemId,
span: Span,
full_span_end: u32,
count: usize,
},
BuildObject {
id: ElemId,
span: Span,
full_span_end: u32,
key_spans: Vec<Span>,
},
}
let mut work: Vec<Task<'buf>> = vec![Task::Process(raw)];
let mut done: Vec<Element<'buf>> = Vec::new();
while let Some(task) = work.pop() {
match task {
Task::Process(raw) => {
let value = match raw.value {
RawValue::Null => Value::Null,
RawValue::True => Value::True,
RawValue::False => Value::False,
RawValue::String(s) => Value::String(s),
RawValue::Number(n) => Value::Number(n),
RawValue::Array(items) => {
work.push(Task::BuildArray {
id: raw.id,
span: raw.span,
full_span_end: raw.full_span_end,
count: items.len(),
});
for item in items.into_iter().rev() {
work.push(Task::Process(item));
}
continue;
}
RawValue::Object(fields) => {
let key_spans = fields.iter().map(|f| f.key_span).collect();
work.push(Task::BuildObject {
id: raw.id,
span: raw.span,
full_span_end: raw.full_span_end,
key_spans,
});
for field in fields.into_iter().rev() {
work.push(Task::Process(field.element));
}
continue;
}
};
done.push(Element {
doc: Rc::clone(inner),
id: raw.id,
span: raw.span,
full_span_end: raw.full_span_end,
value,
});
}
Task::BuildArray {
id,
span,
full_span_end,
count,
} => {
let start = done.len() - count;
let items: Vec<Element<'buf>> = done.drain(start..).collect();
done.push(Element {
doc: Rc::clone(inner),
id,
span,
full_span_end,
value: Value::Array(items),
});
}
Task::BuildObject {
id,
span,
full_span_end,
key_spans,
} => {
let count = key_spans.len();
let start = done.len() - count;
let elements: Vec<Element<'buf>> = done.drain(start..).collect();
let fields = key_spans
.into_iter()
.zip(elements)
.map(|(key_span, element)| Field { key_span, element })
.collect();
done.push(Element {
doc: Rc::clone(inner),
id,
span,
full_span_end,
value: Value::Object(fields),
});
}
}
}
done.swap_remove(0)
}