json_syntax/parse/
mod.rs

1use decoded_char::DecodedChar;
2use locspan::{Meta, Span};
3use std::{fmt, io};
4
5mod array;
6mod boolean;
7mod null;
8mod number;
9mod object;
10mod string;
11mod value;
12
13use crate::CodeMap;
14
15/// Parser options.
16#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
17pub struct Options {
18	/// Whether or not to accept a high surrogate without its low counterpart
19	/// in strings.
20	///
21	/// In such instance, the high surrogate will be replaced with the Unicode
22	/// REPLACEMENT CHARACTER, U+FFFD.
23	pub accept_truncated_surrogate_pair: bool,
24
25	/// Whether or not to accept invalid Unicode codepoints in strings.
26	///
27	/// Invalid codepoints will be replaced with the Unicode
28	/// REPLACEMENT CHARACTER, U+FFFD.
29	pub accept_invalid_codepoints: bool,
30}
31
32impl Options {
33	/// Strict mode.
34	///
35	/// All options are set to `false`.
36	pub fn strict() -> Self {
37		Self {
38			accept_truncated_surrogate_pair: false,
39			accept_invalid_codepoints: false,
40		}
41	}
42
43	/// Flexible mode.
44	///
45	/// All options are set to `true`.
46	pub fn flexible() -> Self {
47		Self {
48			accept_truncated_surrogate_pair: true,
49			accept_invalid_codepoints: true,
50		}
51	}
52}
53
54impl Default for Options {
55	fn default() -> Self {
56		Self::strict()
57	}
58}
59
60pub trait Parse: Sized {
61	fn parse_slice(content: &[u8]) -> Result<(Self, CodeMap), Error> {
62		Self::parse_utf8(utf8_decode::Decoder::new(content.iter().copied()))
63			.map_err(Error::io_into_utf8)
64	}
65
66	fn parse_slice_with(content: &[u8], options: Options) -> Result<(Self, CodeMap), Error> {
67		Self::parse_utf8_with(utf8_decode::Decoder::new(content.iter().copied()), options)
68			.map_err(Error::io_into_utf8)
69	}
70
71	fn parse_str(content: &str) -> Result<(Self, CodeMap), Error> {
72		Self::parse_utf8(content.chars().map(Ok))
73	}
74
75	fn parse_str_with(content: &str, options: Options) -> Result<(Self, CodeMap), Error> {
76		Self::parse_utf8_with(content.chars().map(Ok), options)
77	}
78
79	fn parse_infallible_utf8<C>(chars: C) -> Result<(Self, CodeMap), Error>
80	where
81		C: Iterator<Item = char>,
82	{
83		Self::parse_infallible(chars.map(DecodedChar::from_utf8))
84	}
85
86	fn parse_utf8_infallible_with<C>(chars: C, options: Options) -> Result<(Self, CodeMap), Error>
87	where
88		C: Iterator<Item = char>,
89	{
90		Self::parse_infallible_with(chars.map(DecodedChar::from_utf8), options)
91	}
92
93	fn parse_utf8<C, E>(chars: C) -> Result<(Self, CodeMap), Error<E>>
94	where
95		C: Iterator<Item = Result<char, E>>,
96	{
97		Self::parse(chars.map(|c| c.map(DecodedChar::from_utf8)))
98	}
99
100	fn parse_utf8_with<C, E>(chars: C, options: Options) -> Result<(Self, CodeMap), Error<E>>
101	where
102		C: Iterator<Item = Result<char, E>>,
103	{
104		Self::parse_with(chars.map(|c| c.map(DecodedChar::from_utf8)), options)
105	}
106
107	fn parse_infallible<C>(chars: C) -> Result<(Self, CodeMap), Error>
108	where
109		C: Iterator<Item = DecodedChar>,
110	{
111		let mut parser = Parser::new(chars.map(Ok));
112		let value = Self::parse_in(&mut parser, Context::None)?.into_value();
113		Ok((value, parser.code_map))
114	}
115
116	fn parse_infallible_with<C>(chars: C, options: Options) -> Result<(Self, CodeMap), Error>
117	where
118		C: Iterator<Item = DecodedChar>,
119	{
120		let mut parser = Parser::new_with(chars.map(Ok), options);
121		let value = Self::parse_in(&mut parser, Context::None)?.into_value();
122		Ok((value, parser.code_map))
123	}
124
125	fn parse<C, E>(chars: C) -> Result<(Self, CodeMap), Error<E>>
126	where
127		C: Iterator<Item = Result<DecodedChar, E>>,
128	{
129		let mut parser = Parser::new(chars);
130		let value = Self::parse_in(&mut parser, Context::None)?.into_value();
131		Ok((value, parser.code_map))
132	}
133
134	fn parse_with<C, E>(chars: C, options: Options) -> Result<(Self, CodeMap), Error<E>>
135	where
136		C: Iterator<Item = Result<DecodedChar, E>>,
137	{
138		let mut parser = Parser::new_with(chars, options);
139		let value = Self::parse_in(&mut parser, Context::None)?.into_value();
140		Ok((value, parser.code_map))
141	}
142
143	fn parse_in<C, E>(
144		parser: &mut Parser<C, E>,
145		context: Context,
146	) -> Result<Meta<Self, usize>, Error<E>>
147	where
148		C: Iterator<Item = Result<DecodedChar, E>>;
149}
150
151/// JSON parser.
152pub struct Parser<C: Iterator<Item = Result<DecodedChar, E>>, E> {
153	/// Character stream.
154	chars: C,
155
156	/// Pending next char.
157	pending: Option<DecodedChar>,
158
159	/// Position in the stream.
160	position: usize,
161
162	/// Parser options.
163	options: Options,
164
165	/// Code-map.
166	code_map: CodeMap,
167}
168
169/// Checks if the given char `c` is a JSON whitespace.
170#[inline(always)]
171pub fn is_whitespace(c: char) -> bool {
172	matches!(c, ' ' | '\t' | '\r' | '\n')
173}
174
175impl<C: Iterator<Item = Result<DecodedChar, E>>, E> Parser<C, E> {
176	pub fn new(chars: C) -> Self {
177		Self {
178			chars,
179			pending: None,
180			position: 0,
181			options: Options::default(),
182			code_map: CodeMap::default(),
183		}
184	}
185
186	pub fn new_with(chars: C, options: Options) -> Self {
187		Self {
188			chars,
189			pending: None,
190			position: 0,
191			options,
192			code_map: CodeMap::default(),
193		}
194	}
195
196	fn begin_fragment(&mut self) -> usize {
197		self.code_map.reserve(self.position)
198	}
199
200	fn end_fragment(&mut self, i: usize) {
201		let entry_count = self.code_map.len();
202		let entry = self.code_map.get_mut(i).unwrap();
203		entry.span.set_end(self.position);
204		entry.volume = entry_count - i;
205	}
206
207	fn peek_char(&mut self) -> Result<Option<char>, Error<E>> {
208		match self.pending {
209			Some(c) => Ok(Some(c.chr())),
210			None => match self.chars.next() {
211				Some(Ok(c)) => {
212					self.pending = Some(c);
213					Ok(Some(c.chr()))
214				}
215				Some(Err(e)) => Err(Error::Stream(self.position, e)),
216				None => Ok(None),
217			},
218		}
219	}
220
221	fn next_char(&mut self) -> Result<(usize, Option<char>), Error<E>> {
222		let c = match self.pending.take() {
223			Some(c) => Some(c),
224			None => self
225				.chars
226				.next()
227				.transpose()
228				.map_err(|e| Error::Stream(self.position, e))?,
229		};
230
231		let p = self.position;
232		let c = c.map(|c| {
233			self.position += c.len();
234			c.chr()
235		});
236
237		Ok((p, c))
238	}
239
240	fn skip_whitespaces(&mut self) -> Result<(), Error<E>> {
241		while let Some(c) = self.peek_char()? {
242			if is_whitespace(c) {
243				self.next_char()?;
244			} else {
245				break;
246			}
247		}
248
249		Ok(())
250	}
251}
252
253/// Parse error.
254#[derive(Debug)]
255pub enum Error<E = core::convert::Infallible> {
256	/// Stream error.
257	///
258	/// The first parameter is the byte index at which the error occurred.
259	Stream(usize, E),
260
261	/// Unexpected character or end of stream.
262	///
263	/// The first parameter is the byte index at which the error occurred.
264	Unexpected(usize, Option<char>),
265
266	/// Invalid unicode codepoint.
267	///
268	/// The first parameter is the span at which the error occurred.
269	InvalidUnicodeCodePoint(Span, u32),
270
271	/// Missing low surrogate in a string.
272	///
273	/// The first parameter is the byte index at which the error occurred.
274	MissingLowSurrogate(Span, u16),
275
276	/// Invalid low surrogate in a string.
277	///
278	/// The first parameter is the span at which the error occurred.
279	InvalidLowSurrogate(Span, u16, u32),
280
281	/// UTF-8 encoding error.
282	InvalidUtf8(usize),
283}
284
285impl<E> Error<E> {
286	/// Creates an `Unexpected` error.
287	#[inline(always)]
288	fn unexpected(position: usize, c: Option<char>) -> Self {
289		// panic!("unexpected {:?}", c);
290		Self::Unexpected(position, c)
291	}
292
293	pub fn position(&self) -> usize {
294		match self {
295			Self::Stream(p, _) => *p,
296			Self::Unexpected(p, _) => *p,
297			Self::InvalidUnicodeCodePoint(span, _) => span.start(),
298			Self::MissingLowSurrogate(span, _) => span.start(),
299			Self::InvalidLowSurrogate(span, _, _) => span.start(),
300			Self::InvalidUtf8(p) => *p,
301		}
302	}
303
304	pub fn span(&self) -> Span {
305		match self {
306			Self::Stream(p, _) => Span::new(*p, *p),
307			Self::Unexpected(p, _) => Span::new(*p, *p),
308			Self::InvalidUnicodeCodePoint(span, _) => *span,
309			Self::MissingLowSurrogate(span, _) => *span,
310			Self::InvalidLowSurrogate(span, _, _) => *span,
311			Self::InvalidUtf8(p) => Span::new(*p, *p),
312		}
313	}
314}
315
316impl Error<io::Error> {
317	fn io_into_utf8(self) -> Error {
318		match self {
319			Self::Stream(p, _) => Error::InvalidUtf8(p),
320			Self::Unexpected(p, e) => Error::Unexpected(p, e),
321			Self::InvalidUnicodeCodePoint(s, e) => Error::InvalidUnicodeCodePoint(s, e),
322			Self::MissingLowSurrogate(s, e) => Error::MissingLowSurrogate(s, e),
323			Self::InvalidLowSurrogate(s, a, b) => Error::InvalidLowSurrogate(s, a, b),
324			Self::InvalidUtf8(p) => Error::InvalidUtf8(p),
325		}
326	}
327}
328
329impl<E: fmt::Display> fmt::Display for Error<E> {
330	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
331		match self {
332			Self::Stream(_, e) => e.fmt(f),
333			Self::Unexpected(_, Some(c)) => write!(f, "unexpected character `{}`", c),
334			Self::Unexpected(_, None) => write!(f, "unexpected end of file"),
335			Self::InvalidUnicodeCodePoint(_, c) => write!(f, "invalid Unicode code point {:x}", *c),
336			Self::MissingLowSurrogate(_, _) => write!(f, "missing low surrogate"),
337			Self::InvalidLowSurrogate(_, _, _) => write!(f, "invalid low surrogate"),
338			Self::InvalidUtf8(_) => write!(f, "invalid UTF-8"),
339		}
340	}
341}
342
343impl<E: 'static + std::error::Error> std::error::Error for Error<E> {
344	fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
345		match self {
346			Self::Stream(_, e) => Some(e),
347			_ => None,
348		}
349	}
350}
351
352/// Parsing context.
353///
354/// Defines what characters are allowed after a value.
355#[derive(Clone, Copy, PartialEq, Eq, Debug)]
356pub enum Context {
357	None,
358	Array,
359	ObjectKey,
360	ObjectValue,
361}
362
363impl Context {
364	/// Checks if the given character `c` can follow a value in this context.
365	pub fn follows(&self, c: char) -> bool {
366		match self {
367			Self::None => is_whitespace(c),
368			Self::Array => is_whitespace(c) || matches!(c, ',' | ']'),
369			Self::ObjectKey => is_whitespace(c) || matches!(c, ':'),
370			Self::ObjectValue => is_whitespace(c) || matches!(c, ',' | '}'),
371		}
372	}
373}