1use std::fmt;
2use std::iter::Peekable;
3
4use smallvec::SmallVec;
5
6#[derive(Debug, Clone, PartialEq, PartialOrd)]
8pub enum JsonToken {
9 Number(f64),
10 True,
11 False,
12 String(String),
13 Null,
14 ArrayOpen,
15 Comma,
16 ArrayClose,
17 ObjOpen,
18 Colon,
19 ObjClose,
20}
21
22#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
24pub struct Location {
25 pub byte_offset: u64,
26 pub line: u64,
27 pub col: u64,
28}
29
30impl Location {
31 fn advance_by_byte(&mut self, c: u8) {
32 if c == b'\n' {
33 self.col = 0;
34 self.line += 1;
35 } else {
36 self.col += 1;
37 }
38 self.byte_offset += 1;
39 }
40}
41
42#[derive(Debug)]
44pub struct JsonParseError {
45 msg: String,
46 location: Location,
47}
48
49impl JsonParseError {
50 pub fn new(msg: String, location: Location) -> JsonParseError {
52 JsonParseError { msg, location }
53 }
54
55 pub fn msg(&self) -> &str {
57 &self.msg
58 }
59
60 pub fn location(&self) -> Location {
62 self.location
63 }
64}
65
66impl fmt::Display for JsonParseError {
67 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68 write!(
69 f,
70 "Parse error at line:{}, col:{}: {}",
71 self.location.line, self.location.col, &self.msg,
72 )
73 }
74}
75
76impl std::error::Error for JsonParseError {}
77
78pub type JsonParseResult<T> = Result<T, JsonParseError>;
80
81fn is_whitespace(c: u8) -> bool {
85 matches!(c, 0x20 | 0xa | 0xd | 0x9)
86}
87
88pub struct JsonTokenizer<I: Iterator<Item = u8>> {
90 bytes: Peekable<I>,
91 location: Location,
92}
93
94impl<I: Iterator<Item = u8>> JsonTokenizer<I> {
95 pub fn new(it: I) -> Self {
97 JsonTokenizer {
98 bytes: it.peekable(),
99 location: Location::default(),
100 }
101 }
102
103 pub fn location(&self) -> Location {
105 self.location
106 }
107
108 pub fn expect_eof(&mut self) -> Result<(), JsonParseError> {
110 match self.peek_byte_skip_whitespace() {
111 Some(b) => self.err(format!("Expected EOF but found byte {b:#x}")),
112 None => Ok(()),
113 }
114 }
115
116 fn err<T>(&self, msg: String) -> Result<T, JsonParseError> {
117 Err(JsonParseError::new(msg, self.location))
118 }
119
120 fn eof_err(&self) -> JsonParseError {
121 JsonParseError::new(String::from("Unexpected EOF"), self.location)
122 }
123
124 fn peek_byte_skip_whitespace(&mut self) -> Option<u8> {
125 while let Some(c) = self.bytes.peek().copied() {
126 if is_whitespace(c) {
127 self.bytes.next().unwrap();
128 self.location.advance_by_byte(c);
129 continue;
130 }
131 return Some(c);
132 }
133 None
134 }
135
136 fn consume_byte(&mut self) -> Result<u8, JsonParseError> {
137 match self.bytes.next() {
138 Some(b) => {
139 self.location.advance_by_byte(b);
140 Ok(b)
141 }
142 None => Err(self.eof_err()),
143 }
144 }
145
146 fn consume_string(&mut self) -> JsonParseResult<JsonToken> {
147 if self.consume_byte().unwrap() != b'"' {
148 panic!("This function should only be called after the caller has encountered a start quote");
149 }
150
151 let mut s = SmallVec::<[u8; 10]>::new();
152 loop {
153 let b = match self.consume_byte()? {
154 b'\\' => match self.consume_byte()? {
155 b'\\' => b'\\',
156 b'/' => b'/',
157 b'"' => b'"',
158 b'b' => 0x8,
159 b'f' => 0xc,
160 b'n' => b'\n',
161 b'r' => b'\r',
162 b't' => b'\t',
163 b'u' => {
164 let mut u = 0u16;
165 for _ in 0..4 {
166 let b = self.consume_byte()?;
167 if let Some(h) = ascii_byte_to_hex_digit(b) {
168 u = u * 0x10 + h as u16;
169 } else {
170 return self.err(format!("Unicode character must be \\uXXXX (X is hex character) format but found byte {b:#x}"));
171 }
172 }
173 let c = match u {
174 0xD800..=0xDBFF => {
175 if self.consume_byte()? != b'\\' || self.consume_byte()? != b'u' {
179 return self.err(format!("First UTF-16 surragate {u:#x} must be directly followed by a second \\uXXXX surrogate."));
180 }
181 let mut u2 = 0u16;
182 for _ in 0..4 {
183 let b = self.consume_byte()?;
184 if let Some(h) = ascii_byte_to_hex_digit(b) {
185 u2 = u2 * 0x10 + h as u16;
186 } else {
187 return self.err(format!("Unicode character must be \\uXXXX (X is hex character) format but found byte '{b:#x}'"));
188 }
189 }
190 if !matches!(u2, 0xDC00..=0xDFFF) {
191 return self.err(format!("First UTF-16 surrogate {u:#x} must be directly followed by a second \\uXXXX surrogate, but found something that's not a second surrogate: {u2:#x}."));
192 }
193
194 let c =
196 (((u & 0x3ff) as u32) << 10 | (u2 & 0x3ff) as u32) + 0x1_0000;
197 char::from_u32(c).unwrap()
198 }
199 0xDC00..=0xDFFF => {
200 return self
201 .err(format!("Unpaired UTF-16 second surrogate: {u:#x}"));
202 }
203 _ => char::from_u32(u as u32).unwrap(),
204 };
205 match c.len_utf8() {
206 1 => s.push(c as u8),
207 _ => s.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()),
208 }
209 continue;
210 }
211 b => return self.err(format!("{b:#x} is invalid escaped character")),
212 },
213 b'"' => {
214 let s = String::from_utf8(s.to_vec())
215 .or_else(|_| self.err("Invalid UTF-8 in string".into()))?;
216 return Ok(JsonToken::String(s));
217 }
218 b if b < 0x20 => {
223 return self.err(format!("Unexpected control character {b:#x} in string"));
224 }
225 b => b,
226 };
227
228 s.push(b);
229 }
230 }
231
232 fn consume_constant(&mut self, s: &'static str) -> Result<(), JsonParseError> {
233 for expected_byte in s.as_bytes() {
234 let b = self.consume_byte()?;
235 if b != *expected_byte {
236 return Err(JsonParseError::new(
237 format!("Unexpected byte {b:#x} while parsing '{s}'",),
238 self.location,
239 ));
240 }
241 }
242 Ok(())
243 }
244
245 fn consume_null(&mut self) -> JsonParseResult<JsonToken> {
246 self.consume_constant("null")?;
247 Ok(JsonToken::Null)
248 }
249
250 fn consume_true(&mut self) -> JsonParseResult<JsonToken> {
251 self.consume_constant("true")?;
252 Ok(JsonToken::True)
253 }
254
255 fn consume_false(&mut self) -> JsonParseResult<JsonToken> {
256 self.consume_constant("false")?;
257 Ok(JsonToken::False)
258 }
259
260 fn consume_number(&mut self) -> JsonParseResult<JsonToken> {
261 let neg = *self.bytes.peek().unwrap() == b'-';
262 if neg {
263 self.consume_byte().unwrap();
264 }
265
266 let mut s = SmallVec::<[u8; 16]>::new();
267 let mut saw_dot = false;
268 let mut saw_exp = false;
269
270 while let Some(d) = self.bytes.peek() {
271 match d {
272 b'0'..=b'9' => s.push(*d),
273 b'.' => {
274 saw_dot = true;
275 break;
276 }
277 b'e' | b'E' => {
278 saw_exp = true;
279 break;
280 }
281 _ => break,
282 }
283 self.consume_byte().unwrap();
284 }
285
286 if s.is_empty() {
287 return self.err("Integer part must not be empty in number literal".to_string());
288 }
289
290 if s.starts_with(b"0") && s.len() > 1 {
291 return self
292 .err("Integer part of number must not start with 0 except for '0'".to_string());
293 }
294
295 if saw_dot {
296 s.push(self.consume_byte().unwrap()); while let Some(d) = self.bytes.peek() {
298 match d {
299 b'0'..=b'9' => s.push(*d),
300 b'e' | b'E' => {
301 saw_exp = true;
302 break;
303 }
304 _ => break,
305 }
306 self.consume_byte().unwrap();
307 }
308 if s.ends_with(b".") {
309 return self.err("Fraction part of number must not be empty".to_string());
310 }
311 }
312
313 if saw_exp {
314 s.push(self.consume_byte().unwrap()); if let Some(b'+') | Some(b'-') = self.bytes.peek() {
316 s.push(self.consume_byte().unwrap());
317 }
318
319 let mut saw_digit = false;
320 while let Some(d) = self.bytes.peek() {
321 match d {
322 b'0'..=b'9' => s.push(*d),
323 _ => break,
324 }
325 saw_digit = true;
326 self.consume_byte().unwrap();
327 }
328
329 if !saw_digit {
330 return self.err("Exponent part must not be empty in number literal".to_string());
331 }
332 }
333
334 let s = std::str::from_utf8(&s).unwrap();
335 match s.parse::<f64>() {
336 Ok(n) => Ok(JsonToken::Number(if neg { -n } else { n })),
337 Err(err) => self.err(format!("Invalid number literal '{}': {}", s, err)),
338 }
339 }
340
341 pub fn next_token(&mut self) -> JsonParseResult<JsonToken> {
343 let b = self
344 .peek_byte_skip_whitespace()
345 .ok_or_else(|| self.eof_err())?;
346 self.next_token_with_peeked_byte(b)
347 }
348
349 pub fn next_token_and_location(&mut self) -> JsonParseResult<(JsonToken, Location)> {
351 let b = self
352 .peek_byte_skip_whitespace()
353 .ok_or_else(|| self.eof_err())?;
354 let location = self.location;
355 let token = self.next_token_with_peeked_byte(b)?;
356 Ok((token, location))
357 }
358
359 fn next_token_with_peeked_byte(&mut self, b: u8) -> JsonParseResult<JsonToken> {
360 let token = match b {
361 b'[' => JsonToken::ArrayOpen,
362 b']' => JsonToken::ArrayClose,
363 b'{' => JsonToken::ObjOpen,
364 b'}' => JsonToken::ObjClose,
365 b':' => JsonToken::Colon,
366 b',' => JsonToken::Comma,
367 b'0'..=b'9' | b'-' => return self.consume_number(),
368 b'"' => return self.consume_string(),
369 b't' => return self.consume_true(),
370 b'f' => return self.consume_false(),
371 b'n' => return self.consume_null(),
372 c => return self.err(format!("Invalid byte: {c:#x}")),
373 };
374 self.consume_byte()?;
375 Ok(token)
376 }
377}
378
379fn ascii_byte_to_hex_digit(c: u8) -> Option<u8> {
380 if c.is_ascii_digit() {
381 Some(c - b'0')
382 } else if (b'a'..=b'f').contains(&c) {
383 Some(10 + (c - b'a'))
384 } else if (b'A'..=b'F').contains(&c) {
385 Some(10 + (c - b'A'))
386 } else {
387 None
388 }
389}