1use core::fmt;
13
14use bumpalo::Bump;
15use bumpalo::collections::Vec as BumpVec;
16
17use crate::number::NumberValue;
18use crate::value::DataValue;
19
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub struct ParseError {
22 pub kind: ParseErrorKind,
23 pub position: usize,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum ParseErrorKind {
28 UnexpectedEof,
29 UnexpectedByte(u8),
30 InvalidEscape,
31 InvalidUnicodeEscape,
32 InvalidNumber,
33 TrailingData,
34 DepthLimitExceeded,
35}
36
37impl fmt::Display for ParseError {
38 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39 write!(f, "json parse error at byte {}: ", self.position)?;
40 match self.kind {
41 ParseErrorKind::UnexpectedEof => write!(f, "unexpected end of input"),
42 ParseErrorKind::UnexpectedByte(b) => {
43 write!(f, "unexpected byte 0x{:02x} ({:?})", b, b as char)
44 }
45 ParseErrorKind::InvalidEscape => write!(f, "invalid string escape"),
46 ParseErrorKind::InvalidUnicodeEscape => write!(f, "invalid \\u escape"),
47 ParseErrorKind::InvalidNumber => write!(f, "invalid number literal"),
48 ParseErrorKind::TrailingData => write!(f, "unexpected data after JSON value"),
49 ParseErrorKind::DepthLimitExceeded => write!(f, "nesting depth limit exceeded"),
50 }
51 }
52}
53
54impl std::error::Error for ParseError {}
55
56const MAX_DEPTH: u16 = 256;
60
61const SWAR_ONES: u64 = 0x0101_0101_0101_0101;
62const SWAR_HIGHS: u64 = 0x8080_8080_8080_8080;
63
64#[inline(always)]
69fn string_terminator_mask(w: u64) -> u64 {
70 let q = w ^ (b'"' as u64 * SWAR_ONES);
73 let bs = w ^ (b'\\' as u64 * SWAR_ONES);
74 let lo = w & 0xE0E0_E0E0_E0E0_E0E0;
77 let m_q = q.wrapping_sub(SWAR_ONES) & !q;
78 let m_bs = bs.wrapping_sub(SWAR_ONES) & !bs;
79 let m_lo = lo.wrapping_sub(SWAR_ONES) & !lo;
80 (m_q | m_bs | m_lo) & SWAR_HIGHS
81}
82
83impl<'a> DataValue<'a> {
84 pub fn from_str(input: &'a str, arena: &'a Bump) -> Result<DataValue<'a>, ParseError> {
89 let mut p = Parser {
90 bytes: input.as_bytes(),
91 input,
92 pos: 0,
93 arena,
94 };
95 p.skip_ws();
96 let value = p.parse_value(0)?;
97 p.skip_ws();
98 if p.pos != p.bytes.len() {
99 return Err(p.err(ParseErrorKind::TrailingData));
100 }
101 Ok(value)
102 }
103}
104
105struct Parser<'a> {
106 bytes: &'a [u8],
107 input: &'a str,
108 pos: usize,
109 arena: &'a Bump,
110}
111
112impl<'a> Parser<'a> {
113 #[inline(always)]
114 fn err(&self, kind: ParseErrorKind) -> ParseError {
115 ParseError {
116 kind,
117 position: self.pos,
118 }
119 }
120
121 #[inline(always)]
122 fn peek(&self) -> Result<u8, ParseError> {
123 self.bytes
124 .get(self.pos)
125 .copied()
126 .ok_or_else(|| self.err(ParseErrorKind::UnexpectedEof))
127 }
128
129 #[inline(always)]
130 fn bump(&mut self) -> Result<u8, ParseError> {
131 let b = self.peek()?;
132 self.pos += 1;
133 Ok(b)
134 }
135
136 #[inline(always)]
137 fn skip_ws(&mut self) {
138 while self.pos < self.bytes.len() {
139 match self.bytes[self.pos] {
140 b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
141 _ => break,
142 }
143 }
144 }
145
146 fn parse_value(&mut self, depth: u16) -> Result<DataValue<'a>, ParseError> {
147 if depth > MAX_DEPTH {
148 return Err(self.err(ParseErrorKind::DepthLimitExceeded));
149 }
150 self.skip_ws();
151 let b = self.peek()?;
152 match b {
153 b'"' => self.parse_string().map(DataValue::String),
154 b'{' => self.parse_object(depth),
155 b'[' => self.parse_array(depth),
156 b't' | b'f' => self.parse_bool(),
157 b'n' => self.parse_null(),
158 b'-' | b'0'..=b'9' => self.parse_number(),
159 other => Err(self.err(ParseErrorKind::UnexpectedByte(other))),
160 }
161 }
162
163 fn parse_null(&mut self) -> Result<DataValue<'a>, ParseError> {
164 if self.bytes.get(self.pos..self.pos + 4) == Some(b"null") {
165 self.pos += 4;
166 Ok(DataValue::Null)
167 } else {
168 Err(self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos])))
169 }
170 }
171
172 fn parse_bool(&mut self) -> Result<DataValue<'a>, ParseError> {
173 if self.bytes.get(self.pos..self.pos + 4) == Some(b"true") {
174 self.pos += 4;
175 Ok(DataValue::Bool(true))
176 } else if self.bytes.get(self.pos..self.pos + 5) == Some(b"false") {
177 self.pos += 5;
178 Ok(DataValue::Bool(false))
179 } else {
180 Err(self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos])))
181 }
182 }
183
184 fn parse_number(&mut self) -> Result<DataValue<'a>, ParseError> {
185 let start = self.pos;
186 let mut is_float = false;
187
188 let neg = if self.bytes[self.pos] == b'-' {
193 self.pos += 1;
194 true
195 } else {
196 false
197 };
198 let mut acc: i64 = 0;
199 let mut int_overflowed = false;
200
201 match self.peek()? {
202 b'0' => {
203 self.pos += 1;
204 }
205 c @ b'1'..=b'9' => {
206 acc = -((c - b'0') as i64);
207 self.pos += 1;
208 let mut digits: u32 = 1;
211 while let Some(&d) = self.bytes.get(self.pos) {
212 match d {
213 b'0'..=b'9' => {
214 if digits < 18 {
215 acc = acc * 10 - (d - b'0') as i64;
216 digits += 1;
217 } else {
218 int_overflowed = true;
219 }
220 self.pos += 1;
221 }
222 _ => break,
223 }
224 }
225 }
226 _ => return Err(self.err(ParseErrorKind::InvalidNumber)),
227 }
228 if let Some(&b'.') = self.bytes.get(self.pos) {
230 is_float = true;
231 self.pos += 1;
232 let frac_start = self.pos;
233 while let Some(&c) = self.bytes.get(self.pos) {
234 if c.is_ascii_digit() {
235 self.pos += 1;
236 } else {
237 break;
238 }
239 }
240 if self.pos == frac_start {
241 return Err(self.err(ParseErrorKind::InvalidNumber));
242 }
243 }
244 if matches!(self.bytes.get(self.pos), Some(b'e' | b'E')) {
246 is_float = true;
247 self.pos += 1;
248 if matches!(self.bytes.get(self.pos), Some(b'+' | b'-')) {
249 self.pos += 1;
250 }
251 let exp_start = self.pos;
252 while let Some(&d) = self.bytes.get(self.pos) {
253 if d.is_ascii_digit() {
254 self.pos += 1;
255 } else {
256 break;
257 }
258 }
259 if self.pos == exp_start {
260 return Err(self.err(ParseErrorKind::InvalidNumber));
261 }
262 }
263
264 if !is_float && !int_overflowed {
265 let result = if neg { Some(acc) } else { acc.checked_neg() };
270 if let Some(i) = result {
271 return Ok(DataValue::Number(NumberValue::Integer(i)));
272 }
273 }
274
275 let slice = &self.bytes[start..self.pos];
280 match fast_float2::parse::<f64, _>(slice) {
281 Ok(f) => Ok(DataValue::Number(NumberValue::Float(f))),
282 Err(_) => Err(ParseError {
283 kind: ParseErrorKind::InvalidNumber,
284 position: start,
285 }),
286 }
287 }
288
289 fn parse_string(&mut self) -> Result<&'a str, ParseError> {
293 debug_assert_eq!(self.bytes[self.pos], b'"');
295 self.pos += 1;
296 let start = self.pos;
297
298 while self.pos + 8 <= self.bytes.len() {
305 let w = u64::from_le_bytes(self.bytes[self.pos..self.pos + 8].try_into().unwrap());
306 let mask = string_terminator_mask(w);
307 if mask != 0 {
308 self.pos += (mask.trailing_zeros() / 8) as usize;
309 break;
310 }
311 self.pos += 8;
312 }
313
314 loop {
316 let b = match self.bytes.get(self.pos) {
317 Some(&b) => b,
318 None => return Err(self.err(ParseErrorKind::UnexpectedEof)),
319 };
320 match b {
321 b'"' => {
322 let s = &self.input[start..self.pos];
323 self.pos += 1;
324 return Ok(s);
325 }
326 b'\\' => {
327 return self.parse_string_with_escapes(start);
330 }
331 0..=0x1F => {
332 return Err(self.err(ParseErrorKind::UnexpectedByte(b)));
333 }
334 _ => self.pos += 1,
335 }
336 }
337 }
338
339 fn parse_string_with_escapes(&mut self, start: usize) -> Result<&'a str, ParseError> {
340 let mut out: BumpVec<u8> = BumpVec::with_capacity_in(self.pos - start + 16, self.arena);
341 out.extend_from_slice(&self.bytes[start..self.pos]);
342
343 loop {
344 let chunk_start = self.pos;
348 while self.pos + 8 <= self.bytes.len() {
349 let w = u64::from_le_bytes(self.bytes[self.pos..self.pos + 8].try_into().unwrap());
350 let mask = string_terminator_mask(w);
351 if mask != 0 {
352 self.pos += (mask.trailing_zeros() / 8) as usize;
353 break;
354 }
355 self.pos += 8;
356 }
357 while let Some(&b) = self.bytes.get(self.pos) {
358 if matches!(b, b'"' | b'\\') || b < 0x20 {
359 break;
360 }
361 self.pos += 1;
362 }
363 if self.pos > chunk_start {
364 out.extend_from_slice(&self.bytes[chunk_start..self.pos]);
365 }
366
367 let b = match self.bytes.get(self.pos) {
368 Some(&b) => b,
369 None => return Err(self.err(ParseErrorKind::UnexpectedEof)),
370 };
371 match b {
372 b'"' => {
373 self.pos += 1;
374 let slice = out.into_bump_slice();
375 return Ok(unsafe { core::str::from_utf8_unchecked(slice) });
379 }
380 b'\\' => {
381 self.pos += 1;
382 let esc = self.bump()?;
383 match esc {
384 b'"' => out.push(b'"'),
385 b'\\' => out.push(b'\\'),
386 b'/' => out.push(b'/'),
387 b'b' => out.push(0x08),
388 b'f' => out.push(0x0C),
389 b'n' => out.push(b'\n'),
390 b'r' => out.push(b'\r'),
391 b't' => out.push(b'\t'),
392 b'u' => {
393 let code = self.parse_hex4()?;
394 let ch = if (0xD800..=0xDBFF).contains(&code) {
396 if self.bytes.get(self.pos) != Some(&b'\\')
397 || self.bytes.get(self.pos + 1) != Some(&b'u')
398 {
399 return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
400 }
401 self.pos += 2;
402 let low = self.parse_hex4()?;
403 if !(0xDC00..=0xDFFF).contains(&low) {
404 return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
405 }
406 let scalar = 0x10000
407 + (((code - 0xD800) as u32) << 10)
408 + ((low - 0xDC00) as u32);
409 char::from_u32(scalar)
410 .ok_or_else(|| self.err(ParseErrorKind::InvalidUnicodeEscape))?
411 } else if (0xDC00..=0xDFFF).contains(&code) {
412 return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
413 } else {
414 char::from_u32(code as u32)
415 .ok_or_else(|| self.err(ParseErrorKind::InvalidUnicodeEscape))?
416 };
417 let mut buf = [0u8; 4];
418 let s = ch.encode_utf8(&mut buf);
419 out.extend_from_slice(s.as_bytes());
420 }
421 _ => return Err(self.err(ParseErrorKind::InvalidEscape)),
422 }
423 }
424 _ => return Err(self.err(ParseErrorKind::UnexpectedByte(b))),
425 }
426 }
427 }
428
429 fn parse_hex4(&mut self) -> Result<u16, ParseError> {
430 if self.pos + 4 > self.bytes.len() {
431 return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
432 }
433 let mut v: u16 = 0;
434 for _ in 0..4 {
435 let b = self.bytes[self.pos];
436 let d = match b {
437 b'0'..=b'9' => b - b'0',
438 b'a'..=b'f' => b - b'a' + 10,
439 b'A'..=b'F' => b - b'A' + 10,
440 _ => return Err(self.err(ParseErrorKind::InvalidUnicodeEscape)),
441 } as u16;
442 v = (v << 4) | d;
443 self.pos += 1;
444 }
445 Ok(v)
446 }
447
448 fn parse_array(&mut self, depth: u16) -> Result<DataValue<'a>, ParseError> {
449 debug_assert_eq!(self.bytes[self.pos], b'[');
450 self.pos += 1;
451 self.skip_ws();
452 let mut items: BumpVec<DataValue<'a>> = BumpVec::with_capacity_in(8, self.arena);
459 if let Some(&b']') = self.bytes.get(self.pos) {
460 self.pos += 1;
461 return Ok(DataValue::Array(items.into_bump_slice()));
462 }
463 loop {
464 let v = self.parse_value(depth + 1)?;
465 items.push(v);
466 match self.bytes.get(self.pos) {
470 Some(&b',') => {
471 self.pos += 1;
472 self.skip_ws();
473 }
474 Some(&b']') => {
475 self.pos += 1;
476 return Ok(DataValue::Array(items.into_bump_slice()));
477 }
478 _ => {
479 self.skip_ws();
480 match self.bump()? {
481 b',' => self.skip_ws(),
482 b']' => return Ok(DataValue::Array(items.into_bump_slice())),
483 other => return Err(self.err(ParseErrorKind::UnexpectedByte(other))),
484 }
485 }
486 }
487 }
488 }
489
490 fn parse_object(&mut self, depth: u16) -> Result<DataValue<'a>, ParseError> {
491 debug_assert_eq!(self.bytes[self.pos], b'{');
492 self.pos += 1;
493 self.skip_ws();
494 let mut pairs: BumpVec<(&'a str, DataValue<'a>)> =
503 BumpVec::with_capacity_in(32, self.arena);
504 if let Some(&b'}') = self.bytes.get(self.pos) {
505 self.pos += 1;
506 return Ok(DataValue::Object(pairs.into_bump_slice()));
507 }
508 loop {
509 if self.peek()? != b'"' {
511 return Err(self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos])));
512 }
513 let key = self.parse_string()?;
514
515 match self.bytes.get(self.pos) {
517 Some(&b':') => self.pos += 1,
518 _ => {
519 self.skip_ws();
520 if self.bump()? != b':' {
521 return Err(
522 self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos - 1]))
523 );
524 }
525 }
526 }
527
528 let value = self.parse_value(depth + 1)?;
530 pairs.push((key, value));
531
532 match self.bytes.get(self.pos) {
534 Some(&b',') => {
535 self.pos += 1;
536 self.skip_ws();
537 }
538 Some(&b'}') => {
539 self.pos += 1;
540 return Ok(DataValue::Object(pairs.into_bump_slice()));
541 }
542 _ => {
543 self.skip_ws();
544 match self.bump()? {
545 b',' => self.skip_ws(),
546 b'}' => return Ok(DataValue::Object(pairs.into_bump_slice())),
547 other => return Err(self.err(ParseErrorKind::UnexpectedByte(other))),
548 }
549 }
550 }
551 }
552 }
553}
554
555#[cfg(test)]
556mod tests {
557 use super::*;
558
559 fn parse(s: &str) -> DataValue<'_> {
560 let arena = Box::leak(Box::new(Bump::new()));
561 DataValue::from_str(s, arena).expect("parse")
562 }
563
564 #[test]
565 fn primitives() {
566 assert!(parse("null").is_null());
567 assert_eq!(parse("true").as_bool(), Some(true));
568 assert_eq!(parse("false").as_bool(), Some(false));
569 assert_eq!(parse("0").as_i64(), Some(0));
570 assert_eq!(parse("-7").as_i64(), Some(-7));
571 assert_eq!(parse("3.5").as_f64(), Some(3.5));
572 assert_eq!(parse("1e3").as_f64(), Some(1000.0));
573 assert_eq!(parse(r#""hello""#).as_str(), Some("hello"));
574 }
575
576 #[test]
577 fn integer_overflow_falls_to_float() {
578 let v = parse("123456789012345678901234567890");
579 assert!(v.is_f64());
580 }
581
582 #[test]
583 fn i64_boundaries() {
584 assert_eq!(parse("9223372036854775807").as_i64(), Some(i64::MAX));
585 assert_eq!(parse("-9223372036854775808").as_i64(), Some(i64::MIN));
586 assert!(parse("9223372036854775808").is_f64());
588 assert!(parse("-9223372036854775809").is_f64());
590 }
591
592 #[test]
593 fn empty_collections() {
594 assert_eq!(parse("[]").len(), Some(0));
595 assert_eq!(parse("{}").len(), Some(0));
596 }
597
598 #[test]
599 fn arrays_and_objects() {
600 let v = parse(r#"{"a":[1,2,3],"b":{"c":true}}"#);
601 assert_eq!(v["a"][0].as_i64(), Some(1));
602 assert_eq!(v["a"][2].as_i64(), Some(3));
603 assert_eq!(v["b"]["c"].as_bool(), Some(true));
604 }
605
606 #[test]
607 fn string_escapes() {
608 assert_eq!(parse(r#""a\nb""#).as_str(), Some("a\nb"));
609 assert_eq!(parse(r#""a\\b""#).as_str(), Some("a\\b"));
610 assert_eq!(parse(r#""é""#).as_str(), Some("é"));
611 assert_eq!(parse(r#""😀""#).as_str(), Some("😀"));
613 }
614
615 #[test]
616 fn whitespace_tolerant() {
617 let v = parse(" {\n \"a\" :\t1 ,\n \"b\":2\n} ");
618 assert_eq!(v["a"].as_i64(), Some(1));
619 assert_eq!(v["b"].as_i64(), Some(2));
620 }
621
622 #[test]
623 fn rejects_trailing_data() {
624 let arena = Bump::new();
625 assert!(DataValue::from_str("1 2", &arena).is_err());
626 }
627
628 #[test]
629 fn rejects_bad_escape() {
630 let arena = Bump::new();
631 assert!(DataValue::from_str(r#""\q""#, &arena).is_err());
632 }
633
634 #[test]
635 fn rejects_unescaped_control_bytes_in_string() {
636 let arena = Bump::new();
640 for ctl in 0u8..0x20 {
641 let mut s = Vec::from(b"\"abcdefghijklmnop");
644 s.push(ctl);
645 s.push(b'"');
646 let input = std::str::from_utf8(&s).unwrap();
647 assert!(
648 DataValue::from_str(input, &arena).is_err(),
649 "control byte 0x{ctl:02x} should error",
650 );
651 }
652 }
653
654 #[test]
655 fn long_escape_string_round_trips() {
656 let mut json = String::from("\"");
659 for _ in 0..10 {
660 json.push_str(&"x".repeat(40));
661 json.push_str(r"\n");
662 }
663 json.push('"');
664 let arena = Bump::new();
665 let v = DataValue::from_str(&json, &arena).unwrap();
666 let s = v.as_str().unwrap();
667 assert_eq!(s.matches('\n').count(), 10);
668 assert!(s.starts_with(&"x".repeat(40)));
669 }
670
671 #[test]
672 fn long_string_round_trips() {
673 let s = "x".repeat(200);
676 let json = format!("\"{s}\"");
677 let arena = Bump::new();
678 let v = DataValue::from_str(&json, &arena).unwrap();
679 assert_eq!(v.as_str(), Some(s.as_str()));
680 }
681
682 #[test]
683 fn deep_nesting_under_limit_ok() {
684 let n = 200;
685 let s = "[".repeat(n) + &"]".repeat(n);
686 let arena = Bump::new();
687 assert!(DataValue::from_str(&s, &arena).is_ok());
688 }
689
690 #[test]
691 fn deep_nesting_over_limit_errors() {
692 let n = 1000;
693 let s = "[".repeat(n) + &"]".repeat(n);
694 let arena = Bump::new();
695 assert!(DataValue::from_str(&s, &arena).is_err());
696 }
697}