Skip to main content

datavalue_rs/
parser.rs

1//! Bump-allocated JSON parser. See [`DataValue::from_str`] for the entry point.
2//!
3//! Strategy:
4//! - Single linear scan over the input bytes.
5//! - Strings without escape sequences are borrowed directly from the input
6//!   (zero-copy). Strings with escapes are unescaped into the arena.
7//! - Arrays/objects are accumulated in `bumpalo::collections::Vec` then
8//!   frozen into `&[..]` slices via `into_bump_slice`.
9//! - Numbers parse on the integer fast path (i64) and only fall back to f64
10//!   when a decimal point or exponent is present (or i64 overflows).
11
12use core::fmt;
13
14use bumpalo::Bump;
15use bumpalo::collections::Vec as BumpVec;
16
17use crate::number::NumberValue;
18use crate::value::DataValue;
19
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub struct ParseError {
22    pub kind: ParseErrorKind,
23    pub position: usize,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum ParseErrorKind {
28    UnexpectedEof,
29    UnexpectedByte(u8),
30    InvalidEscape,
31    InvalidUnicodeEscape,
32    InvalidNumber,
33    TrailingData,
34    DepthLimitExceeded,
35}
36
37impl fmt::Display for ParseError {
38    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39        write!(f, "json parse error at byte {}: ", self.position)?;
40        match self.kind {
41            ParseErrorKind::UnexpectedEof => write!(f, "unexpected end of input"),
42            ParseErrorKind::UnexpectedByte(b) => {
43                write!(f, "unexpected byte 0x{:02x} ({:?})", b, b as char)
44            }
45            ParseErrorKind::InvalidEscape => write!(f, "invalid string escape"),
46            ParseErrorKind::InvalidUnicodeEscape => write!(f, "invalid \\u escape"),
47            ParseErrorKind::InvalidNumber => write!(f, "invalid number literal"),
48            ParseErrorKind::TrailingData => write!(f, "unexpected data after JSON value"),
49            ParseErrorKind::DepthLimitExceeded => write!(f, "nesting depth limit exceeded"),
50        }
51    }
52}
53
54impl std::error::Error for ParseError {}
55
56/// Soft cap on nested array/object depth. Keeps the stack usage bounded so
57/// pathological input can't blow the recursive descent stack. 256 is well
58/// past anything legitimate JSON would produce.
59const MAX_DEPTH: u16 = 256;
60
61const SWAR_ONES: u64 = 0x0101_0101_0101_0101;
62const SWAR_HIGHS: u64 = 0x8080_8080_8080_8080;
63
64/// SWAR scan for the next byte that ends a JSON string fast path: `"`, `\\`,
65/// or any control byte (< 0x20). Returns a mask with the high bit set in the
66/// byte positions that match; the first match (if any) is found via
67/// `trailing_zeros() / 8`. Bytes are interpreted little-endian.
68#[inline(always)]
69fn string_terminator_mask(w: u64) -> u64 {
70    // For "byte equals X", XOR makes the matching byte zero, then
71    // `(z - 0x01..) & !z & 0x80..` highlights any zero-byte position.
72    let q = w ^ (b'"' as u64 * SWAR_ONES);
73    let bs = w ^ (b'\\' as u64 * SWAR_ONES);
74    // For "byte < 0x20", mask off the low 5 bits per byte (`& 0xE0`) and
75    // detect zero bytes — any byte 0x00..=0x1F has its top 3 bits clear.
76    let lo = w & 0xE0E0_E0E0_E0E0_E0E0;
77    let m_q = q.wrapping_sub(SWAR_ONES) & !q;
78    let m_bs = bs.wrapping_sub(SWAR_ONES) & !bs;
79    let m_lo = lo.wrapping_sub(SWAR_ONES) & !lo;
80    (m_q | m_bs | m_lo) & SWAR_HIGHS
81}
82
83impl<'a> DataValue<'a> {
84    /// Parse a JSON document into a [`DataValue`] tree allocated in `arena`.
85    ///
86    /// Strings without escape sequences are borrowed directly from `input`
87    /// (the returned tree's lifetime is the shorter of `input` and `arena`).
88    pub fn from_str(input: &'a str, arena: &'a Bump) -> Result<DataValue<'a>, ParseError> {
89        let mut p = Parser {
90            bytes: input.as_bytes(),
91            input,
92            pos: 0,
93            arena,
94        };
95        p.skip_ws();
96        let value = p.parse_value(0)?;
97        p.skip_ws();
98        if p.pos != p.bytes.len() {
99            return Err(p.err(ParseErrorKind::TrailingData));
100        }
101        Ok(value)
102    }
103}
104
105struct Parser<'a> {
106    bytes: &'a [u8],
107    input: &'a str,
108    pos: usize,
109    arena: &'a Bump,
110}
111
112impl<'a> Parser<'a> {
113    #[inline(always)]
114    fn err(&self, kind: ParseErrorKind) -> ParseError {
115        ParseError {
116            kind,
117            position: self.pos,
118        }
119    }
120
121    #[inline(always)]
122    fn peek(&self) -> Result<u8, ParseError> {
123        self.bytes
124            .get(self.pos)
125            .copied()
126            .ok_or_else(|| self.err(ParseErrorKind::UnexpectedEof))
127    }
128
129    #[inline(always)]
130    fn bump(&mut self) -> Result<u8, ParseError> {
131        let b = self.peek()?;
132        self.pos += 1;
133        Ok(b)
134    }
135
136    #[inline(always)]
137    fn skip_ws(&mut self) {
138        while self.pos < self.bytes.len() {
139            match self.bytes[self.pos] {
140                b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
141                _ => break,
142            }
143        }
144    }
145
146    fn parse_value(&mut self, depth: u16) -> Result<DataValue<'a>, ParseError> {
147        if depth > MAX_DEPTH {
148            return Err(self.err(ParseErrorKind::DepthLimitExceeded));
149        }
150        self.skip_ws();
151        let b = self.peek()?;
152        match b {
153            b'"' => self.parse_string().map(DataValue::String),
154            b'{' => self.parse_object(depth),
155            b'[' => self.parse_array(depth),
156            b't' | b'f' => self.parse_bool(),
157            b'n' => self.parse_null(),
158            b'-' | b'0'..=b'9' => self.parse_number(),
159            other => Err(self.err(ParseErrorKind::UnexpectedByte(other))),
160        }
161    }
162
163    fn parse_null(&mut self) -> Result<DataValue<'a>, ParseError> {
164        if self.bytes.get(self.pos..self.pos + 4) == Some(b"null") {
165            self.pos += 4;
166            Ok(DataValue::Null)
167        } else {
168            Err(self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos])))
169        }
170    }
171
172    fn parse_bool(&mut self) -> Result<DataValue<'a>, ParseError> {
173        if self.bytes.get(self.pos..self.pos + 4) == Some(b"true") {
174            self.pos += 4;
175            Ok(DataValue::Bool(true))
176        } else if self.bytes.get(self.pos..self.pos + 5) == Some(b"false") {
177            self.pos += 5;
178            Ok(DataValue::Bool(false))
179        } else {
180            Err(self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos])))
181        }
182    }
183
184    fn parse_number(&mut self) -> Result<DataValue<'a>, ParseError> {
185        let start = self.pos;
186        let mut is_float = false;
187
188        // Accumulate the integer as a *negative* i64. This lets the magnitude
189        // reach i64::MIN without wrapping, which a positive accumulator can't.
190        // On overflow we set int_overflowed and stop accumulating; the digit
191        // scan still advances `pos` so the slice for the f64 fallback is right.
192        let neg = if self.bytes[self.pos] == b'-' {
193            self.pos += 1;
194            true
195        } else {
196            false
197        };
198        let mut acc: i64 = 0;
199        let mut int_overflowed = false;
200
201        match self.peek()? {
202            b'0' => {
203                self.pos += 1;
204            }
205            c @ b'1'..=b'9' => {
206                acc = -((c - b'0') as i64);
207                self.pos += 1;
208                // 18 digits fit in i64 unconditionally (i64::MAX ≈ 9.22 × 10^18).
209                // Beyond that we tag overflow and let the f64 fallback handle it.
210                let mut digits: u32 = 1;
211                while let Some(&d) = self.bytes.get(self.pos) {
212                    match d {
213                        b'0'..=b'9' => {
214                            if digits < 18 {
215                                acc = acc * 10 - (d - b'0') as i64;
216                                digits += 1;
217                            } else {
218                                int_overflowed = true;
219                            }
220                            self.pos += 1;
221                        }
222                        _ => break,
223                    }
224                }
225            }
226            _ => return Err(self.err(ParseErrorKind::InvalidNumber)),
227        }
228        // Fraction.
229        if let Some(&b'.') = self.bytes.get(self.pos) {
230            is_float = true;
231            self.pos += 1;
232            let frac_start = self.pos;
233            while let Some(&c) = self.bytes.get(self.pos) {
234                if c.is_ascii_digit() {
235                    self.pos += 1;
236                } else {
237                    break;
238                }
239            }
240            if self.pos == frac_start {
241                return Err(self.err(ParseErrorKind::InvalidNumber));
242            }
243        }
244        // Exponent.
245        if matches!(self.bytes.get(self.pos), Some(b'e' | b'E')) {
246            is_float = true;
247            self.pos += 1;
248            if matches!(self.bytes.get(self.pos), Some(b'+' | b'-')) {
249                self.pos += 1;
250            }
251            let exp_start = self.pos;
252            while let Some(&d) = self.bytes.get(self.pos) {
253                if d.is_ascii_digit() {
254                    self.pos += 1;
255                } else {
256                    break;
257                }
258            }
259            if self.pos == exp_start {
260                return Err(self.err(ParseErrorKind::InvalidNumber));
261            }
262        }
263
264        if !is_float && !int_overflowed {
265            // `acc` is the negative-accumulated value. If the input was
266            // negative we keep it; otherwise negate. The only failure mode is
267            // acc == i64::MIN with !neg (input "9223372036854775808"), which
268            // overflows positive i64 and falls through to f64.
269            let result = if neg { Some(acc) } else { acc.checked_neg() };
270            if let Some(i) = result {
271                return Ok(DataValue::Number(NumberValue::Integer(i)));
272            }
273        }
274
275        // fast-float2 is meaningfully faster than libcore's f64 parser on
276        // float-heavy input (the canada fixture is ~2 MB of floats). The
277        // number literal we just walked is JSON-shaped and is a strict
278        // subset of what the parser accepts.
279        let slice = &self.bytes[start..self.pos];
280        match fast_float2::parse::<f64, _>(slice) {
281            Ok(f) => Ok(DataValue::Number(NumberValue::Float(f))),
282            Err(_) => Err(ParseError {
283                kind: ParseErrorKind::InvalidNumber,
284                position: start,
285            }),
286        }
287    }
288
289    /// Parse a `"..."` string and return the resolved &str. Borrowed from
290    /// the input when there are no escape sequences; otherwise unescaped
291    /// into the arena.
292    fn parse_string(&mut self) -> Result<&'a str, ParseError> {
293        // Already at the opening quote.
294        debug_assert_eq!(self.bytes[self.pos], b'"');
295        self.pos += 1;
296        let start = self.pos;
297
298        // Bulk SWAR scan: 8 bytes at a time, looking for `"`, `\\`, or any
299        // byte < 0x20. The branch-free mask gives us the offset of the first
300        // hit within the window via trailing_zeros / 8. Inlined here rather
301        // than dispatched via a SIMD helper — the call/slice boundary cost
302        // outweighs even NEON's 16-byte stride for the typical mix of short
303        // JSON strings (object keys, IDs).
304        while self.pos + 8 <= self.bytes.len() {
305            let w = u64::from_le_bytes(self.bytes[self.pos..self.pos + 8].try_into().unwrap());
306            let mask = string_terminator_mask(w);
307            if mask != 0 {
308                self.pos += (mask.trailing_zeros() / 8) as usize;
309                break;
310            }
311            self.pos += 8;
312        }
313
314        // Tail (and post-SWAR-hit) per-byte handling.
315        loop {
316            let b = match self.bytes.get(self.pos) {
317                Some(&b) => b,
318                None => return Err(self.err(ParseErrorKind::UnexpectedEof)),
319            };
320            match b {
321                b'"' => {
322                    let s = &self.input[start..self.pos];
323                    self.pos += 1;
324                    return Ok(s);
325                }
326                b'\\' => {
327                    // Switch to slow path: copy what we have so far, then
328                    // resolve escapes one at a time.
329                    return self.parse_string_with_escapes(start);
330                }
331                0..=0x1F => {
332                    return Err(self.err(ParseErrorKind::UnexpectedByte(b)));
333                }
334                _ => self.pos += 1,
335            }
336        }
337    }
338
339    fn parse_string_with_escapes(&mut self, start: usize) -> Result<&'a str, ParseError> {
340        let mut out: BumpVec<u8> = BumpVec::with_capacity_in(self.pos - start + 16, self.arena);
341        out.extend_from_slice(&self.bytes[start..self.pos]);
342
343        loop {
344            // Bulk-copy the safe run between escapes. Same SWAR scan as the
345            // fast path, but here we copy each window into `out` in one
346            // extend_from_slice rather than pushing per byte.
347            let chunk_start = self.pos;
348            while self.pos + 8 <= self.bytes.len() {
349                let w = u64::from_le_bytes(self.bytes[self.pos..self.pos + 8].try_into().unwrap());
350                let mask = string_terminator_mask(w);
351                if mask != 0 {
352                    self.pos += (mask.trailing_zeros() / 8) as usize;
353                    break;
354                }
355                self.pos += 8;
356            }
357            while let Some(&b) = self.bytes.get(self.pos) {
358                if matches!(b, b'"' | b'\\') || b < 0x20 {
359                    break;
360                }
361                self.pos += 1;
362            }
363            if self.pos > chunk_start {
364                out.extend_from_slice(&self.bytes[chunk_start..self.pos]);
365            }
366
367            let b = match self.bytes.get(self.pos) {
368                Some(&b) => b,
369                None => return Err(self.err(ParseErrorKind::UnexpectedEof)),
370            };
371            match b {
372                b'"' => {
373                    self.pos += 1;
374                    let slice = out.into_bump_slice();
375                    // The input is &str (already valid UTF-8) and our
376                    // unescape path only ever produces valid UTF-8 byte
377                    // sequences, so this is sound.
378                    return Ok(unsafe { core::str::from_utf8_unchecked(slice) });
379                }
380                b'\\' => {
381                    self.pos += 1;
382                    let esc = self.bump()?;
383                    match esc {
384                        b'"' => out.push(b'"'),
385                        b'\\' => out.push(b'\\'),
386                        b'/' => out.push(b'/'),
387                        b'b' => out.push(0x08),
388                        b'f' => out.push(0x0C),
389                        b'n' => out.push(b'\n'),
390                        b'r' => out.push(b'\r'),
391                        b't' => out.push(b'\t'),
392                        b'u' => {
393                            let code = self.parse_hex4()?;
394                            // Handle surrogate pairs.
395                            let ch = if (0xD800..=0xDBFF).contains(&code) {
396                                if self.bytes.get(self.pos) != Some(&b'\\')
397                                    || self.bytes.get(self.pos + 1) != Some(&b'u')
398                                {
399                                    return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
400                                }
401                                self.pos += 2;
402                                let low = self.parse_hex4()?;
403                                if !(0xDC00..=0xDFFF).contains(&low) {
404                                    return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
405                                }
406                                let scalar = 0x10000
407                                    + (((code - 0xD800) as u32) << 10)
408                                    + ((low - 0xDC00) as u32);
409                                char::from_u32(scalar)
410                                    .ok_or_else(|| self.err(ParseErrorKind::InvalidUnicodeEscape))?
411                            } else if (0xDC00..=0xDFFF).contains(&code) {
412                                return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
413                            } else {
414                                char::from_u32(code as u32)
415                                    .ok_or_else(|| self.err(ParseErrorKind::InvalidUnicodeEscape))?
416                            };
417                            let mut buf = [0u8; 4];
418                            let s = ch.encode_utf8(&mut buf);
419                            out.extend_from_slice(s.as_bytes());
420                        }
421                        _ => return Err(self.err(ParseErrorKind::InvalidEscape)),
422                    }
423                }
424                _ => return Err(self.err(ParseErrorKind::UnexpectedByte(b))),
425            }
426        }
427    }
428
429    fn parse_hex4(&mut self) -> Result<u16, ParseError> {
430        if self.pos + 4 > self.bytes.len() {
431            return Err(self.err(ParseErrorKind::InvalidUnicodeEscape));
432        }
433        let mut v: u16 = 0;
434        for _ in 0..4 {
435            let b = self.bytes[self.pos];
436            let d = match b {
437                b'0'..=b'9' => b - b'0',
438                b'a'..=b'f' => b - b'a' + 10,
439                b'A'..=b'F' => b - b'A' + 10,
440                _ => return Err(self.err(ParseErrorKind::InvalidUnicodeEscape)),
441            } as u16;
442            v = (v << 4) | d;
443            self.pos += 1;
444        }
445        Ok(v)
446    }
447
448    fn parse_array(&mut self, depth: u16) -> Result<DataValue<'a>, ParseError> {
449        debug_assert_eq!(self.bytes[self.pos], b'[');
450        self.pos += 1;
451        self.skip_ws();
452        // Keep array initial capacity small (8). Larger values regress
453        // canada serialize by 2× because canada has hundreds of thousands
454        // of 2-element coordinate arrays; over-provisioned slots stay in
455        // the arena and disperse the tree, destroying serialize-traversal
456        // cache locality. The doubling cost on long arrays (twitter's
457        // 100-status array) is dwarfed by the locality cost of high cap.
458        let mut items: BumpVec<DataValue<'a>> = BumpVec::with_capacity_in(8, self.arena);
459        if let Some(&b']') = self.bytes.get(self.pos) {
460            self.pos += 1;
461            return Ok(DataValue::Array(items.into_bump_slice()));
462        }
463        loop {
464            let v = self.parse_value(depth + 1)?;
465            items.push(v);
466            // Most JSON is minified — the byte right after a value is the
467            // separator. Inspect it directly; fall back to the skip_ws +
468            // bump path only when the next byte isn't `,` or `]`.
469            match self.bytes.get(self.pos) {
470                Some(&b',') => {
471                    self.pos += 1;
472                    self.skip_ws();
473                }
474                Some(&b']') => {
475                    self.pos += 1;
476                    return Ok(DataValue::Array(items.into_bump_slice()));
477                }
478                _ => {
479                    self.skip_ws();
480                    match self.bump()? {
481                        b',' => self.skip_ws(),
482                        b']' => return Ok(DataValue::Array(items.into_bump_slice())),
483                        other => return Err(self.err(ParseErrorKind::UnexpectedByte(other))),
484                    }
485                }
486            }
487        }
488    }
489
490    fn parse_object(&mut self, depth: u16) -> Result<DataValue<'a>, ParseError> {
491        debug_assert_eq!(self.bytes[self.pos], b'{');
492        self.pos += 1;
493        self.skip_ws();
494        // Twitter status objects run ~30 keys, so 32 keeps them in their
495        // first chunk; smaller objects (citm events, 5-6 keys) leave some
496        // unused tail. We can't shrink BumpVec capacity in place inside a
497        // bump arena (the unused slots stay between this allocation and
498        // the next), so the choice is a trade-off: too high spreads the
499        // tree across the arena and tanks serialize traversal cache
500        // locality (canada serialize doubles when arrays go to cap 64);
501        // too low forces a realloc + memmove on every grow.
502        let mut pairs: BumpVec<(&'a str, DataValue<'a>)> =
503            BumpVec::with_capacity_in(32, self.arena);
504        if let Some(&b'}') = self.bytes.get(self.pos) {
505            self.pos += 1;
506            return Ok(DataValue::Object(pairs.into_bump_slice()));
507        }
508        loop {
509            // Key. After the loop entry / a `,` we already skipped WS.
510            if self.peek()? != b'"' {
511                return Err(self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos])));
512            }
513            let key = self.parse_string()?;
514
515            // Colon. Fast path: byte right after the key is `:` (minified).
516            match self.bytes.get(self.pos) {
517                Some(&b':') => self.pos += 1,
518                _ => {
519                    self.skip_ws();
520                    if self.bump()? != b':' {
521                        return Err(
522                            self.err(ParseErrorKind::UnexpectedByte(self.bytes[self.pos - 1]))
523                        );
524                    }
525                }
526            }
527
528            // Value. parse_value skips its own leading WS; no skip_ws here.
529            let value = self.parse_value(depth + 1)?;
530            pairs.push((key, value));
531
532            // Separator. Same fast path as parse_array.
533            match self.bytes.get(self.pos) {
534                Some(&b',') => {
535                    self.pos += 1;
536                    self.skip_ws();
537                }
538                Some(&b'}') => {
539                    self.pos += 1;
540                    return Ok(DataValue::Object(pairs.into_bump_slice()));
541                }
542                _ => {
543                    self.skip_ws();
544                    match self.bump()? {
545                        b',' => self.skip_ws(),
546                        b'}' => return Ok(DataValue::Object(pairs.into_bump_slice())),
547                        other => return Err(self.err(ParseErrorKind::UnexpectedByte(other))),
548                    }
549                }
550            }
551        }
552    }
553}
554
555#[cfg(test)]
556mod tests {
557    use super::*;
558
559    fn parse(s: &str) -> DataValue<'_> {
560        let arena = Box::leak(Box::new(Bump::new()));
561        DataValue::from_str(s, arena).expect("parse")
562    }
563
564    #[test]
565    fn primitives() {
566        assert!(parse("null").is_null());
567        assert_eq!(parse("true").as_bool(), Some(true));
568        assert_eq!(parse("false").as_bool(), Some(false));
569        assert_eq!(parse("0").as_i64(), Some(0));
570        assert_eq!(parse("-7").as_i64(), Some(-7));
571        assert_eq!(parse("3.5").as_f64(), Some(3.5));
572        assert_eq!(parse("1e3").as_f64(), Some(1000.0));
573        assert_eq!(parse(r#""hello""#).as_str(), Some("hello"));
574    }
575
576    #[test]
577    fn integer_overflow_falls_to_float() {
578        let v = parse("123456789012345678901234567890");
579        assert!(v.is_f64());
580    }
581
582    #[test]
583    fn i64_boundaries() {
584        assert_eq!(parse("9223372036854775807").as_i64(), Some(i64::MAX));
585        assert_eq!(parse("-9223372036854775808").as_i64(), Some(i64::MIN));
586        // Just past i64::MAX must demote to f64, not silently wrap.
587        assert!(parse("9223372036854775808").is_f64());
588        // Just past i64::MIN must demote to f64.
589        assert!(parse("-9223372036854775809").is_f64());
590    }
591
592    #[test]
593    fn empty_collections() {
594        assert_eq!(parse("[]").len(), Some(0));
595        assert_eq!(parse("{}").len(), Some(0));
596    }
597
598    #[test]
599    fn arrays_and_objects() {
600        let v = parse(r#"{"a":[1,2,3],"b":{"c":true}}"#);
601        assert_eq!(v["a"][0].as_i64(), Some(1));
602        assert_eq!(v["a"][2].as_i64(), Some(3));
603        assert_eq!(v["b"]["c"].as_bool(), Some(true));
604    }
605
606    #[test]
607    fn string_escapes() {
608        assert_eq!(parse(r#""a\nb""#).as_str(), Some("a\nb"));
609        assert_eq!(parse(r#""a\\b""#).as_str(), Some("a\\b"));
610        assert_eq!(parse(r#""é""#).as_str(), Some("é"));
611        // Surrogate pair for U+1F600 😀
612        assert_eq!(parse(r#""😀""#).as_str(), Some("😀"));
613    }
614
615    #[test]
616    fn whitespace_tolerant() {
617        let v = parse(" {\n \"a\" :\t1 ,\n \"b\":2\n} ");
618        assert_eq!(v["a"].as_i64(), Some(1));
619        assert_eq!(v["b"].as_i64(), Some(2));
620    }
621
622    #[test]
623    fn rejects_trailing_data() {
624        let arena = Bump::new();
625        assert!(DataValue::from_str("1 2", &arena).is_err());
626    }
627
628    #[test]
629    fn rejects_bad_escape() {
630        let arena = Bump::new();
631        assert!(DataValue::from_str(r#""\q""#, &arena).is_err());
632    }
633
634    #[test]
635    fn rejects_unescaped_control_bytes_in_string() {
636        // The SWAR scan must still surface every control byte (0x00..=0x1F),
637        // including ones that fall inside an 8-byte window after several
638        // safe bytes.
639        let arena = Bump::new();
640        for ctl in 0u8..0x20 {
641            // Pad with safe bytes so the control byte lands somewhere in
642            // the bulk-scan path rather than the head.
643            let mut s = Vec::from(b"\"abcdefghijklmnop");
644            s.push(ctl);
645            s.push(b'"');
646            let input = std::str::from_utf8(&s).unwrap();
647            assert!(
648                DataValue::from_str(input, &arena).is_err(),
649                "control byte 0x{ctl:02x} should error",
650            );
651        }
652    }
653
654    #[test]
655    fn long_escape_string_round_trips() {
656        // Force the escape slow path's SWAR loop to run several iterations
657        // by interleaving long safe runs with escapes.
658        let mut json = String::from("\"");
659        for _ in 0..10 {
660            json.push_str(&"x".repeat(40));
661            json.push_str(r"\n");
662        }
663        json.push('"');
664        let arena = Bump::new();
665        let v = DataValue::from_str(&json, &arena).unwrap();
666        let s = v.as_str().unwrap();
667        assert_eq!(s.matches('\n').count(), 10);
668        assert!(s.starts_with(&"x".repeat(40)));
669    }
670
671    #[test]
672    fn long_string_round_trips() {
673        // Force the SWAR loop to fire several iterations and the tail to
674        // take over for the final < 8 bytes.
675        let s = "x".repeat(200);
676        let json = format!("\"{s}\"");
677        let arena = Bump::new();
678        let v = DataValue::from_str(&json, &arena).unwrap();
679        assert_eq!(v.as_str(), Some(s.as_str()));
680    }
681
682    #[test]
683    fn deep_nesting_under_limit_ok() {
684        let n = 200;
685        let s = "[".repeat(n) + &"]".repeat(n);
686        let arena = Bump::new();
687        assert!(DataValue::from_str(&s, &arena).is_ok());
688    }
689
690    #[test]
691    fn deep_nesting_over_limit_errors() {
692        let n = 1000;
693        let s = "[".repeat(n) + &"]".repeat(n);
694        let arena = Bump::new();
695        assert!(DataValue::from_str(&s, &arena).is_err());
696    }
697}