Skip to main content

jzon/
scanner.rs

1use crate::{simd, Error};
2
3#[cold]
4#[inline]
5fn err_eof() -> Error { Error::UnexpectedEof }
6
7#[cold]
8#[inline]
9fn err_token() -> Error { Error::UnexpectedToken }
10
11/// A parsed JSON string: either a zero-copy borrow or a heap-allocated value.
12///
13/// The `BorrowedNoEsc` variant is returned by [`Scanner::read_str`] when no
14/// escape sequences were present in the JSON input.  This lets the serializer
15/// skip the `find_escape` scan entirely — the string is provably escape-free.
16pub enum JsonStr<'de> {
17    /// Zero-copy borrow from the input.  **No longer emitted by [`Scanner::read_str`]**
18    /// (use [`JsonStr::BorrowedNoEsc`] instead); kept for API compatibility.
19    /// `ToJson` will run `write_escaped_str` on this variant.
20    Borrowed(&'de str),
21    /// Zero-copy borrow whose content is **provably escape-free** (the scanner
22    /// hit a closing `"` before any `\\`).  The serializer can bypass the
23    /// `find_escape` scan and write the bytes directly.
24    BorrowedNoEsc(&'de str),
25    Owned(String),
26}
27
28impl<'de> JsonStr<'de> {
29    #[inline]
30    pub fn as_borrowed(&self) -> Option<&'de str> {
31        match self {
32            JsonStr::Borrowed(s) => Some(s),
33            JsonStr::BorrowedNoEsc(s) => Some(s),
34            JsonStr::Owned(_) => None,
35        }
36    }
37
38    #[inline]
39    pub fn as_str(&self) -> &str {
40        match self {
41            JsonStr::Borrowed(s) => s,
42            JsonStr::BorrowedNoEsc(s) => s,
43            JsonStr::Owned(s) => s.as_str(),
44        }
45    }
46
47    #[inline]
48    pub fn into_owned(self) -> String {
49        match self {
50            JsonStr::Borrowed(s) => s.to_owned(),
51            JsonStr::BorrowedNoEsc(s) => s.to_owned(),
52            JsonStr::Owned(s) => s,
53        }
54    }
55}
56
57pub struct Scanner<'de> {
58    input: &'de [u8],
59    pos: usize,
60    #[cfg(feature = "stats")]
61    pub stats: crate::stats::ScannerStats,
62}
63
64impl<'de> Scanner<'de> {
65    #[inline]
66    pub fn new(input: &'de [u8]) -> Self {
67        Scanner {
68            input,
69            pos: 0,
70            #[cfg(feature = "stats")]
71            stats: crate::stats::ScannerStats::default(),
72        }
73    }
74
75    #[inline]
76    pub fn new_str(s: &'de str) -> Self {
77        Self::new(s.as_bytes())
78    }
79
80    #[inline]
81    pub fn peek_byte(&self) -> Result<u8, Error> {
82        self.input.get(self.pos).copied().ok_or_else(err_eof)
83    }
84
85    #[inline]
86    pub fn advance(&mut self) {
87        self.pos += 1;
88    }
89
90    /// Byte offset into the input slice — used by internally-tagged enum parsers to checkpoint and re-scan.
91    #[inline]
92    pub fn pos(&self) -> usize { self.pos }
93
94    #[inline]
95    pub fn set_pos(&mut self, saved_pos: usize) { self.pos = saved_pos; }
96
97    #[inline]
98    pub fn advance_by(&mut self, n: usize) {
99        self.pos += n;
100    }
101
102    /// Remaining unprocessed input — used by single-pass float parsers (`fast_float2::parse_partial`).
103    #[inline]
104    pub fn remaining_input(&self) -> &'de [u8] {
105        &self.input[self.pos..]
106    }
107
108    #[inline]
109    pub fn expect_byte(&mut self, expected: u8) -> Result<(), Error> {
110        match self.input.get(self.pos) {
111            Some(&b) if b == expected => { self.pos += 1; Ok(()) }
112            _ => Err(err_token()),
113        }
114    }
115
116    pub fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), Error> {
117        let end = self.pos + expected.len();
118        if self.input.get(self.pos..end) == Some(expected) {
119            self.pos = end;
120            Ok(())
121        } else {
122            Err(err_token())
123        }
124    }
125
126    #[inline(always)]
127    pub fn skip_whitespace(&mut self) {
128        // Fast path: compact JSON has no leading whitespace — skip the loop entirely.
129        // All structural bytes are > b' ' (32), so this correctly identifies non-whitespace.
130        if let Some(&b) = self.input.get(self.pos) {
131            if b > b' ' { return; }
132        } else {
133            return;
134        }
135        self.skip_whitespace_swar();
136    }
137
138    /// SWAR whitespace skipper — called only when the first byte IS whitespace.
139    ///
140    /// JSON whitespace (ECMA-404 §2) is exactly: 0x09 (TAB), 0x0A (LF),
141    /// 0x0D (CR), 0x20 (SP).  VT (0x0B) and FF (0x0C) are NOT valid JSON
142    /// whitespace even though they are ≤ 0x20.
143    ///
144    /// Not `#[cold]` — pretty-printed JSON calls this on every field separator.
145    #[inline]
146    fn skip_whitespace_swar(&mut self) {
147        // 8-byte SWAR bulk scan: all 4 JSON whitespace bytes (0x09,0x0A,0x0D,0x20)
148        // are ≤ 0x20, so the "all high-bits set after sub(0x21)" trick bulk-skips
149        // them. VT(0x0B) and FF(0x0C) also satisfy this, so the byte-by-byte tail
150        // re-validates: bulk skip advances past any ≤0x20 byte, tail rejects non-WS.
151        while self.pos + 8 <= self.input.len() {
152            let chunk = u64::from_le_bytes(
153                self.input[self.pos..self.pos + 8].try_into().unwrap(),
154            );
155            let sub = chunk.wrapping_sub(0x2121_2121_2121_2121_u64);
156            if (sub & 0x8080_8080_8080_8080_u64) == 0x8080_8080_8080_8080_u64 {
157                self.pos += 8;
158            } else {
159                break;
160            }
161        }
162        // Byte-by-byte tail: precise WS test rejects VT/FF.
163        while let Some(&b) = self.input.get(self.pos) {
164            if b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' {
165                self.pos += 1;
166            } else {
167                break;
168            }
169        }
170    }
171
172    #[inline(always)]
173    pub fn peek_byte_after_ws(&mut self) -> Result<u8, Error> {
174        self.skip_whitespace();
175        self.peek_byte()
176    }
177
178    /// After parsing a top-level value, skip trailing whitespace and verify
179    /// that no non-whitespace bytes remain (ECMA-404 requires a single value).
180    #[inline]
181    pub fn expect_eof(&mut self) -> Result<(), Error> {
182        self.skip_whitespace();
183        if self.pos < self.input.len() {
184            Err(Error::UnexpectedToken)
185        } else {
186            Ok(())
187        }
188    }
189
190    /// Read a JSON object key as a zero-copy `&'de [u8]`.
191    /// Returns `Error::EscapedKey` if the key contains backslashes.
192    pub fn read_key(&mut self) -> Result<&'de [u8], Error> {
193        self.skip_whitespace();
194        self.expect_byte(b'"')?;
195        let start = self.pos;
196        // When simd-intrinsics are active the find_escape kernels (NEON/AVX2)
197        // fuse quote+backslash+ctrl<0x20 in a single SIMD pass — negligible extra
198        // cost over find(). On scalar/SWAR paths, use find() + SWAR has_control_char
199        // to avoid a byte-by-byte second pass.
200        #[cfg(feature = "simd-intrinsics")]
201        let stop = simd::find_escape(self.input, self.pos);
202        #[cfg(not(feature = "simd-intrinsics"))]
203        let stop = simd::find(self.input, self.pos);
204
205        match self.input.get(stop) {
206            Some(&b'"') => {
207                #[cfg(not(feature = "simd-intrinsics"))]
208                if simd::has_control_char(&self.input[start..stop]) {
209                    return Err(Error::InvalidEscape);
210                }
211                self.pos = stop + 1;
212                Ok(&self.input[start..stop])
213            }
214            Some(&b'\\') => Err(Error::EscapedKey),
215            Some(_) => Err(Error::InvalidEscape), // control char from find_escape
216            _ => Err(err_eof()),
217        }
218    }
219
220    /// Read a JSON object key and the mandatory `:` separator in one call.
221    #[inline]
222    pub fn read_key_colon(&mut self) -> Result<&'de [u8], Error> {
223        let key = self.read_key()?;
224        // Fast path: ':' almost always immediately follows the closing '"' in compact JSON.
225        if self.input.get(self.pos) == Some(&b':') {
226            self.pos += 1;
227        } else {
228            self.skip_whitespace();
229            self.expect_byte(b':')?;
230        }
231        Ok(key)
232    }
233
234    /// Read a JSON string value.
235    ///
236    /// Returns [`JsonStr::BorrowedNoEsc`] when no escape sequences are present
237    /// (zero allocation, provably escape-free), or [`JsonStr::Owned`] after
238    /// unescaping.
239    pub fn read_str(&mut self) -> Result<JsonStr<'de>, Error> {
240        self.skip_whitespace();
241        self.expect_byte(b'"')?;
242        let start = self.pos;
243        #[cfg(feature = "simd-intrinsics")]
244        let stop = simd::find_escape(self.input, start);
245        #[cfg(not(feature = "simd-intrinsics"))]
246        let stop = simd::find(self.input, start);
247
248        match self.input.get(stop) {
249            Some(&b'"') => {
250                #[cfg(not(feature = "simd-intrinsics"))]
251                if simd::has_control_char(&self.input[start..stop]) {
252                    return Err(Error::InvalidEscape);
253                }
254                let s = core::str::from_utf8(&self.input[start..stop])
255                    .map_err(|_| Error::InvalidUtf8)?;
256                self.pos = stop + 1;
257
258                #[cfg(feature = "stats")]
259                { self.stats.zero_copy_borrows += 1; }
260
261                Ok(JsonStr::BorrowedNoEsc(s))
262            }
263            Some(&b'\\') => {
264                self.pos = stop;
265                let owned = self.unescape_from(start)?;
266
267                #[cfg(feature = "stats")]
268                { self.stats.heap_allocations += 1; }
269
270                Ok(JsonStr::Owned(owned))
271            }
272            Some(_) => Err(Error::InvalidEscape), // control char < 0x20
273            None => Err(err_eof()),
274        }
275    }
276
277    /// Scan a JSON number and return the raw byte slice (zero-copy).
278    pub fn read_number_bytes(&mut self) -> Result<&'de [u8], Error> {
279        self.skip_whitespace();
280        let start = self.pos;
281        if self.input.get(self.pos) == Some(&b'-') { self.pos += 1; }
282
283        // SWAR digit scan: for byte b, b is b'0'..=b'9' iff (b - 0x30) is 0..=9.
284        // Two conditions: (1) sub has no high bits (rules out bytes ≥ 0xB0),
285        // (2) sub + 0x76 has no high bits (rules out sub bytes 10..=0x7F).
286        #[inline(always)]
287        fn swar_all_digits(chunk: u64) -> bool {
288            let sub = chunk.wrapping_sub(0x3030_3030_3030_3030_u64);
289            if (sub & 0x8080_8080_8080_8080_u64) != 0 { return false; }
290            let check = sub.wrapping_add(0x7676_7676_7676_7676_u64);
291            (check & 0x8080_8080_8080_8080_u64) == 0
292        }
293
294        // Read the integer part.  If it starts with '0', the spec forbids any
295        // further digit immediately following (leading zeros like "01" are invalid).
296        match self.input.get(self.pos) {
297            Some(&b'0') => {
298                self.pos += 1;
299                // Leading zero: next byte must NOT be another digit.
300                if matches!(self.input.get(self.pos), Some(b'0'..=b'9')) {
301                    return Err(Error::InvalidNumber);
302                }
303            }
304            Some(&(b'1'..=b'9')) => {
305                self.pos += 1;
306                // Scan remaining integer digits with SWAR then byte-by-byte.
307                while self.pos + 8 <= self.input.len() {
308                    let chunk = u64::from_le_bytes(
309                        self.input[self.pos..self.pos + 8].try_into().unwrap(),
310                    );
311                    if swar_all_digits(chunk) { self.pos += 8; } else { break; }
312                }
313                while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
314            }
315            _ => {} // will be caught by the end-check below
316        }
317
318        if self.input.get(self.pos) == Some(&b'.') {
319            self.pos += 1;
320            // At least one digit must follow the decimal point.
321            let digits_start = self.pos;
322            while self.pos + 8 <= self.input.len() {
323                let chunk = u64::from_le_bytes(
324                    self.input[self.pos..self.pos + 8].try_into().unwrap(),
325                );
326                if swar_all_digits(chunk) { self.pos += 8; } else { break; }
327            }
328            while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
329            if self.pos == digits_start {
330                // No digit after '.': "1." is invalid JSON.
331                return Err(Error::InvalidNumber);
332            }
333        }
334        if matches!(self.input.get(self.pos), Some(b'e') | Some(b'E')) {
335            self.pos += 1;
336            if matches!(self.input.get(self.pos), Some(b'+') | Some(b'-')) { self.pos += 1; }
337            while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
338        }
339        let end = self.pos;
340        if end == start || (end == start + 1 && self.input[start] == b'-') {
341            return Err(Error::InvalidNumber);
342        }
343
344        #[cfg(feature = "stats")]
345        { self.stats.bytes_scanned += (end - start) as u64; }
346
347        Ok(&self.input[start..end])
348    }
349
350    /// Returns true if the next (non-whitespace) bytes are `null` — does NOT consume.
351    #[inline]
352    pub fn peek_null(&mut self) -> bool {
353        self.skip_whitespace();
354        self.input.get(self.pos..self.pos + 4) == Some(b"null")
355    }
356
357    pub fn read_null(&mut self) -> Result<(), Error> {
358        self.skip_whitespace();
359        self.expect_bytes(b"null")
360    }
361
362    pub fn read_bool(&mut self) -> Result<bool, Error> {
363        self.skip_whitespace();
364        match self.input.get(self.pos) {
365            Some(&b't') => {
366                self.pos += 4;
367                if self.input.get(self.pos - 3..self.pos) == Some(b"rue") {
368                    Ok(true)
369                } else {
370                    self.pos -= 4;
371                    Err(err_token())
372                }
373            }
374            Some(&b'f') => {
375                self.pos += 5;
376                if self.input.get(self.pos - 4..self.pos) == Some(b"alse") {
377                    Ok(false)
378                } else {
379                    self.pos -= 5;
380                    Err(err_token())
381                }
382            }
383            _ => Err(err_token()),
384        }
385    }
386
387    /// Skip over any JSON value — used for unknown fields.
388    pub fn skip_value(&mut self) -> Result<(), Error> {
389        self.skip_whitespace();
390        match self.peek_byte()? {
391            b'"'              => self.skip_string(),
392            b'{'              => self.skip_object(),
393            b'['              => self.skip_array(),
394            b't'              => self.expect_bytes(b"true"),
395            b'f'              => self.expect_bytes(b"false"),
396            b'n'              => self.expect_bytes(b"null"),
397            b'-' | b'0'..=b'9' => { self.read_number_bytes()?; Ok(()) }
398            _                 => Err(err_token()),
399        }
400    }
401
402    fn skip_string(&mut self) -> Result<(), Error> {
403        self.expect_byte(b'"')?;
404        loop {
405            match self.input.get(self.pos) {
406                Some(&b'"')  => { self.pos += 1; return Ok(()); }
407                Some(&b'\\') => { self.pos += 2; }
408                Some(_)      => { self.pos += 1; }
409                None         => return Err(err_eof()),
410            }
411        }
412    }
413
414    /// Skip remaining array elements and the closing `]`.
415    /// Call this after a partial `SeqAccess` visit to drain any unconsumed
416    /// elements so the scanner is positioned after the `]`.
417    pub fn skip_array_tail(&mut self) -> Result<(), Error> {
418        loop {
419            self.skip_whitespace();
420            match self.peek_byte()? {
421                b']' => { self.pos += 1; return Ok(()); }
422                b',' => { self.pos += 1; self.skip_value()?; }
423                _    => { self.skip_value()?; }
424            }
425        }
426    }
427
428    /// Skip remaining fields of an already-opened object (cursor is just past `{`).
429    /// Used by internally-tagged enum deserialization when the variant is unknown.
430    pub fn skip_object_tail(&mut self) -> Result<(), Error> {
431        loop {
432            self.skip_whitespace();
433            match self.peek_byte()? {
434                b'}' => { self.pos += 1; return Ok(()); }
435                b'"' => {
436                    self.skip_string()?;
437                    self.skip_whitespace();
438                    self.expect_byte(b':')?;
439                    self.skip_value()?;
440                    self.skip_whitespace();
441                    match self.peek_byte()? {
442                        b',' => { self.pos += 1; }
443                        b'}' => { self.pos += 1; return Ok(()); }
444                        _ => return Err(err_token()),
445                    }
446                }
447                _ => return Err(err_token()),
448            }
449        }
450    }
451
452    fn skip_object(&mut self) -> Result<(), Error> {
453        self.expect_byte(b'{')?;
454        self.skip_whitespace();
455        if self.input.get(self.pos) == Some(&b'}') { self.pos += 1; return Ok(()); }
456        loop {
457            self.skip_string()?;
458            self.skip_whitespace();
459            self.expect_byte(b':')?;
460            self.skip_value()?;
461            self.skip_whitespace();
462            match self.peek_byte()? {
463                b',' => { self.pos += 1; self.skip_whitespace(); }
464                b'}' => { self.pos += 1; break; }
465                _    => return Err(err_token()),
466            }
467        }
468        Ok(())
469    }
470
471    fn skip_array(&mut self) -> Result<(), Error> {
472        self.expect_byte(b'[')?;
473        self.skip_whitespace();
474        if self.input.get(self.pos) == Some(&b']') { self.pos += 1; return Ok(()); }
475        loop {
476            self.skip_value()?;
477            self.skip_whitespace();
478            match self.peek_byte()? {
479                b',' => { self.pos += 1; self.skip_whitespace(); }
480                b']' => { self.pos += 1; break; }
481                _    => return Err(err_token()),
482            }
483        }
484        Ok(())
485    }
486
487    /// Unescape a JSON string whose content starts at `content_start` and whose
488    /// first backslash is at `self.pos`.  Returns the fully decoded `String`.
489    fn unescape_from(&mut self, content_start: usize) -> Result<String, Error> {
490        // Output is at most as long as the remaining input — preallocating
491        // avoids the Vec-doubling realloc chain on escape-heavy strings.
492        let mut buf: Vec<u8> =
493            Vec::with_capacity(self.input.len().saturating_sub(content_start));
494        // The caller (read_str) already positioned self.pos at the first `\`;
495        // find_escape already verified no control chars before that point,
496        // so the prefix is clean and we copy it directly.
497        buf.extend_from_slice(&self.input[content_start..self.pos]);
498
499        loop {
500            match self.input.get(self.pos) {
501                Some(&b'"') => { self.pos += 1; break; }
502                Some(&b'\\') => {
503                    self.pos += 1;
504                    let esc = self.input.get(self.pos).copied().ok_or_else(err_eof)?;
505                    self.pos += 1;
506                    match esc {
507                        b'"'  => buf.push(b'"'),
508                        b'\\' => buf.push(b'\\'),
509                        b'/'  => buf.push(b'/'),
510                        b'n'  => buf.push(b'\n'),
511                        b't'  => buf.push(b'\t'),
512                        b'r'  => buf.push(b'\r'),
513                        b'b'  => buf.push(0x08),
514                        b'f'  => buf.push(0x0C),
515                        b'u'  => {
516                            let hex = self.input.get(self.pos..self.pos + 4).ok_or(Error::InvalidEscape)?;
517                            let s = core::str::from_utf8(hex).map_err(|_| Error::InvalidEscape)?;
518                            let code = u32::from_str_radix(s, 16).map_err(|_| Error::InvalidEscape)?;
519                            let c = if (0xD800..=0xDBFF).contains(&code) {
520                                self.pos += 4;
521                                if self.input.get(self.pos..self.pos + 2) != Some(b"\\u") {
522                                    return Err(Error::InvalidEscape);
523                                }
524                                self.pos += 2;
525                                let lo_hex = self.input.get(self.pos..self.pos + 4).ok_or(Error::InvalidEscape)?;
526                                let lo_s = core::str::from_utf8(lo_hex).map_err(|_| Error::InvalidEscape)?;
527                                let lo = u32::from_str_radix(lo_s, 16).map_err(|_| Error::InvalidEscape)?;
528                                self.pos += 4;
529                                let combined = 0x10000 + ((code - 0xD800) << 10) + (lo - 0xDC00);
530                                char::from_u32(combined).ok_or(Error::InvalidEscape)?
531                            } else {
532                                self.pos += 4;
533                                char::from_u32(code).ok_or(Error::InvalidEscape)?
534                            };
535                            let mut tmp = [0u8; 4];
536                            buf.extend_from_slice(c.encode_utf8(&mut tmp).as_bytes());
537                            continue;
538                        }
539                        _ => return Err(Error::InvalidEscape),
540                    }
541                }
542                Some(_) => {
543                    let seg_start = self.pos;
544                    // Single pass: find_escape stops at `"`, `\`, or any byte < 0x20.
545                    let stop = simd::find_escape(self.input, self.pos);
546                    match self.input.get(stop) {
547                        Some(&b'"') | Some(&b'\\') => {
548                            buf.extend_from_slice(&self.input[seg_start..stop]);
549                            self.pos = stop;
550                        }
551                        Some(_) => return Err(Error::InvalidEscape), // control char
552                        None => return Err(err_eof()),
553                    }
554                }
555                None => return Err(err_eof()),
556            }
557        }
558
559        String::from_utf8(buf).map_err(|_| Error::InvalidUtf8)
560    }
561}