Skip to main content

jzon/
scanner.rs

1use crate::{simd, Error};
2
3#[cold]
4#[inline]
5fn err_eof() -> Error { Error::UnexpectedEof }
6
7#[cold]
8#[inline]
9fn err_token() -> Error { Error::UnexpectedToken }
10
11/// A parsed JSON string: either a zero-copy borrow or a heap-allocated value.
12pub enum JsonStr<'de> {
13    Borrowed(&'de str),
14    Owned(String),
15}
16
17impl<'de> JsonStr<'de> {
18    #[inline]
19    pub fn as_borrowed(&self) -> Option<&'de str> {
20        match self {
21            JsonStr::Borrowed(s) => Some(s),
22            JsonStr::Owned(_) => None,
23        }
24    }
25
26    #[inline]
27    pub fn as_str(&self) -> &str {
28        match self {
29            JsonStr::Borrowed(s) => s,
30            JsonStr::Owned(s) => s.as_str(),
31        }
32    }
33
34    #[inline]
35    pub fn into_owned(self) -> String {
36        match self {
37            JsonStr::Borrowed(s) => s.to_owned(),
38            JsonStr::Owned(s) => s,
39        }
40    }
41}
42
43pub struct Scanner<'de> {
44    input: &'de [u8],
45    pos: usize,
46    #[cfg(feature = "stats")]
47    pub stats: crate::stats::ScannerStats,
48}
49
50impl<'de> Scanner<'de> {
51    #[inline]
52    pub fn new(input: &'de [u8]) -> Self {
53        Scanner {
54            input,
55            pos: 0,
56            #[cfg(feature = "stats")]
57            stats: crate::stats::ScannerStats::default(),
58        }
59    }
60
61    #[inline]
62    pub fn new_str(s: &'de str) -> Self {
63        Self::new(s.as_bytes())
64    }
65
66    #[inline]
67    pub fn peek_byte(&self) -> Result<u8, Error> {
68        self.input.get(self.pos).copied().ok_or_else(err_eof)
69    }
70
71    #[inline]
72    pub fn advance(&mut self) {
73        self.pos += 1;
74    }
75
76    /// Byte offset into the input slice — used by internally-tagged enum parsers to checkpoint and re-scan.
77    #[inline]
78    pub fn pos(&self) -> usize { self.pos }
79
80    #[inline]
81    pub fn set_pos(&mut self, saved_pos: usize) { self.pos = saved_pos; }
82
83    #[inline]
84    pub fn advance_by(&mut self, n: usize) {
85        self.pos += n;
86    }
87
88    /// Remaining unprocessed input — used by single-pass float parsers (`fast_float2::parse_partial`).
89    #[inline]
90    pub fn remaining_input(&self) -> &'de [u8] {
91        &self.input[self.pos..]
92    }
93
94    #[inline]
95    pub fn expect_byte(&mut self, expected: u8) -> Result<(), Error> {
96        match self.input.get(self.pos) {
97            Some(&b) if b == expected => { self.pos += 1; Ok(()) }
98            _ => Err(err_token()),
99        }
100    }
101
102    pub fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), Error> {
103        let end = self.pos + expected.len();
104        if self.input.get(self.pos..end) == Some(expected) {
105            self.pos = end;
106            Ok(())
107        } else {
108            Err(err_token())
109        }
110    }
111
112    #[inline(always)]
113    pub fn skip_whitespace(&mut self) {
114        // Fast path: compact JSON has no leading whitespace — skip the loop entirely.
115        // All structural bytes are > b' ' (32), so this correctly identifies non-whitespace.
116        if let Some(&b) = self.input.get(self.pos) {
117            if b > b' ' { return; }
118        } else {
119            return;
120        }
121        self.skip_whitespace_swar();
122    }
123
124    /// SWAR whitespace skipper — called only when the first byte IS whitespace.
125    ///
126    /// All JSON whitespace bytes (0x09, 0x0A, 0x0D, 0x20) are ≤ 0x20.
127    /// All structural bytes are > 0x20.  Trick: subtract 0x21 from a byte —
128    /// values ≤ 0x20 wrap and get their high bit set; values ≥ 0x21 do not.
129    /// Applied to all 8 bytes at once with 64-bit arithmetic.
130    ///
131    /// Not `#[cold]` — pretty-printed JSON calls this on every field separator.
132    #[inline]
133    fn skip_whitespace_swar(&mut self) {
134        while self.pos + 8 <= self.input.len() {
135            let chunk = u64::from_le_bytes(
136                self.input[self.pos..self.pos + 8].try_into().unwrap(),
137            );
138            let sub = chunk.wrapping_sub(0x2121_2121_2121_2121_u64);
139            if (sub & 0x8080_8080_8080_8080_u64) == 0x8080_8080_8080_8080_u64 {
140                self.pos += 8;
141            } else {
142                break;
143            }
144        }
145        while let Some(&b) = self.input.get(self.pos) {
146            if b > b' ' { break; }
147            self.pos += 1;
148        }
149    }
150
151    #[inline(always)]
152    pub fn peek_byte_after_ws(&mut self) -> Result<u8, Error> {
153        self.skip_whitespace();
154        self.peek_byte()
155    }
156
157    /// Read a JSON object key as a zero-copy `&'de [u8]`.
158    /// Returns `Error::EscapedKey` if the key contains backslashes.
159    pub fn read_key(&mut self) -> Result<&'de [u8], Error> {
160        self.skip_whitespace();
161        self.expect_byte(b'"')?;
162        let start = self.pos;
163        let stop = simd::find(self.input, self.pos);
164        match self.input.get(stop) {
165            Some(&b'"') => {
166                let k = &self.input[start..stop];
167                self.pos = stop + 1;
168                Ok(k)
169            }
170            Some(&b'\\') => Err(Error::EscapedKey),
171            _ => Err(err_eof()),
172        }
173    }
174
175    /// Read a JSON object key and the mandatory `:` separator in one call.
176    #[inline]
177    pub fn read_key_colon(&mut self) -> Result<&'de [u8], Error> {
178        let key = self.read_key()?;
179        // Fast path: ':' almost always immediately follows the closing '"' in compact JSON.
180        if self.input.get(self.pos) == Some(&b':') {
181            self.pos += 1;
182        } else {
183            self.skip_whitespace();
184            self.expect_byte(b':')?;
185        }
186        Ok(key)
187    }
188
189    /// Read a JSON string value.
190    ///
191    /// Returns `Borrowed(&'de str)` when no escape sequences are present
192    /// (zero allocation), or `Owned(String)` after unescaping.
193    pub fn read_str(&mut self) -> Result<JsonStr<'de>, Error> {
194        self.skip_whitespace();
195        self.expect_byte(b'"')?;
196        let start = self.pos;
197        let stop = simd::find(self.input, start);
198
199        match self.input.get(stop) {
200            Some(&b'"') => {
201                let s = core::str::from_utf8(&self.input[start..stop])
202                    .map_err(|_| Error::InvalidUtf8)?;
203                self.pos = stop + 1;
204
205                #[cfg(feature = "stats")]
206                { self.stats.zero_copy_borrows += 1; }
207
208                Ok(JsonStr::Borrowed(s))
209            }
210            Some(&b'\\') => {
211                self.pos = stop;
212                let owned = self.unescape_from(start)?;
213
214                #[cfg(feature = "stats")]
215                { self.stats.heap_allocations += 1; }
216
217                Ok(JsonStr::Owned(owned))
218            }
219            _ => Err(err_eof()),
220        }
221    }
222
223    /// Scan a JSON number and return the raw byte slice (zero-copy).
224    pub fn read_number_bytes(&mut self) -> Result<&'de [u8], Error> {
225        self.skip_whitespace();
226        let start = self.pos;
227        if self.input.get(self.pos) == Some(&b'-') { self.pos += 1; }
228
229        // SWAR digit scan: for byte b, b is b'0'..=b'9' iff (b - 0x30) is 0..=9.
230        // Two conditions: (1) sub has no high bits (rules out bytes ≥ 0xB0),
231        // (2) sub + 0x76 has no high bits (rules out sub bytes 10..=0x7F).
232        #[inline(always)]
233        fn swar_all_digits(chunk: u64) -> bool {
234            let sub = chunk.wrapping_sub(0x3030_3030_3030_3030_u64);
235            if (sub & 0x8080_8080_8080_8080_u64) != 0 { return false; }
236            let check = sub.wrapping_add(0x7676_7676_7676_7676_u64);
237            (check & 0x8080_8080_8080_8080_u64) == 0
238        }
239        while self.pos + 8 <= self.input.len() {
240            let chunk = u64::from_le_bytes(
241                self.input[self.pos..self.pos + 8].try_into().unwrap(),
242            );
243            if swar_all_digits(chunk) { self.pos += 8; } else { break; }
244        }
245        while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
246        if self.input.get(self.pos) == Some(&b'.') {
247            self.pos += 1;
248            while self.pos + 8 <= self.input.len() {
249                let chunk = u64::from_le_bytes(
250                    self.input[self.pos..self.pos + 8].try_into().unwrap(),
251                );
252                if swar_all_digits(chunk) { self.pos += 8; } else { break; }
253            }
254            while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
255        }
256        if matches!(self.input.get(self.pos), Some(b'e') | Some(b'E')) {
257            self.pos += 1;
258            if matches!(self.input.get(self.pos), Some(b'+') | Some(b'-')) { self.pos += 1; }
259            while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
260        }
261        let end = self.pos;
262        if end == start || (end == start + 1 && self.input[start] == b'-') {
263            return Err(Error::InvalidNumber);
264        }
265
266        #[cfg(feature = "stats")]
267        { self.stats.bytes_scanned += (end - start) as u64; }
268
269        Ok(&self.input[start..end])
270    }
271
272    /// Returns true if the next (non-whitespace) bytes are `null` — does NOT consume.
273    #[inline]
274    pub fn peek_null(&mut self) -> bool {
275        self.skip_whitespace();
276        self.input.get(self.pos..self.pos + 4) == Some(b"null")
277    }
278
279    pub fn read_null(&mut self) -> Result<(), Error> {
280        self.skip_whitespace();
281        self.expect_bytes(b"null")
282    }
283
284    pub fn read_bool(&mut self) -> Result<bool, Error> {
285        self.skip_whitespace();
286        match self.input.get(self.pos) {
287            Some(&b't') => {
288                self.pos += 4;
289                if self.input.get(self.pos - 3..self.pos) == Some(b"rue") {
290                    Ok(true)
291                } else {
292                    self.pos -= 4;
293                    Err(err_token())
294                }
295            }
296            Some(&b'f') => {
297                self.pos += 5;
298                if self.input.get(self.pos - 4..self.pos) == Some(b"alse") {
299                    Ok(false)
300                } else {
301                    self.pos -= 5;
302                    Err(err_token())
303                }
304            }
305            _ => Err(err_token()),
306        }
307    }
308
309    /// Skip over any JSON value — used for unknown fields.
310    pub fn skip_value(&mut self) -> Result<(), Error> {
311        self.skip_whitespace();
312        match self.peek_byte()? {
313            b'"'              => self.skip_string(),
314            b'{'              => self.skip_object(),
315            b'['              => self.skip_array(),
316            b't'              => self.expect_bytes(b"true"),
317            b'f'              => self.expect_bytes(b"false"),
318            b'n'              => self.expect_bytes(b"null"),
319            b'-' | b'0'..=b'9' => { self.read_number_bytes()?; Ok(()) }
320            _                 => Err(err_token()),
321        }
322    }
323
324    fn skip_string(&mut self) -> Result<(), Error> {
325        self.expect_byte(b'"')?;
326        loop {
327            match self.input.get(self.pos) {
328                Some(&b'"')  => { self.pos += 1; return Ok(()); }
329                Some(&b'\\') => { self.pos += 2; }
330                Some(_)      => { self.pos += 1; }
331                None         => return Err(err_eof()),
332            }
333        }
334    }
335
336    /// Skip remaining fields of an already-opened object (cursor is just past `{`).
337    /// Used by internally-tagged enum deserialization when the variant is unknown.
338    pub fn skip_object_tail(&mut self) -> Result<(), Error> {
339        loop {
340            self.skip_whitespace();
341            match self.peek_byte()? {
342                b'}' => { self.pos += 1; return Ok(()); }
343                b'"' => {
344                    self.skip_string()?;
345                    self.skip_whitespace();
346                    self.expect_byte(b':')?;
347                    self.skip_value()?;
348                    self.skip_whitespace();
349                    match self.peek_byte()? {
350                        b',' => { self.pos += 1; }
351                        b'}' => { self.pos += 1; return Ok(()); }
352                        _ => return Err(err_token()),
353                    }
354                }
355                _ => return Err(err_token()),
356            }
357        }
358    }
359
360    fn skip_object(&mut self) -> Result<(), Error> {
361        self.expect_byte(b'{')?;
362        self.skip_whitespace();
363        if self.input.get(self.pos) == Some(&b'}') { self.pos += 1; return Ok(()); }
364        loop {
365            self.skip_string()?;
366            self.skip_whitespace();
367            self.expect_byte(b':')?;
368            self.skip_value()?;
369            self.skip_whitespace();
370            match self.peek_byte()? {
371                b',' => { self.pos += 1; self.skip_whitespace(); }
372                b'}' => { self.pos += 1; break; }
373                _    => return Err(err_token()),
374            }
375        }
376        Ok(())
377    }
378
379    fn skip_array(&mut self) -> Result<(), Error> {
380        self.expect_byte(b'[')?;
381        self.skip_whitespace();
382        if self.input.get(self.pos) == Some(&b']') { self.pos += 1; return Ok(()); }
383        loop {
384            self.skip_value()?;
385            self.skip_whitespace();
386            match self.peek_byte()? {
387                b',' => { self.pos += 1; self.skip_whitespace(); }
388                b']' => { self.pos += 1; break; }
389                _    => return Err(err_token()),
390            }
391        }
392        Ok(())
393    }
394
395    /// Unescape a JSON string whose content starts at `content_start` and whose
396    /// first backslash is at `self.pos`.  Returns the fully decoded `String`.
397    fn unescape_from(&mut self, content_start: usize) -> Result<String, Error> {
398        // Output is at most as long as the remaining input — preallocating
399        // avoids the Vec-doubling realloc chain on escape-heavy strings.
400        let mut buf: Vec<u8> =
401            Vec::with_capacity(self.input.len().saturating_sub(content_start));
402        buf.extend_from_slice(&self.input[content_start..self.pos]);
403
404        loop {
405            match self.input.get(self.pos) {
406                Some(&b'"') => { self.pos += 1; break; }
407                Some(&b'\\') => {
408                    self.pos += 1;
409                    let esc = self.input.get(self.pos).copied().ok_or_else(err_eof)?;
410                    self.pos += 1;
411                    match esc {
412                        b'"'  => buf.push(b'"'),
413                        b'\\' => buf.push(b'\\'),
414                        b'/'  => buf.push(b'/'),
415                        b'n'  => buf.push(b'\n'),
416                        b't'  => buf.push(b'\t'),
417                        b'r'  => buf.push(b'\r'),
418                        b'b'  => buf.push(0x08),
419                        b'f'  => buf.push(0x0C),
420                        b'u'  => {
421                            let hex = self.input.get(self.pos..self.pos + 4).ok_or(Error::InvalidEscape)?;
422                            let s = core::str::from_utf8(hex).map_err(|_| Error::InvalidEscape)?;
423                            let code = u32::from_str_radix(s, 16).map_err(|_| Error::InvalidEscape)?;
424                            let c = if (0xD800..=0xDBFF).contains(&code) {
425                                self.pos += 4;
426                                if self.input.get(self.pos..self.pos + 2) != Some(b"\\u") {
427                                    return Err(Error::InvalidEscape);
428                                }
429                                self.pos += 2;
430                                let lo_hex = self.input.get(self.pos..self.pos + 4).ok_or(Error::InvalidEscape)?;
431                                let lo_s = core::str::from_utf8(lo_hex).map_err(|_| Error::InvalidEscape)?;
432                                let lo = u32::from_str_radix(lo_s, 16).map_err(|_| Error::InvalidEscape)?;
433                                self.pos += 4;
434                                let combined = 0x10000 + ((code - 0xD800) << 10) + (lo - 0xDC00);
435                                char::from_u32(combined).ok_or(Error::InvalidEscape)?
436                            } else {
437                                self.pos += 4;
438                                char::from_u32(code).ok_or(Error::InvalidEscape)?
439                            };
440                            let mut tmp = [0u8; 4];
441                            buf.extend_from_slice(c.encode_utf8(&mut tmp).as_bytes());
442                            continue;
443                        }
444                        _ => return Err(Error::InvalidEscape),
445                    }
446                }
447                Some(_) => {
448                    let seg_start = self.pos;
449                    let stop = simd::find(self.input, self.pos);
450                    buf.extend_from_slice(&self.input[seg_start..stop]);
451                    self.pos = stop;
452                }
453                None => return Err(err_eof()),
454            }
455        }
456
457        String::from_utf8(buf).map_err(|_| Error::InvalidUtf8)
458    }
459}