1use crate::{simd, Error};
2
3#[cold]
4#[inline]
5fn err_eof() -> Error { Error::UnexpectedEof }
6
7#[cold]
8#[inline]
9fn err_token() -> Error { Error::UnexpectedToken }
10
11pub enum JsonStr<'de> {
17 Borrowed(&'de str),
21 BorrowedNoEsc(&'de str),
25 Owned(String),
26}
27
28impl<'de> JsonStr<'de> {
29 #[inline]
30 pub fn as_borrowed(&self) -> Option<&'de str> {
31 match self {
32 JsonStr::Borrowed(s) => Some(s),
33 JsonStr::BorrowedNoEsc(s) => Some(s),
34 JsonStr::Owned(_) => None,
35 }
36 }
37
38 #[inline]
39 pub fn as_str(&self) -> &str {
40 match self {
41 JsonStr::Borrowed(s) => s,
42 JsonStr::BorrowedNoEsc(s) => s,
43 JsonStr::Owned(s) => s.as_str(),
44 }
45 }
46
47 #[inline]
48 pub fn into_owned(self) -> String {
49 match self {
50 JsonStr::Borrowed(s) => s.to_owned(),
51 JsonStr::BorrowedNoEsc(s) => s.to_owned(),
52 JsonStr::Owned(s) => s,
53 }
54 }
55}
56
57pub struct Scanner<'de> {
58 input: &'de [u8],
59 pos: usize,
60 #[cfg(feature = "stats")]
61 pub stats: crate::stats::ScannerStats,
62}
63
64impl<'de> Scanner<'de> {
65 #[inline]
66 pub fn new(input: &'de [u8]) -> Self {
67 Scanner {
68 input,
69 pos: 0,
70 #[cfg(feature = "stats")]
71 stats: crate::stats::ScannerStats::default(),
72 }
73 }
74
75 #[inline]
76 pub fn new_str(s: &'de str) -> Self {
77 Self::new(s.as_bytes())
78 }
79
80 #[inline]
81 pub fn peek_byte(&self) -> Result<u8, Error> {
82 self.input.get(self.pos).copied().ok_or_else(err_eof)
83 }
84
85 #[inline]
86 pub fn advance(&mut self) {
87 self.pos += 1;
88 }
89
90 #[inline]
92 pub fn pos(&self) -> usize { self.pos }
93
94 #[inline]
95 pub fn set_pos(&mut self, saved_pos: usize) { self.pos = saved_pos; }
96
97 #[inline]
98 pub fn advance_by(&mut self, n: usize) {
99 self.pos += n;
100 }
101
102 #[inline]
104 pub fn remaining_input(&self) -> &'de [u8] {
105 &self.input[self.pos..]
106 }
107
108 #[inline]
109 pub fn expect_byte(&mut self, expected: u8) -> Result<(), Error> {
110 match self.input.get(self.pos) {
111 Some(&b) if b == expected => { self.pos += 1; Ok(()) }
112 _ => Err(err_token()),
113 }
114 }
115
116 pub fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), Error> {
117 let end = self.pos + expected.len();
118 if self.input.get(self.pos..end) == Some(expected) {
119 self.pos = end;
120 Ok(())
121 } else {
122 Err(err_token())
123 }
124 }
125
126 #[inline(always)]
127 pub fn skip_whitespace(&mut self) {
128 if let Some(&b) = self.input.get(self.pos) {
131 if b > b' ' { return; }
132 } else {
133 return;
134 }
135 self.skip_whitespace_swar();
136 }
137
138 #[inline]
146 fn skip_whitespace_swar(&mut self) {
147 while self.pos + 8 <= self.input.len() {
152 let chunk = u64::from_le_bytes(
153 self.input[self.pos..self.pos + 8].try_into().unwrap(),
154 );
155 let sub = chunk.wrapping_sub(0x2121_2121_2121_2121_u64);
156 if (sub & 0x8080_8080_8080_8080_u64) == 0x8080_8080_8080_8080_u64 {
157 self.pos += 8;
158 } else {
159 break;
160 }
161 }
162 while let Some(&b) = self.input.get(self.pos) {
164 if b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' {
165 self.pos += 1;
166 } else {
167 break;
168 }
169 }
170 }
171
172 #[inline(always)]
173 pub fn peek_byte_after_ws(&mut self) -> Result<u8, Error> {
174 self.skip_whitespace();
175 self.peek_byte()
176 }
177
178 #[inline]
181 pub fn expect_eof(&mut self) -> Result<(), Error> {
182 self.skip_whitespace();
183 if self.pos < self.input.len() {
184 Err(Error::UnexpectedToken)
185 } else {
186 Ok(())
187 }
188 }
189
190 pub fn read_key(&mut self) -> Result<&'de [u8], Error> {
193 self.skip_whitespace();
194 self.expect_byte(b'"')?;
195 let start = self.pos;
196 #[cfg(feature = "simd-intrinsics")]
201 let stop = simd::find_escape(self.input, self.pos);
202 #[cfg(not(feature = "simd-intrinsics"))]
203 let stop = simd::find(self.input, self.pos);
204
205 match self.input.get(stop) {
206 Some(&b'"') => {
207 #[cfg(not(feature = "simd-intrinsics"))]
208 if simd::has_control_char(&self.input[start..stop]) {
209 return Err(Error::InvalidEscape);
210 }
211 self.pos = stop + 1;
212 Ok(&self.input[start..stop])
213 }
214 Some(&b'\\') => Err(Error::EscapedKey),
215 Some(_) => Err(Error::InvalidEscape), _ => Err(err_eof()),
217 }
218 }
219
220 #[inline]
222 pub fn read_key_colon(&mut self) -> Result<&'de [u8], Error> {
223 let key = self.read_key()?;
224 if self.input.get(self.pos) == Some(&b':') {
226 self.pos += 1;
227 } else {
228 self.skip_whitespace();
229 self.expect_byte(b':')?;
230 }
231 Ok(key)
232 }
233
234 pub fn read_str(&mut self) -> Result<JsonStr<'de>, Error> {
240 self.skip_whitespace();
241 self.expect_byte(b'"')?;
242 let start = self.pos;
243 #[cfg(feature = "simd-intrinsics")]
244 let stop = simd::find_escape(self.input, start);
245 #[cfg(not(feature = "simd-intrinsics"))]
246 let stop = simd::find(self.input, start);
247
248 match self.input.get(stop) {
249 Some(&b'"') => {
250 #[cfg(not(feature = "simd-intrinsics"))]
251 if simd::has_control_char(&self.input[start..stop]) {
252 return Err(Error::InvalidEscape);
253 }
254 let s = core::str::from_utf8(&self.input[start..stop])
255 .map_err(|_| Error::InvalidUtf8)?;
256 self.pos = stop + 1;
257
258 #[cfg(feature = "stats")]
259 { self.stats.zero_copy_borrows += 1; }
260
261 Ok(JsonStr::BorrowedNoEsc(s))
262 }
263 Some(&b'\\') => {
264 self.pos = stop;
265 let owned = self.unescape_from(start)?;
266
267 #[cfg(feature = "stats")]
268 { self.stats.heap_allocations += 1; }
269
270 Ok(JsonStr::Owned(owned))
271 }
272 Some(_) => Err(Error::InvalidEscape), None => Err(err_eof()),
274 }
275 }
276
277 pub fn read_number_bytes(&mut self) -> Result<&'de [u8], Error> {
279 self.skip_whitespace();
280 let start = self.pos;
281 if self.input.get(self.pos) == Some(&b'-') { self.pos += 1; }
282
283 #[inline(always)]
287 fn swar_all_digits(chunk: u64) -> bool {
288 let sub = chunk.wrapping_sub(0x3030_3030_3030_3030_u64);
289 if (sub & 0x8080_8080_8080_8080_u64) != 0 { return false; }
290 let check = sub.wrapping_add(0x7676_7676_7676_7676_u64);
291 (check & 0x8080_8080_8080_8080_u64) == 0
292 }
293
294 match self.input.get(self.pos) {
297 Some(&b'0') => {
298 self.pos += 1;
299 if matches!(self.input.get(self.pos), Some(b'0'..=b'9')) {
301 return Err(Error::InvalidNumber);
302 }
303 }
304 Some(&(b'1'..=b'9')) => {
305 self.pos += 1;
306 while self.pos + 8 <= self.input.len() {
308 let chunk = u64::from_le_bytes(
309 self.input[self.pos..self.pos + 8].try_into().unwrap(),
310 );
311 if swar_all_digits(chunk) { self.pos += 8; } else { break; }
312 }
313 while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
314 }
315 _ => {} }
317
318 if self.input.get(self.pos) == Some(&b'.') {
319 self.pos += 1;
320 let digits_start = self.pos;
322 while self.pos + 8 <= self.input.len() {
323 let chunk = u64::from_le_bytes(
324 self.input[self.pos..self.pos + 8].try_into().unwrap(),
325 );
326 if swar_all_digits(chunk) { self.pos += 8; } else { break; }
327 }
328 while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
329 if self.pos == digits_start {
330 return Err(Error::InvalidNumber);
332 }
333 }
334 if matches!(self.input.get(self.pos), Some(b'e') | Some(b'E')) {
335 self.pos += 1;
336 if matches!(self.input.get(self.pos), Some(b'+') | Some(b'-')) { self.pos += 1; }
337 while let Some(&b) = self.input.get(self.pos) { if b.is_ascii_digit() { self.pos += 1; } else { break; } }
338 }
339 let end = self.pos;
340 if end == start || (end == start + 1 && self.input[start] == b'-') {
341 return Err(Error::InvalidNumber);
342 }
343
344 #[cfg(feature = "stats")]
345 { self.stats.bytes_scanned += (end - start) as u64; }
346
347 Ok(&self.input[start..end])
348 }
349
350 #[inline]
352 pub fn peek_null(&mut self) -> bool {
353 self.skip_whitespace();
354 self.input.get(self.pos..self.pos + 4) == Some(b"null")
355 }
356
357 pub fn read_null(&mut self) -> Result<(), Error> {
358 self.skip_whitespace();
359 self.expect_bytes(b"null")
360 }
361
362 pub fn read_bool(&mut self) -> Result<bool, Error> {
363 self.skip_whitespace();
364 match self.input.get(self.pos) {
365 Some(&b't') => {
366 self.pos += 4;
367 if self.input.get(self.pos - 3..self.pos) == Some(b"rue") {
368 Ok(true)
369 } else {
370 self.pos -= 4;
371 Err(err_token())
372 }
373 }
374 Some(&b'f') => {
375 self.pos += 5;
376 if self.input.get(self.pos - 4..self.pos) == Some(b"alse") {
377 Ok(false)
378 } else {
379 self.pos -= 5;
380 Err(err_token())
381 }
382 }
383 _ => Err(err_token()),
384 }
385 }
386
387 pub fn skip_value(&mut self) -> Result<(), Error> {
389 self.skip_whitespace();
390 match self.peek_byte()? {
391 b'"' => self.skip_string(),
392 b'{' => self.skip_object(),
393 b'[' => self.skip_array(),
394 b't' => self.expect_bytes(b"true"),
395 b'f' => self.expect_bytes(b"false"),
396 b'n' => self.expect_bytes(b"null"),
397 b'-' | b'0'..=b'9' => { self.read_number_bytes()?; Ok(()) }
398 _ => Err(err_token()),
399 }
400 }
401
402 fn skip_string(&mut self) -> Result<(), Error> {
403 self.expect_byte(b'"')?;
404 loop {
405 match self.input.get(self.pos) {
406 Some(&b'"') => { self.pos += 1; return Ok(()); }
407 Some(&b'\\') => { self.pos += 2; }
408 Some(_) => { self.pos += 1; }
409 None => return Err(err_eof()),
410 }
411 }
412 }
413
414 pub fn skip_array_tail(&mut self) -> Result<(), Error> {
418 loop {
419 self.skip_whitespace();
420 match self.peek_byte()? {
421 b']' => { self.pos += 1; return Ok(()); }
422 b',' => { self.pos += 1; self.skip_value()?; }
423 _ => { self.skip_value()?; }
424 }
425 }
426 }
427
428 pub fn skip_object_tail(&mut self) -> Result<(), Error> {
431 loop {
432 self.skip_whitespace();
433 match self.peek_byte()? {
434 b'}' => { self.pos += 1; return Ok(()); }
435 b'"' => {
436 self.skip_string()?;
437 self.skip_whitespace();
438 self.expect_byte(b':')?;
439 self.skip_value()?;
440 self.skip_whitespace();
441 match self.peek_byte()? {
442 b',' => { self.pos += 1; }
443 b'}' => { self.pos += 1; return Ok(()); }
444 _ => return Err(err_token()),
445 }
446 }
447 _ => return Err(err_token()),
448 }
449 }
450 }
451
452 fn skip_object(&mut self) -> Result<(), Error> {
453 self.expect_byte(b'{')?;
454 self.skip_whitespace();
455 if self.input.get(self.pos) == Some(&b'}') { self.pos += 1; return Ok(()); }
456 loop {
457 self.skip_string()?;
458 self.skip_whitespace();
459 self.expect_byte(b':')?;
460 self.skip_value()?;
461 self.skip_whitespace();
462 match self.peek_byte()? {
463 b',' => { self.pos += 1; self.skip_whitespace(); }
464 b'}' => { self.pos += 1; break; }
465 _ => return Err(err_token()),
466 }
467 }
468 Ok(())
469 }
470
471 fn skip_array(&mut self) -> Result<(), Error> {
472 self.expect_byte(b'[')?;
473 self.skip_whitespace();
474 if self.input.get(self.pos) == Some(&b']') { self.pos += 1; return Ok(()); }
475 loop {
476 self.skip_value()?;
477 self.skip_whitespace();
478 match self.peek_byte()? {
479 b',' => { self.pos += 1; self.skip_whitespace(); }
480 b']' => { self.pos += 1; break; }
481 _ => return Err(err_token()),
482 }
483 }
484 Ok(())
485 }
486
487 fn unescape_from(&mut self, content_start: usize) -> Result<String, Error> {
490 let mut buf: Vec<u8> =
493 Vec::with_capacity(self.input.len().saturating_sub(content_start));
494 buf.extend_from_slice(&self.input[content_start..self.pos]);
498
499 loop {
500 match self.input.get(self.pos) {
501 Some(&b'"') => { self.pos += 1; break; }
502 Some(&b'\\') => {
503 self.pos += 1;
504 let esc = self.input.get(self.pos).copied().ok_or_else(err_eof)?;
505 self.pos += 1;
506 match esc {
507 b'"' => buf.push(b'"'),
508 b'\\' => buf.push(b'\\'),
509 b'/' => buf.push(b'/'),
510 b'n' => buf.push(b'\n'),
511 b't' => buf.push(b'\t'),
512 b'r' => buf.push(b'\r'),
513 b'b' => buf.push(0x08),
514 b'f' => buf.push(0x0C),
515 b'u' => {
516 let hex = self.input.get(self.pos..self.pos + 4).ok_or(Error::InvalidEscape)?;
517 let s = core::str::from_utf8(hex).map_err(|_| Error::InvalidEscape)?;
518 let code = u32::from_str_radix(s, 16).map_err(|_| Error::InvalidEscape)?;
519 let c = if (0xD800..=0xDBFF).contains(&code) {
520 self.pos += 4;
521 if self.input.get(self.pos..self.pos + 2) != Some(b"\\u") {
522 return Err(Error::InvalidEscape);
523 }
524 self.pos += 2;
525 let lo_hex = self.input.get(self.pos..self.pos + 4).ok_or(Error::InvalidEscape)?;
526 let lo_s = core::str::from_utf8(lo_hex).map_err(|_| Error::InvalidEscape)?;
527 let lo = u32::from_str_radix(lo_s, 16).map_err(|_| Error::InvalidEscape)?;
528 self.pos += 4;
529 let combined = 0x10000 + ((code - 0xD800) << 10) + (lo - 0xDC00);
530 char::from_u32(combined).ok_or(Error::InvalidEscape)?
531 } else {
532 self.pos += 4;
533 char::from_u32(code).ok_or(Error::InvalidEscape)?
534 };
535 let mut tmp = [0u8; 4];
536 buf.extend_from_slice(c.encode_utf8(&mut tmp).as_bytes());
537 continue;
538 }
539 _ => return Err(Error::InvalidEscape),
540 }
541 }
542 Some(_) => {
543 let seg_start = self.pos;
544 let stop = simd::find_escape(self.input, self.pos);
546 match self.input.get(stop) {
547 Some(&b'"') | Some(&b'\\') => {
548 buf.extend_from_slice(&self.input[seg_start..stop]);
549 self.pos = stop;
550 }
551 Some(_) => return Err(Error::InvalidEscape), None => return Err(err_eof()),
553 }
554 }
555 None => return Err(err_eof()),
556 }
557 }
558
559 String::from_utf8(buf).map_err(|_| Error::InvalidUtf8)
560 }
561}