1#![doc = include_str!("../README.md")]
2
3#[cfg(feature = "serde")]
4pub mod de;
5pub mod dom;
6pub mod sax;
7
8#[cfg(feature = "serde")]
9pub use de::from_taperef;
10pub use dom::json_ref::JsonRef;
11pub use dom::{Dom, DomArrayIter, DomEntry, DomEntryKind, DomObjectIter, DomRef};
12pub use sax::Sax;
13
14use dom::DomWriter;
15
16#[cfg(target_arch = "x86_64")]
29#[repr(C)]
30struct ZmmVtab {
31 null: unsafe extern "C" fn(*mut ()),
32 bool_val: unsafe extern "C" fn(*mut (), bool),
33 number: unsafe extern "C" fn(*mut (), *const u8, usize),
34 string: unsafe extern "C" fn(*mut (), *const u8, usize),
35 escaped_string: unsafe extern "C" fn(*mut (), *const u8, usize),
36 key: unsafe extern "C" fn(*mut (), *const u8, usize),
37 escaped_key: unsafe extern "C" fn(*mut (), *const u8, usize),
38 start_object: unsafe extern "C" fn(*mut ()),
39 end_object: unsafe extern "C" fn(*mut ()),
40 start_array: unsafe extern "C" fn(*mut ()),
41 end_array: unsafe extern "C" fn(*mut ()),
42}
43
44#[cfg(target_arch = "x86_64")]
57pub(crate) trait WriterForZmm {
58 unsafe fn wfz_null(&mut self);
59 unsafe fn wfz_bool_val(&mut self, v: bool);
60 unsafe fn wfz_number(&mut self, ptr: *const u8, len: usize);
61 unsafe fn wfz_string(&mut self, ptr: *const u8, len: usize);
62 unsafe fn wfz_escaped_string(&mut self, ptr: *const u8, len: usize);
63 unsafe fn wfz_key(&mut self, ptr: *const u8, len: usize);
64 unsafe fn wfz_escaped_key(&mut self, ptr: *const u8, len: usize);
65 unsafe fn wfz_start_object(&mut self);
66 unsafe fn wfz_end_object(&mut self);
67 unsafe fn wfz_start_array(&mut self);
68 unsafe fn wfz_end_array(&mut self);
69}
70
71#[cfg(target_arch = "x86_64")]
72impl<'a, W: Sax<'a>> WriterForZmm for W {
73 unsafe fn wfz_null(&mut self) {
74 self.null()
75 }
76 unsafe fn wfz_bool_val(&mut self, v: bool) {
77 self.bool_val(v)
78 }
79 unsafe fn wfz_number(&mut self, ptr: *const u8, len: usize) {
80 let s: &'a str = unsafe {
81 std::mem::transmute(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
82 ptr, len,
83 )))
84 };
85 self.number(s)
86 }
87 unsafe fn wfz_string(&mut self, ptr: *const u8, len: usize) {
88 let s: &'a str = unsafe {
89 std::mem::transmute(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
90 ptr, len,
91 )))
92 };
93 self.string(s)
94 }
95 unsafe fn wfz_escaped_string(&mut self, ptr: *const u8, len: usize) {
96 let s = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(ptr, len)) };
97 self.escaped_string(s)
98 }
99 unsafe fn wfz_key(&mut self, ptr: *const u8, len: usize) {
100 let s: &'a str = unsafe {
101 std::mem::transmute(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
102 ptr, len,
103 )))
104 };
105 self.key(s)
106 }
107 unsafe fn wfz_escaped_key(&mut self, ptr: *const u8, len: usize) {
108 let s = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(ptr, len)) };
109 self.escaped_key(s)
110 }
111 unsafe fn wfz_start_object(&mut self) {
112 self.start_object()
113 }
114 unsafe fn wfz_end_object(&mut self) {
115 self.end_object()
116 }
117 unsafe fn wfz_start_array(&mut self) {
118 self.start_array()
119 }
120 unsafe fn wfz_end_array(&mut self) {
121 self.end_array()
122 }
123}
124
125#[cfg(target_arch = "x86_64")]
126unsafe extern "C" fn zw_null<W: WriterForZmm>(data: *mut ()) {
127 unsafe { (*(data as *mut W)).wfz_null() }
128}
129#[cfg(target_arch = "x86_64")]
130unsafe extern "C" fn zw_bool_val<W: WriterForZmm>(data: *mut (), v: bool) {
131 unsafe { (*(data as *mut W)).wfz_bool_val(v) }
132}
133#[cfg(target_arch = "x86_64")]
134unsafe extern "C" fn zw_number<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
135 unsafe { (*(data as *mut W)).wfz_number(ptr, len) }
136}
137#[cfg(target_arch = "x86_64")]
138unsafe extern "C" fn zw_string<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
139 unsafe { (*(data as *mut W)).wfz_string(ptr, len) }
140}
141#[cfg(target_arch = "x86_64")]
142unsafe extern "C" fn zw_escaped_string<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
143 unsafe { (*(data as *mut W)).wfz_escaped_string(ptr, len) }
144}
145#[cfg(target_arch = "x86_64")]
146unsafe extern "C" fn zw_key<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
147 unsafe { (*(data as *mut W)).wfz_key(ptr, len) }
148}
149#[cfg(target_arch = "x86_64")]
150unsafe extern "C" fn zw_escaped_key<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
151 unsafe { (*(data as *mut W)).wfz_escaped_key(ptr, len) }
152}
153#[cfg(target_arch = "x86_64")]
154unsafe extern "C" fn zw_start_object<W: WriterForZmm>(data: *mut ()) {
155 unsafe { (*(data as *mut W)).wfz_start_object() }
156}
157#[cfg(target_arch = "x86_64")]
158unsafe extern "C" fn zw_end_object<W: WriterForZmm>(data: *mut ()) {
159 unsafe { (*(data as *mut W)).wfz_end_object() }
160}
161#[cfg(target_arch = "x86_64")]
162unsafe extern "C" fn zw_start_array<W: WriterForZmm>(data: *mut ()) {
163 unsafe { (*(data as *mut W)).wfz_start_array() }
164}
165#[cfg(target_arch = "x86_64")]
166unsafe extern "C" fn zw_end_array<W: WriterForZmm>(data: *mut ()) {
167 unsafe { (*(data as *mut W)).wfz_end_array() }
168}
169
170#[cfg(target_arch = "x86_64")]
174fn build_zmm_vtab<W: WriterForZmm>() -> ZmmVtab {
175 ZmmVtab {
176 null: zw_null::<W>,
177 bool_val: zw_bool_val::<W>,
178 number: zw_number::<W>,
179 string: zw_string::<W>,
180 escaped_string: zw_escaped_string::<W>,
181 key: zw_key::<W>,
182 escaped_key: zw_escaped_key::<W>,
183 start_object: zw_start_object::<W>,
184 end_object: zw_end_object::<W>,
185 start_array: zw_start_array::<W>,
186 end_array: zw_end_array::<W>,
187 }
188}
189
190#[cfg(target_arch = "x86_64")]
191#[allow(improper_ctypes)]
192unsafe extern "C" {
193 fn parse_json_zmm_sax(
198 src_ptr: *const u8,
199 src_len: usize,
200 writer_data: *mut (),
201 writer_vtab: *const ZmmVtab,
202 frames_buf: *mut u8,
203 ) -> bool;
204
205 fn parse_json_zmm_dom(
214 src_ptr: *const u8,
215 src_len: usize,
216 tape_ptr: *mut DomEntry<'static>,
217 tape_len_out: *mut usize,
218 frames_buf: *mut u8,
219 open_buf: *mut u64,
220 has_escapes_out: *mut bool,
221 tape_cap: usize,
222 ) -> u8;
223}
224
225#[cfg(feature = "stats")]
230pub mod stats {
231 use std::sync::atomic::{AtomicU64, Ordering::Relaxed};
232
233 pub static VALUE_WHITESPACE: AtomicU64 = AtomicU64::new(0);
234 pub static STRING_CHARS: AtomicU64 = AtomicU64::new(0);
235 pub static STRING_ESCAPE: AtomicU64 = AtomicU64::new(0);
236 pub static KEY_CHARS: AtomicU64 = AtomicU64::new(0);
237 pub static KEY_ESCAPE: AtomicU64 = AtomicU64::new(0);
238 pub static KEY_END: AtomicU64 = AtomicU64::new(0);
239 pub static AFTER_COLON: AtomicU64 = AtomicU64::new(0);
240 pub static ATOM_CHARS: AtomicU64 = AtomicU64::new(0);
241 pub static OBJECT_START: AtomicU64 = AtomicU64::new(0);
242 pub static ARRAY_START: AtomicU64 = AtomicU64::new(0);
243 pub static AFTER_VALUE: AtomicU64 = AtomicU64::new(0);
244
245 pub fn reset() {
246 for s in all() {
247 s.store(0, Relaxed);
248 }
249 }
250
251 fn all() -> [&'static AtomicU64; 11] {
252 [
253 &VALUE_WHITESPACE,
254 &STRING_CHARS,
255 &STRING_ESCAPE,
256 &KEY_CHARS,
257 &KEY_ESCAPE,
258 &KEY_END,
259 &AFTER_COLON,
260 &ATOM_CHARS,
261 &OBJECT_START,
262 &ARRAY_START,
263 &AFTER_VALUE,
264 ]
265 }
266
267 pub struct StateStats {
268 pub value_whitespace: u64,
269 pub string_chars: u64,
270 pub string_escape: u64,
271 pub key_chars: u64,
272 pub key_escape: u64,
273 pub key_end: u64,
274 pub after_colon: u64,
275 pub atom_chars: u64,
276 pub object_start: u64,
277 pub array_start: u64,
278 pub after_value: u64,
279 }
280
281 pub fn get() -> StateStats {
282 StateStats {
283 value_whitespace: VALUE_WHITESPACE.load(Relaxed),
284 string_chars: STRING_CHARS.load(Relaxed),
285 string_escape: STRING_ESCAPE.load(Relaxed),
286 key_chars: KEY_CHARS.load(Relaxed),
287 key_escape: KEY_ESCAPE.load(Relaxed),
288 key_end: KEY_END.load(Relaxed),
289 after_colon: AFTER_COLON.load(Relaxed),
290 atom_chars: ATOM_CHARS.load(Relaxed),
291 object_start: OBJECT_START.load(Relaxed),
292 array_start: ARRAY_START.load(Relaxed),
293 after_value: AFTER_VALUE.load(Relaxed),
294 }
295 }
296}
297
298macro_rules! stat {
300 ($counter:path) => {
301 #[cfg(feature = "stats")]
302 $counter.fetch_add(1, ::std::sync::atomic::Ordering::Relaxed);
303 };
304}
305
306#[derive(PartialEq)]
307enum State {
308 ValueWhitespace,
310
311 StringChars,
313
314 KeyChars,
316 KeyEnd,
318 AfterColon,
320
321 AtomChars,
323
324 Error,
326
327 ObjectStart,
329
330 ArrayStart,
332
333 AfterValue,
335}
336
337#[derive(Copy, Clone, PartialEq)]
344#[repr(u8)]
345enum FrameKind {
346 Object = 0,
347 Array = 1,
348}
349
350pub const MAX_JSON_DEPTH: usize = 64;
352
353fn is_valid_json_number(s: &[u8]) -> bool {
361 let mut i = 0;
362 let n = s.len();
363 if n == 0 {
364 return false;
365 }
366 if s[i] == b'-' {
367 i += 1;
368 if i == n {
369 return false;
370 }
371 }
372 if s[i] == b'0' {
373 i += 1;
374 if i < n && s[i].is_ascii_digit() {
375 return false;
376 }
377 } else if s[i].is_ascii_digit() {
378 while i < n && s[i].is_ascii_digit() {
379 i += 1;
380 }
381 } else {
382 return false;
383 }
384 if i < n && s[i] == b'.' {
385 i += 1;
386 if i == n || !s[i].is_ascii_digit() {
387 return false;
388 }
389 while i < n && s[i].is_ascii_digit() {
390 i += 1;
391 }
392 }
393 if i < n && (s[i] == b'e' || s[i] == b'E') {
394 i += 1;
395 if i < n && (s[i] == b'+' || s[i] == b'-') {
396 i += 1;
397 }
398 if i == n || !s[i].is_ascii_digit() {
399 return false;
400 }
401 while i < n && s[i].is_ascii_digit() {
402 i += 1;
403 }
404 }
405 i == n
406}
407
408#[doc(hidden)]
411#[unsafe(no_mangle)]
412pub extern "C" fn is_valid_json_number_c(ptr: *const u8, len: usize) -> bool {
413 let s = unsafe { std::slice::from_raw_parts(ptr, len) };
414 is_valid_json_number(s)
415}
416
417#[doc(hidden)]
426#[cfg(target_arch = "x86_64")]
427#[unsafe(no_mangle)]
428#[inline(never)]
429pub extern "C" fn dom_unescape_to_box_str(
430 raw_ptr: *const u8,
431 raw_len: usize,
432 out_ptr: *mut *const u8,
433 out_len: *mut usize,
434) {
435 unsafe {
436 let raw = std::str::from_utf8_unchecked(std::slice::from_raw_parts(raw_ptr, raw_len));
437 let mut buf = String::new();
438 unescape_str(raw, &mut buf);
439 let boxed: Box<str> = buf.into_boxed_str();
440 let len = boxed.len();
441 let raw_out: *mut str = Box::into_raw(boxed);
442 *out_ptr = raw_out as *mut u8 as *const u8;
443 *out_len = len;
444 }
445}
446
447fn write_atom<'a, W: Sax<'a>>(s: &'a str, w: &mut W) -> bool {
448 match s {
449 "true" => {
450 w.bool_val(true);
451 true
452 }
453 "false" => {
454 w.bool_val(false);
455 true
456 }
457 "null" => {
458 w.null();
459 true
460 }
461 n => {
462 if is_valid_json_number(n.as_bytes()) {
463 w.number(n);
464 true
465 } else {
466 false
467 }
468 }
469 }
470}
471
472pub fn parse_to_dom<'a>(src: &'a str) -> Option<Dom<'a>> {
492 parse_with(src, DomWriter::new())
493}
494
495#[cfg(target_arch = "x86_64")]
525pub unsafe fn parse_to_dom_zmm<'a>(
526 src: &'a str,
527 initial_capacity: Option<usize>,
528) -> Option<Dom<'a>> {
529 const RESULT_OK: u8 = 0;
531 const RESULT_PARSE_ERROR: u8 = 1;
532 const RESULT_TAPE_OVERFLOW: u8 = 2;
533
534 let mut frames_buf = [FrameKind::Object; MAX_JSON_DEPTH];
535 let mut open_buf = [0u64; MAX_JSON_DEPTH];
536
537 let mut capacity = initial_capacity.unwrap_or_else(|| (src.len() / 4).max(2));
542
543 loop {
544 let mut tape_data: Vec<DomEntry<'a>> = Vec::with_capacity(capacity);
545 let tape_ptr = tape_data.as_mut_ptr() as *mut DomEntry<'static>;
546 let mut tape_len: usize = 0;
547 let mut has_escapes: bool = false;
548
549 let result = unsafe {
559 parse_json_zmm_dom(
560 src.as_ptr(),
561 src.len(),
562 tape_ptr,
563 &raw mut tape_len,
564 frames_buf.as_mut_ptr() as *mut u8,
565 open_buf.as_mut_ptr(),
566 &raw mut has_escapes,
567 capacity,
568 )
569 };
570
571 match result {
572 RESULT_OK => {
573 unsafe { tape_data.set_len(tape_len) };
575 return Some(Dom {
576 entries: tape_data,
577 has_escapes,
578 });
579 }
580 RESULT_PARSE_ERROR => return None,
581 RESULT_TAPE_OVERFLOW => {
582 unsafe { tape_data.set_len(tape_len) };
588 capacity = capacity.saturating_mul(2).max(capacity + 1);
589 continue;
590 }
591 _ => return None, }
593 }
594}
595
596pub fn parse_with<'a, W: Sax<'a>>(src: &'a str, writer: W) -> Option<W::Output> {
604 let mut frames_buf = [FrameKind::Object; MAX_JSON_DEPTH];
605 parse_json_impl(src, writer, &mut frames_buf)
606}
607
608#[cfg(target_arch = "x86_64")]
621pub unsafe fn parse_with_zmm<'a, W: Sax<'a>>(src: &'a str, mut writer: W) -> Option<W::Output> {
622 let vtab = build_zmm_vtab::<W>();
623 let mut frames_buf = [FrameKind::Object; MAX_JSON_DEPTH];
624 let ok = unsafe {
628 parse_json_zmm_sax(
629 src.as_ptr(),
630 src.len(),
631 &raw mut writer as *mut (),
632 &vtab,
633 frames_buf.as_mut_ptr() as *mut u8,
634 )
635 };
636 if ok { writer.finish() } else { None }
637}
638
639fn parse_json_impl<'a, W: Sax<'a>>(
640 src: &'a str,
641 mut writer: W,
642 frames_buf: &mut [FrameKind; MAX_JSON_DEPTH],
643) -> Option<W::Output> {
644 let bytes = src.as_bytes();
645 let mut frames_depth: usize = 0;
646 let mut str_start: usize = 0; let mut str_escaped = false; let mut bs_count: usize = 0; let mut atom_start: usize = 0; let mut current_key_raw: &'a str = ""; let mut current_key_escaped = false; let mut after_comma = false; let mut state = State::ValueWhitespace;
654
655 let mut pos = 0;
656 while pos < bytes.len() {
657 let chunk_len = (bytes.len() - pos).min(64);
658 let chunk = &bytes[pos..pos + chunk_len];
659 let byte_state = classify_u64(chunk);
660
661 let mut chunk_offset = 0;
662 'inner: while chunk_offset < chunk_len {
663 state = match state {
664 State::ValueWhitespace => {
665 stat!(crate::stats::VALUE_WHITESPACE);
666 let ahead = (!byte_state.whitespace) >> chunk_offset;
667 let skip = ahead.trailing_zeros() as usize;
668 chunk_offset += skip;
669 if chunk_offset >= chunk_len {
670 break 'inner;
671 }
672 let byte = chunk[chunk_offset];
673 match byte {
674 b'{' => {
675 if frames_depth >= MAX_JSON_DEPTH {
676 State::Error
677 } else {
678 frames_buf[frames_depth] = FrameKind::Object;
679 frames_depth += 1;
680 writer.start_object();
681 State::ObjectStart
682 }
683 }
684 b'[' => {
685 if frames_depth >= MAX_JSON_DEPTH {
686 State::Error
687 } else {
688 frames_buf[frames_depth] = FrameKind::Array;
689 frames_depth += 1;
690 writer.start_array();
691 State::ArrayStart
692 }
693 }
694 b'"' => {
695 str_start = pos + chunk_offset + 1;
696 str_escaped = false;
697 bs_count = 0;
698 State::StringChars
699 }
700 _ => {
701 atom_start = pos + chunk_offset;
702 State::AtomChars
703 }
704 }
705 }
706
707 State::StringChars => {
708 stat!(crate::stats::STRING_CHARS);
709 let interesting = (byte_state.backslashes | byte_state.quotes) >> chunk_offset;
713 let skip = interesting.trailing_zeros() as usize;
714 chunk_offset = (chunk_offset + skip).min(chunk_len);
715 if chunk_offset >= chunk_len {
716 break 'inner;
717 }
718 if skip > 0 {
720 bs_count = 0;
721 }
722 let byte = chunk[chunk_offset];
723 match byte {
724 b'\\' => {
725 bs_count += 1;
728 str_escaped = true;
729 State::StringChars
730 }
731 b'"' if bs_count & 1 == 1 => {
732 bs_count = 0;
734 State::StringChars
735 }
736 _ => {
737 bs_count = 0;
739 let raw = &src[str_start..pos + chunk_offset];
740 if str_escaped {
741 writer.escaped_string(raw);
742 } else {
743 writer.string(raw);
744 }
745 State::AfterValue
746 }
747 }
748 }
749
750 State::KeyChars => {
751 stat!(crate::stats::KEY_CHARS);
752 let interesting = (byte_state.backslashes | byte_state.quotes) >> chunk_offset;
753 let skip = interesting.trailing_zeros() as usize;
754 chunk_offset = (chunk_offset + skip).min(chunk_len);
755 if chunk_offset >= chunk_len {
756 break 'inner;
757 }
758 if skip > 0 {
759 bs_count = 0;
760 }
761 let byte = chunk[chunk_offset];
762 match byte {
763 b'\\' => {
764 bs_count += 1;
765 str_escaped = true;
766 State::KeyChars
767 }
768 b'"' if bs_count & 1 == 1 => {
769 bs_count = 0;
771 State::KeyChars
772 }
773 _ => {
774 bs_count = 0;
776 current_key_raw = &src[str_start..pos + chunk_offset];
777 current_key_escaped = str_escaped;
778 State::KeyEnd
779 }
780 }
781 }
782 State::KeyEnd => {
783 stat!(crate::stats::KEY_END);
784 let ahead = (!byte_state.whitespace) >> chunk_offset;
785 let skip = ahead.trailing_zeros() as usize;
786 chunk_offset += skip;
787 if chunk_offset >= chunk_len {
788 break 'inner;
789 }
790 let byte = chunk[chunk_offset];
791 match byte {
792 b':' => {
793 if current_key_escaped {
794 writer.escaped_key(current_key_raw);
795 } else {
796 writer.key(current_key_raw);
797 }
798 State::AfterColon
799 }
800 _ => State::Error,
801 }
802 }
803 State::AfterColon => {
804 stat!(crate::stats::AFTER_COLON);
805 let ahead = (!byte_state.whitespace) >> chunk_offset;
806 let skip = ahead.trailing_zeros() as usize;
807 chunk_offset += skip;
808 if chunk_offset >= chunk_len {
809 break 'inner;
810 }
811 let byte = chunk[chunk_offset];
812 match byte {
813 b'{' => {
814 if frames_depth >= MAX_JSON_DEPTH {
815 State::Error
816 } else {
817 frames_buf[frames_depth] = FrameKind::Object;
818 frames_depth += 1;
819 writer.start_object();
820 State::ObjectStart
821 }
822 }
823 b'[' => {
824 if frames_depth >= MAX_JSON_DEPTH {
825 State::Error
826 } else {
827 frames_buf[frames_depth] = FrameKind::Array;
828 frames_depth += 1;
829 writer.start_array();
830 State::ArrayStart
831 }
832 }
833 b'"' => {
834 str_start = pos + chunk_offset + 1;
835 str_escaped = false;
836 bs_count = 0;
837 State::StringChars
838 }
839 _ => {
840 atom_start = pos + chunk_offset;
841 State::AtomChars
842 }
843 }
844 }
845
846 State::AtomChars => {
847 stat!(crate::stats::ATOM_CHARS);
848 let ahead = byte_state.delimiters >> chunk_offset;
849 let skip = ahead.trailing_zeros() as usize;
850 chunk_offset += skip;
851 if chunk_offset >= chunk_len {
852 break 'inner;
853 }
854 let byte = chunk[chunk_offset];
855 if !write_atom(&src[atom_start..pos + chunk_offset], &mut writer) {
856 State::Error
857 } else {
858 match byte {
859 b'}' => {
860 if frames_depth == 0
861 || frames_buf[frames_depth - 1] != FrameKind::Object
862 {
863 State::Error
864 } else {
865 frames_depth -= 1;
866 writer.end_object();
867 State::AfterValue
868 }
869 }
870 b']' => {
871 if frames_depth == 0
872 || frames_buf[frames_depth - 1] != FrameKind::Array
873 {
874 State::Error
875 } else {
876 frames_depth -= 1;
877 writer.end_array();
878 State::AfterValue
879 }
880 }
881 b',' => {
882 if frames_depth == 0 {
883 State::Error
884 } else {
885 match frames_buf[frames_depth - 1] {
886 FrameKind::Array => {
887 after_comma = true;
888 State::ArrayStart
889 }
890 FrameKind::Object => {
891 after_comma = true;
892 State::ObjectStart
893 }
894 }
895 }
896 }
897 _ => State::AfterValue, }
899 }
900 }
901
902 State::Error => break 'inner,
903
904 State::ObjectStart => {
905 stat!(crate::stats::OBJECT_START);
906 let ahead = (!byte_state.whitespace) >> chunk_offset;
907 let skip = ahead.trailing_zeros() as usize;
908 chunk_offset += skip;
909 if chunk_offset >= chunk_len {
910 break 'inner;
911 }
912 let byte = chunk[chunk_offset];
913 match byte {
914 b'"' => {
915 after_comma = false;
916 str_start = pos + chunk_offset + 1;
917 str_escaped = false;
918 bs_count = 0;
919 State::KeyChars
920 }
921 b'}' => {
922 if after_comma {
923 State::Error
924 } else if frames_depth > 0
925 && frames_buf[frames_depth - 1] == FrameKind::Object
926 {
927 frames_depth -= 1;
928 writer.end_object();
929 State::AfterValue
930 } else {
931 State::Error
932 }
933 }
934 _ => State::Error,
935 }
936 }
937
938 State::ArrayStart => {
939 stat!(crate::stats::ARRAY_START);
940 let ahead = (!byte_state.whitespace) >> chunk_offset;
941 let skip = ahead.trailing_zeros() as usize;
942 chunk_offset += skip;
943 if chunk_offset >= chunk_len {
944 break 'inner;
945 }
946 let byte = chunk[chunk_offset];
947 match byte {
948 b']' => {
949 if after_comma {
950 State::Error
951 } else if frames_depth > 0
952 && frames_buf[frames_depth - 1] == FrameKind::Array
953 {
954 frames_depth -= 1;
955 writer.end_array();
956 State::AfterValue
957 } else {
958 State::Error
959 }
960 }
961 b'{' => {
962 after_comma = false;
963 if frames_depth >= MAX_JSON_DEPTH {
964 State::Error
965 } else {
966 frames_buf[frames_depth] = FrameKind::Object;
967 frames_depth += 1;
968 writer.start_object();
969 State::ObjectStart
970 }
971 }
972 b'[' => {
973 after_comma = false;
974 if frames_depth >= MAX_JSON_DEPTH {
975 State::Error
976 } else {
977 frames_buf[frames_depth] = FrameKind::Array;
978 frames_depth += 1;
979 writer.start_array();
980 State::ArrayStart
981 }
982 }
983 b'"' => {
984 after_comma = false;
985 str_start = pos + chunk_offset + 1;
986 str_escaped = false;
987 bs_count = 0;
988 State::StringChars
989 }
990 _ => {
991 after_comma = false;
992 atom_start = pos + chunk_offset;
993 State::AtomChars
994 }
995 }
996 }
997
998 State::AfterValue => {
999 stat!(crate::stats::AFTER_VALUE);
1000 let ahead = (!byte_state.whitespace) >> chunk_offset;
1001 let skip = ahead.trailing_zeros() as usize;
1002 chunk_offset += skip;
1003 if chunk_offset >= chunk_len {
1004 break 'inner;
1005 }
1006 let byte = chunk[chunk_offset];
1007 match byte {
1008 b',' => {
1009 if frames_depth == 0 {
1010 State::Error
1011 } else {
1012 match frames_buf[frames_depth - 1] {
1013 FrameKind::Object => {
1014 after_comma = true;
1015 State::ObjectStart
1016 }
1017 FrameKind::Array => {
1018 after_comma = true;
1019 State::ArrayStart
1020 }
1021 }
1022 }
1023 }
1024 b'}' => {
1025 if frames_depth > 0 && frames_buf[frames_depth - 1] == FrameKind::Object
1026 {
1027 frames_depth -= 1;
1028 writer.end_object();
1029 State::AfterValue
1030 } else {
1031 State::Error
1032 }
1033 }
1034 b']' => {
1035 if frames_depth > 0 && frames_buf[frames_depth - 1] == FrameKind::Array
1036 {
1037 frames_depth -= 1;
1038 writer.end_array();
1039 State::AfterValue
1040 } else {
1041 State::Error
1042 }
1043 }
1044 _ => State::Error,
1045 }
1046 }
1047 };
1048 chunk_offset += 1;
1049 }
1050 pos += chunk_len;
1051 }
1052
1053 if state == State::AtomChars {
1055 if !write_atom(&src[atom_start..], &mut writer) {
1056 return None;
1057 }
1058 } else if state != State::AfterValue {
1059 return None;
1060 }
1061
1062 if state == State::Error {
1063 return None;
1064 }
1065
1066 if frames_depth != 0 {
1068 return None;
1069 }
1070
1071 writer.finish()
1072}
1073
1074#[doc(hidden)]
1081#[unsafe(no_mangle)]
1082#[inline(never)]
1083pub fn unescape_str(s: &str, out: &mut String) {
1084 out.clear();
1085 let bytes = s.as_bytes();
1086 let mut i = 0;
1087 while i < bytes.len() {
1088 if bytes[i] != b'\\' {
1089 let ch = s[i..].chars().next().unwrap();
1091 out.push(ch);
1092 i += ch.len_utf8();
1093 continue;
1094 }
1095 i += 1;
1097 if i >= bytes.len() {
1098 break;
1099 }
1100 match bytes[i] {
1101 b'"' => {
1102 out.push('"');
1103 i += 1;
1104 }
1105 b'\\' => {
1106 out.push('\\');
1107 i += 1;
1108 }
1109 b'/' => {
1110 out.push('/');
1111 i += 1;
1112 }
1113 b'b' => {
1114 out.push('\x08');
1115 i += 1;
1116 }
1117 b'f' => {
1118 out.push('\x0C');
1119 i += 1;
1120 }
1121 b'n' => {
1122 out.push('\n');
1123 i += 1;
1124 }
1125 b'r' => {
1126 out.push('\r');
1127 i += 1;
1128 }
1129 b't' => {
1130 out.push('\t');
1131 i += 1;
1132 }
1133 b'u' => {
1134 i += 1; if i + 4 <= bytes.len() {
1136 if let Ok(hi) = u16::from_str_radix(&s[i..i + 4], 16) {
1137 i += 4;
1138 if (0xD800..0xDC00).contains(&hi)
1140 && i + 6 <= bytes.len()
1141 && bytes[i] == b'\\'
1142 && bytes[i + 1] == b'u'
1143 {
1144 if let Ok(lo) = u16::from_str_radix(&s[i + 2..i + 6], 16) {
1145 if (0xDC00..=0xDFFF).contains(&lo) {
1146 let cp = 0x1_0000u32
1147 + ((hi as u32 - 0xD800) << 10)
1148 + (lo as u32 - 0xDC00);
1149 if let Some(ch) = char::from_u32(cp) {
1150 out.push(ch);
1151 i += 6;
1152 continue;
1153 }
1154 }
1155 }
1156 }
1157 if let Some(ch) = char::from_u32(hi as u32) {
1158 out.push(ch);
1159 }
1160 }
1161 }
1162 }
1164 b => {
1165 out.push('\\');
1166 out.push(b as char);
1167 i += 1;
1168 }
1169 }
1170 }
1171}
1172
1173#[repr(C)]
1175#[derive(Debug, PartialEq)]
1176pub struct ByteState {
1177 whitespace: u64, quotes: u64, backslashes: u64, delimiters: u64, }
1182
1183fn classify_u64(src: &[u8]) -> ByteState {
1206 assert!(!src.is_empty() && src.len() <= 64);
1207 let mut buf = [0u8; 64];
1208 buf[..src.len()].copy_from_slice(src);
1209
1210 #[inline(always)]
1211 fn has_zero_byte(v: u64) -> u64 {
1212 v.wrapping_sub(0x0101_0101_0101_0101_u64) & !v & 0x8080_8080_8080_8080_u64
1213 }
1214
1215 #[inline(always)]
1217 fn eq_byte(v: u64, b: u8) -> u64 {
1218 has_zero_byte(v ^ (b as u64 * 0x0101_0101_0101_0101_u64))
1219 }
1220
1221 #[inline(always)]
1223 fn movemask8(v: u64) -> u8 {
1224 ((v & 0x8080_8080_8080_8080_u64).wrapping_mul(0x0002_0408_1020_4081_u64) >> 56) as u8
1225 }
1226
1227 let mut ws = [0u8; 8];
1228 let mut q = [0u8; 8];
1229 let mut bs = [0u8; 8];
1230 let mut dl = [0u8; 8];
1231
1232 for i in 0..8 {
1233 let v = u64::from_le_bytes(buf[i * 8..][..8].try_into().unwrap());
1234
1235 let masked = v & 0x7f7f_7f7f_7f7f_7f7f_u64;
1239 let sum = masked.wrapping_add(0x5f5f_5f5f_5f5f_5f5f_u64);
1240 let w = !(sum | v) & 0x8080_8080_8080_8080_u64;
1241
1242 let quotes = eq_byte(v, b'"');
1243 let backslashes = eq_byte(v, b'\\');
1244 let commas = eq_byte(v, b',');
1245 let cl_brace = eq_byte(v, b'}');
1246 let cl_bracket = eq_byte(v, b']');
1247 let delims = w | commas | cl_brace | cl_bracket;
1248
1249 ws[i] = movemask8(w);
1250 q[i] = movemask8(quotes);
1251 bs[i] = movemask8(backslashes);
1252 dl[i] = movemask8(delims);
1253 }
1254
1255 ByteState {
1256 whitespace: u64::from_le_bytes(ws),
1257 quotes: u64::from_le_bytes(q),
1258 backslashes: u64::from_le_bytes(bs),
1259 delimiters: u64::from_le_bytes(dl),
1260 }
1261}
1262
1263#[cfg(test)]
1264mod tests {
1265 use super::*;
1266
1267 #[cfg(target_arch = "x86_64")]
1273 fn zmm_dom_matches(src: &str) {
1274 let ref_tape = parse_to_dom(src).unwrap_or_else(|| panic!("reference rejected: {src:?}"));
1275 let asm_tape = unsafe { parse_to_dom_zmm(src, None) }
1276 .unwrap_or_else(|| panic!("zmm_tape rejected: {src:?}"));
1277 assert_eq!(
1278 ref_tape.entries, asm_tape.entries,
1279 "tape mismatch for {src:?}"
1280 );
1281 }
1282
1283 #[cfg(target_arch = "x86_64")]
1284 fn zmm_dom_rejects(src: &str) {
1285 assert!(
1286 unsafe { parse_to_dom_zmm(src, None) }.is_none(),
1287 "zmm_tape should reject {src:?}"
1288 );
1289 }
1290
1291 #[cfg(target_arch = "x86_64")]
1292 #[test]
1293 fn zmm_dom_atoms() {
1294 for src in &[
1295 "null",
1296 "true",
1297 "false",
1298 "0",
1299 "42",
1300 "-7",
1301 "3.14",
1302 "1e10",
1303 "-0.5e-3",
1304 "1",
1306 "12",
1307 "123",
1308 "1234",
1309 "12345",
1310 "123456",
1311 "1234567",
1312 "12345678",
1313 "123456789",
1315 ] {
1316 zmm_dom_matches(src);
1317 }
1318 }
1319
1320 #[cfg(target_arch = "x86_64")]
1321 #[test]
1322 fn zmm_dom_strings() {
1323 for src in &[
1324 r#""hello""#,
1325 r#""""#,
1326 r#""with \"escape\"""#,
1327 r#""newline\nand\ttab""#,
1328 r#""\u0041\u0042\u0043""#,
1329 r#""\u0000""#,
1330 r#""surrogate \uD83D\uDE00""#,
1331 ] {
1332 zmm_dom_matches(src);
1333 }
1334 }
1335
1336 #[cfg(target_arch = "x86_64")]
1337 #[test]
1338 fn zmm_dom_simple_object() {
1339 zmm_dom_matches(r#"{"x":1}"#);
1340 zmm_dom_matches(r#"{"a":1,"b":2,"c":3}"#);
1341 zmm_dom_matches(r#"{}"#);
1342 }
1343
1344 #[cfg(target_arch = "x86_64")]
1345 #[test]
1346 fn zmm_dom_simple_array() {
1347 zmm_dom_matches(r#"[1,2,3]"#);
1348 zmm_dom_matches(r#"[]"#);
1349 zmm_dom_matches(r#"[null,true,false,"x",42]"#);
1350 }
1351
1352 #[cfg(target_arch = "x86_64")]
1353 #[test]
1354 fn zmm_dom_nested() {
1355 zmm_dom_matches(r#"{"a":{"b":[1,true,null]}}"#);
1356 zmm_dom_matches(r#"[[1,[2,[3]]]]"#);
1357 zmm_dom_matches(r#"{"k":{"k":{"k":{}}}}"#);
1358 zmm_dom_matches(r#"[{"a":1},{"b":2}]"#);
1359 }
1360
1361 #[cfg(target_arch = "x86_64")]
1362 #[test]
1363 fn zmm_dom_escaped_keys() {
1364 zmm_dom_matches(r#"{"key\nname":1}"#);
1365 zmm_dom_matches(r#"{"key\u0041":true}"#);
1366 zmm_dom_matches(r#"{"a\"b":null}"#);
1367 }
1368
1369 #[cfg(target_arch = "x86_64")]
1370 #[test]
1371 fn zmm_dom_whitespace() {
1372 zmm_dom_matches(" { \"x\" : 1 } ");
1373 zmm_dom_matches("[ 1 , 2 , 3 ]");
1374 zmm_dom_matches("\t\r\nnull\t\r\n");
1375 }
1376
1377 #[cfg(target_arch = "x86_64")]
1378 #[test]
1379 fn zmm_dom_long_string() {
1380 let long = format!(r#""{}""#, "a".repeat(200));
1382 zmm_dom_matches(&long);
1383 let long_esc = format!(r#""{}\n{}""#, "b".repeat(100), "c".repeat(100));
1384 zmm_dom_matches(&long_esc);
1385 }
1386
1387 #[cfg(target_arch = "x86_64")]
1388 #[test]
1389 fn zmm_dom_reject_invalid() {
1390 zmm_dom_rejects("");
1391 zmm_dom_rejects("{");
1392 zmm_dom_rejects("[");
1393 zmm_dom_rejects("}");
1394 zmm_dom_rejects(r#"{"a":}"#);
1395 zmm_dom_rejects(r#"{"a":1"#);
1396 zmm_dom_rejects("01");
1398 zmm_dom_rejects("00");
1399 zmm_dom_rejects("007");
1400 zmm_dom_rejects("01234567"); }
1402
1403 #[cfg(target_arch = "x86_64")]
1408 fn zmm_sax_matches(src: &str) {
1409 #[derive(Default)]
1411 struct EventLog(String);
1412
1413 impl<'s> Sax<'s> for EventLog {
1414 type Output = String;
1415 fn null(&mut self) {
1416 self.0.push_str("null;");
1417 }
1418 fn bool_val(&mut self, v: bool) {
1419 self.0.push_str(if v { "true;" } else { "false;" });
1420 }
1421 fn number(&mut self, s: &str) {
1422 self.0.push_str(s);
1423 self.0.push(';');
1424 }
1425 fn string(&mut self, s: &str) {
1426 self.0.push_str("s:");
1427 self.0.push_str(s);
1428 self.0.push(';');
1429 }
1430 fn escaped_string(&mut self, s: &str) {
1431 self.0.push_str("es:");
1432 self.0.push_str(s);
1433 self.0.push(';');
1434 }
1435 fn key(&mut self, s: &str) {
1436 self.0.push_str("k:");
1437 self.0.push_str(s);
1438 self.0.push(';');
1439 }
1440 fn escaped_key(&mut self, s: &str) {
1441 self.0.push_str("ek:");
1442 self.0.push_str(s);
1443 self.0.push(';');
1444 }
1445 fn start_object(&mut self) {
1446 self.0.push('{');
1447 }
1448 fn end_object(&mut self) {
1449 self.0.push('}');
1450 }
1451 fn start_array(&mut self) {
1452 self.0.push('[');
1453 }
1454 fn end_array(&mut self) {
1455 self.0.push(']');
1456 }
1457 fn finish(self) -> Option<String> {
1458 Some(self.0)
1459 }
1460 }
1461
1462 let ref_log = parse_with(src, EventLog::default())
1463 .unwrap_or_else(|| panic!("reference rejected: {src:?}"));
1464 let asm_log = unsafe { parse_with_zmm(src, EventLog::default()) }
1465 .unwrap_or_else(|| panic!("parse_with_zmm rejected: {src:?}"));
1466 assert_eq!(ref_log, asm_log, "event log mismatch for {src:?}");
1467 }
1468
1469 #[cfg(target_arch = "x86_64")]
1470 #[test]
1471 fn zmm_sax_escaped_strings() {
1472 zmm_sax_matches(r#"{"key":"\n\t\r\""}"#);
1474 zmm_sax_matches(r#"{"key\nname":"val\u0041"}"#);
1475 zmm_sax_matches(r#"["\u0041","\u0042\u0043"]"#);
1476 zmm_sax_matches(r#"{"a\"b":"c\"d"}"#);
1477 let long = format!(r#"{{"{}\n":"{}\t"}}"#, "x".repeat(70), "y".repeat(70));
1479 zmm_sax_matches(&long);
1480 }
1484
1485 #[test]
1489 fn rust_even_backslash_before_quote() {
1490 use crate::JsonRef;
1491 let t = parse_to_dom(r#"{"k":"\\"}"#).expect("parse failed");
1493 assert_eq!(t.root().get("k").as_str(), Some("\\"));
1494 let t = parse_to_dom(r#"{"k":"\\\\"}"#).expect("parse failed");
1496 assert_eq!(t.root().get("k").as_str(), Some("\\\\"));
1497 let t = parse_to_dom(r#"["\\"]"#).expect("parse failed");
1499 assert_eq!(t.root().index_at(0).as_str(), Some("\\"));
1500 let t = parse_to_dom(r#"{"k":"abc\\"}"#).expect("parse failed");
1502 assert_eq!(t.root().get("k").as_str(), Some("abc\\"));
1503 let t = parse_to_dom("{\"k\":\"\\\\\\\"\"}").expect("parse failed");
1507 assert_eq!(t.root().get("k").as_str(), Some("\\\""));
1508 }
1509
1510 #[cfg(target_arch = "x86_64")]
1511 #[test]
1512 fn zmm_dom_overflow_retry() {
1513 let big: String = {
1517 let mut s = String::from("[");
1518 for i in 0..200u32 {
1519 if i > 0 {
1520 s.push(',');
1521 }
1522 s.push_str(&format!(r#"{{"k":{i}}}"#));
1523 }
1524 s.push(']');
1525 s
1526 };
1527 let tape =
1529 unsafe { parse_to_dom_zmm(&big, Some(4)) }.expect("overflow retry should succeed");
1530 assert_eq!(tape.root().unwrap().array_iter().unwrap().count(), 200);
1531 }
1532}