#![doc = include_str!("../README.md")]
#[cfg(feature = "serde")]
pub mod de;
pub mod dom;
pub mod sax;
#[cfg(feature = "serde")]
pub use de::from_taperef;
pub use dom::json_ref::JsonRef;
pub use dom::{Dom, DomArrayIter, DomEntry, DomEntryKind, DomObjectIter, DomRef};
pub use sax::Sax;
use dom::DomWriter;
#[cfg(target_arch = "x86_64")]
#[repr(C)]
struct ZmmVtab {
null: unsafe extern "C" fn(*mut ()),
bool_val: unsafe extern "C" fn(*mut (), bool),
number: unsafe extern "C" fn(*mut (), *const u8, usize),
string: unsafe extern "C" fn(*mut (), *const u8, usize),
escaped_string: unsafe extern "C" fn(*mut (), *const u8, usize),
key: unsafe extern "C" fn(*mut (), *const u8, usize),
escaped_key: unsafe extern "C" fn(*mut (), *const u8, usize),
start_object: unsafe extern "C" fn(*mut ()),
end_object: unsafe extern "C" fn(*mut ()),
start_array: unsafe extern "C" fn(*mut ()),
end_array: unsafe extern "C" fn(*mut ()),
}
#[cfg(target_arch = "x86_64")]
pub(crate) trait WriterForZmm {
unsafe fn wfz_null(&mut self);
unsafe fn wfz_bool_val(&mut self, v: bool);
unsafe fn wfz_number(&mut self, ptr: *const u8, len: usize);
unsafe fn wfz_string(&mut self, ptr: *const u8, len: usize);
unsafe fn wfz_escaped_string(&mut self, ptr: *const u8, len: usize);
unsafe fn wfz_key(&mut self, ptr: *const u8, len: usize);
unsafe fn wfz_escaped_key(&mut self, ptr: *const u8, len: usize);
unsafe fn wfz_start_object(&mut self);
unsafe fn wfz_end_object(&mut self);
unsafe fn wfz_start_array(&mut self);
unsafe fn wfz_end_array(&mut self);
}
#[cfg(target_arch = "x86_64")]
impl<'a, W: Sax<'a>> WriterForZmm for W {
unsafe fn wfz_null(&mut self) {
self.null()
}
unsafe fn wfz_bool_val(&mut self, v: bool) {
self.bool_val(v)
}
unsafe fn wfz_number(&mut self, ptr: *const u8, len: usize) {
let s: &'a str = unsafe {
std::mem::transmute(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
ptr, len,
)))
};
self.number(s)
}
unsafe fn wfz_string(&mut self, ptr: *const u8, len: usize) {
let s: &'a str = unsafe {
std::mem::transmute(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
ptr, len,
)))
};
self.string(s)
}
unsafe fn wfz_escaped_string(&mut self, ptr: *const u8, len: usize) {
let s = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(ptr, len)) };
self.escaped_string(s)
}
unsafe fn wfz_key(&mut self, ptr: *const u8, len: usize) {
let s: &'a str = unsafe {
std::mem::transmute(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
ptr, len,
)))
};
self.key(s)
}
unsafe fn wfz_escaped_key(&mut self, ptr: *const u8, len: usize) {
let s = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(ptr, len)) };
self.escaped_key(s)
}
unsafe fn wfz_start_object(&mut self) {
self.start_object()
}
unsafe fn wfz_end_object(&mut self) {
self.end_object()
}
unsafe fn wfz_start_array(&mut self) {
self.start_array()
}
unsafe fn wfz_end_array(&mut self) {
self.end_array()
}
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_null<W: WriterForZmm>(data: *mut ()) {
unsafe { (*(data as *mut W)).wfz_null() }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_bool_val<W: WriterForZmm>(data: *mut (), v: bool) {
unsafe { (*(data as *mut W)).wfz_bool_val(v) }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_number<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
unsafe { (*(data as *mut W)).wfz_number(ptr, len) }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_string<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
unsafe { (*(data as *mut W)).wfz_string(ptr, len) }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_escaped_string<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
unsafe { (*(data as *mut W)).wfz_escaped_string(ptr, len) }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_key<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
unsafe { (*(data as *mut W)).wfz_key(ptr, len) }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_escaped_key<W: WriterForZmm>(data: *mut (), ptr: *const u8, len: usize) {
unsafe { (*(data as *mut W)).wfz_escaped_key(ptr, len) }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_start_object<W: WriterForZmm>(data: *mut ()) {
unsafe { (*(data as *mut W)).wfz_start_object() }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_end_object<W: WriterForZmm>(data: *mut ()) {
unsafe { (*(data as *mut W)).wfz_end_object() }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_start_array<W: WriterForZmm>(data: *mut ()) {
unsafe { (*(data as *mut W)).wfz_start_array() }
}
#[cfg(target_arch = "x86_64")]
unsafe extern "C" fn zw_end_array<W: WriterForZmm>(data: *mut ()) {
unsafe { (*(data as *mut W)).wfz_end_array() }
}
#[cfg(target_arch = "x86_64")]
fn build_zmm_vtab<W: WriterForZmm>() -> ZmmVtab {
ZmmVtab {
null: zw_null::<W>,
bool_val: zw_bool_val::<W>,
number: zw_number::<W>,
string: zw_string::<W>,
escaped_string: zw_escaped_string::<W>,
key: zw_key::<W>,
escaped_key: zw_escaped_key::<W>,
start_object: zw_start_object::<W>,
end_object: zw_end_object::<W>,
start_array: zw_start_array::<W>,
end_array: zw_end_array::<W>,
}
}
#[cfg(target_arch = "x86_64")]
#[allow(improper_ctypes)]
unsafe extern "C" {
fn parse_json_zmm_sax(
src_ptr: *const u8,
src_len: usize,
writer_data: *mut (),
writer_vtab: *const ZmmVtab,
frames_buf: *mut u8,
) -> bool;
fn parse_json_zmm_dom(
src_ptr: *const u8,
src_len: usize,
tape_ptr: *mut DomEntry<'static>,
tape_len_out: *mut usize,
frames_buf: *mut u8,
open_buf: *mut u64,
has_escapes_out: *mut bool,
tape_cap: usize,
) -> u8;
}
#[derive(PartialEq)]
enum State {
ValueWhitespace,
StringChars,
KeyChars,
KeyEnd,
AfterColon,
AtomChars,
Error,
ObjectStart,
ArrayStart,
AfterValue,
}
#[derive(Copy, Clone, PartialEq)]
#[repr(u8)]
enum FrameKind {
Object = 0,
Array = 1,
}
pub const MAX_JSON_DEPTH: usize = 64;
fn is_valid_json_number(s: &[u8]) -> bool {
let mut i = 0;
let n = s.len();
if n == 0 {
return false;
}
if s[i] == b'-' {
i += 1;
if i == n {
return false;
}
}
if s[i] == b'0' {
i += 1;
if i < n && s[i].is_ascii_digit() {
return false;
}
} else if s[i].is_ascii_digit() {
while i < n && s[i].is_ascii_digit() {
i += 1;
}
} else {
return false;
}
if i < n && s[i] == b'.' {
i += 1;
if i == n || !s[i].is_ascii_digit() {
return false;
}
while i < n && s[i].is_ascii_digit() {
i += 1;
}
}
if i < n && (s[i] == b'e' || s[i] == b'E') {
i += 1;
if i < n && (s[i] == b'+' || s[i] == b'-') {
i += 1;
}
if i == n || !s[i].is_ascii_digit() {
return false;
}
while i < n && s[i].is_ascii_digit() {
i += 1;
}
}
i == n
}
#[doc(hidden)]
#[unsafe(no_mangle)]
pub extern "C" fn is_valid_json_number_c(ptr: *const u8, len: usize) -> bool {
let s = unsafe { std::slice::from_raw_parts(ptr, len) };
is_valid_json_number(s)
}
#[doc(hidden)]
#[cfg(target_arch = "x86_64")]
#[unsafe(no_mangle)]
#[inline(never)]
pub extern "C" fn dom_unescape_to_box_str(
raw_ptr: *const u8,
raw_len: usize,
out_ptr: *mut *const u8,
out_len: *mut usize,
) {
unsafe {
let raw = std::str::from_utf8_unchecked(std::slice::from_raw_parts(raw_ptr, raw_len));
let mut buf = String::new();
unescape_str(raw, &mut buf);
let boxed: Box<str> = buf.into_boxed_str();
let len = boxed.len();
let raw_out: *mut str = Box::into_raw(boxed);
*out_ptr = raw_out as *mut u8 as *const u8;
*out_len = len;
}
}
fn write_atom<'a, W: Sax<'a>>(s: &'a str, w: &mut W) -> bool {
match s {
"true" => {
w.bool_val(true);
true
}
"false" => {
w.bool_val(false);
true
}
"null" => {
w.null();
true
}
n => {
if is_valid_json_number(n.as_bytes()) {
w.number(n);
true
} else {
false
}
}
}
}
pub fn parse_to_dom<'a>(src: &'a str, initial_capacity: Option<usize>) -> Option<Dom<'a>> {
let cap = initial_capacity.unwrap_or(0);
parse_with(src, DomWriter::with_capacity(cap))
}
#[cfg(target_arch = "x86_64")]
pub unsafe fn parse_to_dom_zmm<'a>(
src: &'a str,
initial_capacity: Option<usize>,
) -> Option<Dom<'a>> {
const RESULT_OK: u8 = 0;
const RESULT_PARSE_ERROR: u8 = 1;
const RESULT_TAPE_OVERFLOW: u8 = 2;
let mut frames_buf = [FrameKind::Object; MAX_JSON_DEPTH];
let mut open_buf = [0u64; MAX_JSON_DEPTH];
let mut capacity = initial_capacity.unwrap_or_else(|| (src.len() / 4).max(2));
loop {
let mut tape_data: Vec<DomEntry<'a>> = Vec::with_capacity(capacity);
let tape_ptr = tape_data.as_mut_ptr() as *mut DomEntry<'static>;
let mut tape_len: usize = 0;
let mut has_escapes: bool = false;
let result = unsafe {
parse_json_zmm_dom(
src.as_ptr(),
src.len(),
tape_ptr,
&raw mut tape_len,
frames_buf.as_mut_ptr() as *mut u8,
open_buf.as_mut_ptr(),
&raw mut has_escapes,
capacity,
)
};
match result {
RESULT_OK => {
unsafe { tape_data.set_len(tape_len) };
return Some(Dom {
entries: tape_data,
has_escapes,
});
}
RESULT_PARSE_ERROR => return None,
RESULT_TAPE_OVERFLOW => {
unsafe { tape_data.set_len(tape_len) };
capacity = capacity.saturating_mul(2).max(capacity + 1);
continue;
}
_ => return None, }
}
}
#[cfg(target_arch = "x86_64")]
fn parse_to_dom_zmm_safe<'a>(src: &'a str, cap: Option<usize>) -> Option<Dom<'a>> {
unsafe { parse_to_dom_zmm(src, cap) }
}
pub fn dom_parser() -> for<'a> fn(&'a str, Option<usize>) -> Option<Dom<'a>> {
#[cfg(target_arch = "x86_64")]
if is_x86_feature_detected!("avx512bw") {
return parse_to_dom_zmm_safe;
}
parse_to_dom
}
#[derive(Copy, Clone)]
pub struct SaxParser {
#[cfg(target_arch = "x86_64")]
zmm: bool,
}
impl SaxParser {
pub fn parse<'a, W: Sax<'a>>(&self, src: &'a str, writer: W) -> Option<W::Output> {
#[cfg(target_arch = "x86_64")]
if self.zmm {
return unsafe { parse_with_zmm(src, writer) };
}
parse_with(src, writer)
}
}
pub fn sax_parser() -> SaxParser {
SaxParser {
#[cfg(target_arch = "x86_64")]
zmm: is_x86_feature_detected!("avx512bw"),
}
}
pub fn parse_with<'a, W: Sax<'a>>(src: &'a str, writer: W) -> Option<W::Output> {
let mut frames_buf = [FrameKind::Object; MAX_JSON_DEPTH];
parse_json_impl(src, writer, &mut frames_buf)
}
#[cfg(target_arch = "x86_64")]
pub unsafe fn parse_with_zmm<'a, W: Sax<'a>>(src: &'a str, mut writer: W) -> Option<W::Output> {
let vtab = build_zmm_vtab::<W>();
let mut frames_buf = [FrameKind::Object; MAX_JSON_DEPTH];
let ok = unsafe {
parse_json_zmm_sax(
src.as_ptr(),
src.len(),
&raw mut writer as *mut (),
&vtab,
frames_buf.as_mut_ptr() as *mut u8,
)
};
if ok { writer.finish() } else { None }
}
fn parse_json_impl<'a, W: Sax<'a>>(
src: &'a str,
mut writer: W,
frames_buf: &mut [FrameKind; MAX_JSON_DEPTH],
) -> Option<W::Output> {
let bytes = src.as_bytes();
let mut frames_depth: usize = 0;
let mut str_start: usize = 0; let mut str_escaped = false; let mut bs_count: usize = 0; let mut atom_start: usize = 0; let mut current_key_raw: &'a str = ""; let mut current_key_escaped = false; let mut after_comma = false; let mut state = State::ValueWhitespace;
let mut pos = 0;
while pos < bytes.len() {
let chunk_len = (bytes.len() - pos).min(64);
let chunk = &bytes[pos..pos + chunk_len];
let byte_state = classify_u64(chunk);
let mut chunk_offset = 0;
'inner: while chunk_offset < chunk_len {
state = match state {
State::ValueWhitespace => {
let ahead = (!byte_state.whitespace) >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
match byte {
b'{' => {
if frames_depth >= MAX_JSON_DEPTH {
State::Error
} else {
frames_buf[frames_depth] = FrameKind::Object;
frames_depth += 1;
writer.start_object();
State::ObjectStart
}
}
b'[' => {
if frames_depth >= MAX_JSON_DEPTH {
State::Error
} else {
frames_buf[frames_depth] = FrameKind::Array;
frames_depth += 1;
writer.start_array();
State::ArrayStart
}
}
b'"' => {
str_start = pos + chunk_offset + 1;
str_escaped = false;
bs_count = 0;
State::StringChars
}
_ => {
atom_start = pos + chunk_offset;
State::AtomChars
}
}
}
State::StringChars => {
let interesting = (byte_state.backslashes | byte_state.quotes) >> chunk_offset;
let skip = interesting.trailing_zeros() as usize;
chunk_offset = (chunk_offset + skip).min(chunk_len);
if chunk_offset >= chunk_len {
break 'inner;
}
if skip > 0 {
bs_count = 0;
}
let byte = chunk[chunk_offset];
match byte {
b'\\' => {
bs_count += 1;
str_escaped = true;
State::StringChars
}
b'"' if bs_count & 1 == 1 => {
bs_count = 0;
State::StringChars
}
_ => {
bs_count = 0;
let raw = &src[str_start..pos + chunk_offset];
if str_escaped {
writer.escaped_string(raw);
} else {
writer.string(raw);
}
State::AfterValue
}
}
}
State::KeyChars => {
let interesting = (byte_state.backslashes | byte_state.quotes) >> chunk_offset;
let skip = interesting.trailing_zeros() as usize;
chunk_offset = (chunk_offset + skip).min(chunk_len);
if chunk_offset >= chunk_len {
break 'inner;
}
if skip > 0 {
bs_count = 0;
}
let byte = chunk[chunk_offset];
match byte {
b'\\' => {
bs_count += 1;
str_escaped = true;
State::KeyChars
}
b'"' if bs_count & 1 == 1 => {
bs_count = 0;
State::KeyChars
}
_ => {
bs_count = 0;
current_key_raw = &src[str_start..pos + chunk_offset];
current_key_escaped = str_escaped;
State::KeyEnd
}
}
}
State::KeyEnd => {
let ahead = (!byte_state.whitespace) >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
match byte {
b':' => {
if current_key_escaped {
writer.escaped_key(current_key_raw);
} else {
writer.key(current_key_raw);
}
State::AfterColon
}
_ => State::Error,
}
}
State::AfterColon => {
let ahead = (!byte_state.whitespace) >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
match byte {
b'{' => {
if frames_depth >= MAX_JSON_DEPTH {
State::Error
} else {
frames_buf[frames_depth] = FrameKind::Object;
frames_depth += 1;
writer.start_object();
State::ObjectStart
}
}
b'[' => {
if frames_depth >= MAX_JSON_DEPTH {
State::Error
} else {
frames_buf[frames_depth] = FrameKind::Array;
frames_depth += 1;
writer.start_array();
State::ArrayStart
}
}
b'"' => {
str_start = pos + chunk_offset + 1;
str_escaped = false;
bs_count = 0;
State::StringChars
}
_ => {
atom_start = pos + chunk_offset;
State::AtomChars
}
}
}
State::AtomChars => {
let ahead = byte_state.delimiters >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
if !write_atom(&src[atom_start..pos + chunk_offset], &mut writer) {
State::Error
} else {
match byte {
b'}' => {
if frames_depth == 0
|| frames_buf[frames_depth - 1] != FrameKind::Object
{
State::Error
} else {
frames_depth -= 1;
writer.end_object();
State::AfterValue
}
}
b']' => {
if frames_depth == 0
|| frames_buf[frames_depth - 1] != FrameKind::Array
{
State::Error
} else {
frames_depth -= 1;
writer.end_array();
State::AfterValue
}
}
b',' => {
if frames_depth == 0 {
State::Error
} else {
match frames_buf[frames_depth - 1] {
FrameKind::Array => {
after_comma = true;
State::ArrayStart
}
FrameKind::Object => {
after_comma = true;
State::ObjectStart
}
}
}
}
_ => State::AfterValue, }
}
}
State::Error => break 'inner,
State::ObjectStart => {
let ahead = (!byte_state.whitespace) >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
match byte {
b'"' => {
after_comma = false;
str_start = pos + chunk_offset + 1;
str_escaped = false;
bs_count = 0;
State::KeyChars
}
b'}' => {
if after_comma {
State::Error
} else if frames_depth > 0
&& frames_buf[frames_depth - 1] == FrameKind::Object
{
frames_depth -= 1;
writer.end_object();
State::AfterValue
} else {
State::Error
}
}
_ => State::Error,
}
}
State::ArrayStart => {
let ahead = (!byte_state.whitespace) >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
match byte {
b']' => {
if after_comma {
State::Error
} else if frames_depth > 0
&& frames_buf[frames_depth - 1] == FrameKind::Array
{
frames_depth -= 1;
writer.end_array();
State::AfterValue
} else {
State::Error
}
}
b'{' => {
after_comma = false;
if frames_depth >= MAX_JSON_DEPTH {
State::Error
} else {
frames_buf[frames_depth] = FrameKind::Object;
frames_depth += 1;
writer.start_object();
State::ObjectStart
}
}
b'[' => {
after_comma = false;
if frames_depth >= MAX_JSON_DEPTH {
State::Error
} else {
frames_buf[frames_depth] = FrameKind::Array;
frames_depth += 1;
writer.start_array();
State::ArrayStart
}
}
b'"' => {
after_comma = false;
str_start = pos + chunk_offset + 1;
str_escaped = false;
bs_count = 0;
State::StringChars
}
_ => {
after_comma = false;
atom_start = pos + chunk_offset;
State::AtomChars
}
}
}
State::AfterValue => {
let ahead = (!byte_state.whitespace) >> chunk_offset;
let skip = ahead.trailing_zeros() as usize;
chunk_offset += skip;
if chunk_offset >= chunk_len {
break 'inner;
}
let byte = chunk[chunk_offset];
match byte {
b',' => {
if frames_depth == 0 {
State::Error
} else {
match frames_buf[frames_depth - 1] {
FrameKind::Object => {
after_comma = true;
State::ObjectStart
}
FrameKind::Array => {
after_comma = true;
State::ArrayStart
}
}
}
}
b'}' => {
if frames_depth > 0 && frames_buf[frames_depth - 1] == FrameKind::Object
{
frames_depth -= 1;
writer.end_object();
State::AfterValue
} else {
State::Error
}
}
b']' => {
if frames_depth > 0 && frames_buf[frames_depth - 1] == FrameKind::Array
{
frames_depth -= 1;
writer.end_array();
State::AfterValue
} else {
State::Error
}
}
_ => State::Error,
}
}
};
chunk_offset += 1;
}
pos += chunk_len;
}
if state == State::AtomChars {
if !write_atom(&src[atom_start..], &mut writer) {
return None;
}
} else if state != State::AfterValue {
return None;
}
if state == State::Error {
return None;
}
if frames_depth != 0 {
return None;
}
writer.finish()
}
#[doc(hidden)]
#[unsafe(no_mangle)]
#[inline(never)]
pub fn unescape_str(s: &str, out: &mut String) {
out.clear();
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'\\' {
let ch = s[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
continue;
}
i += 1;
if i >= bytes.len() {
break;
}
match bytes[i] {
b'"' => {
out.push('"');
i += 1;
}
b'\\' => {
out.push('\\');
i += 1;
}
b'/' => {
out.push('/');
i += 1;
}
b'b' => {
out.push('\x08');
i += 1;
}
b'f' => {
out.push('\x0C');
i += 1;
}
b'n' => {
out.push('\n');
i += 1;
}
b'r' => {
out.push('\r');
i += 1;
}
b't' => {
out.push('\t');
i += 1;
}
b'u' => {
i += 1; if i + 4 <= bytes.len() {
if let Ok(hi) = u16::from_str_radix(&s[i..i + 4], 16) {
i += 4;
if (0xD800..0xDC00).contains(&hi)
&& i + 6 <= bytes.len()
&& bytes[i] == b'\\'
&& bytes[i + 1] == b'u'
{
if let Ok(lo) = u16::from_str_radix(&s[i + 2..i + 6], 16) {
if (0xDC00..=0xDFFF).contains(&lo) {
let cp = 0x1_0000u32
+ ((hi as u32 - 0xD800) << 10)
+ (lo as u32 - 0xDC00);
if let Some(ch) = char::from_u32(cp) {
out.push(ch);
i += 6;
continue;
}
}
}
}
if let Some(ch) = char::from_u32(hi as u32) {
out.push(ch);
}
}
}
}
b => {
out.push('\\');
out.push(b as char);
i += 1;
}
}
}
}
#[repr(C)]
#[derive(Debug, PartialEq)]
pub struct ByteState {
whitespace: u64, quotes: u64, backslashes: u64, delimiters: u64, }
fn classify_u64(src: &[u8]) -> ByteState {
assert!(!src.is_empty() && src.len() <= 64);
let mut buf = [0u8; 64];
buf[..src.len()].copy_from_slice(src);
#[inline(always)]
fn eq_byte(v: u64, b: u8) -> u64 {
let x = v ^ (b as u64 * 0x0101_0101_0101_0101_u64);
!(((x & 0x7f7f_7f7f_7f7f_7f7f_u64).wrapping_add(0x7f7f_7f7f_7f7f_7f7f_u64)) | x)
& 0x8080_8080_8080_8080_u64
}
#[inline(always)]
fn movemask8(v: u64) -> u8 {
((v & 0x8080_8080_8080_8080_u64).wrapping_mul(0x0002_0408_1020_4081_u64) >> 56) as u8
}
let mut ws = [0u8; 8];
let mut q = [0u8; 8];
let mut bs = [0u8; 8];
let mut dl = [0u8; 8];
for i in 0..8 {
let v = u64::from_le_bytes(buf[i * 8..][..8].try_into().unwrap());
let masked = v & 0x7f7f_7f7f_7f7f_7f7f_u64;
let sum = masked.wrapping_add(0x5f5f_5f5f_5f5f_5f5f_u64);
let w = !(sum | v) & 0x8080_8080_8080_8080_u64;
let quotes = eq_byte(v, b'"');
let backslashes = eq_byte(v, b'\\');
let commas = eq_byte(v, b',');
let cl_brace = eq_byte(v, b'}');
let cl_bracket = eq_byte(v, b']');
let delims = w | commas | cl_brace | cl_bracket;
ws[i] = movemask8(w);
q[i] = movemask8(quotes);
bs[i] = movemask8(backslashes);
dl[i] = movemask8(delims);
}
ByteState {
whitespace: u64::from_le_bytes(ws),
quotes: u64::from_le_bytes(q),
backslashes: u64::from_le_bytes(bs),
delimiters: u64::from_le_bytes(dl),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(target_arch = "x86_64")]
fn zmm_dom_matches(src: &str) {
if !is_x86_feature_detected!("avx512bw") {
return;
}
let ref_tape =
parse_to_dom(src, None).unwrap_or_else(|| panic!("reference rejected: {src:?}"));
let asm_tape = unsafe { parse_to_dom_zmm(src, None) }
.unwrap_or_else(|| panic!("zmm_tape rejected: {src:?}"));
assert_eq!(
ref_tape.entries, asm_tape.entries,
"tape mismatch for {src:?}"
);
}
#[cfg(target_arch = "x86_64")]
fn zmm_dom_rejects(src: &str) {
if !is_x86_feature_detected!("avx512bw") {
return;
}
assert!(
unsafe { parse_to_dom_zmm(src, None) }.is_none(),
"zmm_tape should reject {src:?}"
);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_atoms() {
for src in &[
"null",
"true",
"false",
"0",
"42",
"-7",
"3.14",
"1e10",
"-0.5e-3",
"1",
"12",
"123",
"1234",
"12345",
"123456",
"1234567",
"12345678",
"123456789",
] {
zmm_dom_matches(src);
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_strings() {
for src in &[
r#""hello""#,
r#""""#,
r#""with \"escape\"""#,
r#""newline\nand\ttab""#,
r#""\u0041\u0042\u0043""#,
r#""\u0000""#,
r#""surrogate \uD83D\uDE00""#,
] {
zmm_dom_matches(src);
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_simple_object() {
zmm_dom_matches(r#"{"x":1}"#);
zmm_dom_matches(r#"{"a":1,"b":2,"c":3}"#);
zmm_dom_matches(r#"{}"#);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_simple_array() {
zmm_dom_matches(r#"[1,2,3]"#);
zmm_dom_matches(r#"[]"#);
zmm_dom_matches(r#"[null,true,false,"x",42]"#);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_nested() {
zmm_dom_matches(r#"{"a":{"b":[1,true,null]}}"#);
zmm_dom_matches(r#"[[1,[2,[3]]]]"#);
zmm_dom_matches(r#"{"k":{"k":{"k":{}}}}"#);
zmm_dom_matches(r#"[{"a":1},{"b":2}]"#);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_escaped_keys() {
zmm_dom_matches(r#"{"key\nname":1}"#);
zmm_dom_matches(r#"{"key\u0041":true}"#);
zmm_dom_matches(r#"{"a\"b":null}"#);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_whitespace() {
zmm_dom_matches(" { \"x\" : 1 } ");
zmm_dom_matches("[ 1 , 2 , 3 ]");
zmm_dom_matches("\t\r\nnull\t\r\n");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_long_string() {
let long = format!(r#""{}""#, "a".repeat(200));
zmm_dom_matches(&long);
let long_esc = format!(r#""{}\n{}""#, "b".repeat(100), "c".repeat(100));
zmm_dom_matches(&long_esc);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_reject_invalid() {
zmm_dom_rejects("");
zmm_dom_rejects("{");
zmm_dom_rejects("[");
zmm_dom_rejects("}");
zmm_dom_rejects(r#"{"a":}"#);
zmm_dom_rejects(r#"{"a":1"#);
zmm_dom_rejects("01");
zmm_dom_rejects("00");
zmm_dom_rejects("007");
zmm_dom_rejects("01234567"); }
#[cfg(target_arch = "x86_64")]
fn zmm_sax_matches(src: &str) {
#[derive(Default)]
struct EventLog(String);
impl<'s> Sax<'s> for EventLog {
type Output = String;
fn null(&mut self) {
self.0.push_str("null;");
}
fn bool_val(&mut self, v: bool) {
self.0.push_str(if v { "true;" } else { "false;" });
}
fn number(&mut self, s: &str) {
self.0.push_str(s);
self.0.push(';');
}
fn string(&mut self, s: &str) {
self.0.push_str("s:");
self.0.push_str(s);
self.0.push(';');
}
fn escaped_string(&mut self, s: &str) {
self.0.push_str("es:");
self.0.push_str(s);
self.0.push(';');
}
fn key(&mut self, s: &str) {
self.0.push_str("k:");
self.0.push_str(s);
self.0.push(';');
}
fn escaped_key(&mut self, s: &str) {
self.0.push_str("ek:");
self.0.push_str(s);
self.0.push(';');
}
fn start_object(&mut self) {
self.0.push('{');
}
fn end_object(&mut self) {
self.0.push('}');
}
fn start_array(&mut self) {
self.0.push('[');
}
fn end_array(&mut self) {
self.0.push(']');
}
fn finish(self) -> Option<String> {
Some(self.0)
}
}
if !is_x86_feature_detected!("avx512bw") {
return;
}
let ref_log = parse_with(src, EventLog::default())
.unwrap_or_else(|| panic!("reference rejected: {src:?}"));
let asm_log = unsafe { parse_with_zmm(src, EventLog::default()) }
.unwrap_or_else(|| panic!("parse_with_zmm rejected: {src:?}"));
assert_eq!(ref_log, asm_log, "event log mismatch for {src:?}");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_sax_escaped_strings() {
zmm_sax_matches(r#"{"key":"\n\t\r\""}"#);
zmm_sax_matches(r#"{"key\nname":"val\u0041"}"#);
zmm_sax_matches(r#"["\u0041","\u0042\u0043"]"#);
zmm_sax_matches(r#"{"a\"b":"c\"d"}"#);
let long = format!(r#"{{"{}\n":"{}\t"}}"#, "x".repeat(70), "y".repeat(70));
zmm_sax_matches(&long);
}
#[test]
fn rust_even_backslash_before_quote() {
use crate::JsonRef;
let t = parse_to_dom(r#"{"k":"\\"}"#, None).expect("parse failed");
assert_eq!(t.root().get("k").as_str(), Some("\\"));
let t = parse_to_dom(r#"{"k":"\\\\"}"#, None).expect("parse failed");
assert_eq!(t.root().get("k").as_str(), Some("\\\\"));
let t = parse_to_dom(r#"["\\"]"#, None).expect("parse failed");
assert_eq!(t.root().index_at(0).as_str(), Some("\\"));
let t = parse_to_dom(r#"{"k":"abc\\"}"#, None).expect("parse failed");
assert_eq!(t.root().get("k").as_str(), Some("abc\\"));
let t = parse_to_dom("{\"k\":\"\\\\\\\"\"}", None).expect("parse failed");
assert_eq!(t.root().get("k").as_str(), Some("\\\""));
}
#[cfg(target_arch = "x86_64")]
#[test]
fn zmm_dom_overflow_retry() {
if !is_x86_feature_detected!("avx512bw") {
return;
}
let big: String = {
let mut s = String::from("[");
for i in 0..200u32 {
if i > 0 {
s.push(',');
}
s.push_str(&format!(r#"{{"k":{i}}}"#));
}
s.push(']');
s
};
let tape =
unsafe { parse_to_dom_zmm(&big, Some(4)) }.expect("overflow retry should succeed");
assert_eq!(tape.root().unwrap().array_iter().unwrap().count(), 200);
}
#[test]
fn swar_eq_byte_quote_false_positive_regression() {
use crate::JsonRef;
let dom = parse_to_dom("\"#\\\\\"", None).expect("\"#\\\\\" should parse");
assert_eq!(dom.root().as_str(), Some("#\\"));
let dom = parse_to_dom("\"# \\\\\"", None).expect("\"# \\\\\" should parse");
assert_eq!(dom.root().as_str(), Some("# \\"));
let dom = parse_to_dom("\"=\\\\\\\"#\"", None).expect("\"=\\\\\\\"#\\\" should parse");
assert_eq!(dom.root().as_str(), Some("=\\\"#"));
}
#[test]
fn swar_eq_byte_quote_false_positive_regression_sax() {
struct Capture(Option<String>);
impl<'s> Sax<'s> for Capture {
type Output = String;
fn string(&mut self, s: &str) {
self.0 = Some(s.to_owned());
}
fn escaped_string(&mut self, s: &str) {
self.0 = Some(s.to_owned());
}
fn finish(self) -> Option<String> {
self.0
}
}
let cases: &[(&str, &str)] = &[
("\"#\\\\\"", "#\\\\"),
("\"# \\\\\"", "# \\\\"),
("\"=\\\\\\\"#\"", "=\\\\\\\"#"),
];
for &(src, expected_raw) in cases {
let got = parse_with(src, Capture(None))
.unwrap_or_else(|| panic!("parse_with rejected {src:?}"));
assert_eq!(got, expected_raw, "SAX raw string mismatch for {src:?}");
}
}
}