simd-json 0.7.0

High performance JSON parser based on a port of simdjson
Documentation
#![allow(dead_code)]
use crate::charutils::is_not_structural_or_whitespace;
use crate::safer_unchecked::GetSaferUnchecked;
use crate::value::tape::Node;
use crate::{Deserializer, Error, ErrorType, Result};
use value_trait::StaticNode;

#[cfg_attr(not(feature = "no-inline"), inline(always))]
#[allow(clippy::cast_ptr_alignment)]
pub fn is_valid_true_atom(loc: &[u8]) -> bool {
    debug_assert!(loc.len() >= 8, "loc too short for a u64 read");

    // TODO is this expensive?
    let mut error: u64;
    unsafe {
        //let tv: u64 = *(b"true    ".as_ptr() as *const u64);
        // this is the same:
        const TV: u64 = 0x00_00_00_00_65_75_72_74;
        const MASK4: u64 = 0x00_00_00_00_ff_ff_ff_ff;

        // TODO: does this has the same effect as:
        //   std::memcpy(&locval, loc, sizeof(uint64_t));
        let locval: u64 = loc.as_ptr().cast::<u64>().read_unaligned();

        error = (locval & MASK4) ^ TV;
        error |= u64::from(is_not_structural_or_whitespace(*loc.get_kinda_unchecked(4)));
    }
    error == 0
}

#[cfg(feature = "safe")]
macro_rules! get {
    ($a:expr, $i:expr) => {
        &$a[$i]
    };
}

#[cfg(not(feature = "safe"))]
macro_rules! get {
    ($a:expr, $i:expr) => {{
        unsafe { $a.get_kinda_unchecked($i) }
    }};
}

#[cfg_attr(not(feature = "no-inline"), inline(always))]
#[allow(clippy::cast_ptr_alignment, unused_unsafe)]
pub fn is_valid_false_atom(loc: &[u8]) -> bool {
    debug_assert!(loc.len() >= 8, "loc too short for a u64 read");

    // TODO: this is ugly and probably copies data every time
    let mut error: u64;
    unsafe {
        //let fv: u64 = *(b"false   ".as_ptr() as *const u64);
        // this is the same:

        const FV: u64 = 0x00_00_00_65_73_6c_61_66;
        const MASK5: u64 = 0x00_00_00_ff_ff_ff_ff_ff;

        let locval: u64 = loc.as_ptr().cast::<u64>().read_unaligned();

        // FIXME the original code looks like this:
        // error = ((locval & mask5) ^ fv) as u32;
        // but that fails on falsy as the u32 conversion
        // will mask the error on the y so we re-write it
        // it would be interesting what the consequences are
        error = (locval & MASK5) ^ FV;
        error |= u64::from(is_not_structural_or_whitespace(*get!(loc, 5)));
    }
    error == 0
}

#[cfg_attr(not(feature = "no-inline"), inline(always))]
#[allow(clippy::cast_ptr_alignment, unused_unsafe)]
pub fn is_valid_null_atom(loc: &[u8]) -> bool {
    debug_assert!(loc.len() >= 8, "loc too short for a u64 read");

    // TODO is this expensive?
    let mut error: u64;
    unsafe {
        //let nv: u64 = *(b"null   ".as_ptr() as *const u64);
        // this is the same:
        const NV: u64 = 0x00_00_00_00_6c_6c_75_6e;
        const MASK4: u64 = 0x00_00_00_00_ff_ff_ff_ff;
        let locval: u64 = loc.as_ptr().cast::<u64>().read_unaligned();

        error = (locval & MASK4) ^ NV;
        error |= u64::from(is_not_structural_or_whitespace(*get!(loc, 4)));
    }
    error == 0
}

#[derive(Debug)]
enum State {
    ObjectKey,
    ScopeEnd,
    MainArraySwitch,
}
#[derive(Debug)]
enum StackState {
    Start,
    Object,
    Array,
}

impl<'de> Deserializer<'de> {
    #[allow(
        clippy::cognitive_complexity,
        clippy::too_many_lines,
        unused_unsafe,
        clippy::uninit_vec
    )]
    pub(crate) fn build_tape(
        input: &'de mut [u8],
        input2: &[u8],
        buffer: &mut [u8],
        structural_indexes: &[u32],
    ) -> Result<Vec<Node<'de>>> {
        // While a valid json can have at max len/2 (`[[[]]]`)elements that are relevant
        // a invalid json might exceed this `[[[[[[` and we need to protect against that.
        let mut res: Vec<Node<'de>> = Vec::with_capacity(structural_indexes.len());
        let mut stack = Vec::with_capacity(structural_indexes.len());

        let mut depth: usize = 0;
        let mut last_start = 1;
        let mut cnt: usize = 0;
        let mut r_i = 0;

        // let mut i: usize = 0; // index of the structural character (0,1,2,3...)
        // location of the structural character in the input (buf)
        let mut idx: usize = 0;
        // used to track the (structural) character we are looking at, updated
        // by UPDATE_CHAR macro
        let mut c: u8 = 0;
        // skip the zero index
        let mut i: usize = 1;
        let mut state;

        macro_rules! s2try {
            ($e:expr) => {
                match $e {
                    ::std::result::Result::Ok(val) => val,
                    ::std::result::Result::Err(err) => {
                        // We need to ensure that rust doesn't
                        // try to free strings that we never
                        // allocated
                        unsafe {
                            res.set_len(r_i);
                        };
                        return ::std::result::Result::Err(err);
                    }
                }
            };
        }

        macro_rules! insert_res {
            ($t:expr) => {
                unsafe {
                    res.as_mut_ptr().add(r_i).write($t);
                    r_i += 1;
                }
            };
        }
        macro_rules! success {
            () => {
                unsafe {
                    res.set_len(r_i);
                }
                return Ok(res);
            };
        }
        macro_rules! update_char {
            () => {
                if i < structural_indexes.len() {
                    idx = *get!(structural_indexes, i) as usize;
                    i += 1;
                    c = *get!(input2, idx);
                } else {
                    fail!(ErrorType::Syntax);
                }
            };
        }

        macro_rules! goto {
            ($state:expr) => {{
                state = $state;
                continue;
            }};
        }

        insert_res!(Node::Static(StaticNode::Null));

        macro_rules! insert_str {
            () => {
                insert_res!(Node::String(s2try!(Self::parse_str_(
                    input, &input2, buffer, idx
                ))));
            };
        }

        // The continue cases are the most frequently called onces it's
        // worth pulling them out into a macro (aka inlining them)
        // Since we don't have a 'gogo' in rust.
        macro_rules! array_continue {
            () => {{
                update_char!();
                match c {
                    b',' => {
                        cnt += 1;
                        update_char!();
                        goto!(MainArraySwitch);
                    }
                    b']' => {
                        goto!(ScopeEnd);
                    }
                    _c => {
                        fail!(ErrorType::ExpectedArrayContent);
                    }
                }
            }};
        }

        macro_rules! object_continue {
            () => {{
                update_char!();
                match c {
                    b',' => {
                        cnt += 1;
                        update_char!();
                        if c == b'"' {
                            insert_str!();
                            goto!(ObjectKey);
                        }
                        fail!(ErrorType::ExpectedObjectKey);
                    }
                    b'}' => {
                        goto!(ScopeEnd);
                    }
                    _ => {
                        fail!(ErrorType::ExpectedObjectContent);
                    }
                }
            }};
        }

        macro_rules! array_begin {
            () => {
                update_char!();
                if c == b']' {
                    cnt = 0;
                    goto!(ScopeEnd);
                }
                goto!(MainArraySwitch);
            };
        }

        macro_rules! object_begin {
            () => {{
                update_char!();
                match c {
                    b'"' => {
                        insert_str!();
                        goto!(ObjectKey)
                    }
                    b'}' => {
                        cnt = 0;
                        goto!(ScopeEnd);
                    }
                    _c => {
                        fail!(ErrorType::ExpectedObjectContent);
                    }
                }
            }};
        }

        macro_rules! fail {
            () => {
                // We need to ensure that rust doesn't
                // try to free strings that we never
                // allocated
                unsafe {
                    res.set_len(r_i);
                };
                return Err(Error::new(idx, c as char, ErrorType::InternalError));
            };
            ($t:expr) => {
                // We need to ensure that rust doesn't
                // try to free strings that we never
                // allocated
                unsafe {
                    res.set_len(r_i);
                };
                return Err(Error::new(idx, c as char, $t));
            };
        }
        // State start, we pull this outside of the
        // loop to reduce the number of required checks
        update_char!();
        match c {
            b'{' => {
                unsafe {
                    let s: *mut (StackState, usize, usize) = stack.as_mut_ptr();
                    s.add(depth).write((StackState::Start, last_start, cnt));
                }

                last_start = r_i;
                insert_res!(Node::Object(0, 0));

                depth += 1;
                cnt = 1;

                update_char!();
                match c {
                    b'"' => {
                        insert_str!();
                        state = State::ObjectKey;
                    }
                    b'}' => {
                        cnt = 0;
                        state = State::ScopeEnd;
                    }
                    _c => {
                        fail!(ErrorType::ExpectedObjectContent);
                    }
                }
            }
            b'[' => {
                unsafe {
                    let s: *mut (StackState, usize, usize) = stack.as_mut_ptr();
                    s.add(depth).write((StackState::Start, last_start, cnt));
                }

                last_start = r_i;
                insert_res!(Node::Array(0, 0));

                depth += 1;
                cnt = 1;

                update_char!();
                if c == b']' {
                    cnt = 0;
                    state = State::ScopeEnd;
                } else {
                    state = State::MainArraySwitch;
                }
            }
            b't' => {
                unsafe {
                    if !is_valid_true_atom(get!(input2, idx..)) {
                        fail!(ErrorType::ExpectedNull); // TODO: better error
                    }
                };
                insert_res!(Node::Static(StaticNode::Bool(true)));
                if i == structural_indexes.len() {
                    success!();
                }
                fail!(ErrorType::TrailingCharacters);
            }
            b'f' => {
                unsafe {
                    if !is_valid_false_atom(get!(input2, idx..)) {
                        fail!(ErrorType::ExpectedNull); // TODO: better error
                    }
                };
                insert_res!(Node::Static(StaticNode::Bool(false)));
                if i == structural_indexes.len() {
                    success!();
                }
                fail!(ErrorType::TrailingCharacters);
            }
            b'n' => {
                unsafe {
                    if !is_valid_null_atom(get!(input2, idx..)) {
                        fail!(ErrorType::ExpectedNull); // TODO: better error
                    }
                };
                insert_res!(Node::Static(StaticNode::Null));
                if i == structural_indexes.len() {
                    success!();
                }
                fail!(ErrorType::TrailingCharacters);
            }
            b'"' => {
                insert_str!();
                if i == structural_indexes.len() {
                    success!();
                }
                fail!(ErrorType::TrailingCharacters);
            }
            b'-' => {
                insert_res!(Node::Static(s2try!(Self::parse_number(idx, input2, true))));

                if i == structural_indexes.len() {
                    success!();
                }
                fail!(ErrorType::TrailingCharacters);
            }
            b'0'..=b'9' => {
                insert_res!(Node::Static(s2try!(Self::parse_number(idx, input2, false))));

                if i == structural_indexes.len() {
                    success!();
                }
                fail!(ErrorType::TrailingCharacters);
            }
            _ => {
                fail!();
            }
        }

        loop {
            use self::State::{MainArraySwitch, ObjectKey, ScopeEnd};
            match state {
                ////////////////////////////// OBJECT STATES /////////////////////////////
                ObjectKey => {
                    update_char!();
                    if unlikely!(c != b':') {
                        fail!(ErrorType::ExpectedObjectColon);
                    }
                    update_char!();
                    match c {
                        b'"' => {
                            insert_str!();
                            object_continue!();
                        }
                        b't' => {
                            insert_res!(Node::Static(StaticNode::Bool(true)));
                            if !is_valid_true_atom(get!(input2, idx..)) {
                                fail!(ErrorType::ExpectedBoolean); // TODO: better error
                            }
                            object_continue!();
                        }
                        b'f' => {
                            insert_res!(Node::Static(StaticNode::Bool(false)));
                            if !is_valid_false_atom(get!(input2, idx..)) {
                                fail!(ErrorType::ExpectedBoolean); // TODO: better error
                            }
                            object_continue!();
                        }
                        b'n' => {
                            insert_res!(Node::Static(StaticNode::Null));
                            if !is_valid_null_atom(get!(input2, idx..)) {
                                fail!(ErrorType::ExpectedNull); // TODO: better error
                            }
                            object_continue!();
                        }
                        b'-' => {
                            insert_res!(Node::Static(s2try!(Self::parse_number(
                                idx, input2, true
                            ))));

                            object_continue!();
                        }
                        b'0'..=b'9' => {
                            insert_res!(Node::Static(s2try!(Self::parse_number(
                                idx, input2, false
                            ))));

                            object_continue!();
                        }
                        b'{' => {
                            unsafe {
                                let s: *mut (StackState, usize, usize) = stack.as_mut_ptr();
                                s.add(depth).write((StackState::Object, last_start, cnt));
                            }
                            last_start = r_i;
                            insert_res!(Node::Object(0, 0));
                            depth += 1;
                            cnt = 1;
                            object_begin!();
                        }
                        b'[' => {
                            unsafe {
                                let s: *mut (StackState, usize, usize) = stack.as_mut_ptr();
                                s.add(depth).write((StackState::Object, last_start, cnt));
                            }
                            last_start = r_i;
                            insert_res!(Node::Array(0, 0));
                            depth += 1;
                            cnt = 1;
                            array_begin!();
                        }
                        _c => {
                            fail!();
                        }
                    }
                }
                ////////////////////////////// COMMON STATE /////////////////////////////
                ScopeEnd => {
                    if depth == 0 {
                        fail!(ErrorType::Syntax);
                    }
                    depth -= 1;
                    unsafe {
                        match *res.as_mut_ptr().add(last_start) {
                            Node::Array(ref mut len, ref mut end)
                            | Node::Object(ref mut len, ref mut end) => {
                                *len = cnt;
                                *end = r_i;
                            }
                            _ => unreachable!(),
                        };
                    }
                    unsafe {
                        let a = stack.as_ptr().add(depth);

                        last_start = (*a).1;
                        cnt = (*a).2;

                        match (*a).0 {
                            StackState::Object => object_continue!(),
                            StackState::Array => array_continue!(),
                            StackState::Start => {
                                if i == structural_indexes.len() {
                                    success!();
                                }
                                fail!();
                            }
                        };
                    }
                }

                ////////////////////////////// ARRAY STATES /////////////////////////////
                MainArraySwitch => {
                    // we call update char on all paths in, so we can peek at c on the
                    // on paths that can accept a close square brace (post-, and at start)
                    match c {
                        b'"' => {
                            insert_str!();
                            array_continue!();
                        }
                        b't' => {
                            insert_res!(Node::Static(StaticNode::Bool(true)));
                            if !is_valid_true_atom(get!(input2, idx..)) {
                                fail!(ErrorType::ExpectedBoolean); // TODO: better error
                            }
                            array_continue!();
                        }
                        b'f' => {
                            insert_res!(Node::Static(StaticNode::Bool(false)));
                            if !is_valid_false_atom(get!(input2, idx..)) {
                                fail!(ErrorType::ExpectedBoolean); // TODO: better error
                            }
                            array_continue!();
                        }
                        b'n' => {
                            insert_res!(Node::Static(StaticNode::Null));
                            if !is_valid_null_atom(get!(input2, idx..)) {
                                fail!(ErrorType::ExpectedNull); // TODO: better error
                            }
                            array_continue!();
                        }
                        b'-' => {
                            insert_res!(Node::Static(s2try!(Self::parse_number(
                                idx, input2, true
                            ))));

                            array_continue!();
                        }
                        b'0'..=b'9' => {
                            insert_res!(Node::Static(s2try!(Self::parse_number(
                                idx, input2, false
                            ))));

                            array_continue!();
                        }
                        b'{' => {
                            unsafe {
                                let s: *mut (StackState, usize, usize) = stack.as_mut_ptr();
                                s.add(depth).write((StackState::Array, last_start, cnt));
                            }
                            last_start = r_i;
                            insert_res!(Node::Object(0, 0));
                            depth += 1;
                            cnt = 1;
                            object_begin!();
                        }
                        b'[' => {
                            unsafe {
                                let s: *mut (StackState, usize, usize) = stack.as_mut_ptr();
                                s.add(depth).write((StackState::Array, last_start, cnt));
                            }
                            last_start = r_i;
                            insert_res!(Node::Array(0, 0));
                            depth += 1;
                            cnt = 1;
                            array_begin!();
                        }
                        _c => {
                            fail!();
                        }
                    }
                }
            }
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn true_atom() {
        assert!(is_valid_true_atom(b"true    "));
        assert!(!is_valid_true_atom(b"tru     "));
        assert!(!is_valid_true_atom(b" rue    "));
    }
    #[test]
    fn false_atom() {
        assert!(is_valid_false_atom(b"false   "));
        assert!(!is_valid_false_atom(b"falze   "));
        assert!(!is_valid_false_atom(b"falsy   "));
        assert!(!is_valid_false_atom(b"fals    "));
        assert!(!is_valid_false_atom(b" alse   "));

        //unsafe { assert!(!is_valid_false_atom(b"fals    " as *const u8)) }
        //        unsafe { assert!(!is_valid_false_atom(b"false   " as *const u8)) }
    }
    #[test]
    fn null_atom() {
        assert!(is_valid_null_atom(b"null    "));
        assert!(!is_valid_null_atom(b"nul     "));
        assert!(!is_valid_null_atom(b" ull    "));
    }

    #[cfg(feature = "serde_impl")]
    #[test]
    fn parsing_errors() {
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut b"time".to_vec()),
            Err(Error::new(0, 't', ErrorType::ExpectedNull))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut b"falsy".to_vec()),
            Err(Error::new(0, 'f', ErrorType::ExpectedNull))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut b"new".to_vec()),
            Err(Error::new(0, 'n', ErrorType::ExpectedNull))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut b"[true, time]".to_vec()),
            Err(Error::new(7, 't', ErrorType::ExpectedBoolean))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut b"[true, falsy]".to_vec()),
            Err(Error::new(7, 'f', ErrorType::ExpectedBoolean))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut b"[null, new]".to_vec()),
            Err(Error::new(7, 'n', ErrorType::ExpectedNull))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut br#"{"1":time}"#.to_vec()),
            Err(Error::new(5, 't', ErrorType::ExpectedBoolean))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut br#"{"0":falsy}"#.to_vec()),
            Err(Error::new(5, 'f', ErrorType::ExpectedBoolean))
        );
        assert_eq!(
            crate::serde::from_slice::<bool>(&mut br#"{"0":new}"#.to_vec()),
            Err(Error::new(5, 'n', ErrorType::ExpectedNull))
        );
    }
}