simd-json 0.7.0

High performance JSON parser based on a port of simdjson
Documentation
mod numberconst;

use self::numberconst::{MANTISSA_128, POW10, POW10_COMPONENTS};
use super::{is_integer, is_not_structural_or_whitespace_or_exponent_or_decimal};

#[cfg(feature = "swar-number-parsing")]
use super::{is_made_of_eight_digits_fast, parse_eight_digits_unrolled};

use crate::charutils::is_structural_or_whitespace;
use crate::error::Error;
use crate::safer_unchecked::GetSaferUnchecked;
use crate::unlikely;
use crate::StaticNode;
use crate::{mem, static_cast_i64, Deserializer, ErrorType, Result};

macro_rules! get {
    ($buf:ident, $idx:expr) => {
        unsafe { *$buf.get_kinda_unchecked($idx as usize) }
    };
}
macro_rules! err {
    ($idx:ident, $num:expr) => {
        return Err(Error::new($idx, $num as char, ErrorType::InvalidNumber))
    };
}

#[cfg_attr(not(feature = "no-inline"), inline)]
#[allow(clippy::cast_possible_truncation)]
fn multiply_as_u128(a: u64, b: u64) -> (u64, u64) {
    let res: u128 = u128::from(a) * u128::from(b);
    (res as u64, (res >> 64) as u64)
}

impl<'de> Deserializer<'de> {
    #[allow(
        unused_unsafe,
        clippy::cast_possible_wrap,
        clippy::cast_possible_truncation,
        clippy::too_many_lines
    )]
    pub(crate) fn parse_number(idx: usize, buf: &[u8], negative: bool) -> Result<StaticNode> {
        let start_idx = idx;
        let mut idx = idx;
        if negative {
            idx += 1;
            if !is_integer(get!(buf, idx)) {
                err!(idx, get!(buf, idx))
            }
        }
        let mut start = idx;
        let mut num: u64 = 0;
        if get!(buf, idx) == b'0' {
            idx += 1;
            if is_not_structural_or_whitespace_or_exponent_or_decimal(get!(buf, idx)) {
                err!(idx, get!(buf, idx))
            }
        } else {
            if !is_integer(get!(buf, idx)) {
                err!(idx, get!(buf, idx))
            }
            num = u64::from(get!(buf, idx) - b'0');
            idx += 1;
            while is_integer(get!(buf, idx)) {
                num = 10_u64
                    .wrapping_mul(num)
                    .wrapping_add(u64::from(get!(buf, idx) - b'0'));
                idx += 1;
            }
        }
        let mut exponent: i64 = 0;
        let mut is_float = false;
        if get!(buf, idx) == b'.' {
            is_float = true;
            idx += 1;
            let first_after_period = idx as i64;
            if is_integer(get!(buf, idx)) {
                num = 10_u64.wrapping_mul(num) + u64::from(get!(buf, idx) - b'0');
                idx += 1;
            } else {
                err!(idx, get!(buf, idx))
            }

            #[cfg(feature = "swar-number-parsing")]
            {
                if is_made_of_eight_digits_fast(&buf[idx..]) {
                    num = 100_000_000_u64
                        .wrapping_mul(num)
                        .wrapping_add(u64::from(parse_eight_digits_unrolled(&buf[idx..])));
                    idx += 8;
                }
            }
            while is_integer(get!(buf, idx)) {
                num = 10_u64.wrapping_mul(num) + u64::from(get!(buf, idx) - b'0');
                idx += 1;
            }
            exponent = first_after_period.wrapping_sub(idx as i64);
        }
        let mut digit_count = idx - start_idx - 1;
        match get!(buf, idx) {
            b'e' | b'E' => {
                is_float = true;
                idx += 1;
                let neg_exp: bool;
                match get!(buf, idx) {
                    b'-' => {
                        neg_exp = true;
                        idx += 1;
                    }
                    b'+' => {
                        neg_exp = false;
                        idx += 1;
                    }
                    _ => {
                        neg_exp = false;
                    }
                }
                if !is_integer(get!(buf, idx)) {
                    err!(idx, get!(buf, idx))
                }
                let mut exp_number = i64::from(get!(buf, idx) - b'0');
                idx += 1;
                if is_integer(get!(buf, idx)) {
                    exp_number = 10 * exp_number + i64::from(get!(buf, idx) - b'0');
                    idx += 1;
                }
                if is_integer(get!(buf, idx)) {
                    exp_number = 10 * exp_number + i64::from(get!(buf, idx) - b'0');
                    idx += 1;
                }
                while is_integer(get!(buf, idx)) {
                    if exp_number > 0x0001_0000_0000 {
                        err!(idx, get!(buf, idx))
                    }
                    exp_number = 10 * exp_number + i64::from(get!(buf, idx) - b'0');
                    idx += 1;
                }
                exponent += if neg_exp { -exp_number } else { exp_number };
            }
            _ => {}
        }
        if is_float {
            if unlikely!(digit_count >= 19) {
                let start_digits = get!(buf, start) as usize;
                while get!(buf, start) == b'0' || get!(buf, start) == b'.' {
                    start += 1;
                }
                digit_count = digit_count.wrapping_sub((start.wrapping_sub(start_digits)) as usize);
                if digit_count >= 19 {
                    return f64_from_parts_slow(&buf[start_idx..idx], start_idx);
                }
            }
            if is_structural_or_whitespace(get!(buf, idx)) == 0 {
                err!(idx, get!(buf, idx))
            }
            f64_from_parts(
                !negative,
                num,
                exponent as i32,
                &buf[start_idx..idx],
                start_idx,
            )
        } else if unlikely!(digit_count >= 18) {
            parse_large_integer(start_idx, buf, negative)
        } else if is_structural_or_whitespace(get!(buf, idx)) == 0 {
            err!(idx, get!(buf, idx))
        } else {
            Ok(StaticNode::I64(if negative {
                unsafe { static_cast_i64!(num.wrapping_neg()) } // -(num as i64)
            } else {
                num as i64
            }))
        }
    }
}

#[cfg(not(feature = "128bit"))]
#[cold]
#[allow(clippy::cast_possible_wrap)]
fn parse_large_integer(start_idx: usize, buf: &[u8], negative: bool) -> Result<StaticNode> {
    let mut idx = start_idx;
    if negative {
        idx += 1;
    }
    let mut num: u64 = 0;
    if get!(buf, idx) == b'0' {
        idx += 1;
    } else {
        num = u64::from(get!(buf, idx) - b'0');
        idx += 1;
        while is_integer(get!(buf, idx)) {
            let digit = u64::from(get!(buf, idx) - b'0');
            {
                let (res, overflowed) = 10_u64.overflowing_mul(num);
                if overflowed {
                    err!(idx, get!(buf, idx))
                }
                num = res;
            }
            {
                let (res, overflowed) = num.overflowing_add(digit);
                if overflowed {
                    err!(idx, get!(buf, idx))
                }
                num = res;
            }
            idx += 1;
        }
    }
    match (negative, num) {
        (true, 9_223_372_036_854_775_808) => Ok(StaticNode::I64(i64::MIN)),
        (true, 9_223_372_036_854_775_809..=u64::MAX) => err!(idx, get!(buf, idx)),
        (true, 0..=9_223_372_036_854_775_807) => Ok(StaticNode::I64(-(num as i64))),
        (false, 0..=9_223_372_036_854_775_807) => Ok(StaticNode::I64(num as i64)),
        (false, _) => Ok(StaticNode::U64(num)),
    }
}

#[cfg(feature = "128bit")]
#[cold]
#[allow(clippy::cast_possible_wrap)]
fn parse_large_integer(start_idx: usize, buf: &[u8], negative: bool) -> Result<StaticNode> {
    let mut idx = start_idx;
    if negative {
        idx += 1;
    }
    let mut num: u128 = 0;
    if get!(buf, idx) == b'0' {
        idx += 1;
    } else {
        num = u128::from(get!(buf, idx) - b'0');
        idx += 1;
        while is_integer(get!(buf, idx)) {
            let digit = u128::from(get!(buf, idx) - b'0');
            {
                let (res, overflowed) = 10_u128.overflowing_mul(num);
                if overflowed {
                    err!(idx, get!(buf, idx))
                }
                num = res;
            }
            {
                let (res, overflowed) = num.overflowing_add(digit);
                if overflowed {
                    err!(idx, get!(buf, idx))
                }
                num = res;
            }
            idx += 1;
        }
    }
    match (negative, num) {
        (true, 170_141_183_460_469_231_731_687_303_715_884_105_728_u128) => {
            Ok(StaticNode::I128(i128::MIN))
        }
        (true, 170_141_183_460_469_231_731_687_303_715_884_105_729_u128..=u128::MAX) => {
            err!(idx, get!(buf, idx))
        }
        (true, 0..=170_141_183_460_469_231_731_687_303_715_884_105_727_u128) => {
            if let Ok(i) = i64::try_from(-(num as i128)) {
                Ok(StaticNode::I64(i))
            } else {
                Ok(StaticNode::I128(-(num as i128)))
            }
        }
        (false, _) => {
            if let Ok(i) = u64::try_from(num) {
                Ok(StaticNode::U64(i))
            } else {
                Ok(StaticNode::U128(num))
            }
        }
    }
}

#[allow(
    clippy::cast_sign_loss,
    clippy::cast_possible_truncation,
    clippy::cast_precision_loss,
    clippy::cast_possible_wrap
)]
fn f64_from_parts(
    positive: bool,
    significand: u64,
    exponent: i32,
    slice: &[u8],
    offset: usize,
) -> Result<StaticNode> {
    if (-22..=22).contains(&exponent) && significand <= 9_007_199_254_740_991 {
        let mut f = significand as f64;
        if exponent < 0 {
            f /= get!(POW10, -exponent);
        } else {
            f *= get!(POW10, exponent);
        }
        Ok(StaticNode::F64(if positive { f } else { -f }))
    } else if significand == 0 {
        Ok(StaticNode::F64(if positive { 0.0 } else { -0.0 }))
    } else if (-325..=308).contains(&exponent) {
        let (factor_mantissa, factor_exponent) = get!(POW10_COMPONENTS, exponent + 325);
        let mut leading_zeroes = u64::from(significand.leading_zeros());
        let f = significand << leading_zeroes;
        let (mut lower, mut upper) = multiply_as_u128(f, factor_mantissa);
        if upper & 0x1FF == 0x1FF && lower.wrapping_add(f) < lower {
            let factor_mantissa_low = get!(MANTISSA_128, exponent + 325);
            let (product_low, product_middle2) = multiply_as_u128(f, factor_mantissa_low);
            let product_middle1 = lower;
            let mut product_high = upper;
            let product_middle = product_middle1.wrapping_add(product_middle2);
            if product_middle < product_middle1 {
                product_high += 1;
            }
            if product_middle.wrapping_add(1) == 0
                && product_high & 0x1FF == 0x1FF
                && product_low.wrapping_add(f) < product_low
            {
                return f64_from_parts_slow(slice, offset);
            }
            upper = product_high;
            lower = product_middle;
        }
        let upperbit = upper.wrapping_shr(63);
        let mut mantissa = upper.wrapping_shr((upperbit + 9) as u32);
        leading_zeroes += 1 ^ upperbit;

        if lower == 0 && upper.trailing_zeros() >= 9 && mantissa & 3 == 1 {
            return f64_from_parts_slow(slice, offset);
        }
        mantissa += mantissa & 1;
        mantissa >>= 1;

        if mantissa >= 1 << 53 {
            mantissa = 1 << 52;
            leading_zeroes -= 1;
        }
        mantissa &= !(1 << 52);
        let real_exponent = (factor_exponent as u64).wrapping_sub(leading_zeroes);
        // we have to check that real_exponent is in range, otherwise we bail out
        if !(1..=2046).contains(&real_exponent) {
            return f64_from_parts_slow(slice, offset);
        }
        mantissa |= real_exponent.wrapping_shl(52);
        mantissa |= u64::from(!positive) << 63;
        let res = f64::from_bits(mantissa);
        if res.is_infinite() {
            err!(offset, get!(slice, offset))
        }
        Ok(StaticNode::F64(res))
    } else {
        f64_from_parts_slow(slice, offset)
    }
}

#[cold]
fn f64_from_parts_slow(slice: &[u8], offset: usize) -> Result<StaticNode> {
    match lexical_core::parse_with_options::<f64, { lexical_core::format::JSON }>(
        slice,
        &lexical_core::parse_float_options::JSON,
    ) {
        Ok(val) => {
            if val.is_infinite() {
                err!(offset, get!(slice, offset))
            }

            Ok(StaticNode::F64(val))
        }
        Err(_) => err!(offset, get!(slice, offset)),
    }
}

#[cfg(test)]
mod test {
    #![allow(clippy::default_trait_access)]
    use crate::error::Error;
    use crate::value::owned::to_value;
    use crate::value::owned::Value;
    use crate::value::owned::Value::Static;
    use value_trait::StaticNode::F64;
    use value_trait::StaticNode::I64;

    fn to_value_from_str(buf: &str) -> Result<Value, Error> {
        let mut val = String::from(buf);
        let val = unsafe { val.as_bytes_mut() };
        to_value(val)
    }

    #[test]
    fn float() {
        assert_eq!(
            to_value_from_str("0.4e5").expect("40000.0"),
            Static(F64(40000.0))
        );
        assert_eq!(
            to_value_from_str("-12345678901234.56789012").unwrap(),
            Static(F64(-12_345_678_901_234.568))
        );
        assert_eq!(to_value_from_str("0.4e-001").unwrap(), Static(F64(0.04)));
        assert_eq!(
            to_value_from_str("0.123456789e-12").unwrap(),
            Static(F64(1.234_567_89e-13))
        );
        assert_eq!(
            to_value_from_str("1.234567890E+34").unwrap(),
            1.234_567_89e34
        );
        assert_eq!(
            to_value_from_str("23456789012E66").unwrap(),
            Static(F64(2.345_678_901_2e76))
        );
        assert_eq!(
            to_value_from_str("0.0000000000000000000000000000000000000000000000000123e50")
                .expect("1.23"),
            Static(F64(1.23))
        );
        assert_eq!(to_value_from_str("0.6").expect("0.6"), Static(F64(0.6)));
    }

    #[test]
    fn float_precision() {
        assert_eq!(
            to_value_from_str("31.245270191439438").unwrap(),
            31.245_270_191_439_438
        );
        assert_eq!(
            to_value_from_str("-31.245270191439438").unwrap(),
            -31.245_270_191_439_438
        );
        assert_eq!(
            to_value_from_str("121.48791951161945").unwrap(),
            121.487_919_511_619_45
        );
        assert_eq!(
            to_value_from_str("-121.48791951161945").unwrap(),
            -121.487_919_511_619_45
        );
        assert_eq!(
            to_value_from_str("100.78399658203125").unwrap(),
            100.783_996_582_031_25
        );
        assert_eq!(
            to_value_from_str("-100.78399658203125").unwrap(),
            -100.783_996_582_031_25
        );
    }

    #[test]
    fn int_trailing_invalid() {
        // todo: these should fail but is not distinguished from trailing padding
        assert!(to_value_from_str("123\x00").is_ok());
        assert!(to_value_from_str("[123\x00]").is_ok());
    }

    #[test]
    fn float_trailing_invalid() {
        assert!(to_value_from_str("0.1.2").is_err());
        assert!(to_value_from_str("[0.1.2]").is_err());
        assert!(to_value_from_str("[-1.0.]").is_err());
        assert!(to_value_from_str("[1.2a-3]").is_err());
        assert!(to_value_from_str("[1.8011670033376514H-308]").is_err());
    }

    #[test]
    fn bad_dot() {
        assert!(to_value_from_str("1.").is_err());
        assert!(to_value_from_str("1.e").is_err());
        assert!(to_value_from_str("100000000000000000000000000000000000000000000.").is_err());
        assert!(to_value_from_str("100000000000000000000000000000000000000000000.e").is_err());
    }

    #[test]
    fn bad_e() {
        assert!(to_value_from_str("1.0e").is_err());
        assert!(to_value_from_str("100000000000000000000000000000000000000000000.0e").is_err());
        assert!(to_value_from_str("100000000000000000000000000000000000000000000.0ee").is_err());
    }

    #[test]
    fn infinite_literal() {
        assert!(to_value_from_str("inf").is_err());
        assert!(to_value_from_str("Inf").is_err());
        assert!(to_value_from_str("-inf").is_err());
        assert!(to_value_from_str("-Inf").is_err());
        assert!(to_value_from_str("infinity").is_err());
        assert!(to_value_from_str("Infinity").is_err());
        assert!(to_value_from_str("-infinity").is_err());
        assert!(to_value_from_str("-Infinity").is_err());
    }

    #[test]
    fn infinite_exponent() {
        assert!(to_value_from_str("1e309").is_err());
        assert!(to_value_from_str("1e1000").is_err());
        assert!(to_value_from_str("100000000000000000000000000000000000000000000e309").is_err());
        assert!(to_value_from_str("100000000000000000000000000000000000000000000e1000").is_err());
    }

    #[test]
    fn nan() {
        assert!(to_value_from_str("NaN").is_err());
        assert!(to_value_from_str("nan").is_err());
    }

    #[test]
    fn zero_int() {
        assert_eq!(to_value_from_str("0").expect("0"), Static(I64(0)));
    }

    #[test]
    fn zero_float() {
        assert_eq!(to_value_from_str("0e1").expect("0e1"), Static(F64(0.0)));
        assert_eq!(to_value_from_str("0.00e-00").unwrap(), Static(F64(0.0)));
        assert_eq!(to_value_from_str("0e-1").expect("0e-1"), Static(F64(-0.0)));
        assert_eq!(to_value_from_str("-0.00e-00").unwrap(), Static(F64(-0.0)));
    }

    #[test]
    fn int() {
        assert_eq!(to_value_from_str("1").unwrap(), Static(I64(1)));
        assert_eq!(to_value_from_str("257").unwrap(), Static(I64(257)));
    }

    #[test]
    fn minus_309() {
        assert_eq!(
            to_value_from_str("-5.96916642387374e-309").unwrap(),
            Static(F64(-5.969_166_423_873_74e-_309))
        );
    }
    #[allow(clippy::unreadable_literal)]
    #[test]
    fn tiny_float() {
        assert_eq!(to_value_from_str("-0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000596916642387374").unwrap(), Static(F64(-0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000596916642387374)));
    }
}