nojson 0.3.11 - Docs.rs

//! SWAR (SIMD Within A Register) optimization for JSON string scanning.
//!
//! Checks 8 bytes at a time using bitwise operations on a `u64` to find
//! bytes that require special handling in JSON strings: control characters
//! (< 0x20), double quote (0x22), and backslash (0x5C).

const MASK_80: u64 = 0x8080808080808080;
const MASK_01: u64 = 0x0101010101010101;

/// Returns the number of leading bytes in `s` that are "plain" ASCII for JSON strings:
/// - Not a control character (byte >= 0x20)
/// - Not a double quote `"` (0x22)
/// - Not a backslash `\` (0x5C)
/// - ASCII (byte < 0x80), so byte/char boundaries are guaranteed to align
pub(crate) fn skip_plain_ascii_bytes(s: &[u8]) -> usize {
    let mut i = 0;

    // Process 8 bytes at a time
    while i + 8 <= s.len() {
        let chunk: [u8; 8] = s[i..i + 8].try_into().unwrap();
        let w = u64::from_ne_bytes(chunk);

        if is_all_plain_ascii(w) {
            i += 8;
        } else {
            // Find the first non-plain byte within this chunk
            return i + first_non_plain_offset(w);
        }
    }

    // Handle remaining bytes (< 8)
    while i < s.len() {
        if is_plain_ascii_byte(s[i]) {
            i += 1;
        } else {
            break;
        }
    }

    i
}

/// Returns the number of leading bytes in `s` that belong to a non-ASCII UTF-8 run.
///
/// This assumes `s` starts at a UTF-8 character boundary. It stops at the first ASCII byte,
/// which is also a character boundary in valid UTF-8.
#[inline(always)]
pub(crate) fn skip_non_ascii_bytes(s: &[u8]) -> usize {
    let mut i = 0;

    // Process 8 bytes at a time: all non-ASCII bytes have bit 7 set
    while i + 8 <= s.len() {
        let chunk: [u8; 8] = s[i..i + 8].try_into().unwrap();
        let w = u64::from_ne_bytes(chunk);
        if w & MASK_80 == MASK_80 {
            i += 8;
        } else {
            break;
        }
    }

    while i < s.len() && s[i] & 0x80 != 0 {
        i += 1;
    }
    i
}

/// Returns the number of leading bytes in `s` that are JSON whitespace
/// (` `, `\t`, `\r`, or `\n`).
///
/// Most calls in compact JSON pass strings whose first byte is non-whitespace,
/// so a byte-by-byte loop is faster than an 8-byte SWAR setup cost. The
/// compiler turns the match into a fast dispatch and unrolls the loop for
/// any longer run.
#[inline]
pub(crate) fn skip_json_whitespace(s: &[u8]) -> usize {
    let mut i = 0;
    while i < s.len() && matches!(s[i], b' ' | b'\t' | b'\r' | b'\n') {
        i += 1;
    }
    i
}

/// Returns the number of leading bytes in `s` that are ASCII digits (`'0'..='9'`).
pub(crate) fn skip_ascii_digits(s: &[u8]) -> usize {
    let mut i = 0;

    // Process 8 bytes at a time.
    while i + 8 <= s.len() {
        let chunk: [u8; 8] = s[i..i + 8].try_into().unwrap();
        let w = u64::from_ne_bytes(chunk);

        if is_all_ascii_digits(w) {
            i += 8;
        } else {
            return i + first_non_digit_offset(w);
        }
    }

    while i < s.len() && s[i].is_ascii_digit() {
        i += 1;
    }

    i
}

/// Build a mask with bit 7 set for each byte in `w` that is NOT an ASCII digit.
/// A byte is non-digit if: byte < 0x30, byte > 0x39, or byte >= 0x80.
#[inline(always)]
fn non_digit_mask(w: u64) -> u64 {
    // Non-ASCII: byte >= 0x80
    let non_ascii = w & MASK_80;

    // byte < 0x30: b + 0x50 has bit 7 set iff b >= 0x30 (within ASCII).
    // XOR with MASK_80 inverts so bit 7 set iff b < 0x30.
    let lt_30 = (w.wrapping_add(0x5050505050505050) ^ MASK_80) & MASK_80;

    // byte > 0x39 (within ASCII): b + 0x46 has bit 7 set iff b >= 0x3A.
    // For non-ASCII (b >= 0x80), addition can wrap; that case is covered by
    // `non_ascii`, and any spurious carry into a later byte is harmless
    // because we only report the first non-digit position.
    let gt_39 = w.wrapping_add(0x4646464646464646) & MASK_80;

    non_ascii | lt_30 | gt_39
}

#[inline(always)]
fn is_all_ascii_digits(w: u64) -> bool {
    non_digit_mask(w) == 0
}

#[inline(always)]
fn first_non_digit_offset(w: u64) -> usize {
    let fail = non_digit_mask(w);
    #[cfg(target_endian = "little")]
    {
        (fail.trailing_zeros() / 8) as usize
    }
    #[cfg(target_endian = "big")]
    {
        (fail.leading_zeros() / 8) as usize
    }
}

/// Build a mask with bit 7 set for each byte in `w` that is NOT plain ASCII for JSON strings.
/// A byte is non-plain if: >= 128, < 32, == 0x22 (`"`), or == 0x5C (`\`).
#[inline(always)]
fn non_plain_mask(w: u64) -> u64 {
    // Non-ASCII: byte >= 128
    let non_ascii = w & MASK_80;

    // Control chars: byte < 32 (for bytes < 128)
    // For b in [0x00, 0x7F]: b + 0x60 sets bit 7 iff b >= 0x20
    let control = (w.wrapping_add(0x6060606060606060) ^ MASK_80) & MASK_80;

    // Quote (0x22): Mycroft's zero-byte detection
    let xor_quote = w ^ 0x2222222222222222;
    let quote = xor_quote.wrapping_sub(MASK_01) & !xor_quote & MASK_80;

    // Backslash (0x5C): Mycroft's zero-byte detection
    let xor_bslash = w ^ 0x5C5C5C5C5C5C5C5C;
    let bslash = xor_bslash.wrapping_sub(MASK_01) & !xor_bslash & MASK_80;

    non_ascii | control | quote | bslash
}

/// Check if all 8 bytes in `w` are plain ASCII for JSON strings.
#[inline(always)]
fn is_all_plain_ascii(w: u64) -> bool {
    non_plain_mask(w) == 0
}

/// Find the byte offset of the first non-plain byte within a u64 word.
/// Assumes at least one non-plain byte exists.
#[inline(always)]
fn first_non_plain_offset(w: u64) -> usize {
    let fail = non_plain_mask(w);

    // Find the first set bit. On little-endian, trailing_zeros gives the
    // lowest-address byte. On big-endian, leading_zeros does.
    #[cfg(target_endian = "little")]
    {
        (fail.trailing_zeros() / 8) as usize
    }
    #[cfg(target_endian = "big")]
    {
        (fail.leading_zeros() / 8) as usize
    }
}

#[inline(always)]
fn is_plain_ascii_byte(b: u8) -> bool {
    (0x20..0x80).contains(&b) && b != b'"' && b != b'\\'
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty() {
        assert_eq!(skip_plain_ascii_bytes(b""), 0);
    }

    #[test]
    fn all_plain_short() {
        assert_eq!(skip_plain_ascii_bytes(b"hello"), 5);
    }

    #[test]
    fn all_plain_8_bytes() {
        assert_eq!(skip_plain_ascii_bytes(b"abcdefgh"), 8);
    }

    #[test]
    fn all_plain_16_bytes() {
        assert_eq!(skip_plain_ascii_bytes(b"abcdefghijklmnop"), 16);
    }

    #[test]
    fn quote_at_various_positions() {
        for pos in 0..16 {
            let mut buf = [b'a'; 16];
            buf[pos] = b'"';
            assert_eq!(
                skip_plain_ascii_bytes(&buf),
                pos,
                "quote at position {}",
                pos
            );
        }
    }

    #[test]
    fn backslash_at_various_positions() {
        for pos in 0..16 {
            let mut buf = [b'a'; 16];
            buf[pos] = b'\\';
            assert_eq!(
                skip_plain_ascii_bytes(&buf),
                pos,
                "backslash at position {}",
                pos
            );
        }
    }

    #[test]
    fn control_chars() {
        for b in 0..0x20u8 {
            let buf = [b'a', b'b', b'c', b];
            assert_eq!(skip_plain_ascii_bytes(&buf), 3, "control char 0x{:02x}", b);
        }
    }

    #[test]
    fn control_char_at_start() {
        assert_eq!(skip_plain_ascii_bytes(b"\x00abc"), 0);
        assert_eq!(skip_plain_ascii_bytes(b"\nabc"), 0);
        assert_eq!(skip_plain_ascii_bytes(b"\tabc"), 0);
    }

    #[test]
    fn non_ascii_stops() {
        assert_eq!(skip_plain_ascii_bytes("abc日本語".as_bytes()), 3);
        assert_eq!(skip_plain_ascii_bytes("あ".as_bytes()), 0);
    }

    #[test]
    fn non_ascii_run() {
        assert_eq!(skip_non_ascii_bytes("日本語x".as_bytes()), "日本語".len());
        assert_eq!(skip_non_ascii_bytes("あa".as_bytes()), "あ".len());
        assert_eq!(skip_non_ascii_bytes("abc".as_bytes()), 0);
    }

    #[test]
    fn high_ascii_boundary() {
        // 0x7F (DEL) is a control char in JSON context? No, JSON spec says < 0x20 are control.
        // DEL (0x7F) is technically a control char but JSON spec only requires escaping < 0x20.
        // Our function treats 0x7F as plain ASCII (>= 0x20 and < 0x80).
        assert_eq!(skip_plain_ascii_bytes(&[0x7F]), 1);
        // 0x80 is non-ASCII
        assert_eq!(skip_plain_ascii_bytes(&[0x80]), 0);
        // 0x20 (space) is the first plain byte
        assert_eq!(skip_plain_ascii_bytes(&[0x20]), 1);
        // 0x1F is last control char
        assert_eq!(skip_plain_ascii_bytes(&[0x1F]), 0);
    }

    #[test]
    fn mixed_content() {
        let input = b"Hello, World!\"rest";
        assert_eq!(skip_plain_ascii_bytes(input), 13); // stops at "
    }

    #[test]
    fn all_printable_ascii() {
        // All printable ASCII except " and \ should be plain
        let mut count = 0;
        for b in 0x20..0x80u8 {
            if b != b'"' && b != b'\\' {
                assert_eq!(
                    skip_plain_ascii_bytes(&[b]),
                    1,
                    "byte 0x{:02x} ('{}') should be plain",
                    b,
                    b as char,
                );
                count += 1;
            }
        }
        assert_eq!(count, 94); // 96 printable minus " and \
    }

    #[test]
    fn long_plain_then_special() {
        let buf = [b'x'; 1024];
        // All 1024 bytes are plain 'x'
        assert_eq!(skip_plain_ascii_bytes(&buf), 1024);

        let mut buf2 = [b'x'; 64];
        buf2[50] = b'"';
        assert_eq!(skip_plain_ascii_bytes(&buf2), 50);
    }

    #[test]
    fn digits_empty_and_short() {
        assert_eq!(skip_ascii_digits(b""), 0);
        assert_eq!(skip_ascii_digits(b"0"), 1);
        assert_eq!(skip_ascii_digits(b"9"), 1);
        assert_eq!(skip_ascii_digits(b"a"), 0);
        assert_eq!(skip_ascii_digits(b"123"), 3);
    }

    #[test]
    fn digits_full_chunk() {
        assert_eq!(skip_ascii_digits(b"01234567"), 8);
        assert_eq!(skip_ascii_digits(b"0123456789012345"), 16);
    }

    #[test]
    fn digits_stop_at_non_digit() {
        assert_eq!(skip_ascii_digits(b"12345abc"), 5);
        assert_eq!(skip_ascii_digits(b"1234567a"), 7);
        assert_eq!(skip_ascii_digits(b"12345678a"), 8);
        assert_eq!(skip_ascii_digits(b"123456789a"), 9);
    }

    #[test]
    fn digits_boundary_chars() {
        // '/' = 0x2F, just below '0' = 0x30
        assert_eq!(skip_ascii_digits(b"/"), 0);
        // ':' = 0x3A, just above '9' = 0x39
        assert_eq!(skip_ascii_digits(b":"), 0);
        // '0' boundary
        assert_eq!(skip_ascii_digits(b"0/"), 1);
        // '9' boundary
        assert_eq!(skip_ascii_digits(b"9:"), 1);
    }

    #[test]
    fn digits_non_ascii() {
        // Non-ASCII byte stops the run.
        let mut buf = b"12345\xC2\xA00".to_vec();
        assert_eq!(skip_ascii_digits(&buf), 5);
        // Non-ASCII at start.
        buf = b"\xFF1234".to_vec();
        assert_eq!(skip_ascii_digits(&buf), 0);
    }

    #[test]
    fn digits_each_non_digit_position() {
        for pos in 0..16 {
            let mut buf = [b'5'; 16];
            buf[pos] = b'.';
            assert_eq!(
                skip_ascii_digits(&buf),
                pos,
                "non-digit '.' at position {pos}"
            );
        }
    }

    #[test]
    fn ws_empty_and_short() {
        assert_eq!(skip_json_whitespace(b""), 0);
        assert_eq!(skip_json_whitespace(b" "), 1);
        assert_eq!(skip_json_whitespace(b"\t"), 1);
        assert_eq!(skip_json_whitespace(b"\r"), 1);
        assert_eq!(skip_json_whitespace(b"\n"), 1);
        assert_eq!(skip_json_whitespace(b"  \t \n "), 6);
        assert_eq!(skip_json_whitespace(b"a"), 0);
    }

    #[test]
    fn ws_full_chunk() {
        assert_eq!(skip_json_whitespace(b"        "), 8);
        assert_eq!(skip_json_whitespace(b"\t\n\r \t\n\r "), 8);
        assert_eq!(skip_json_whitespace(b"        \t       "), 16);
    }

    #[test]
    fn ws_stop_at_non_ws() {
        assert_eq!(skip_json_whitespace(b"   abc"), 3);
        assert_eq!(skip_json_whitespace(b"\n\n{"), 2);
        // Non-whitespace at every chunk boundary position.
        for pos in 0..16 {
            let mut buf = [b' '; 16];
            buf[pos] = b'a';
            assert_eq!(
                skip_json_whitespace(&buf),
                pos,
                "non-ws 'a' at position {pos}"
            );
        }
    }

    #[test]
    fn ws_excludes_non_json_whitespace() {
        // Vertical tab (0x0B), form feed (0x0C), and other low ASCII are NOT
        // JSON whitespace.
        assert_eq!(skip_json_whitespace(b"\x0B"), 0);
        assert_eq!(skip_json_whitespace(b"\x0C"), 0);
        assert_eq!(skip_json_whitespace(b"\x00"), 0);
        // Non-breaking space (Unicode U+00A0, UTF-8 0xC2 0xA0) is not JSON ws.
        assert_eq!(skip_json_whitespace(b"\xC2\xA0"), 0);
    }

    #[test]
    fn ws_long_run() {
        let buf = [b' '; 1024];
        assert_eq!(skip_json_whitespace(&buf), 1024);
        let mut buf2 = [b' '; 256];
        buf2[200] = b'{';
        assert_eq!(skip_json_whitespace(&buf2), 200);
    }
}