simd_json/
charutils.rs

1use crate::safer_unchecked::GetSaferUnchecked;
2
3const STRUCTURAL_OR_WHITESPACE_NEGATED: [u32; 256] = [
4    0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
6    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
7    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
8    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
11    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
12];
13
14const STRUCTURAL_OR_WHITESPACE: [u32; 256] = [
15    1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
17    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
18    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
19    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23];
24
25#[cfg_attr(not(feature = "no-inline"), inline)]
26pub fn is_not_structural_or_whitespace(c: u8) -> u32 {
27    unsafe { *STRUCTURAL_OR_WHITESPACE_NEGATED.get_kinda_unchecked(c as usize) }
28}
29
30#[cfg_attr(not(feature = "no-inline"), inline)]
31pub fn is_structural_or_whitespace(c: u8) -> u32 {
32    unsafe { *STRUCTURAL_OR_WHITESPACE.get_kinda_unchecked(c as usize) }
33}
34
35const DIGITTOVAL: [i8; 256] = [
36    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
37    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
38    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1,
39    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10,
40    11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
41    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
43    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
44    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
45    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
46    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
47];
48
49// returns a value with the high 16 bits set if not valid
50// otherwise returns the conversion of the 4 hex digits at src into the bottom 16 bits of the 32-bit
51// return register
52#[cfg_attr(not(feature = "no-inline"), inline)]
53#[allow(clippy::cast_sign_loss)]
54pub fn hex_to_u32_nocheck(src: &[u8]) -> u32 {
55    // strictly speaking, static inline is a C-ism
56    // all these will sign-extend the chars looked up, placing 1-bits into the high 28 bits of every
57    // invalid value. After the shifts, this will *still* result in the outcome that the high 16 bits of any
58    // value with any invalid char will be all 1's. We check for this in the caller.
59    unsafe {
60        let v1: i32 =
61            i32::from(*DIGITTOVAL.get_kinda_unchecked(*src.get_kinda_unchecked(0) as usize));
62        let v2: i32 =
63            i32::from(*DIGITTOVAL.get_kinda_unchecked(*src.get_kinda_unchecked(1) as usize));
64        let v3: i32 =
65            i32::from(*DIGITTOVAL.get_kinda_unchecked(*src.get_kinda_unchecked(2) as usize));
66        let v4: i32 =
67            i32::from(*DIGITTOVAL.get_kinda_unchecked(*src.get_kinda_unchecked(3) as usize));
68        ((v1 << 12) | (v2 << 8) | (v3 << 4) | v4) as u32
69    }
70}
71
72// given a code point cp, writes to c
73// the utf-8 code, outputting the length in
74// bytes, if the length is zero, the code point
75// is invalid
76//
77// This can possibly be made faster using pdep
78// and clz and table lookups, but JSON documents
79// have few escaped code points, and the following
80// function looks cheap.
81//
82// Note: we assume that surrogates are treated separately
83//
84#[cfg_attr(not(feature = "no-inline"), inline)]
85#[allow(clippy::cast_possible_truncation)]
86pub fn codepoint_to_utf8(cp: u32, c: &mut [u8]) -> usize {
87    unsafe {
88        if cp <= 0x7F {
89            *c.get_kinda_unchecked_mut(0) = cp as u8;
90            1 // ascii
91        } else if cp <= 0x7FF {
92            *c.get_kinda_unchecked_mut(0) = ((cp >> 6) + 192) as u8;
93            *c.get_kinda_unchecked_mut(1) = ((cp & 63) + 128) as u8;
94            2
95            // universal plane
96            //  Surrogates are treated elsewhere...
97            //} //else if (0xd800 <= cp && cp <= 0xdfff) {
98            //  return 0; // surrogates // could put assert here
99        } else if cp <= 0xFFFF {
100            *c.get_kinda_unchecked_mut(0) = ((cp >> 12) + 224) as u8;
101            *c.get_kinda_unchecked_mut(1) = (((cp >> 6) & 63) + 128) as u8;
102            *c.get_kinda_unchecked_mut(2) = ((cp & 63) + 128) as u8;
103            3
104        } else if cp <= 0x0010_FFFF {
105            // if you know you have a valid code point, this is not needed
106            *c.get_kinda_unchecked_mut(0) = ((cp >> 18) + 240) as u8;
107            *c.get_kinda_unchecked_mut(1) = (((cp >> 12) & 63) + 128) as u8;
108            *c.get_kinda_unchecked_mut(2) = (((cp >> 6) & 63) + 128) as u8;
109            *c.get_kinda_unchecked_mut(3) = ((cp & 63) + 128) as u8;
110            4
111        } else {
112            // will return 0 when the code point was too large.
113            0
114        }
115    }
116}