json_flat_parser/
lexer.rs

1use crate::string_from_bytes;
2
3#[derive(Debug)]
4pub enum Token<'json> {
5    CurlyOpen,
6    CurlyClose,
7    SquareOpen,
8    SquareClose,
9    Colon,
10    Comma,
11    String(&'json str),
12    Number(&'json str),
13    Boolean(&'json str),
14    Null,
15}
16
17
18pub struct SliceRead<'json> {
19    slice: &'json [u8],
20    index: usize,
21}
22
23impl<'json> SliceRead<'json> {
24    pub fn new(slice: &'json [u8]) -> Self {
25        SliceRead { slice, index: 0 }
26    }
27    #[inline]
28    pub fn next(&mut self) -> Option<u8> {
29        if self.index < self.slice.len() {
30            let result = self.slice[self.index];
31            self.index += 1;
32            Some(result)
33        } else {
34            None
35        }
36    }
37    #[inline]
38    pub fn next_u64(&mut self) -> (u64, usize) {
39        if self.index + 8 < self.slice.len() {
40            let result = u64::from_le_bytes(
41                [self.slice[self.index], self.slice[self.index + 1], self.slice[self.index + 2], self.slice[self.index + 3],
42                    self.slice[self.index + 4], self.slice[self.index + 5], self.slice[self.index + 6], self.slice[self.index + 7]]);
43            self.index += 8;
44            (result, 8)
45        } else {
46            let mut v: [u8; 8] = [0; 8];
47            let mut i = 0;
48            while self.index + i < self.slice.len() {
49                v[i] = self.slice[self.index + i];
50                i += 1;
51            }
52            self.index += i;
53            (u64::from_le_bytes(v), i)
54        }
55    }
56    #[inline]
57    pub fn peek(&self) -> Option<u8> {
58        if self.index < self.slice.len() {
59            Some(self.slice[self.index])
60        } else {
61            None
62        }
63    }
64    #[inline]
65    pub fn slice_from(&self, start: usize) -> &'json [u8] {
66        &self.slice[start..self.index]
67    }
68    #[inline]
69    pub fn is_at_end(&self) -> bool {
70        self.index >= self.slice.len()
71    }
72
73    #[inline]
74    pub fn match_pattern(&mut self, pattern: &[u8]) -> bool {
75        let end = self.index + pattern.len();
76        if end <= self.slice.len() && self.slice[self.index..end] == *pattern {
77            self.index += pattern.len();
78            true
79        } else {
80            false
81        }
82    }
83
84    pub fn data(&self) -> &'json [u8] {
85        self.slice
86    }
87}
88
89
90pub struct Lexer<'json> {
91    reader: SliceRead<'json>,
92}
93
94
95const MASK_OPEN_CURLY: u64 = 0x0101010101010101 * b'{' as u64;
96const MASK_CLOSE_CURLY: u64 = 0x0101010101010101 * b'}' as u64;
97const MASK_OPEN_SQUARE: u64 = 0x0101010101010101 * b'[' as u64;
98const MASK_CLOSE_SQUARE: u64 = 0x0101010101010101 * b']' as u64;
99const MASK_QUOTE: u64 = 0x0101010101010101 * b'"' as u64;
100
101impl<'json> Lexer<'json> {
102    pub fn new(input: &'json [u8]) -> Self {
103        Lexer {
104            reader: SliceRead::new(input),
105        }
106    }
107
108    #[inline]
109    pub fn consume_string_until_end_of_array(&mut self, array_start_index: usize, nested_array: bool) -> Option<&'json str> {
110        let mut square_close_count = 1;
111        if nested_array {
112            square_close_count += 1;
113        }
114        while !self.reader.is_at_end() {
115            let current_index = self.reader.index;
116            let (bytes, _) = self.reader.next_u64();
117            let comparison_square_close = MASK_CLOSE_SQUARE ^ bytes;
118            let comparison_square_open = MASK_OPEN_SQUARE ^ bytes;
119            let high_bit_mask_square_close = (((comparison_square_close >> 1) | 0x8080808080808080) - comparison_square_close) & 0x8080808080808080;
120            let high_bit_mask_square_open = (((comparison_square_open >> 1) | 0x8080808080808080) - comparison_square_open) & 0x8080808080808080;
121            if high_bit_mask_square_close == 0 && high_bit_mask_square_open == 0 {
122                continue;
123            } else {
124                let mut index = 0;
125                if high_bit_mask_square_close != 0 {
126                    index = (high_bit_mask_square_close.trailing_zeros() >> 3) as usize;
127                }
128                if high_bit_mask_square_open != 0 {
129                    let open_index = (high_bit_mask_square_open.trailing_zeros() >> 3) as usize;
130                    if open_index < index {
131                        index = open_index;
132                    }
133                }
134                self.reader.index = current_index + index;
135            }
136            match self.reader.next()? {
137                b'[' => square_close_count += 1,
138                b']' => {
139                    if square_close_count == 1 {
140                        return string_from_bytes(&self.reader.slice[array_start_index..self.reader.index]);
141                    } else {
142                        square_close_count -= 1;
143                    }
144                }
145                _ => {}
146            }
147        }
148        None
149    }
150
151    pub fn reader_index(&self) -> usize {
152        self.reader.index
153    }
154    pub fn reader(&mut self) -> &SliceRead<'json> {
155        &self.reader
156    }
157
158    pub fn set_reader_index(&mut self, index: usize) {
159        self.reader.index = index;
160    }
161
162    #[inline]
163    pub fn consume_string_until_end_of_object(&mut self, should_return: bool) -> Option<&'json str> {
164        let mut square_close_count = 1;
165        let start = self.reader.index - 1;
166        while !self.reader.is_at_end() {
167            let current_index = self.reader.index;
168            let (bytes, _) = self.reader.next_u64();
169            let comparison_curly_close = MASK_CLOSE_CURLY ^ bytes;
170            let comparison_curly_open = MASK_OPEN_CURLY ^ bytes;
171            let high_bit_mask_curly_close = (((comparison_curly_close >> 1) | 0x8080808080808080) - comparison_curly_close) & 0x8080808080808080;
172            let high_bit_mask_curly_open = (((comparison_curly_open >> 1) | 0x8080808080808080) - comparison_curly_open) & 0x8080808080808080;
173
174            if high_bit_mask_curly_close == 0 && high_bit_mask_curly_open == 0 {
175                continue;
176            } else {
177                let mut index = 0;
178                if high_bit_mask_curly_close != 0 {
179                    index = (high_bit_mask_curly_close.trailing_zeros() >> 3) as usize;
180                }
181                if high_bit_mask_curly_open != 0 {
182                    let open_index = (high_bit_mask_curly_open.trailing_zeros() >> 3) as usize;
183                    if open_index < index {
184                        index = open_index;
185                    }
186                }
187                self.reader.index = current_index + index;
188            }
189
190            match self.reader.next()? {
191                b'{' => square_close_count += 1,
192                b'}' => {
193                    if square_close_count == 1 {
194                        if should_return {
195                            let value = string_from_bytes(&self.reader.slice[start..self.reader.index])?;
196                            return Some(value);
197                        } else {
198                            break;
199                        }
200                    } else {
201                        square_close_count -= 1;
202                    }
203                }
204                _ => {}
205            }
206        }
207        None
208    }
209    #[inline]
210    pub fn next_token(&mut self) -> Option<Token<'json>> {
211        loop {
212            match self.reader.next()? {
213                b'{' => return Some(Token::CurlyOpen),
214                b'}' => return Some(Token::CurlyClose),
215                b'[' => return Some(Token::SquareOpen),
216                b']' => return Some(Token::SquareClose),
217                b',' => return Some(Token::Comma),
218                b':' => return Some(Token::Colon),
219                b'-' | b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' | b'8' | b'9' => {
220                    let start = self.reader.index - 1;
221                    while let Some(b) = self.reader.next() {
222                        if !((0x30..=0x39).contains(&b) || b == b'.' || b == b'e' || b == b'+' || b == b'-') {
223                            break;
224                        }
225                    }
226                    self.reader.index -= 1;
227                    let s = string_from_bytes(&self.reader.slice[start..self.reader.index])?;
228                    return Some(Token::Number(s));
229                }
230                b'"' => {
231                    let start = self.reader.index;
232                    while !self.reader.is_at_end() {
233                        let (bytes, read_bytes) = self.reader.next_u64();
234                        let comparison = MASK_QUOTE ^ bytes;
235                        let high_bit_mask1 = (((comparison >> 1) | 0x8080808080808080) - comparison) & 0x8080808080808080;
236                        // println!("...{}", String::from_utf8_lossy(&self.reader.slice[self.reader.index - read_bytes..self.reader.index]));
237                        if high_bit_mask1 != 0 {
238                            let position = (high_bit_mask1.trailing_zeros() >> 3) as usize;
239                            if self.reader.slice[self.reader.index - read_bytes + position - 1] != b'\\' {
240                                self.reader.index = self.reader.index - read_bytes + position + 1;
241                                break;
242                            } else {
243                                self.reader.index = self.reader.index - read_bytes + position + 1;
244                            }
245                        }
246                    }
247                    let s = string_from_bytes(&self.reader.slice[start..self.reader.index - 1])?;
248                    return Some(Token::String(s));
249                }
250                b't' if self.reader.match_pattern(b"rue") => return Some(Token::Boolean(string_from_bytes(&self.reader.slice[self.reader.index - 4..self.reader.index])?)),
251                b'f' if self.reader.match_pattern(b"alse") => return Some(Token::Boolean(string_from_bytes(&self.reader.slice[self.reader.index - 5..self.reader.index])?)),
252                b'n' if self.reader.match_pattern(b"ull") => return Some(Token::Null),
253                _ => {}
254            }
255        }
256    }
257}
258
259