json_tools/lexer.rs
1/// A lexer for utf-8 encoded json data
2pub struct Lexer<I: IntoIterator<Item = u8>> {
3 chars: I::IntoIter,
4 next_byte: Option<u8>,
5 cursor: u64,
6 buffer_type: BufferType,
7}
8
9#[derive(Debug, PartialEq, Clone)]
10pub enum TokenType {
11 /// `{`
12 CurlyOpen,
13 /// `}`
14 CurlyClose,
15
16 /// `[`
17 BracketOpen,
18 /// `]`
19 BracketClose,
20
21 /// `:`
22 Colon,
23 /// `,`
24 Comma,
25
26 /// A json string , like `"foo"`
27 String,
28 /// `true`
29 BooleanTrue,
30 /// `false`
31 BooleanFalse,
32 /// A Number, like `1.1234` or `123` or `-0.0` or `-1` or `.0` or `.`
33 Number,
34
35 /// any json number, like `1.24123` or `123`
36 // NOTE: We can't do numbers with our simplified lexer as it would require
37 // us to read a byte just to see that it's not a number and thus the previous
38 // tokens are to be returned. But we cannot peek without drastically complicating
39 // our so far quite speedy implementation.
40 // Number,
41 /// `null`
42 Null,
43
44 /// The type of the token could not be identified.
45 /// Should be removed if this lexer is ever to be feature complete
46 Invalid,
47}
48
49impl AsRef<str> for TokenType {
50 fn as_ref(&self) -> &str {
51 match *self {
52 TokenType::CurlyOpen => "{",
53 TokenType::CurlyClose => "}",
54 TokenType::BracketOpen => "[",
55 TokenType::BracketClose => "]",
56 TokenType::Colon => ":",
57 TokenType::Comma => ",",
58 TokenType::BooleanTrue => "true",
59 TokenType::BooleanFalse => "false",
60 TokenType::Null => "null",
61
62 TokenType::Invalid => panic!("Cannot convert invalid TokenType"),
63 _ => panic!("Cannot convert variant TokenTypes"),
64 }
65 }
66}
67
68/// A pair of indices into the byte stream returned by our source
69/// iterator.
70/// It is an exclusive range.
71#[derive(Debug, PartialEq, Clone, Default)]
72pub struct Span {
73 /// Index of the first the byte
74 pub first: u64,
75 /// Index one past the last byte
76 pub end: u64,
77}
78
79/// A lexical token, identifying its kind and span.
80#[derive(Debug, PartialEq, Clone)]
81pub struct Token {
82 /// The exact type of the token
83 pub kind: TokenType,
84
85 /// A buffer representing the bytes of this Token.
86 pub buf: Buffer,
87}
88
89/// Representation of a buffer containing items making up a `Token`.
90///
91/// It's either always `Span`, or one of the `*Byte` variants.
92#[derive(Debug, PartialEq, Clone)]
93pub enum Buffer {
94 /// Multiple bytes making up a token. Only set for `TokenType::String` and
95 /// `TokenType::Number`.
96 MultiByte(Vec<u8>),
97 /// The span allows to reference back into the source byte stream
98 /// to obtain the string making up the token.
99 /// Please note that for control characters, booleans and null (i.e
100 /// anything that is not `Buffer::MultiByte` you should use
101 /// `<TokenType as AsRef<str>>::as_ref()`)
102 Span(Span),
103}
104
105/// The type of `Buffer` you want in each `Token`
106#[derive(Debug, PartialEq, Clone)]
107pub enum BufferType {
108 /// Use a `Buffer::MultiByte` were appropriate. Initialize it with the
109 /// given capacity (to obtain higher performance when pushing characters)
110 Bytes(usize),
111 Span,
112}
113
114impl<I> Lexer<I>
115where
116 I: IntoIterator<Item = u8>,
117{
118 /// Returns a new Lexer from a given byte iterator.
119 pub fn new(chars: I, buffer_type: BufferType) -> Lexer<I> {
120 Lexer {
121 chars: chars.into_iter(),
122 next_byte: None,
123 cursor: 0,
124 buffer_type,
125 }
126 }
127
128 pub fn into_inner(self) -> I::IntoIter {
129 self.chars
130 }
131
132 fn put_back(&mut self, c: u8) {
133 debug_assert!(self.next_byte.is_none());
134 self.next_byte = Some(c);
135 self.cursor -= 1;
136 }
137
138 fn next_byte(&mut self) -> Option<u8> {
139 match self.next_byte.take() {
140 Some(c) => {
141 self.cursor += 1;
142 Some(c)
143 }
144 None => {
145 let res = self.chars.next();
146 match res {
147 None => None,
148 Some(_) => {
149 self.cursor += 1;
150 res
151 }
152 }
153 }
154 }
155 }
156}
157
158// Identifies the state of the lexer
159enum Mode {
160 // String parse mode: bool = ignore_next, usize = ignore_digits
161 String(bool, usize),
162 // `null` parse mode: buf, buf-index
163 Null([u8; 4], usize),
164 // `true` parse mode
165 True([u8; 4], usize),
166 // `false` parse mode
167 False([u8; 5], usize),
168 // `Number` parse mode
169 Number,
170 SlowPath,
171}
172
173impl<I> Iterator for Lexer<I>
174where
175 I: IntoIterator<Item = u8>,
176{
177 type Item = Token;
178
179 /// Lex the underlying byte stream to generate tokens
180 fn next(&mut self) -> Option<Token> {
181 let mut t: Option<TokenType> = None;
182
183 let mut first = 0;
184 let mut state = Mode::SlowPath;
185 let last_cursor = self.cursor;
186 let mut buf = match self.buffer_type {
187 BufferType::Bytes(capacity) => Some(Vec::<u8>::with_capacity(capacity)),
188 BufferType::Span => None,
189 };
190
191 while let Some(c) = self.next_byte() {
192 let mut set_cursor = |cursor| {
193 first = cursor - 1;
194 };
195
196 match state {
197 Mode::String(ref mut ign_next, ref mut ign_digits) => {
198 if let Some(ref mut v) = buf {
199 v.push(c);
200 }
201 if *ign_next {
202 match c {
203 b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {
204 *ign_next = false;
205 continue;
206 }
207 b'u' => {
208 *ign_next = false;
209 *ign_digits = 4;
210 continue;
211 }
212 _ => {
213 t = Some(TokenType::Invalid);
214 break;
215 }
216 }
217 }
218 if *ign_digits > 0 {
219 match c {
220 b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
221 *ign_digits -= 1;
222 continue;
223 }
224 _ => {
225 t = Some(TokenType::Invalid);
226 break;
227 }
228 }
229 }
230 match c {
231 b'"' => {
232 t = Some(TokenType::String);
233 break;
234 }
235 b'\\' => {
236 *ign_next = true;
237 continue;
238 }
239 _ => {
240 continue;
241 }
242 }
243 }
244 Mode::Null(ref mut b, ref mut i) => {
245 b[*i] = c;
246 if *i == 3 {
247 // we know b[0] is b'n'
248 if b[1] == b'u' && b[2] == b'l' && b[3] == b'l' {
249 t = Some(TokenType::Null);
250 } else {
251 t = Some(TokenType::Invalid);
252 }
253 break;
254 } else {
255 *i += 1;
256 continue;
257 }
258 }
259 Mode::Number => match c {
260 b'0'..=b'9' | b'-' | b'+' | b'.' | b'E' | b'e' => {
261 if let Some(ref mut v) = buf {
262 v.push(c);
263 }
264 continue;
265 }
266 _ => {
267 t = Some(TokenType::Number);
268 self.put_back(c);
269 break;
270 }
271 },
272 Mode::True(ref mut b, ref mut i) => {
273 b[*i] = c;
274 if *i == 3 {
275 // we know b[0] is b't'
276 if b[1] == b'r' && b[2] == b'u' && b[3] == b'e' {
277 t = Some(TokenType::BooleanTrue);
278 } else {
279 t = Some(TokenType::Invalid);
280 }
281 break;
282 } else {
283 *i += 1;
284 continue;
285 }
286 }
287 Mode::False(ref mut b, ref mut i) => {
288 b[*i] = c;
289 if *i == 4 {
290 // we know b[0] is b'f'
291 if b[1] == b'a' && b[2] == b'l' && b[3] == b's' && b[4] == b'e' {
292 t = Some(TokenType::BooleanFalse);
293 } else {
294 t = Some(TokenType::Invalid);
295 }
296 break;
297 } else {
298 *i += 1;
299 continue;
300 }
301 }
302 Mode::SlowPath => {
303 match c {
304 b'{' => {
305 t = Some(TokenType::CurlyOpen);
306 set_cursor(self.cursor);
307 break;
308 }
309 b'}' => {
310 t = Some(TokenType::CurlyClose);
311 set_cursor(self.cursor);
312 break;
313 }
314 b'"' => {
315 state = Mode::String(false, 0);
316 if let Some(ref mut v) = buf {
317 v.push(c);
318 } else {
319 set_cursor(self.cursor);
320 // it starts at invalid, and once we know it closes, it's a string
321 t = Some(TokenType::Invalid);
322 }
323 }
324 b'n' => {
325 state = Mode::Null([c, b'x', b'x', b'x'], 1);
326 set_cursor(self.cursor);
327 }
328 b'0'..=b'9' | b'-' | b'.' => {
329 state = Mode::Number;
330 if let Some(ref mut v) = buf {
331 v.push(c);
332 } else {
333 set_cursor(self.cursor);
334 }
335 }
336 b't' => {
337 state = Mode::True([c, b'x', b'x', b'x'], 1);
338 set_cursor(self.cursor);
339 }
340 b'f' => {
341 state = Mode::False([c, b'x', b'x', b'x', b'x'], 1);
342 set_cursor(self.cursor);
343 }
344 b'[' => {
345 t = Some(TokenType::BracketOpen);
346 set_cursor(self.cursor);
347 break;
348 }
349 b']' => {
350 t = Some(TokenType::BracketClose);
351 set_cursor(self.cursor);
352 break;
353 }
354 b':' => {
355 t = Some(TokenType::Colon);
356 set_cursor(self.cursor);
357 break;
358 }
359 b',' => {
360 t = Some(TokenType::Comma);
361 set_cursor(self.cursor);
362 break;
363 }
364 b'\\' => {
365 // invalid
366 t = Some(TokenType::Invalid);
367 set_cursor(self.cursor);
368 break;
369 }
370 _ => {}
371 } // end single byte match
372 } // end case SlowPath
373 } // end match state
374 } // end for each byte
375
376 match t {
377 None => match (buf, state) {
378 (Some(b), Mode::Number) => Some(Token {
379 kind: TokenType::Number,
380 buf: Buffer::MultiByte(b),
381 }),
382 (None, Mode::Number) => Some(Token {
383 kind: TokenType::Number,
384 buf: Buffer::Span(Span { first, end: self.cursor }),
385 }),
386 _ => None,
387 },
388 Some(t) => {
389 if self.cursor == last_cursor {
390 None
391 } else {
392 let buf = match (&t, buf) {
393 (&TokenType::String, Some(b)) | (&TokenType::Number, Some(b)) => Buffer::MultiByte(b),
394 _ => Buffer::Span(Span { first, end: self.cursor }),
395 };
396 Some(Token { kind: t, buf })
397 }
398 }
399 }
400 }
401}