json_tools/
lexer.rs

1/// A lexer for utf-8 encoded json data
2pub struct Lexer<I: IntoIterator<Item = u8>> {
3    chars: I::IntoIter,
4    next_byte: Option<u8>,
5    cursor: u64,
6    buffer_type: BufferType,
7}
8
9#[derive(Debug, PartialEq, Clone)]
10pub enum TokenType {
11    /// `{`
12    CurlyOpen,
13    /// `}`
14    CurlyClose,
15
16    /// `[`
17    BracketOpen,
18    /// `]`
19    BracketClose,
20
21    /// `:`
22    Colon,
23    /// `,`
24    Comma,
25
26    /// A json string , like `"foo"`
27    String,
28    /// `true`
29    BooleanTrue,
30    /// `false`
31    BooleanFalse,
32    /// A Number, like `1.1234` or `123` or `-0.0` or `-1` or `.0` or `.`
33    Number,
34
35    /// any json number, like `1.24123` or `123`
36    // NOTE: We can't do numbers with our simplified lexer as it would require
37    // us to read a byte just to see that it's not a number and thus the previous
38    // tokens are to be returned. But we cannot peek without drastically complicating
39    // our so far quite speedy implementation.
40    // Number,
41    /// `null`
42    Null,
43
44    /// The type of the token could not be identified.
45    /// Should be removed if this lexer is ever to be feature complete
46    Invalid,
47}
48
49impl AsRef<str> for TokenType {
50    fn as_ref(&self) -> &str {
51        match *self {
52            TokenType::CurlyOpen => "{",
53            TokenType::CurlyClose => "}",
54            TokenType::BracketOpen => "[",
55            TokenType::BracketClose => "]",
56            TokenType::Colon => ":",
57            TokenType::Comma => ",",
58            TokenType::BooleanTrue => "true",
59            TokenType::BooleanFalse => "false",
60            TokenType::Null => "null",
61
62            TokenType::Invalid => panic!("Cannot convert invalid TokenType"),
63            _ => panic!("Cannot convert variant TokenTypes"),
64        }
65    }
66}
67
68/// A pair of indices into the byte stream returned by our source
69/// iterator.
70/// It is an exclusive range.
71#[derive(Debug, PartialEq, Clone, Default)]
72pub struct Span {
73    /// Index of the first the byte
74    pub first: u64,
75    /// Index one past the last byte
76    pub end: u64,
77}
78
79/// A lexical token, identifying its kind and span.
80#[derive(Debug, PartialEq, Clone)]
81pub struct Token {
82    /// The exact type of the token
83    pub kind: TokenType,
84
85    /// A buffer representing the bytes of this Token.
86    pub buf: Buffer,
87}
88
89/// Representation of a buffer containing items making up a `Token`.
90///
91/// It's either always `Span`, or one of the `*Byte` variants.
92#[derive(Debug, PartialEq, Clone)]
93pub enum Buffer {
94    /// Multiple bytes making up a token. Only set for `TokenType::String` and
95    /// `TokenType::Number`.
96    MultiByte(Vec<u8>),
97    /// The span allows to reference back into the source byte stream
98    /// to obtain the string making up the token.
99    /// Please note that for control characters, booleans and null (i.e
100    /// anything that is not `Buffer::MultiByte` you should use
101    /// `<TokenType as AsRef<str>>::as_ref()`)
102    Span(Span),
103}
104
105/// The type of `Buffer` you want in each `Token`
106#[derive(Debug, PartialEq, Clone)]
107pub enum BufferType {
108    /// Use a `Buffer::MultiByte` were appropriate. Initialize it with the
109    /// given capacity (to obtain higher performance when pushing characters)
110    Bytes(usize),
111    Span,
112}
113
114impl<I> Lexer<I>
115where
116    I: IntoIterator<Item = u8>,
117{
118    /// Returns a new Lexer from a given byte iterator.
119    pub fn new(chars: I, buffer_type: BufferType) -> Lexer<I> {
120        Lexer {
121            chars: chars.into_iter(),
122            next_byte: None,
123            cursor: 0,
124            buffer_type,
125        }
126    }
127
128    pub fn into_inner(self) -> I::IntoIter {
129        self.chars
130    }
131
132    fn put_back(&mut self, c: u8) {
133        debug_assert!(self.next_byte.is_none());
134        self.next_byte = Some(c);
135        self.cursor -= 1;
136    }
137
138    fn next_byte(&mut self) -> Option<u8> {
139        match self.next_byte.take() {
140            Some(c) => {
141                self.cursor += 1;
142                Some(c)
143            }
144            None => {
145                let res = self.chars.next();
146                match res {
147                    None => None,
148                    Some(_) => {
149                        self.cursor += 1;
150                        res
151                    }
152                }
153            }
154        }
155    }
156}
157
158// Identifies the state of the lexer
159enum Mode {
160    // String parse mode: bool = ignore_next, usize = ignore_digits
161    String(bool, usize),
162    // `null` parse mode: buf, buf-index
163    Null([u8; 4], usize),
164    // `true` parse mode
165    True([u8; 4], usize),
166    // `false` parse mode
167    False([u8; 5], usize),
168    // `Number` parse mode
169    Number,
170    SlowPath,
171}
172
173impl<I> Iterator for Lexer<I>
174where
175    I: IntoIterator<Item = u8>,
176{
177    type Item = Token;
178
179    /// Lex the underlying byte stream to generate tokens
180    fn next(&mut self) -> Option<Token> {
181        let mut t: Option<TokenType> = None;
182
183        let mut first = 0;
184        let mut state = Mode::SlowPath;
185        let last_cursor = self.cursor;
186        let mut buf = match self.buffer_type {
187            BufferType::Bytes(capacity) => Some(Vec::<u8>::with_capacity(capacity)),
188            BufferType::Span => None,
189        };
190
191        while let Some(c) = self.next_byte() {
192            let mut set_cursor = |cursor| {
193                first = cursor - 1;
194            };
195
196            match state {
197                Mode::String(ref mut ign_next, ref mut ign_digits) => {
198                    if let Some(ref mut v) = buf {
199                        v.push(c);
200                    }
201                    if *ign_next {
202                        match c {
203                            b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {
204                                *ign_next = false;
205                                continue;
206                            }
207                            b'u' => {
208                                *ign_next = false;
209                                *ign_digits = 4;
210                                continue;
211                            }
212                            _ => {
213                                t = Some(TokenType::Invalid);
214                                break;
215                            }
216                        }
217                    }
218                    if *ign_digits > 0 {
219                        match c {
220                            b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
221                                *ign_digits -= 1;
222                                continue;
223                            }
224                            _ => {
225                                t = Some(TokenType::Invalid);
226                                break;
227                            }
228                        }
229                    }
230                    match c {
231                        b'"' => {
232                            t = Some(TokenType::String);
233                            break;
234                        }
235                        b'\\' => {
236                            *ign_next = true;
237                            continue;
238                        }
239                        _ => {
240                            continue;
241                        }
242                    }
243                }
244                Mode::Null(ref mut b, ref mut i) => {
245                    b[*i] = c;
246                    if *i == 3 {
247                        // we know b[0] is b'n'
248                        if b[1] == b'u' && b[2] == b'l' && b[3] == b'l' {
249                            t = Some(TokenType::Null);
250                        } else {
251                            t = Some(TokenType::Invalid);
252                        }
253                        break;
254                    } else {
255                        *i += 1;
256                        continue;
257                    }
258                }
259                Mode::Number => match c {
260                    b'0'..=b'9' | b'-' | b'+' | b'.' | b'E' | b'e' => {
261                        if let Some(ref mut v) = buf {
262                            v.push(c);
263                        }
264                        continue;
265                    }
266                    _ => {
267                        t = Some(TokenType::Number);
268                        self.put_back(c);
269                        break;
270                    }
271                },
272                Mode::True(ref mut b, ref mut i) => {
273                    b[*i] = c;
274                    if *i == 3 {
275                        // we know b[0] is b't'
276                        if b[1] == b'r' && b[2] == b'u' && b[3] == b'e' {
277                            t = Some(TokenType::BooleanTrue);
278                        } else {
279                            t = Some(TokenType::Invalid);
280                        }
281                        break;
282                    } else {
283                        *i += 1;
284                        continue;
285                    }
286                }
287                Mode::False(ref mut b, ref mut i) => {
288                    b[*i] = c;
289                    if *i == 4 {
290                        // we know b[0] is b'f'
291                        if b[1] == b'a' && b[2] == b'l' && b[3] == b's' && b[4] == b'e' {
292                            t = Some(TokenType::BooleanFalse);
293                        } else {
294                            t = Some(TokenType::Invalid);
295                        }
296                        break;
297                    } else {
298                        *i += 1;
299                        continue;
300                    }
301                }
302                Mode::SlowPath => {
303                    match c {
304                        b'{' => {
305                            t = Some(TokenType::CurlyOpen);
306                            set_cursor(self.cursor);
307                            break;
308                        }
309                        b'}' => {
310                            t = Some(TokenType::CurlyClose);
311                            set_cursor(self.cursor);
312                            break;
313                        }
314                        b'"' => {
315                            state = Mode::String(false, 0);
316                            if let Some(ref mut v) = buf {
317                                v.push(c);
318                            } else {
319                                set_cursor(self.cursor);
320                                // it starts at invalid, and once we know it closes, it's a string
321                                t = Some(TokenType::Invalid);
322                            }
323                        }
324                        b'n' => {
325                            state = Mode::Null([c, b'x', b'x', b'x'], 1);
326                            set_cursor(self.cursor);
327                        }
328                        b'0'..=b'9' | b'-' | b'.' => {
329                            state = Mode::Number;
330                            if let Some(ref mut v) = buf {
331                                v.push(c);
332                            } else {
333                                set_cursor(self.cursor);
334                            }
335                        }
336                        b't' => {
337                            state = Mode::True([c, b'x', b'x', b'x'], 1);
338                            set_cursor(self.cursor);
339                        }
340                        b'f' => {
341                            state = Mode::False([c, b'x', b'x', b'x', b'x'], 1);
342                            set_cursor(self.cursor);
343                        }
344                        b'[' => {
345                            t = Some(TokenType::BracketOpen);
346                            set_cursor(self.cursor);
347                            break;
348                        }
349                        b']' => {
350                            t = Some(TokenType::BracketClose);
351                            set_cursor(self.cursor);
352                            break;
353                        }
354                        b':' => {
355                            t = Some(TokenType::Colon);
356                            set_cursor(self.cursor);
357                            break;
358                        }
359                        b',' => {
360                            t = Some(TokenType::Comma);
361                            set_cursor(self.cursor);
362                            break;
363                        }
364                        b'\\' => {
365                            // invalid
366                            t = Some(TokenType::Invalid);
367                            set_cursor(self.cursor);
368                            break;
369                        }
370                        _ => {}
371                    } // end single byte match
372                } // end case SlowPath
373            } // end match state
374        } // end for each byte
375
376        match t {
377            None => match (buf, state) {
378                (Some(b), Mode::Number) => Some(Token {
379                    kind: TokenType::Number,
380                    buf: Buffer::MultiByte(b),
381                }),
382                (None, Mode::Number) => Some(Token {
383                    kind: TokenType::Number,
384                    buf: Buffer::Span(Span { first, end: self.cursor }),
385                }),
386                _ => None,
387            },
388            Some(t) => {
389                if self.cursor == last_cursor {
390                    None
391                } else {
392                    let buf = match (&t, buf) {
393                        (&TokenType::String, Some(b)) | (&TokenType::Number, Some(b)) => Buffer::MultiByte(b),
394                        _ => Buffer::Span(Span { first, end: self.cursor }),
395                    };
396                    Some(Token { kind: t, buf })
397                }
398            }
399        }
400    }
401}