json_session/
tokenizer.rs

1use std::fmt;
2use std::iter::Peekable;
3
4use smallvec::SmallVec;
5
6/// A JSON token.
7#[derive(Debug, Clone, PartialEq, PartialOrd)]
8pub enum JsonToken {
9    Number(f64),
10    True,
11    False,
12    String(String),
13    Null,
14    ArrayOpen,
15    Comma,
16    ArrayClose,
17    ObjOpen,
18    Colon,
19    ObjClose,
20}
21
22/// A byte offset and the corresponding line and column number.
23#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
24pub struct Location {
25    pub byte_offset: u64,
26    pub line: u64,
27    pub col: u64,
28}
29
30impl Location {
31    fn advance_by_byte(&mut self, c: u8) {
32        if c == b'\n' {
33            self.col = 0;
34            self.line += 1;
35        } else {
36            self.col += 1;
37        }
38        self.byte_offset += 1;
39    }
40}
41
42/// The error type used in this crate. Comes with Location information.
43#[derive(Debug)]
44pub struct JsonParseError {
45    msg: String,
46    location: Location,
47}
48
49impl JsonParseError {
50    /// Creates a new [`JsonParseError`].
51    pub fn new(msg: String, location: Location) -> JsonParseError {
52        JsonParseError { msg, location }
53    }
54
55    /// The error message.
56    pub fn msg(&self) -> &str {
57        &self.msg
58    }
59
60    /// The location in the source document at which the parse error was encountered.
61    pub fn location(&self) -> Location {
62        self.location
63    }
64}
65
66impl fmt::Display for JsonParseError {
67    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68        write!(
69            f,
70            "Parse error at line:{}, col:{}: {}",
71            self.location.line, self.location.col, &self.msg,
72        )
73    }
74}
75
76impl std::error::Error for JsonParseError {}
77
78/// A type alias for `Result<T, JsonParseError>`.
79pub type JsonParseResult<T> = Result<T, JsonParseError>;
80
81// Note: char::is_ascii_whitespace is not available because some characters are not defined as
82// whitespace character in JSON spec. For example, U+000C FORM FEED is whitespace in Rust but
83// it isn't in JSON.
84fn is_whitespace(c: u8) -> bool {
85    matches!(c, 0x20 | 0xa | 0xd | 0x9)
86}
87
88/// A pull-based tokenizer which takes an iterator over bytes and emits [`JsonToken`]s.
89pub struct JsonTokenizer<I: Iterator<Item = u8>> {
90    bytes: Peekable<I>,
91    location: Location,
92}
93
94impl<I: Iterator<Item = u8>> JsonTokenizer<I> {
95    /// Create a new [`JsonTokenizer`]
96    pub fn new(it: I) -> Self {
97        JsonTokenizer {
98            bytes: it.peekable(),
99            location: Location::default(),
100        }
101    }
102
103    /// The location after the end of the last token that was returned from `next_token()`.
104    pub fn location(&self) -> Location {
105        self.location
106    }
107
108    /// Returns an error if there is more than just white space in the remaining bytes.
109    pub fn expect_eof(&mut self) -> Result<(), JsonParseError> {
110        match self.peek_byte_skip_whitespace() {
111            Some(b) => self.err(format!("Expected EOF but found byte {b:#x}")),
112            None => Ok(()),
113        }
114    }
115
116    fn err<T>(&self, msg: String) -> Result<T, JsonParseError> {
117        Err(JsonParseError::new(msg, self.location))
118    }
119
120    fn eof_err(&self) -> JsonParseError {
121        JsonParseError::new(String::from("Unexpected EOF"), self.location)
122    }
123
124    fn peek_byte_skip_whitespace(&mut self) -> Option<u8> {
125        while let Some(c) = self.bytes.peek().copied() {
126            if is_whitespace(c) {
127                self.bytes.next().unwrap();
128                self.location.advance_by_byte(c);
129                continue;
130            }
131            return Some(c);
132        }
133        None
134    }
135
136    fn consume_byte(&mut self) -> Result<u8, JsonParseError> {
137        match self.bytes.next() {
138            Some(b) => {
139                self.location.advance_by_byte(b);
140                Ok(b)
141            }
142            None => Err(self.eof_err()),
143        }
144    }
145
146    fn consume_string(&mut self) -> JsonParseResult<JsonToken> {
147        if self.consume_byte().unwrap() != b'"' {
148            panic!("This function should only be called after the caller has encountered a start quote");
149        }
150
151        let mut s = SmallVec::<[u8; 10]>::new();
152        loop {
153            let b = match self.consume_byte()? {
154                b'\\' => match self.consume_byte()? {
155                    b'\\' => b'\\',
156                    b'/' => b'/',
157                    b'"' => b'"',
158                    b'b' => 0x8,
159                    b'f' => 0xc,
160                    b'n' => b'\n',
161                    b'r' => b'\r',
162                    b't' => b'\t',
163                    b'u' => {
164                        let mut u = 0u16;
165                        for _ in 0..4 {
166                            let b = self.consume_byte()?;
167                            if let Some(h) = ascii_byte_to_hex_digit(b) {
168                                u = u * 0x10 + h as u16;
169                            } else {
170                                return self.err(format!("Unicode character must be \\uXXXX (X is hex character) format but found byte {b:#x}"));
171                            }
172                        }
173                        let c = match u {
174                            0xD800..=0xDBFF => {
175                                // First surrogate
176
177                                // Parse the second surrogate, which must be directly following.
178                                if self.consume_byte()? != b'\\' || self.consume_byte()? != b'u' {
179                                    return self.err(format!("First UTF-16 surragate {u:#x} must be directly followed by a second \\uXXXX surrogate."));
180                                }
181                                let mut u2 = 0u16;
182                                for _ in 0..4 {
183                                    let b = self.consume_byte()?;
184                                    if let Some(h) = ascii_byte_to_hex_digit(b) {
185                                        u2 = u2 * 0x10 + h as u16;
186                                    } else {
187                                        return self.err(format!("Unicode character must be \\uXXXX (X is hex character) format but found byte '{b:#x}'"));
188                                    }
189                                }
190                                if !matches!(u2, 0xDC00..=0xDFFF) {
191                                    return self.err(format!("First UTF-16 surrogate {u:#x} must be directly followed by a second \\uXXXX surrogate, but found something that's not a second surrogate: {u2:#x}."));
192                                }
193
194                                // Now we have both the first and the second surrogate. Assemble them into a char, the same way that char::decode_utf16 does it.
195                                let c =
196                                    (((u & 0x3ff) as u32) << 10 | (u2 & 0x3ff) as u32) + 0x1_0000;
197                                char::from_u32(c).unwrap()
198                            }
199                            0xDC00..=0xDFFF => {
200                                return self
201                                    .err(format!("Unpaired UTF-16 second surrogate: {u:#x}"));
202                            }
203                            _ => char::from_u32(u as u32).unwrap(),
204                        };
205                        match c.len_utf8() {
206                            1 => s.push(c as u8),
207                            _ => s.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()),
208                        }
209                        continue;
210                    }
211                    b => return self.err(format!("{b:#x} is invalid escaped character")),
212                },
213                b'"' => {
214                    let s = String::from_utf8(s.to_vec())
215                        .or_else(|_| self.err("Invalid UTF-8 in string".into()))?;
216                    return Ok(JsonToken::String(s));
217                }
218                // Note: c.is_control() is not available here because JSON accepts 0x7f (DEL) in
219                // string literals but 0x7f is control character.
220                // Rough spec of JSON says string literal cannot contain control characters. But it
221                // can actually contain 0x7f.
222                b if b < 0x20 => {
223                    return self.err(format!("Unexpected control character {b:#x} in string"));
224                }
225                b => b,
226            };
227
228            s.push(b);
229        }
230    }
231
232    fn consume_constant(&mut self, s: &'static str) -> Result<(), JsonParseError> {
233        for expected_byte in s.as_bytes() {
234            let b = self.consume_byte()?;
235            if b != *expected_byte {
236                return Err(JsonParseError::new(
237                    format!("Unexpected byte {b:#x} while parsing '{s}'",),
238                    self.location,
239                ));
240            }
241        }
242        Ok(())
243    }
244
245    fn consume_null(&mut self) -> JsonParseResult<JsonToken> {
246        self.consume_constant("null")?;
247        Ok(JsonToken::Null)
248    }
249
250    fn consume_true(&mut self) -> JsonParseResult<JsonToken> {
251        self.consume_constant("true")?;
252        Ok(JsonToken::True)
253    }
254
255    fn consume_false(&mut self) -> JsonParseResult<JsonToken> {
256        self.consume_constant("false")?;
257        Ok(JsonToken::False)
258    }
259
260    fn consume_number(&mut self) -> JsonParseResult<JsonToken> {
261        let neg = *self.bytes.peek().unwrap() == b'-';
262        if neg {
263            self.consume_byte().unwrap();
264        }
265
266        let mut s = SmallVec::<[u8; 16]>::new();
267        let mut saw_dot = false;
268        let mut saw_exp = false;
269
270        while let Some(d) = self.bytes.peek() {
271            match d {
272                b'0'..=b'9' => s.push(*d),
273                b'.' => {
274                    saw_dot = true;
275                    break;
276                }
277                b'e' | b'E' => {
278                    saw_exp = true;
279                    break;
280                }
281                _ => break,
282            }
283            self.consume_byte().unwrap();
284        }
285
286        if s.is_empty() {
287            return self.err("Integer part must not be empty in number literal".to_string());
288        }
289
290        if s.starts_with(b"0") && s.len() > 1 {
291            return self
292                .err("Integer part of number must not start with 0 except for '0'".to_string());
293        }
294
295        if saw_dot {
296            s.push(self.consume_byte().unwrap()); // eat '.'
297            while let Some(d) = self.bytes.peek() {
298                match d {
299                    b'0'..=b'9' => s.push(*d),
300                    b'e' | b'E' => {
301                        saw_exp = true;
302                        break;
303                    }
304                    _ => break,
305                }
306                self.consume_byte().unwrap();
307            }
308            if s.ends_with(b".") {
309                return self.err("Fraction part of number must not be empty".to_string());
310            }
311        }
312
313        if saw_exp {
314            s.push(self.consume_byte().unwrap()); // eat 'e' or 'E'
315            if let Some(b'+') | Some(b'-') = self.bytes.peek() {
316                s.push(self.consume_byte().unwrap());
317            }
318
319            let mut saw_digit = false;
320            while let Some(d) = self.bytes.peek() {
321                match d {
322                    b'0'..=b'9' => s.push(*d),
323                    _ => break,
324                }
325                saw_digit = true;
326                self.consume_byte().unwrap();
327            }
328
329            if !saw_digit {
330                return self.err("Exponent part must not be empty in number literal".to_string());
331            }
332        }
333
334        let s = std::str::from_utf8(&s).unwrap();
335        match s.parse::<f64>() {
336            Ok(n) => Ok(JsonToken::Number(if neg { -n } else { n })),
337            Err(err) => self.err(format!("Invalid number literal '{}': {}", s, err)),
338        }
339    }
340
341    /// Parses a token and returns it, or an error.
342    pub fn next_token(&mut self) -> JsonParseResult<JsonToken> {
343        let b = self
344            .peek_byte_skip_whitespace()
345            .ok_or_else(|| self.eof_err())?;
346        self.next_token_with_peeked_byte(b)
347    }
348
349    /// Parses a token and returns it along with its location, or an error.
350    pub fn next_token_and_location(&mut self) -> JsonParseResult<(JsonToken, Location)> {
351        let b = self
352            .peek_byte_skip_whitespace()
353            .ok_or_else(|| self.eof_err())?;
354        let location = self.location;
355        let token = self.next_token_with_peeked_byte(b)?;
356        Ok((token, location))
357    }
358
359    fn next_token_with_peeked_byte(&mut self, b: u8) -> JsonParseResult<JsonToken> {
360        let token = match b {
361            b'[' => JsonToken::ArrayOpen,
362            b']' => JsonToken::ArrayClose,
363            b'{' => JsonToken::ObjOpen,
364            b'}' => JsonToken::ObjClose,
365            b':' => JsonToken::Colon,
366            b',' => JsonToken::Comma,
367            b'0'..=b'9' | b'-' => return self.consume_number(),
368            b'"' => return self.consume_string(),
369            b't' => return self.consume_true(),
370            b'f' => return self.consume_false(),
371            b'n' => return self.consume_null(),
372            c => return self.err(format!("Invalid byte: {c:#x}")),
373        };
374        self.consume_byte()?;
375        Ok(token)
376    }
377}
378
379fn ascii_byte_to_hex_digit(c: u8) -> Option<u8> {
380    if c.is_ascii_digit() {
381        Some(c - b'0')
382    } else if (b'a'..=b'f').contains(&c) {
383        Some(10 + (c - b'a'))
384    } else if (b'A'..=b'F').contains(&c) {
385        Some(10 + (c - b'A'))
386    } else {
387        None
388    }
389}
json_session/tokenizer.rs

json_session/
tokenizer.rs