json_threat_protection/
lexer.rs

1use crate::read::{Position, Read};
2use thiserror::Error;
3
4#[derive(Error, Debug)]
5/// An error that occurred while lexing a JSON input.
6pub enum LexerError {
7    /// Invalid UTF-8 sequence at the given position.
8    #[error("invalid utf-8 sequence ({0}")]
9    InvalidUtf8Sequence(Position),
10
11    /// Unexpected byte at the given position.
12    #[error("unpexpected byte at byte ({0})")]
13    UnexpectedByte(Position),
14
15    /// Error occurred while reading from the underlying reader.
16    #[error("I/O Error")]
17    ReadError(#[from] crate::read::ReadError),
18}
19
20#[derive(Debug, Copy, Clone, PartialEq)]
21pub enum Token {
22    LBrace,   // {
23    RBrace,   // }
24    LBracket, // [
25    RBracket, // ]
26    Comma,    // ,
27    Colon,    // :
28    Number,
29    String,
30    True,
31    False,
32    Null,
33}
34
35/// A JSON lexer, which reads a JSON input and produces a stream of tokens.
36pub struct Lexer<R: Read> {
37    reader: R,
38    peeked_str_buf: Vec<u8>,
39    peeked: Option<Token>,
40}
41
42impl<R: Read> Lexer<R> {
43    pub fn new(reader: R) -> Self {
44        Lexer {
45            reader,
46            peeked_str_buf: Vec::with_capacity(64),
47            peeked: None,
48        }
49    }
50
51    pub fn position(&self) -> Position {
52        self.reader.position()
53    }
54
55    pub fn peek(&mut self, str_buf: &mut Vec<u8>) -> Result<Option<Token>, LexerError> {
56        if self.peeked.is_none() {
57            self.peeked = self.next(str_buf)?;
58            self.peeked_str_buf.clear();
59            self.peeked_str_buf.extend_from_slice(&str_buf);
60        }
61        Ok(self.peeked)
62    }
63
64    pub fn next(&mut self, str_buf: &mut Vec<u8>) -> Result<Option<Token>, LexerError> {
65        if self.peeked.is_some() {
66            let peeked = self.peeked.clone();
67            self.peeked = None;
68
69            if matches!(peeked, Some(Token::String)) {
70                str_buf.clear();
71                str_buf.extend_from_slice(&self.peeked_str_buf);
72            }
73
74            return Ok(peeked);
75        }
76
77        loop {
78            self.reader.skip_whitespace()?;
79            let peek = self.reader.peek()?;
80            if peek.is_none() {
81                return Ok(None);
82            }
83
84            // unwrap is safe because peek is not None
85            match peek.unwrap() {
86                b'{' => {
87                    // unwrap is safe because peek is not None
88                    self.reader.next()?.unwrap();
89                    return Ok(Some(Token::LBrace));
90                }
91                b'}' => {
92                    // unwrap is safe because peek is not None
93                    self.reader.next()?.unwrap();
94                    return Ok(Some(Token::RBrace));
95                }
96                b'[' => {
97                    // unwrap is safe because peek is not None
98                    self.reader.next()?.unwrap();
99                    return Ok(Some(Token::LBracket));
100                }
101                b']' => {
102                    // unwrap is safe because peek is not None
103                    self.reader.next()?.unwrap();
104                    return Ok(Some(Token::RBracket));
105                }
106                b',' => {
107                    // unwrap is safe because peek is not None
108                    self.reader.next()?.unwrap();
109                    return Ok(Some(Token::Comma));
110                }
111                b':' => {
112                    // unwrap is safe because peek is not None
113                    self.reader.next()?.unwrap();
114                    return Ok(Some(Token::Colon));
115                }
116                b'"' => {
117                    return Ok(Some(self.parse_string(str_buf)?));
118                }
119                b't' => {
120                    return Ok(Some(self.parse_true()?));
121                }
122                b'f' => {
123                    return Ok(Some(self.parse_false()?));
124                }
125                b'n' => {
126                    return Ok(Some(self.parse_null()?));
127                }
128                b'-' | b'+' | b'0'..=b'9' => {
129                    return Ok(Some(self.parse_number()?));
130                }
131                _ => return Err(LexerError::UnexpectedByte(self.position())),
132            }
133        }
134    }
135
136    fn parse_string(&mut self, str_buf: &mut Vec<u8>) -> Result<Token, LexerError> {
137        str_buf.clear();
138        self.reader.next_likely_string(str_buf)?;
139
140        let str = std::str::from_utf8(str_buf);
141        if str.is_err() {
142            return Err(LexerError::InvalidUtf8Sequence(self.position()));
143        }
144
145        Ok(Token::String)
146    }
147
148    fn parse_number(&mut self) -> Result<Token, LexerError> {
149        match self.reader.next_number() {
150            Ok(_) => Ok(Token::Number),
151            Err(e) => Err(e.into()),
152        }
153    }
154
155    fn parse_true(&mut self) -> Result<Token, LexerError> {
156        match self.reader.next4()? {
157            [b't', b'r', b'u', b'e'] => Ok(Token::True),
158            _ => Err(LexerError::UnexpectedByte(self.position())),
159        }
160    }
161
162    fn parse_false(&mut self) -> Result<Token, LexerError> {
163        match self.reader.next5()? {
164            [b'f', b'a', b'l', b's', b'e'] => Ok(Token::False),
165            _ => Err(LexerError::UnexpectedByte(self.position())),
166        }
167    }
168
169    fn parse_null(&mut self) -> Result<Token, LexerError> {
170        match self.reader.next4()? {
171            [b'n', b'u', b'l', b'l'] => Ok(Token::Null),
172            _ => Err(LexerError::UnexpectedByte(self.position())),
173        }
174    }
175}