json_threat_protection/
lexer.rs1use crate::read::{Position, Read};
2use thiserror::Error;
3
4#[derive(Error, Debug)]
5pub enum LexerError {
7 #[error("invalid utf-8 sequence ({0}")]
9 InvalidUtf8Sequence(Position),
10
11 #[error("unpexpected byte at byte ({0})")]
13 UnexpectedByte(Position),
14
15 #[error("I/O Error")]
17 ReadError(#[from] crate::read::ReadError),
18}
19
20#[derive(Debug, Copy, Clone, PartialEq)]
21pub enum Token {
22 LBrace, RBrace, LBracket, RBracket, Comma, Colon, Number,
29 String,
30 True,
31 False,
32 Null,
33}
34
35pub struct Lexer<R: Read> {
37 reader: R,
38 peeked_str_buf: Vec<u8>,
39 peeked: Option<Token>,
40}
41
42impl<R: Read> Lexer<R> {
43 pub fn new(reader: R) -> Self {
44 Lexer {
45 reader,
46 peeked_str_buf: Vec::with_capacity(64),
47 peeked: None,
48 }
49 }
50
51 pub fn position(&self) -> Position {
52 self.reader.position()
53 }
54
55 pub fn peek(&mut self, str_buf: &mut Vec<u8>) -> Result<Option<Token>, LexerError> {
56 if self.peeked.is_none() {
57 self.peeked = self.next(str_buf)?;
58 self.peeked_str_buf.clear();
59 self.peeked_str_buf.extend_from_slice(&str_buf);
60 }
61 Ok(self.peeked)
62 }
63
64 pub fn next(&mut self, str_buf: &mut Vec<u8>) -> Result<Option<Token>, LexerError> {
65 if self.peeked.is_some() {
66 let peeked = self.peeked.clone();
67 self.peeked = None;
68
69 if matches!(peeked, Some(Token::String)) {
70 str_buf.clear();
71 str_buf.extend_from_slice(&self.peeked_str_buf);
72 }
73
74 return Ok(peeked);
75 }
76
77 loop {
78 self.reader.skip_whitespace()?;
79 let peek = self.reader.peek()?;
80 if peek.is_none() {
81 return Ok(None);
82 }
83
84 match peek.unwrap() {
86 b'{' => {
87 self.reader.next()?.unwrap();
89 return Ok(Some(Token::LBrace));
90 }
91 b'}' => {
92 self.reader.next()?.unwrap();
94 return Ok(Some(Token::RBrace));
95 }
96 b'[' => {
97 self.reader.next()?.unwrap();
99 return Ok(Some(Token::LBracket));
100 }
101 b']' => {
102 self.reader.next()?.unwrap();
104 return Ok(Some(Token::RBracket));
105 }
106 b',' => {
107 self.reader.next()?.unwrap();
109 return Ok(Some(Token::Comma));
110 }
111 b':' => {
112 self.reader.next()?.unwrap();
114 return Ok(Some(Token::Colon));
115 }
116 b'"' => {
117 return Ok(Some(self.parse_string(str_buf)?));
118 }
119 b't' => {
120 return Ok(Some(self.parse_true()?));
121 }
122 b'f' => {
123 return Ok(Some(self.parse_false()?));
124 }
125 b'n' => {
126 return Ok(Some(self.parse_null()?));
127 }
128 b'-' | b'+' | b'0'..=b'9' => {
129 return Ok(Some(self.parse_number()?));
130 }
131 _ => return Err(LexerError::UnexpectedByte(self.position())),
132 }
133 }
134 }
135
136 fn parse_string(&mut self, str_buf: &mut Vec<u8>) -> Result<Token, LexerError> {
137 str_buf.clear();
138 self.reader.next_likely_string(str_buf)?;
139
140 let str = std::str::from_utf8(str_buf);
141 if str.is_err() {
142 return Err(LexerError::InvalidUtf8Sequence(self.position()));
143 }
144
145 Ok(Token::String)
146 }
147
148 fn parse_number(&mut self) -> Result<Token, LexerError> {
149 match self.reader.next_number() {
150 Ok(_) => Ok(Token::Number),
151 Err(e) => Err(e.into()),
152 }
153 }
154
155 fn parse_true(&mut self) -> Result<Token, LexerError> {
156 match self.reader.next4()? {
157 [b't', b'r', b'u', b'e'] => Ok(Token::True),
158 _ => Err(LexerError::UnexpectedByte(self.position())),
159 }
160 }
161
162 fn parse_false(&mut self) -> Result<Token, LexerError> {
163 match self.reader.next5()? {
164 [b'f', b'a', b'l', b's', b'e'] => Ok(Token::False),
165 _ => Err(LexerError::UnexpectedByte(self.position())),
166 }
167 }
168
169 fn parse_null(&mut self) -> Result<Token, LexerError> {
170 match self.reader.next4()? {
171 [b'n', b'u', b'l', b'l'] => Ok(Token::Null),
172 _ => Err(LexerError::UnexpectedByte(self.position())),
173 }
174 }
175}