1use std::u8;
2
3use crate::json::{self, JSONError};
4
5#[derive(Clone, Debug, PartialEq)]
6pub enum Token {
7 OpenBrace,
8 CloseBrace,
9 OpenBracket,
10 CloseBracket,
11 Colon,
12 Comma,
13 StringLiteral(String),
14 NumericLiteral(String),
15 True,
16 False,
17 Null,
18 Unknown(String),
19}
20
21#[derive(Clone, Debug)]
22pub struct TokenPos(pub Token, pub usize, pub usize);
23
24pub struct Lexer {
25 buffer: Vec<u8>,
26 pos: usize,
27 marker: usize,
28 line: usize,
29 column: usize,
30}
31
32impl Lexer {
33 pub fn new(buffer: Vec<u8>) -> Lexer {
34 Lexer {
35 buffer,
36 pos: 0,
37 marker: 0,
38 line: 1,
39 column: 1,
40 }
41 }
42
43 #[inline]
44 fn curr(&self) -> u8 {
45 self.buffer[self.pos]
46 }
47 #[inline]
48 fn mark(&self) -> u8 {
49 self.buffer[self.marker]
50 }
51
52 fn advance(&mut self, len: usize) -> json::Result<()> {
54 if self.pos + len > self.buffer.len() {
56 return Err(JSONError::LexerError(
57 format!(
58 "new position {} out of bounds for buffer length {}",
59 self.pos + len,
60 self.buffer.len(),
61 )
62 ));
63 }
64
65 let mut line_breaks: Vec<usize> = vec![];
67 for i in self.pos..self.pos + len {
68 if self.buffer[i] == b'\n' {
69 line_breaks.push(i);
70 }
71 }
72
73 self.pos += len;
75 self.marker = self.pos;
76
77 if line_breaks.len() > 0 {
79 self.line += line_breaks.len();
81 self.column = self.pos - line_breaks.pop().unwrap();
83 } else {
84 self.column += len;
86 }
87
88 Ok(())
89 }
90
91 #[allow(dead_code)]
93 fn seek(&mut self, codepoint: u8) -> json::Result<()> {
94 self.marker = self.pos + 1;
96 while self.mark() != codepoint {
97 self.marker += 1;
98 if self.marker >= self.buffer.len() {
99 return Err(JSONError::LexerError(
100 format!(
101 "codepoint {} never found",
102 codepoint as char,
103 )
104 ));
105 }
106 }
107 self.marker += 1;
109
110 Ok(())
111 }
112
113 fn seek_in(&mut self, low: u8, high: u8) {
114 while self.marker < self.buffer.len() && self.mark() >= low && self.mark() <= high {
115 self.marker += 1;
116 }
117 }
118
119 fn seek_all(&mut self, values: &[u8]) {
120 while self.marker < self.buffer.len() {
121 if values.iter().any(|&val| val == self.mark()) {
122 self.marker += 1;
123 } else {
124 break;
125 }
126 }
127 }
128
129 fn highlighted(&self) -> &str {
130 core::str::from_utf8(&self.buffer[self.pos..self.marker]).unwrap()
131 }
132
133 pub fn tokenify(&mut self) -> json::Result<Vec<TokenPos>> {
134 const ALPHABET: [u8; 52] = [
136 b'a', b'b', b'c', b'd', b'e', b'f', b'g',
137 b'h', b'i', b'j', b'k', b'l', b'm', b'n',
138 b'o', b'p', b'q', b'r', b's', b't', b'u',
139 b'v', b'w', b'x', b'y', b'z',
140 b'A', b'B', b'C', b'D', b'E', b'F', b'G',
141 b'H', b'I', b'J', b'K', b'L', b'M', b'N',
142 b'O', b'P', b'Q', b'R', b'S', b'T', b'U',
143 b'V', b'W', b'X', b'Y', b'Z',
144 ];
145
146 self.pos = 0;
147
148 let mut tokens: Vec<TokenPos> = vec![];
149
150 loop {
151 if self.pos == self.buffer.len() {
152 break Ok(tokens);
153 }
154 match self.curr() {
155 b'{' => {
156 tokens.push(TokenPos(Token::OpenBrace, self.line, self.column));
157 self.advance(1)?;
158 },
159 b'}' => {
160 tokens.push(TokenPos(Token::CloseBrace, self.line, self.column));
161 self.advance(1)?;
162 },
163 b'[' => {
164 tokens.push(TokenPos(Token::OpenBracket, self.line, self.column));
165 self.advance(1)?;
166 },
167 b']' => {
168 tokens.push(TokenPos(Token::CloseBracket, self.line, self.column));
169 self.advance(1)?;
170 },
171 b':' => {
172 tokens.push(TokenPos(Token::Colon, self.line, self.column));
173 self.advance(1)?;
174 },
175 b',' => {
176 tokens.push(TokenPos(Token::Comma, self.line, self.column));
177 self.advance(1)?;
178 },
179 b' ' => {
180 self.advance(1)?;
181 },
182 b'\n' => {
183 self.advance(1)?;
184 },
185 b'"' => {
186 self.marker = self.pos + 1;
188 loop {
189 self.marker += 1;
190 if self.mark() == b'"' {
191 if self.buffer[self.marker - 1] != b'\\' {
192 break;
193 }
194 }
195 if self.marker >= self.buffer.len() {
196 return Err(JSONError::LexerError(
197 format!(
198 "ending \" never found"
199 )
200 ));
201 }
202 }
203 self.marker += 1;
205 tokens.push(TokenPos(
206 Token::StringLiteral(self.highlighted().to_owned()),
207 self.line,
208 self.column,
209 ));
210 self.advance(self.marker - self.pos)?;
211 },
212 b't' => {
213 self.seek_all(&ALPHABET);
214
215 if self.highlighted() == "true" {
216 tokens.push(TokenPos(Token::True, self.line, self.column));
217 } else {
218 tokens.push(TokenPos(
219 Token::Unknown(self.highlighted().to_owned()),
220 self.line,
221 self.column,
222 ));
223 }
224
225 self.advance(self.marker - self.pos)?;
226 },
227 b'f' => {
228 self.seek_all(&ALPHABET);
229
230 if self.highlighted() == "false" {
231 tokens.push(TokenPos(Token::False, self.line, self.column));
232 } else {
233 tokens.push(TokenPos(
234 Token::Unknown(self.highlighted().to_owned()),
235 self.line,
236 self.column,
237 ));
238 }
239
240 self.advance(self.marker - self.pos)?;
241 },
242 b'n' => {
243 self.seek_all(&ALPHABET);
244
245 if self.highlighted() == "null" {
246 tokens.push(TokenPos(Token::Null, self.line, self.column));
247 } else {
248 tokens.push(TokenPos(
249 Token::Unknown(self.highlighted().to_owned()),
250 self.line,
251 self.column,
252 ));
253 }
254
255 self.advance(self.marker - self.pos)?;
256 },
257 b'A'..=b'z' => {
258 self.seek_in(b'A', b'z');
259 tokens.push(TokenPos(
260 Token::Unknown(self.highlighted().to_owned()),
261 self.line,
262 self.column,
263 ));
264 self.advance(self.marker - self.pos)?;
265 },
266 b'0'..=b'9' | b'-' | b'+' | b'.' => {
267 const NUM_CHARS: [u8; 15] = [
268 b'0', b'1', b'2', b'3',
269 b'4', b'5', b'6', b'7',
270 b'8', b'9', b'.', b'e',
271 b'E', b'+', b'-',
272 ];
273 self.seek_all(&NUM_CHARS);
274 tokens.push(TokenPos(
275 Token::NumericLiteral(self.highlighted().to_owned()),
276 self.line,
277 self.column,
278 ));
279 self.advance(self.marker - self.pos)?;
280 },
281 _ => {
282 break Err(JSONError::LexerError(
283 format!(
284 "invalid character '{}' at line {}, column {}",
285 self.curr() as char,
286 self.line,
287 self.column,
288 )
289 ));
290 }
291 }
292 }
293 }
294}