json_rs/
lexer.rs

1use std::u8;
2
3use crate::json::{self, JSONError};
4
5#[derive(Clone, Debug, PartialEq)]
6pub enum Token {
7    OpenBrace,
8    CloseBrace,
9    OpenBracket,
10    CloseBracket,
11    Colon,
12    Comma,
13    StringLiteral(String),
14    NumericLiteral(String),
15    True,
16    False,
17    Null,
18    Unknown(String),
19}
20
21#[derive(Clone, Debug)]
22pub struct TokenPos(pub Token, pub usize, pub usize);
23
24pub struct Lexer {
25    buffer: Vec<u8>,
26    pos: usize,
27    marker: usize, 
28    line: usize,
29    column: usize,
30}
31
32impl Lexer {
33    pub fn new(buffer: Vec<u8>) -> Lexer {
34        Lexer {
35            buffer,
36            pos: 0,
37            marker: 0,
38            line: 1,
39            column: 1,
40        }
41    }
42
43    #[inline]
44    fn curr(&self) -> u8 {
45        self.buffer[self.pos]
46    }
47    #[inline]
48    fn mark(&self) -> u8 {
49        self.buffer[self.marker]
50    }
51
52    /// Advance lexer by `len` bytes, adjusting column and line positions as necessary
53    fn advance(&mut self, len: usize) -> json::Result<()> {
54        // err if out of bounds
55        if self.pos + len > self.buffer.len() {
56            return Err(JSONError::LexerError(
57                format!(
58                    "new position {} out of bounds for buffer length {}",
59                    self.pos + len,
60                    self.buffer.len(),
61                )
62            ));
63        }
64
65        // find locations of all the line breaks
66        let mut line_breaks: Vec<usize> = vec![];
67        for i in self.pos..self.pos + len {
68            if self.buffer[i] == b'\n' {
69                line_breaks.push(i);
70            }
71        }
72
73        // advance raw character position
74        self.pos += len;
75        self.marker = self.pos;
76
77        // if line breaks were found...
78        if line_breaks.len() > 0 {
79            // increment line count by number of '\n' chars found
80            self.line += line_breaks.len();
81            // set column pos to the offset from the last line break
82            self.column = self.pos - line_breaks.pop().unwrap();
83        } else {
84            // advance column position
85            self.column += len;
86        }
87
88        Ok(())
89    }
90
91    // may use later idk
92    #[allow(dead_code)]
93    fn seek(&mut self, codepoint: u8) -> json::Result<()> {
94        // this ensures that we don't select the current position
95        self.marker = self.pos + 1;
96        while self.mark() != codepoint {
97            self.marker += 1;
98            if self.marker >= self.buffer.len() {
99                return Err(JSONError::LexerError(
100                    format!(
101                        "codepoint {} never found",
102                        codepoint as char,
103                    )
104                ));
105            }
106        }
107        // to include seeked-for character
108        self.marker += 1;
109
110        Ok(())
111    }
112
113    fn seek_in(&mut self, low: u8, high: u8) {
114        while self.marker < self.buffer.len() && self.mark() >= low && self.mark() <= high {
115            self.marker += 1;
116        }
117    }
118
119    fn seek_all(&mut self, values: &[u8]) {
120        while self.marker < self.buffer.len() {
121            if values.iter().any(|&val| val == self.mark()) {
122                self.marker += 1;
123            } else {
124                break;
125            }
126        }
127    }
128
129    fn highlighted(&self) -> &str {
130        core::str::from_utf8(&self.buffer[self.pos..self.marker]).unwrap()
131    }
132
133    pub fn tokenify(&mut self) -> json::Result<Vec<TokenPos>> {
134        // quick and dirty; will switch to better system later
135        const ALPHABET: [u8; 52] = [
136            b'a', b'b', b'c', b'd', b'e', b'f', b'g',
137            b'h', b'i', b'j', b'k', b'l', b'm', b'n',
138            b'o', b'p', b'q', b'r', b's', b't', b'u',
139            b'v', b'w', b'x', b'y', b'z',
140            b'A', b'B', b'C', b'D', b'E', b'F', b'G',
141            b'H', b'I', b'J', b'K', b'L', b'M', b'N',
142            b'O', b'P', b'Q', b'R', b'S', b'T', b'U',
143            b'V', b'W', b'X', b'Y', b'Z',
144        ];
145
146        self.pos = 0;
147
148        let mut tokens: Vec<TokenPos> = vec![];
149
150        loop {
151            if self.pos == self.buffer.len() {
152                break Ok(tokens);
153            }
154            match self.curr() {
155                b'{' => {
156                    tokens.push(TokenPos(Token::OpenBrace, self.line, self.column));
157                    self.advance(1)?;
158                },
159                b'}' => {
160                    tokens.push(TokenPos(Token::CloseBrace, self.line, self.column));
161                    self.advance(1)?;
162                },
163                b'[' => {
164                    tokens.push(TokenPos(Token::OpenBracket, self.line, self.column));
165                    self.advance(1)?;
166                },
167                b']' => {
168                    tokens.push(TokenPos(Token::CloseBracket, self.line, self.column));
169                    self.advance(1)?;
170                },
171                b':' => {
172                    tokens.push(TokenPos(Token::Colon, self.line, self.column));
173                    self.advance(1)?;
174                },
175                b',' => {
176                    tokens.push(TokenPos(Token::Comma, self.line, self.column));
177                    self.advance(1)?;
178                },
179                b' ' => {
180                    self.advance(1)?;
181                },
182                b'\n' => {
183                    self.advance(1)?;
184                },
185                b'"' => {
186                    // this ensures that we don't select the current position
187                    self.marker = self.pos + 1;
188                    loop {
189                        self.marker += 1;
190                        if self.mark() == b'"' {
191                            if self.buffer[self.marker - 1] != b'\\' {
192                                break;
193                            }
194                        }
195                        if self.marker >= self.buffer.len() {
196                            return Err(JSONError::LexerError(
197                                format!(
198                                    "ending \" never found"
199                                )
200                            ));
201                        }
202                    }
203                    // to include seeked-for character
204                    self.marker += 1;
205                    tokens.push(TokenPos(
206                        Token::StringLiteral(self.highlighted().to_owned()),
207                        self.line,
208                        self.column,
209                    ));
210                    self.advance(self.marker - self.pos)?;
211                },
212                b't' => {
213                    self.seek_all(&ALPHABET);
214
215                    if self.highlighted() == "true" {
216                        tokens.push(TokenPos(Token::True, self.line, self.column));
217                    } else {
218                        tokens.push(TokenPos(
219                            Token::Unknown(self.highlighted().to_owned()),
220                            self.line,
221                            self.column,
222                        ));
223                    }
224
225                    self.advance(self.marker - self.pos)?;
226                },
227                b'f' => {
228                    self.seek_all(&ALPHABET);
229
230                    if self.highlighted() == "false" {
231                        tokens.push(TokenPos(Token::False, self.line, self.column));
232                    } else {
233                        tokens.push(TokenPos(
234                            Token::Unknown(self.highlighted().to_owned()),
235                            self.line,
236                            self.column,
237                        ));
238                    }
239
240                    self.advance(self.marker - self.pos)?;
241                },
242                b'n' => {
243                    self.seek_all(&ALPHABET);
244
245                    if self.highlighted() == "null" {
246                        tokens.push(TokenPos(Token::Null, self.line, self.column));
247                    } else {
248                        tokens.push(TokenPos(
249                            Token::Unknown(self.highlighted().to_owned()),
250                            self.line,
251                            self.column,
252                        ));
253                    }
254
255                    self.advance(self.marker - self.pos)?;
256                },
257                b'A'..=b'z' => {
258                    self.seek_in(b'A', b'z');
259                    tokens.push(TokenPos(
260                        Token::Unknown(self.highlighted().to_owned()),
261                        self.line,
262                        self.column,
263                    ));
264                    self.advance(self.marker - self.pos)?;
265                },
266                b'0'..=b'9' | b'-' | b'+' | b'.' => {
267                    const NUM_CHARS: [u8; 15] = [
268                        b'0', b'1', b'2', b'3',
269                        b'4', b'5', b'6', b'7',
270                        b'8', b'9', b'.', b'e',
271                        b'E', b'+', b'-',
272                    ];
273                    self.seek_all(&NUM_CHARS);
274                    tokens.push(TokenPos(
275                        Token::NumericLiteral(self.highlighted().to_owned()),
276                        self.line,
277                        self.column,
278                    ));
279                    self.advance(self.marker - self.pos)?;
280                },
281                _ => {
282                    break Err(JSONError::LexerError(
283                        format!(
284                            "invalid character '{}' at line {}, column {}",
285                            self.curr() as char,
286                            self.line,
287                            self.column,
288                        )
289                    ));
290                }
291            }
292        }
293    }
294}