brainterpreter/lexer/
mod.rs

1//! Lexer for the language tokens
2
3use log::error;
4
5use token::Token;
6
7use crate::source::Position;
8
9pub mod token;
10
11/// Adds debug information to the token
12#[derive(Debug, Clone, PartialEq)]
13pub struct SourceToken {
14    kind: Token,
15    source: Position,
16}
17
18#[derive(Debug)]
19pub struct Lexer<'a> {
20    source: &'a str,
21    start: usize,
22    pos: usize,
23    line: usize,
24    column: usize,
25}
26
27impl<'a> Lexer<'a> {
28    pub fn new(source: &'a str) -> Self {
29        Lexer {
30            source,
31            pos: 0,
32            start: 0,
33            line: 1,
34            column: 1,
35        }
36    }
37
38    pub fn next_token(&mut self) -> SourceToken {
39        let mut maybe_token = self.advance_token();
40        while maybe_token.is_none() {
41            maybe_token = self.advance_token();
42        }
43        maybe_token.unwrap()
44    }
45
46    fn advance_token(&mut self) -> Option<SourceToken> {
47        self.skip_whitespace();
48        if self.at_end() {
49            return Some(Token::EndOfFile.with_position(self.src_pos()));
50        }
51        self.start = self.pos;
52        let c = self.advance().expect("character exhausted prematurely");
53        match c {
54            '+' => Some(Token::Plus.with_position(self.src_pos())),
55            '-' => Some(Token::Minus.with_position(self.src_pos())),
56            '*' => Some(Token::Star.with_position(self.src_pos())),
57            '/' => {
58                if let Some('/') = self.peek(0) {
59                    self.advance();
60                    while let Some(c) = self.peek(0) {
61                        if c == '\n' {
62                            break;
63                        }
64                        self.advance();
65                    }
66                    None
67                } else {
68                    Some(Token::Slash.with_position(self.src_pos()))
69                }
70            }
71            '(' => Some(Token::LeftParen.with_position(self.src_pos())),
72            ')' => Some(Token::RightParen.with_position(self.src_pos())),
73            '{' => Some(Token::LeftCurly.with_position(self.src_pos())),
74            '}' => Some(Token::RightCurly.with_position(self.src_pos())),
75            '[' => Some(Token::LeftSquare.with_position(self.src_pos())),
76            ']' => Some(Token::RightSquare.with_position(self.src_pos())),
77            '=' => {
78                if self.advance_if('=') {
79                    Some(Token::EqualEqual.with_position(self.src_pos()))
80                } else {
81                    Some(Token::Equal.with_position(self.src_pos()))
82                }
83            }
84            '!' => {
85                if self.advance_if('=') {
86                    Some(Token::BangEqual.with_position(self.src_pos()))
87                } else {
88                    Some(Token::Bang.with_position(self.src_pos()))
89                }
90            }
91            '<' => {
92                if self.advance_if('=') {
93                    Some(Token::LessEqual.with_position(self.src_pos()))
94                } else {
95                    Some(Token::Less.with_position(self.src_pos()))
96                }
97            }
98            '>' => {
99                if self.advance_if('=') {
100                    Some(Token::GreaterEqual.with_position(self.src_pos()))
101                } else {
102                    Some(Token::Greater.with_position(self.src_pos()))
103                }
104            }
105            ';' => Some(Token::Semicolon.with_position(self.src_pos())),
106            ',' => Some(Token::Comma.with_position(self.src_pos())),
107            '0'..='9' => Some(self.number()),
108            'a'..='z' | 'A'..='Z' | '_' => Some(self.identifier()),
109            '"' => Some(self.string_literal()),
110            _ => {
111                error!("unknown token: {}", c);
112                Some(Token::Error.with_position(self.src_pos()))
113            }
114        }
115    }
116
117    fn number(&mut self) -> SourceToken {
118        while let Some(c) = self.peek(0) {
119            if !c.is_ascii_digit() {
120                break;
121            }
122            self.advance();
123        }
124
125        if let Some('.') = self.peek(0) {
126            self.advance();
127            while let Some(c) = self.peek(0) {
128                if !c.is_ascii_digit() {
129                    break;
130                }
131                self.advance();
132            }
133        }
134        let number_literal = &self.source[self.start..self.pos];
135        let value: f64 = number_literal.parse().expect("must be a correct number");
136        Token::Number(value).with_position(self.src_pos())
137    }
138
139    fn string_literal(&mut self) -> SourceToken {
140        while let Some(c) = self.peek(0) {
141            if c == '"' {
142                break;
143            }
144            self.advance();
145        }
146        self.advance();
147        let string_literal = &self.source[(self.start + 1)..(self.pos - 1)];
148        Token::StringLiteral(string_literal.to_string()).with_position(self.src_pos())
149    }
150
151    fn identifier(&mut self) -> SourceToken {
152        while let Some(c) = self.peek(0) {
153            if !c.is_ascii_alphanumeric() && c != '_' {
154                break;
155            }
156            self.advance();
157        }
158        let identifier = &self.source[self.start..self.pos];
159        match identifier {
160            "print" => Token::Print.with_position(self.src_pos()),
161            "let" => Token::Let.with_position(self.src_pos()),
162            "true" => Token::True.with_position(self.src_pos()),
163            "false" => Token::False.with_position(self.src_pos()),
164            "if" => Token::If.with_position(self.src_pos()),
165            "else" => Token::Else.with_position(self.src_pos()),
166            "while" => Token::While.with_position(self.src_pos()),
167            "fun" => Token::Fun.with_position(self.src_pos()),
168            "return" => Token::Return.with_position(self.src_pos()),
169            "nil" => Token::Nil.with_position(self.src_pos()),
170            _ => Token::Identifier(identifier.to_string()).with_position(self.src_pos()),
171        }
172    }
173
174    fn advance(&mut self) -> Option<char> {
175        let c = self.source.chars().nth(self.pos);
176        self.pos += 1;
177        self.column += 1;
178        c
179    }
180
181    fn advance_if(&mut self, c: char) -> bool {
182        if self.peek(0) == Some(c) {
183            self.advance();
184            true
185        } else {
186            false
187        }
188    }
189
190    fn peek(&self, offset: usize) -> Option<char> {
191        self.source.chars().nth(self.pos + offset)
192    }
193
194    fn at_end(&self) -> bool {
195        self.pos >= self.source.len()
196    }
197
198    fn skip_whitespace(&mut self) {
199        while let Some(c) = self.peek(0) {
200            if !c.is_ascii_whitespace() {
201                break;
202            }
203            if c == '\n' {
204                self.line += 1;
205                self.column = 0;
206            }
207            self.advance();
208        }
209    }
210
211    fn src_pos(&self) -> Position {
212        Position::new(self.line, self.column - 1)
213    }
214}
215
216impl<'a> Iterator for Lexer<'a> {
217    type Item = SourceToken;
218
219    fn next(&mut self) -> Option<Self::Item> {
220        match self.next_token() {
221            SourceToken {
222                kind: Token::EndOfFile,
223                ..
224            } => None,
225            t => Some(t),
226        }
227    }
228}
229
230impl From<Token> for SourceToken {
231    fn from(token: Token) -> Self {
232        SourceToken {
233            kind: token,
234            source: Position::default(),
235        }
236    }
237}
238
239impl SourceToken {
240    pub fn new(token: Token, source: Position) -> Self {
241        SourceToken {
242            kind: token,
243            source,
244        }
245    }
246
247    pub fn kind(&self) -> &Token {
248        &self.kind
249    }
250
251    pub fn source(&self) -> &Position {
252        &self.source
253    }
254}
255
256impl PartialEq<Token> for SourceToken {
257    fn eq(&self, other: &Token) -> bool {
258        &self.kind == other
259    }
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    #[test]
267    fn empty_source() {
268        let mut lexer = Lexer::new("");
269        let token = lexer.next_token();
270        assert_eq!(token, Token::EndOfFile);
271    }
272
273    #[test]
274    fn arithmetic_operators() {
275        let mut lexer = Lexer::new("+");
276        assert_eq!(lexer.next_token(), Token::Plus);
277        assert_eq!(lexer.next_token(), Token::EndOfFile);
278    }
279
280    #[test]
281    fn integer() {
282        let mut lexer = Lexer::new("42");
283        assert_eq!(lexer.next_token(), Token::Number(42.0));
284        assert_eq!(lexer.next_token(), Token::EndOfFile);
285    }
286
287    #[test]
288    fn float_point_literal() {
289        let mut lexer = Lexer::new("5.52");
290        assert_eq!(lexer.next_token(), Token::Number(5.52));
291        assert_eq!(lexer.next_token(), Token::EndOfFile);
292    }
293    #[test]
294    fn arithmetic_expressions() {
295        let mut lexer = Lexer::new("42 + 8 / 2");
296        assert_eq!(lexer.next_token(), Token::Number(42.0));
297        assert_eq!(lexer.next_token(), Token::Plus);
298        assert_eq!(lexer.next_token(), Token::Number(8.0));
299        assert_eq!(lexer.next_token(), Token::Slash);
300        assert_eq!(lexer.next_token(), Token::Number(2.0));
301    }
302
303    #[test]
304    fn inline_comment() {
305        let mut lexer = Lexer::new("42 + 7 // this is a comment");
306        assert_eq!(lexer.next_token(), Token::Number(42.0));
307        assert_eq!(lexer.next_token(), Token::Plus);
308        assert_eq!(lexer.next_token(), Token::Number(7.0));
309        assert_eq!(lexer.next_token(), Token::EndOfFile);
310    }
311
312    #[test]
313    fn line_comment() {
314        let mut lexer = Lexer::new(
315            "// comment
316            42 + 7",
317        );
318        assert_eq!(lexer.next_token(), Token::Number(42.0));
319        assert_eq!(lexer.next_token(), Token::Plus);
320        assert_eq!(lexer.next_token(), Token::Number(7.0));
321        assert_eq!(lexer.next_token(), Token::EndOfFile);
322    }
323
324    #[test]
325    fn print_statement() {
326        let mut lexer = Lexer::new("print 42");
327        assert_eq!(lexer.next_token(), Token::Print);
328        assert_eq!(lexer.next_token(), Token::Number(42.0));
329        assert_eq!(lexer.next_token(), Token::EndOfFile);
330    }
331
332    #[test]
333    fn identifier() {
334        let mut lexer = Lexer::new("foo");
335        assert_eq!(lexer.next_token(), Token::Identifier("foo".to_string()));
336        assert_eq!(lexer.next_token(), Token::EndOfFile);
337    }
338
339    #[test]
340    fn variable_declaration_and_assignment() {
341        let mut lexer = Lexer::new("let foo = 42;");
342        assert_eq!(lexer.next_token(), Token::Let);
343        assert_eq!(lexer.next_token(), Token::Identifier("foo".to_string()));
344        assert_eq!(lexer.next_token(), Token::Equal);
345        assert_eq!(lexer.next_token(), Token::Number(42.0));
346        assert_eq!(lexer.next_token(), Token::Semicolon);
347        assert_eq!(lexer.next_token(), Token::EndOfFile);
348    }
349
350    #[test]
351    fn comparisons() {
352        let mut lexer = Lexer::new("= == != > >= < <=");
353        assert_eq!(lexer.next_token(), Token::Equal);
354        assert_eq!(lexer.next_token(), Token::EqualEqual);
355        assert_eq!(lexer.next_token(), Token::BangEqual);
356        assert_eq!(lexer.next_token(), Token::Greater);
357        assert_eq!(lexer.next_token(), Token::GreaterEqual);
358        assert_eq!(lexer.next_token(), Token::Less);
359        assert_eq!(lexer.next_token(), Token::LessEqual);
360    }
361}