phoenix_lang/
scanner.rs

1use crate::scanner::TokenType::Identifier;
2
3// todo: make file a reference, cause currently we're cloning the filename each time smh
4#[derive(Debug, Clone)]
5pub struct Scanner {
6    pub file: String,
7    pub code: String,
8    pub cur_line: usize,
9    pub start_pos: usize,
10    pub cur_pos: usize,
11}
12
13impl Default for Scanner {
14    fn default() -> Self {
15        Scanner {
16            file: String::new(),
17            code: "".to_string(),
18            cur_line: 0,
19            start_pos: 0,
20            cur_pos: 0,
21        }
22    }
23}
24
25#[derive(Debug, Clone)]
26pub struct Token {
27    pub token_type: TokenType,
28    pub line_num: usize,
29    pub lexeme: String,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq)]
33pub enum TokenType {
34    /// (
35    LeftParen,
36    /// )
37    RightParen,
38    /// {
39    LeftBrace,
40    /// }
41    RightBrace,
42    /// [
43    LeftBracket,
44    /// [
45    RightBracket,
46    Comma,
47    Dot,
48    Semicolon,
49    Minus,
50    MinusAssign,
51    MinusMinus,
52    Plus,
53    PlusAssign,
54    PlusPlus,
55    Slash,
56    SlashAssign,
57    HashTag,
58    Star,
59    StarAssign,
60    Bang,
61    BangEqual,
62    Equal,
63    EqualEqual,
64    Greater,
65    GreaterEqual,
66    Less,
67    LessEqual,
68    Colon,
69
70    Identifier,
71    String,
72    Number,
73
74    And,
75    Class,
76    Else,
77    False,
78    For,
79    Fun,
80    If,
81    Nil,
82    Or,
83    Print,
84    Return,
85    Super,
86    This,
87    True,
88    Var,
89    While,
90    Error,
91    EOF,
92    Import,
93}
94
95impl Scanner {
96    pub fn new(file: String, code: String, cur_line: usize) -> Scanner {
97        Scanner {
98            file,
99            code,
100            cur_line,
101            start_pos: 0,
102            cur_pos: 0,
103        }
104    }
105
106    fn create_token(&self, token_type: TokenType) -> Token {
107        // println!("code: {}|", self.code);
108        Token {
109            token_type,
110            line_num: self.cur_line,
111            lexeme: if self.code.is_empty() {
112                "".to_string()
113            } else {
114                self.code[self.start_pos..self.cur_pos].to_string()
115            },
116        }
117    }
118
119    fn error_token(&self, message: &str) -> Token {
120        Token {
121            token_type: TokenType::Error,
122            line_num: self.cur_line,
123            lexeme: message.to_string(),
124        }
125    }
126
127    pub fn scan_token(&mut self) -> Token {
128        self.skip_whitespace();
129        self.start_pos = self.cur_pos;
130
131        if self.is_at_end() {
132            return self.create_token(TokenType::EOF);
133        }
134
135        let c = self.advance();
136
137        if Self::is_digit(c) {
138            return self.number();
139        }
140
141        if Self::is_alpha(c) {
142            return self.identifier();
143        }
144
145        match c {
146            b'(' => self.create_token(TokenType::LeftParen),
147            b')' => self.create_token(TokenType::RightParen),
148            b'{' => self.create_token(TokenType::LeftBrace),
149            b'}' => self.create_token(TokenType::RightBrace),
150            b'[' => self.create_token(TokenType::LeftBracket),
151            b']' => self.create_token(TokenType::RightBracket),
152            b',' => self.create_token(TokenType::Comma),
153            b'.' => self.create_token(TokenType::Dot),
154            b'-' => {
155                if self.match_char(b'=') {
156                    self.create_token(TokenType::MinusAssign)
157                } else if self.match_char(b'-') {
158                    self.create_token(TokenType::MinusMinus)
159                } else {
160                    self.create_token(TokenType::Minus)
161                }
162            }
163            b'+' => {
164                if self.match_char(b'=') {
165                    self.create_token(TokenType::PlusAssign)
166                } else if self.match_char(b'+') {
167                    self.create_token(TokenType::PlusPlus)
168                } else {
169                    self.create_token(TokenType::Plus)
170                }
171            }
172            b';' => self.create_token(TokenType::Semicolon),
173            b'*' => {
174                if self.match_char(b'=') {
175                    self.create_token(TokenType::StarAssign)
176                } else {
177                    self.create_token(TokenType::Star)
178                }
179            }
180            b'/' => {
181                if self.match_char(b'=') {
182                    self.create_token(TokenType::SlashAssign)
183                } else {
184                    self.create_token(TokenType::Slash)
185                }
186            }
187            b'#' => self.create_token(TokenType::HashTag),
188            b'!' => {
189                if self.match_char(b'=') {
190                    self.create_token(TokenType::BangEqual)
191                } else {
192                    self.create_token(TokenType::Bang)
193                }
194            }
195            b'=' => {
196                if self.match_char(b'=') {
197                    self.create_token(TokenType::EqualEqual)
198                } else {
199                    self.create_token(TokenType::Equal)
200                }
201            }
202            b'<' => {
203                if self.match_char(b'=') {
204                    self.create_token(TokenType::LessEqual)
205                } else {
206                    self.create_token(TokenType::Less)
207                }
208            }
209            b'>' => {
210                if self.match_char(b'=') {
211                    self.create_token(TokenType::GreaterEqual)
212                } else {
213                    self.create_token(TokenType::Greater)
214                }
215            }
216            b'"' => self.string(),
217            b':' => self.create_token(TokenType::Colon),
218            _ => self.error_token("Unexpected character."),
219        }
220    }
221
222    fn is_at_end(&self) -> bool {
223        self.cur_pos >= self.code.len()
224    }
225
226    fn is_digit(c: u8) -> bool {
227        c.is_ascii_digit()
228    }
229
230    fn is_alpha(c: u8) -> bool {
231        c.is_ascii_lowercase() || c.is_ascii_uppercase() || c == b'_'
232    }
233
234    fn can_peek_next(&self) -> bool {
235        self.cur_pos + 2 <= self.code.len()
236    }
237
238    fn advance(&mut self) -> u8 {
239        let ret = self.peek();
240        self.cur_pos += 1;
241        ret
242    }
243
244    fn match_char(&mut self, expected: u8) -> bool {
245        if self.is_at_end() || self.peek() != expected {
246            false
247        } else {
248            self.cur_pos += 1;
249            true
250        }
251    }
252
253    fn peek(&self) -> u8 {
254        if self.is_at_end() {
255            return b'\0';
256        }
257        self.code.as_bytes()[self.cur_pos]
258    }
259
260    fn peek_next(&self) -> u8 {
261        if self.is_at_end() {
262            return b'\0';
263        }
264        self.code.as_bytes()[self.cur_pos + 1]
265    }
266
267    fn skip_whitespace(&mut self) {
268        while !self.is_at_end() {
269            let next = self.peek();
270            if (next == b' ') || (next == b'\t') || (next == b'\r') {
271                self.advance();
272            } else if next == b'\n' {
273                self.advance();
274                self.cur_line += 1;
275            } else if next == b'/' {
276                if self.can_peek_next() && self.peek_next() == b'/' {
277                    while !self.is_at_end() && self.peek() != b'\n' {
278                        self.advance(); // Advance over the second '/' and the rest of the line, or until we hit the end of the file
279                    }
280
281                    if !self.is_at_end() {
282                        self.advance(); // consume the \n
283                        self.cur_line += 1;
284                    }
285                } else {
286                    return; // Return on single slash
287                }
288                // check for #! at the start of the file
289            } else if next == b'#' && self.cur_pos == 0 {
290                while !self.is_at_end() && self.peek() != b'\n' {
291                    self.advance();
292                }
293
294                if !self.is_at_end() {
295                    self.advance();
296                    self.cur_line += 1;
297                }
298            } else {
299                return;
300            }
301        }
302    }
303
304    fn string(&mut self) -> Token {
305        while !self.is_at_end() && self.peek() != b'"' {
306            if self.peek() == b'\n' {
307                self.cur_line += 1;
308            }
309            self.advance();
310        }
311
312        if self.is_at_end() {
313            return self.error_token("Unterminated string.");
314        }
315
316        self.advance(); // consume the closing "
317
318        self.create_token(TokenType::String)
319    }
320
321    fn number(&mut self) -> Token {
322        if self.cur_pos == self.code.len() {
323            return self.error_token("Unexpected end of file.");
324        }
325        while Self::is_digit(self.peek()) {
326            self.advance();
327        }
328
329        if self.peek() == b'.' && Self::is_digit(self.peek_next()) {
330            self.advance(); // consume the .
331
332            while Self::is_digit(self.peek()) {
333                self.advance();
334            }
335        }
336
337        // also take an f after the number to make it a float
338        if self.peek() == b'f' {
339            self.advance();
340        }
341
342        self.create_token(TokenType::Number)
343    }
344
345    fn identifier(&mut self) -> Token {
346        if self.cur_pos == self.code.len() {
347            return self.error_token("Unexpected end of file.");
348        }
349        while Self::is_alpha(self.peek()) || Self::is_digit(self.peek()) {
350            self.advance();
351        }
352
353        self.create_token(self.identifier_type())
354    }
355
356    fn identifier_type(&self) -> TokenType {
357        let c = self.code.as_bytes()[self.start_pos];
358        return match c {
359            b'a' => self.check_for_keyword(1, 2, "nd", TokenType::And),
360            b'c' => self.check_for_keyword(1, 4, "lass", TokenType::Class),
361            b'e' => self.check_for_keyword(1, 3, "lse", TokenType::Else),
362            b'i' => {
363                if self.cur_pos - self.start_pos > 1 {
364                    // more than 1 char in this maybe keyword
365                    match self.code.as_bytes()[self.start_pos + 1] {
366                        b'f' => TokenType::If,
367                        b'm' => self.check_for_keyword(2, 4, "port", TokenType::Import),
368                        _ => Identifier,
369                    }
370                } else {
371                    Identifier
372                }
373            }
374            b'n' => self.check_for_keyword(1, 2, "il", TokenType::Nil),
375            b'o' => self.check_for_keyword(1, 1, "r", TokenType::Or),
376            b'p' => self.check_for_keyword(1, 4, "rint", TokenType::Print),
377            b'r' => self.check_for_keyword(1, 5, "eturn", TokenType::Return),
378            b's' => self.check_for_keyword(1, 4, "uper", TokenType::Super),
379            b'v' => self.check_for_keyword(1, 2, "ar", TokenType::Var),
380            b'w' => self.check_for_keyword(1, 4, "hile", TokenType::While),
381            b'f' => {
382                if self.cur_pos - self.start_pos > 1 {
383                    // more than 1 char in this maybe keyword
384                    match self.code.as_bytes()[self.start_pos + 1] {
385                        b'a' => self.check_for_keyword(2, 3, "lse", TokenType::False),
386                        b'o' => self.check_for_keyword(2, 1, "r", TokenType::For),
387                        b'u' => self.check_for_keyword(2, 1, "n", TokenType::Fun),
388                        _ => Identifier,
389                    }
390                } else {
391                    Identifier
392                }
393            }
394            b't' => {
395                if self.cur_pos - self.start_pos > 1 {
396                    // more than 1 char in this maybe keyword
397                    match self.code.as_bytes()[self.start_pos + 1] {
398                        b'h' => self.check_for_keyword(2, 2, "is", TokenType::This),
399                        b'r' => self.check_for_keyword(2, 2, "ue", TokenType::True),
400                        _ => Identifier,
401                    }
402                } else {
403                    Identifier
404                }
405            }
406            _ => Identifier,
407        };
408    }
409
410    fn check_for_keyword(
411        &self,
412        start: usize,
413        length: usize,
414        rest: &str,
415        keyword_type: TokenType,
416    ) -> TokenType {
417        if self.cur_pos - self.start_pos == start + length {
418            // this will check that begin + length is within the array, since we already moved cur_pos exactly that far
419            let begin = self.start_pos + start;
420            if &self.code[begin..begin + length] == rest {
421                return keyword_type;
422            }
423        }
424        Identifier
425    }
426}