Skip to main content

lust/lexer/
lexer_impl.rs

1use super::token::{Token, TokenKind};
2use crate::error::{LustError, Result};
3use alloc::{
4    format,
5    string::{String, ToString},
6    vec::Vec,
7};
8pub struct Lexer {
9    input: Vec<char>,
10    position: usize,
11    line: usize,
12    column: usize,
13}
14
15impl Lexer {
16    pub fn new(input: &str) -> Self {
17        Self {
18            input: input.chars().collect(),
19            position: 0,
20            line: 1,
21            column: 1,
22        }
23    }
24
25    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
26        let mut tokens = Vec::new();
27        while !self.is_at_end() {
28            self.skip_whitespace_and_comments()?;
29            if self.is_at_end() {
30                break;
31            }
32
33            let token = self.next_token()?;
34            tokens.push(token);
35        }
36
37        tokens.push(Token::new(
38            TokenKind::Eof,
39            String::new(),
40            self.line,
41            self.column,
42        ));
43        Ok(tokens)
44    }
45
46    fn next_token(&mut self) -> Result<Token> {
47        let start_line = self.line;
48        let start_column = self.column;
49        let ch = self.current_char();
50        let (kind, lexeme) = match ch {
51            '(' => {
52                self.advance();
53                (TokenKind::LeftParen, "(".to_string())
54            }
55
56            ')' => {
57                self.advance();
58                (TokenKind::RightParen, ")".to_string())
59            }
60
61            '{' => {
62                self.advance();
63                (TokenKind::LeftBrace, "{".to_string())
64            }
65
66            '}' => {
67                self.advance();
68                (TokenKind::RightBrace, "}".to_string())
69            }
70
71            '[' => {
72                self.advance();
73                (TokenKind::LeftBracket, "[".to_string())
74            }
75
76            ']' => {
77                self.advance();
78                (TokenKind::RightBracket, "]".to_string())
79            }
80
81            ',' => {
82                self.advance();
83                (TokenKind::Comma, ",".to_string())
84            }
85
86            ';' => {
87                self.advance();
88                (TokenKind::Semicolon, ";".to_string())
89            }
90
91            '%' => {
92                self.advance();
93                (TokenKind::Percent, "%".to_string())
94            }
95
96            '^' => {
97                self.advance();
98                (TokenKind::Caret, "^".to_string())
99            }
100
101            '?' => {
102                self.advance();
103                (TokenKind::Question, "?".to_string())
104            }
105
106            '&' => {
107                self.advance();
108                (TokenKind::Ampersand, "&".to_string())
109            }
110
111            '|' => {
112                self.advance();
113                (TokenKind::Pipe, "|".to_string())
114            }
115
116            '+' => {
117                self.advance();
118                if self.current_char() == '=' {
119                    self.advance();
120                    (TokenKind::PlusEqual, "+=".to_string())
121                } else {
122                    (TokenKind::Plus, "+".to_string())
123                }
124            }
125
126            '-' => {
127                self.advance();
128                if self.current_char() == '=' {
129                    self.advance();
130                    (TokenKind::MinusEqual, "-=".to_string())
131                } else if self.current_char() == '>' {
132                    self.advance();
133                    (TokenKind::Arrow, "->".to_string())
134                } else {
135                    (TokenKind::Minus, "-".to_string())
136                }
137            }
138
139            '*' => {
140                self.advance();
141                if self.current_char() == '=' {
142                    self.advance();
143                    (TokenKind::StarEqual, "*=".to_string())
144                } else {
145                    (TokenKind::Star, "*".to_string())
146                }
147            }
148
149            '/' => {
150                self.advance();
151                if self.current_char() == '=' {
152                    self.advance();
153                    (TokenKind::SlashEqual, "/=".to_string())
154                } else {
155                    (TokenKind::Slash, "/".to_string())
156                }
157            }
158
159            '=' => {
160                self.advance();
161                if self.current_char() == '=' {
162                    self.advance();
163                    (TokenKind::DoubleEqual, "==".to_string())
164                } else if self.current_char() == '>' {
165                    self.advance();
166                    (TokenKind::FatArrow, "=>".to_string())
167                } else {
168                    (TokenKind::Equal, "=".to_string())
169                }
170            }
171
172            '~' => {
173                self.advance();
174                if self.current_char() == '=' {
175                    self.advance();
176                    (TokenKind::NotEqual, "~=".to_string())
177                } else {
178                    return Err(LustError::LexerError {
179                        line: start_line,
180                        column: start_column,
181                        message: format!("Unexpected character: {}", ch),
182                        module: None,
183                    });
184                }
185            }
186
187            '!' => {
188                self.advance();
189                if self.current_char() == '=' {
190                    self.advance();
191                    (TokenKind::NotEqual, "!=".to_string())
192                } else {
193                    return Err(LustError::LexerError {
194                        line: start_line,
195                        column: start_column,
196                        message: format!("Unexpected character: {}", ch),
197                        module: None,
198                    });
199                }
200            }
201
202            '<' => {
203                self.advance();
204                if self.current_char() == '=' {
205                    self.advance();
206                    (TokenKind::LessEqual, "<=".to_string())
207                } else {
208                    (TokenKind::Less, "<".to_string())
209                }
210            }
211
212            '>' => {
213                self.advance();
214                if self.current_char() == '=' {
215                    self.advance();
216                    (TokenKind::GreaterEqual, ">=".to_string())
217                } else {
218                    (TokenKind::Greater, ">".to_string())
219                }
220            }
221
222            ':' => {
223                self.advance();
224                if self.current_char() == ':' {
225                    self.advance();
226                    (TokenKind::DoubleColon, "::".to_string())
227                } else {
228                    (TokenKind::Colon, ":".to_string())
229                }
230            }
231
232            '.' => {
233                self.advance();
234                if self.current_char() == '.' {
235                    self.advance();
236                    (TokenKind::DoubleDot, "..".to_string())
237                } else {
238                    (TokenKind::Dot, ".".to_string())
239                }
240            }
241
242            '"' | '\'' => self.scan_string()?,
243            '0'..='9' => self.scan_number()?,
244            'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
245            _ => {
246                return Err(LustError::LexerError {
247                    line: start_line,
248                    column: start_column,
249                    message: format!("Unexpected character: {}", ch),
250                    module: None,
251                });
252            }
253        };
254        Ok(Token::new(kind, lexeme, start_line, start_column))
255    }
256
257    fn scan_string(&mut self) -> Result<(TokenKind, String)> {
258        let quote = self.current_char();
259        let start_line = self.line;
260        let start_column = self.column;
261        self.advance();
262        let mut value = String::new();
263        value.push(quote);
264        while !self.is_at_end() && self.current_char() != quote {
265            if self.current_char() == '\\' {
266                value.push(self.current_char());
267                self.advance();
268                if !self.is_at_end() {
269                    value.push(self.current_char());
270                    self.advance();
271                }
272            } else {
273                value.push(self.current_char());
274                self.advance();
275            }
276        }
277
278        if self.is_at_end() {
279            return Err(LustError::LexerError {
280                line: start_line,
281                column: start_column,
282                message: "Unterminated string".to_string(),
283                module: None,
284            });
285        }
286
287        value.push(self.current_char());
288        self.advance();
289        Ok((TokenKind::String, value))
290    }
291
292    fn scan_number(&mut self) -> Result<(TokenKind, String)> {
293        let mut value = String::new();
294        let mut is_float = false;
295        while !self.is_at_end() && self.current_char().is_ascii_digit() {
296            value.push(self.current_char());
297            self.advance();
298        }
299
300        if !self.is_at_end() && self.current_char() == '.' {
301            if self.peek(1) != Some('.') && self.peek(1).map_or(false, |c| c.is_ascii_digit()) {
302                is_float = true;
303                value.push(self.current_char());
304                self.advance();
305                while !self.is_at_end() && self.current_char().is_ascii_digit() {
306                    value.push(self.current_char());
307                    self.advance();
308                }
309            }
310        }
311
312        if !self.is_at_end() && (self.current_char() == 'e' || self.current_char() == 'E') {
313            is_float = true;
314            value.push(self.current_char());
315            self.advance();
316            if !self.is_at_end() && (self.current_char() == '+' || self.current_char() == '-') {
317                value.push(self.current_char());
318                self.advance();
319            }
320
321            while !self.is_at_end() && self.current_char().is_ascii_digit() {
322                value.push(self.current_char());
323                self.advance();
324            }
325        }
326
327        let kind = if is_float {
328            TokenKind::Float
329        } else {
330            TokenKind::Integer
331        };
332        Ok((kind, value))
333    }
334
335    fn scan_identifier(&mut self) -> Result<(TokenKind, String)> {
336        let mut value = String::new();
337        while !self.is_at_end()
338            && (self.current_char().is_alphanumeric() || self.current_char() == '_')
339        {
340            value.push(self.current_char());
341            self.advance();
342        }
343
344        let kind = TokenKind::keyword(&value).unwrap_or(TokenKind::Identifier);
345        Ok((kind, value))
346    }
347
348    fn skip_whitespace_and_comments(&mut self) -> Result<()> {
349        while !self.is_at_end() {
350            match self.current_char() {
351                ' ' | '\t' | '\r' => {
352                    self.advance();
353                }
354
355                '\n' => {
356                    self.advance();
357                    self.line += 1;
358                    self.column = 1;
359                }
360
361                '-' => {
362                    if self.peek(1) == Some('-') {
363                        if self.peek(2) == Some('[') && self.peek(3) == Some('[') {
364                            self.advance();
365                            self.advance();
366                            self.advance();
367                            self.advance();
368                            self.skip_block_comment()?;
369                            continue;
370                        }
371
372                        self.advance();
373                        self.advance();
374                        while !self.is_at_end() && self.current_char() != '\n' {
375                            self.advance();
376                        }
377                    } else {
378                        break;
379                    }
380                }
381
382                '#' => {
383                    self.advance();
384                    while !self.is_at_end() && self.current_char() != '\n' {
385                        self.advance();
386                    }
387                }
388
389                _ => break,
390            }
391        }
392
393        Ok(())
394    }
395
396    fn skip_block_comment(&mut self) -> Result<()> {
397        while !self.is_at_end() {
398            if self.current_char() == ']' && self.peek(1) == Some(']') {
399                self.advance();
400                self.advance();
401                return Ok(());
402            }
403
404            if self.current_char() == '\n' {
405                self.advance();
406                self.line += 1;
407                self.column = 1;
408            } else {
409                self.advance();
410            }
411        }
412
413        Err(LustError::LexerError {
414            line: self.line,
415            column: self.column,
416            message: "Unterminated block comment".to_string(),
417            module: None,
418        })
419    }
420
421    fn current_char(&self) -> char {
422        if self.is_at_end() {
423            '\0'
424        } else {
425            self.input[self.position]
426        }
427    }
428
429    fn peek(&self, offset: usize) -> Option<char> {
430        let pos = self.position + offset;
431        if pos < self.input.len() {
432            Some(self.input[pos])
433        } else {
434            None
435        }
436    }
437
438    fn advance(&mut self) {
439        if !self.is_at_end() {
440            self.position += 1;
441            self.column += 1;
442        }
443    }
444
445    fn is_at_end(&self) -> bool {
446        self.position >= self.input.len()
447    }
448}