aether/
lexer.rs

1// src/lexer.rs
2//! Lexer for the Aether language
3//!
4//! Converts source code into a stream of tokens
5
6use crate::token::Token;
7
8/// Lexer state
9pub struct Lexer {
10    input: Vec<char>,
11    position: usize,      // current position in input (points to current char)
12    read_position: usize, // current reading position in input (after current char)
13    ch: char,             // current char under examination
14    line: usize,          // current line number (for error reporting)
15    column: usize,        // current column number (for error reporting)
16    had_whitespace_before_token: bool, // whether whitespace was skipped before current token
17}
18
19impl Lexer {
20    /// Create a new lexer from input string
21    pub fn new(input: &str) -> Self {
22        let mut lexer = Lexer {
23            input: input.chars().collect(),
24            position: 0,
25            read_position: 0,
26            ch: '\0',
27            line: 1,
28            column: 0,
29            had_whitespace_before_token: false,
30        };
31        lexer.read_char(); // Initialize by reading the first character
32        lexer
33    }
34
35    /// Get current line number
36    pub fn line(&self) -> usize {
37        self.line
38    }
39
40    /// Get current column number
41    pub fn column(&self) -> usize {
42        self.column
43    }
44
45    /// Check if whitespace was skipped before the last token
46    pub fn had_whitespace(&self) -> bool {
47        self.had_whitespace_before_token
48    }
49
50    /// Read the next character and advance position
51    fn read_char(&mut self) {
52        if self.read_position >= self.input.len() {
53            self.ch = '\0'; // EOF
54        } else {
55            self.ch = self.input[self.read_position];
56        }
57
58        // Update line and column tracking
59        if self.ch == '\n' {
60            self.line += 1;
61            self.column = 0;
62        } else {
63            self.column += 1;
64        }
65
66        self.position = self.read_position;
67        self.read_position += 1;
68    }
69
70    /// Peek at the next character without advancing
71    fn peek_char(&self) -> char {
72        if self.read_position >= self.input.len() {
73            '\0'
74        } else {
75            self.input[self.read_position]
76        }
77    }
78
79    /// Get the next token
80    pub fn next_token(&mut self) -> Token {
81        let had_ws = self.skip_whitespace();
82        self.had_whitespace_before_token = had_ws;
83
84        let token = match self.ch {
85            // Operators
86            '+' => Token::Plus,
87            '-' => {
88                if self.peek_char() == '>' {
89                    self.read_char();
90                    Token::Arrow
91                } else {
92                    Token::Minus
93                }
94            }
95            '*' => Token::Multiply,
96            '/' => {
97                // Check for comments
98                if self.peek_char() == '/' {
99                    self.skip_line_comment();
100                    return self.next_token();
101                } else if self.peek_char() == '*' {
102                    self.skip_block_comment();
103                    return self.next_token();
104                } else {
105                    Token::Divide
106                }
107            }
108            '%' => Token::Modulo,
109
110            // Comparison and logical
111            '=' => {
112                if self.peek_char() == '=' {
113                    self.read_char();
114                    Token::Equal
115                } else {
116                    Token::Assign
117                }
118            }
119            '!' => {
120                if self.peek_char() == '=' {
121                    self.read_char();
122                    Token::NotEqual
123                } else {
124                    Token::Not
125                }
126            }
127            '<' => {
128                if self.peek_char() == '=' {
129                    self.read_char();
130                    Token::LessEqual
131                } else {
132                    Token::Less
133                }
134            }
135            '>' => {
136                if self.peek_char() == '=' {
137                    self.read_char();
138                    Token::GreaterEqual
139                } else {
140                    Token::Greater
141                }
142            }
143            '&' => {
144                if self.peek_char() == '&' {
145                    self.read_char();
146                    Token::And
147                } else {
148                    Token::Illegal('&')
149                }
150            }
151            '|' => {
152                if self.peek_char() == '|' {
153                    self.read_char();
154                    Token::Or
155                } else {
156                    Token::Illegal('|')
157                }
158            }
159
160            // Delimiters
161            '(' => Token::LeftParen,
162            ')' => Token::RightParen,
163            '{' => Token::LeftBrace,
164            '}' => Token::RightBrace,
165            '[' => Token::LeftBracket,
166            ']' => Token::RightBracket,
167            ',' => Token::Comma,
168            ':' => Token::Colon,
169            ';' => Token::Semicolon,
170
171            // String literals
172            '"' => return self.read_string(),
173
174            // Newline (statement separator)
175            '\n' => Token::Newline,
176
177            // EOF
178            '\0' => Token::EOF,
179
180            // Identifiers, keywords, and numbers
181            _ => {
182                if self.ch.is_alphabetic() || self.ch == '_' {
183                    return self.read_identifier();
184                } else if self.ch.is_numeric() {
185                    return self.read_number();
186                } else {
187                    Token::Illegal(self.ch)
188                }
189            }
190        };
191
192        self.read_char();
193        token
194    }
195
196    /// Skip whitespace (except newlines, which are significant)
197    /// Returns true if any whitespace was skipped
198    fn skip_whitespace(&mut self) -> bool {
199        let mut skipped = false;
200        while self.ch == ' ' || self.ch == '\t' || self.ch == '\r' {
201            skipped = true;
202            self.read_char();
203        }
204        skipped
205    }
206
207    /// Skip single-line comment (// ...)
208    fn skip_line_comment(&mut self) {
209        while self.ch != '\n' && self.ch != '\0' {
210            self.read_char();
211        }
212    }
213
214    /// Skip block comment (/* ... */)
215    fn skip_block_comment(&mut self) {
216        self.read_char(); // skip '/'
217        self.read_char(); // skip '*'
218
219        while !(self.ch == '*' && self.peek_char() == '/') && self.ch != '\0' {
220            if self.ch == '\n' {
221                self.line += 1;
222                self.column = 0;
223            }
224            self.read_char();
225        }
226
227        if self.ch != '\0' {
228            self.read_char(); // skip '*'
229            self.read_char(); // skip '/'
230        }
231    }
232
233    /// Read an identifier or keyword
234    fn read_identifier(&mut self) -> Token {
235        let start = self.position;
236
237        // Aether 标识符: 大写字母、数字、下划线
238        while self.ch.is_alphanumeric() || self.ch == '_' {
239            self.read_char();
240        }
241
242        let ident: String = self.input[start..self.position].iter().collect();
243        Token::lookup_keyword(&ident)
244    }
245
246    /// Read a number (integer or float)
247    fn read_number(&mut self) -> Token {
248        let start = self.position;
249        let mut has_dot = false;
250
251        while self.ch.is_numeric() || (self.ch == '.' && !has_dot) {
252            if self.ch == '.' {
253                // Check if next character is a digit
254                if !self.peek_char().is_numeric() {
255                    break;
256                }
257                has_dot = true;
258            }
259            self.read_char();
260        }
261
262        let num_str: String = self.input[start..self.position].iter().collect();
263
264        // 如果是整数且位数较多(超过15位,接近f64精度极限),作为大整数处理
265        if !has_dot && num_str.len() > 15 {
266            return Token::BigInteger(num_str);
267        }
268
269        match num_str.parse::<f64>() {
270            Ok(num) => Token::Number(num),
271            Err(_) => Token::Illegal('0'), // Invalid number
272        }
273    }
274
275    /// Read a string literal
276    fn read_string(&mut self) -> Token {
277        self.read_char(); // Skip opening quote
278        let start = self.position;
279
280        while self.ch != '"' && self.ch != '\0' {
281            // Handle escape sequences
282            if self.ch == '\\' {
283                self.read_char(); // Skip backslash
284                if self.ch != '\0' {
285                    self.read_char(); // Skip escaped character
286                }
287            } else {
288                if self.ch == '\n' {
289                    self.line += 1;
290                    self.column = 0;
291                }
292                self.read_char();
293            }
294        }
295
296        if self.ch == '\0' {
297            return Token::Illegal('"'); // Unterminated string
298        }
299
300        let string: String = self.input[start..self.position].iter().collect();
301        self.read_char(); // Skip closing quote
302
303        // Process escape sequences
304        Token::String(self.process_escapes(&string))
305    }
306
307    /// Process escape sequences in strings
308    fn process_escapes(&self, s: &str) -> String {
309        let mut result = String::new();
310        let mut chars = s.chars();
311
312        while let Some(ch) = chars.next() {
313            if ch == '\\' {
314                match chars.next() {
315                    Some('n') => result.push('\n'),
316                    Some('t') => result.push('\t'),
317                    Some('r') => result.push('\r'),
318                    Some('\\') => result.push('\\'),
319                    Some('"') => result.push('"'),
320                    Some(c) => {
321                        result.push('\\');
322                        result.push(c);
323                    }
324                    None => result.push('\\'),
325                }
326            } else {
327                result.push(ch);
328            }
329        }
330
331        result
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    #[test]
340    fn test_basic_tokens() {
341        let input = "Set X 10";
342        let mut lexer = Lexer::new(input);
343
344        assert_eq!(lexer.next_token(), Token::Set);
345        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
346        assert_eq!(lexer.next_token(), Token::Number(10.0));
347        assert_eq!(lexer.next_token(), Token::EOF);
348    }
349
350    #[test]
351    fn test_operators() {
352        let input = "+ - * / % == != < <= > >= && || !";
353        let mut lexer = Lexer::new(input);
354
355        assert_eq!(lexer.next_token(), Token::Plus);
356        assert_eq!(lexer.next_token(), Token::Minus);
357        assert_eq!(lexer.next_token(), Token::Multiply);
358        assert_eq!(lexer.next_token(), Token::Divide);
359        assert_eq!(lexer.next_token(), Token::Modulo);
360        assert_eq!(lexer.next_token(), Token::Equal);
361        assert_eq!(lexer.next_token(), Token::NotEqual);
362        assert_eq!(lexer.next_token(), Token::Less);
363        assert_eq!(lexer.next_token(), Token::LessEqual);
364        assert_eq!(lexer.next_token(), Token::Greater);
365        assert_eq!(lexer.next_token(), Token::GreaterEqual);
366        assert_eq!(lexer.next_token(), Token::And);
367        assert_eq!(lexer.next_token(), Token::Or);
368        assert_eq!(lexer.next_token(), Token::Not);
369        assert_eq!(lexer.next_token(), Token::EOF);
370    }
371
372    #[test]
373    fn test_string_literal() {
374        let input = r#"Set MSG "Hello World""#;
375        let mut lexer = Lexer::new(input);
376
377        assert_eq!(lexer.next_token(), Token::Set);
378        assert_eq!(lexer.next_token(), Token::Identifier("MSG".to_string()));
379        assert_eq!(lexer.next_token(), Token::String("Hello World".to_string()));
380        assert_eq!(lexer.next_token(), Token::EOF);
381    }
382
383    #[test]
384    fn test_string_with_escapes() {
385        let input = r#""Hello\nWorld\t!""#;
386        let mut lexer = Lexer::new(input);
387
388        assert_eq!(
389            lexer.next_token(),
390            Token::String("Hello\nWorld\t!".to_string())
391        );
392    }
393
394    #[test]
395    fn test_numbers() {
396        let input = "123 45.67 0.5";
397        let mut lexer = Lexer::new(input);
398
399        assert_eq!(lexer.next_token(), Token::Number(123.0));
400        assert_eq!(lexer.next_token(), Token::Number(45.67));
401        assert_eq!(lexer.next_token(), Token::Number(0.5));
402        assert_eq!(lexer.next_token(), Token::EOF);
403    }
404
405    #[test]
406    fn test_keywords() {
407        let input = "Set Func If Else While For Return True False Null";
408        let mut lexer = Lexer::new(input);
409
410        assert_eq!(lexer.next_token(), Token::Set);
411        assert_eq!(lexer.next_token(), Token::Func);
412        assert_eq!(lexer.next_token(), Token::If);
413        assert_eq!(lexer.next_token(), Token::Else);
414        assert_eq!(lexer.next_token(), Token::While);
415        assert_eq!(lexer.next_token(), Token::For);
416        assert_eq!(lexer.next_token(), Token::Return);
417        assert_eq!(lexer.next_token(), Token::Boolean(true));
418        assert_eq!(lexer.next_token(), Token::Boolean(false));
419        assert_eq!(lexer.next_token(), Token::Null);
420        assert_eq!(lexer.next_token(), Token::EOF);
421    }
422
423    #[test]
424    fn test_identifiers() {
425        let input = "USER_NAME CALCULATE_TOTAL MY_VAR";
426        let mut lexer = Lexer::new(input);
427
428        assert_eq!(
429            lexer.next_token(),
430            Token::Identifier("USER_NAME".to_string())
431        );
432        assert_eq!(
433            lexer.next_token(),
434            Token::Identifier("CALCULATE_TOTAL".to_string())
435        );
436        assert_eq!(lexer.next_token(), Token::Identifier("MY_VAR".to_string()));
437        assert_eq!(lexer.next_token(), Token::EOF);
438    }
439
440    #[test]
441    fn test_delimiters() {
442        let input = "( ) { } [ ] , : ;";
443        let mut lexer = Lexer::new(input);
444
445        assert_eq!(lexer.next_token(), Token::LeftParen);
446        assert_eq!(lexer.next_token(), Token::RightParen);
447        assert_eq!(lexer.next_token(), Token::LeftBrace);
448        assert_eq!(lexer.next_token(), Token::RightBrace);
449        assert_eq!(lexer.next_token(), Token::LeftBracket);
450        assert_eq!(lexer.next_token(), Token::RightBracket);
451        assert_eq!(lexer.next_token(), Token::Comma);
452        assert_eq!(lexer.next_token(), Token::Colon);
453        assert_eq!(lexer.next_token(), Token::Semicolon);
454        assert_eq!(lexer.next_token(), Token::EOF);
455    }
456
457    #[test]
458    fn test_line_comment() {
459        let input = "Set X 10 // This is a comment\nSet Y 20";
460        let mut lexer = Lexer::new(input);
461
462        assert_eq!(lexer.next_token(), Token::Set);
463        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
464        assert_eq!(lexer.next_token(), Token::Number(10.0));
465        assert_eq!(lexer.next_token(), Token::Newline);
466        assert_eq!(lexer.next_token(), Token::Set);
467        assert_eq!(lexer.next_token(), Token::Identifier("Y".to_string()));
468        assert_eq!(lexer.next_token(), Token::Number(20.0));
469    }
470
471    #[test]
472    fn test_block_comment() {
473        let input = "Set X /* block comment */ 10";
474        let mut lexer = Lexer::new(input);
475
476        assert_eq!(lexer.next_token(), Token::Set);
477        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
478        assert_eq!(lexer.next_token(), Token::Number(10.0));
479    }
480
481    #[test]
482    fn test_newlines() {
483        let input = "Set X 10\nSet Y 20";
484        let mut lexer = Lexer::new(input);
485
486        assert_eq!(lexer.next_token(), Token::Set);
487        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
488        assert_eq!(lexer.next_token(), Token::Number(10.0));
489        assert_eq!(lexer.next_token(), Token::Newline);
490        assert_eq!(lexer.line(), 2);
491        assert_eq!(lexer.next_token(), Token::Set);
492    }
493
494    #[test]
495    fn test_complex_expression() {
496        let input = r#"
497            Func ADD (A, B) {
498                Return (A + B)
499            }
500        "#;
501        let mut lexer = Lexer::new(input);
502
503        assert_eq!(lexer.next_token(), Token::Newline);
504        assert_eq!(lexer.next_token(), Token::Func);
505        assert_eq!(lexer.next_token(), Token::Identifier("ADD".to_string()));
506        assert_eq!(lexer.next_token(), Token::LeftParen);
507        assert_eq!(lexer.next_token(), Token::Identifier("A".to_string()));
508        assert_eq!(lexer.next_token(), Token::Comma);
509        assert_eq!(lexer.next_token(), Token::Identifier("B".to_string()));
510        assert_eq!(lexer.next_token(), Token::RightParen);
511        assert_eq!(lexer.next_token(), Token::LeftBrace);
512        assert_eq!(lexer.next_token(), Token::Newline);
513        assert_eq!(lexer.next_token(), Token::Return);
514        assert_eq!(lexer.next_token(), Token::LeftParen);
515        assert_eq!(lexer.next_token(), Token::Identifier("A".to_string()));
516        assert_eq!(lexer.next_token(), Token::Plus);
517        assert_eq!(lexer.next_token(), Token::Identifier("B".to_string()));
518        assert_eq!(lexer.next_token(), Token::RightParen);
519        assert_eq!(lexer.next_token(), Token::Newline);
520        assert_eq!(lexer.next_token(), Token::RightBrace);
521    }
522}