aether/
lexer.rs

1// src/lexer.rs
2//! Lexer for the Aether language
3//!
4//! Converts source code into a stream of tokens
5
6use crate::token::Token;
7
8/// Lexer state
9pub struct Lexer {
10    input: Vec<char>,
11    position: usize,      // current position in input (points to current char)
12    read_position: usize, // current reading position in input (after current char)
13    ch: char,             // current char under examination
14    line: usize,          // current line number (for error reporting)
15    column: usize,        // current column number (for error reporting)
16    had_whitespace_before_token: bool, // whether whitespace was skipped before current token
17}
18
19impl Lexer {
20    /// Create a new lexer from input string
21    pub fn new(input: &str) -> Self {
22        let mut lexer = Lexer {
23            input: input.chars().collect(),
24            position: 0,
25            read_position: 0,
26            ch: '\0',
27            line: 1,
28            column: 0,
29            had_whitespace_before_token: false,
30        };
31        lexer.read_char(); // Initialize by reading the first character
32        lexer
33    }
34
35    /// Get current line number
36    pub fn line(&self) -> usize {
37        self.line
38    }
39
40    /// Get current column number
41    pub fn column(&self) -> usize {
42        self.column
43    }
44
45    /// Check if whitespace was skipped before the last token
46    pub fn had_whitespace(&self) -> bool {
47        self.had_whitespace_before_token
48    }
49
50    /// Read the next character and advance position
51    fn read_char(&mut self) {
52        if self.read_position >= self.input.len() {
53            self.ch = '\0'; // EOF
54        } else {
55            self.ch = self.input[self.read_position];
56        }
57
58        // Update line and column tracking
59        if self.ch == '\n' {
60            self.line += 1;
61            self.column = 0;
62        } else {
63            self.column += 1;
64        }
65
66        self.position = self.read_position;
67        self.read_position += 1;
68    }
69
70    /// Peek at the next character without advancing
71    fn peek_char(&self) -> char {
72        if self.read_position >= self.input.len() {
73            '\0'
74        } else {
75            self.input[self.read_position]
76        }
77    }
78
79    /// Peek at the character n positions ahead without advancing
80    fn peek_char_n(&self, n: usize) -> char {
81        let pos = self.position + n;
82        if pos >= self.input.len() {
83            '\0'
84        } else {
85            self.input[pos]
86        }
87    }
88
89    /// Get the next token
90    pub fn next_token(&mut self) -> Token {
91        let had_ws = self.skip_whitespace();
92        self.had_whitespace_before_token = had_ws;
93
94        let token = match self.ch {
95            // Operators
96            '+' => Token::Plus,
97            '-' => {
98                if self.peek_char() == '>' {
99                    self.read_char();
100                    Token::Arrow
101                } else {
102                    Token::Minus
103                }
104            }
105            '*' => Token::Multiply,
106            '/' => {
107                // Check for comments
108                if self.peek_char() == '/' {
109                    self.skip_line_comment();
110                    return self.next_token();
111                } else if self.peek_char() == '*' {
112                    self.skip_block_comment();
113                    return self.next_token();
114                } else {
115                    Token::Divide
116                }
117            }
118            '%' => Token::Modulo,
119
120            // Comparison and logical
121            '=' => {
122                if self.peek_char() == '=' {
123                    self.read_char();
124                    Token::Equal
125                } else {
126                    Token::Assign
127                }
128            }
129            '!' => {
130                if self.peek_char() == '=' {
131                    self.read_char();
132                    Token::NotEqual
133                } else {
134                    Token::Not
135                }
136            }
137            '<' => {
138                if self.peek_char() == '=' {
139                    self.read_char();
140                    Token::LessEqual
141                } else {
142                    Token::Less
143                }
144            }
145            '>' => {
146                if self.peek_char() == '=' {
147                    self.read_char();
148                    Token::GreaterEqual
149                } else {
150                    Token::Greater
151                }
152            }
153            '&' => {
154                if self.peek_char() == '&' {
155                    self.read_char();
156                    Token::And
157                } else {
158                    Token::Illegal('&')
159                }
160            }
161            '|' => {
162                if self.peek_char() == '|' {
163                    self.read_char();
164                    Token::Or
165                } else {
166                    Token::Illegal('|')
167                }
168            }
169
170            // Delimiters
171            '(' => Token::LeftParen,
172            ')' => Token::RightParen,
173            '{' => Token::LeftBrace,
174            '}' => Token::RightBrace,
175            '[' => Token::LeftBracket,
176            ']' => Token::RightBracket,
177            ',' => Token::Comma,
178            ':' => Token::Colon,
179            ';' => Token::Semicolon,
180
181            // String literals
182            '"' => {
183                // Check if it's a multiline string (""")
184                if self.peek_char() == '"' && self.peek_char_n(2) == '"' {
185                    return self.read_multiline_string();
186                } else {
187                    return self.read_string();
188                }
189            }
190
191            // Newline (statement separator)
192            '\n' => Token::Newline,
193
194            // EOF
195            '\0' => Token::EOF,
196
197            // Identifiers, keywords, and numbers
198            _ => {
199                if self.ch.is_alphabetic() || self.ch == '_' {
200                    return self.read_identifier();
201                } else if self.ch.is_numeric() {
202                    return self.read_number();
203                } else {
204                    Token::Illegal(self.ch)
205                }
206            }
207        };
208
209        self.read_char();
210        token
211    }
212
213    /// Skip whitespace (except newlines, which are significant)
214    /// Returns true if any whitespace was skipped
215    fn skip_whitespace(&mut self) -> bool {
216        let mut skipped = false;
217        while self.ch == ' ' || self.ch == '\t' || self.ch == '\r' {
218            skipped = true;
219            self.read_char();
220        }
221        skipped
222    }
223
224    /// Skip single-line comment (// ...)
225    fn skip_line_comment(&mut self) {
226        while self.ch != '\n' && self.ch != '\0' {
227            self.read_char();
228        }
229    }
230
231    /// Skip block comment (/* ... */)
232    fn skip_block_comment(&mut self) {
233        self.read_char(); // skip '/'
234        self.read_char(); // skip '*'
235
236        while !(self.ch == '*' && self.peek_char() == '/') && self.ch != '\0' {
237            if self.ch == '\n' {
238                self.line += 1;
239                self.column = 0;
240            }
241            self.read_char();
242        }
243
244        if self.ch != '\0' {
245            self.read_char(); // skip '*'
246            self.read_char(); // skip '/'
247        }
248    }
249
250    /// Read an identifier or keyword
251    fn read_identifier(&mut self) -> Token {
252        let start = self.position;
253
254        // Aether 标识符: 大写字母、数字、下划线
255        while self.ch.is_alphanumeric() || self.ch == '_' {
256            self.read_char();
257        }
258
259        let ident: String = self.input[start..self.position].iter().collect();
260        Token::lookup_keyword(&ident)
261    }
262
263    /// Read a number (integer or float)
264    fn read_number(&mut self) -> Token {
265        let start = self.position;
266        let mut has_dot = false;
267
268        while self.ch.is_numeric() || (self.ch == '.' && !has_dot) {
269            if self.ch == '.' {
270                // Check if next character is a digit
271                if !self.peek_char().is_numeric() {
272                    break;
273                }
274                has_dot = true;
275            }
276            self.read_char();
277        }
278
279        let num_str: String = self.input[start..self.position].iter().collect();
280
281        // 如果是整数且位数较多(超过15位,接近f64精度极限),作为大整数处理
282        if !has_dot && num_str.len() > 15 {
283            return Token::BigInteger(num_str);
284        }
285
286        match num_str.parse::<f64>() {
287            Ok(num) => Token::Number(num),
288            Err(_) => Token::Illegal('0'), // Invalid number
289        }
290    }
291
292    /// Read a string literal
293    fn read_string(&mut self) -> Token {
294        self.read_char(); // Skip opening quote
295        let start = self.position;
296
297        while self.ch != '"' && self.ch != '\0' {
298            // Handle escape sequences
299            if self.ch == '\\' {
300                self.read_char(); // Skip backslash
301                if self.ch != '\0' {
302                    self.read_char(); // Skip escaped character
303                }
304            } else {
305                if self.ch == '\n' {
306                    self.line += 1;
307                    self.column = 0;
308                }
309                self.read_char();
310            }
311        }
312
313        if self.ch == '\0' {
314            return Token::Illegal('"'); // Unterminated string
315        }
316
317        let string: String = self.input[start..self.position].iter().collect();
318        self.read_char(); // Skip closing quote
319
320        // Process escape sequences
321        Token::String(self.process_escapes(&string))
322    }
323
324    /// Read a multiline string literal (""" ... """)
325    fn read_multiline_string(&mut self) -> Token {
326        // Skip the opening """
327        self.read_char(); // Skip first "
328        self.read_char(); // Skip second "
329        self.read_char(); // Skip third "
330
331        let start = self.position;
332
333        // Read until we find closing """
334        loop {
335            if self.ch == '\0' {
336                return Token::Illegal('"'); // Unterminated multiline string
337            }
338
339            // Check if we found closing """
340            if self.ch == '"' && self.peek_char() == '"' && self.peek_char_n(2) == '"' {
341                let string: String = self.input[start..self.position].iter().collect();
342
343                // Skip the closing """
344                self.read_char(); // Skip first "
345                self.read_char(); // Skip second "
346                self.read_char(); // Skip third "
347
348                // Process escape sequences
349                return Token::String(self.process_escapes(&string));
350            }
351
352            // Handle newlines for line tracking
353            if self.ch == '\n' {
354                self.line += 1;
355                self.column = 0;
356            }
357
358            self.read_char();
359        }
360    }
361
362    /// Process escape sequences in strings
363    fn process_escapes(&self, s: &str) -> String {
364        let mut result = String::new();
365        let mut chars = s.chars();
366
367        while let Some(ch) = chars.next() {
368            if ch == '\\' {
369                match chars.next() {
370                    Some('n') => result.push('\n'),
371                    Some('t') => result.push('\t'),
372                    Some('r') => result.push('\r'),
373                    Some('\\') => result.push('\\'),
374                    Some('"') => result.push('"'),
375                    Some(c) => {
376                        result.push('\\');
377                        result.push(c);
378                    }
379                    None => result.push('\\'),
380                }
381            } else {
382                result.push(ch);
383            }
384        }
385
386        result
387    }
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_basic_tokens() {
396        let input = "Set X 10";
397        let mut lexer = Lexer::new(input);
398
399        assert_eq!(lexer.next_token(), Token::Set);
400        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
401        assert_eq!(lexer.next_token(), Token::Number(10.0));
402        assert_eq!(lexer.next_token(), Token::EOF);
403    }
404
405    #[test]
406    fn test_operators() {
407        let input = "+ - * / % == != < <= > >= && || !";
408        let mut lexer = Lexer::new(input);
409
410        assert_eq!(lexer.next_token(), Token::Plus);
411        assert_eq!(lexer.next_token(), Token::Minus);
412        assert_eq!(lexer.next_token(), Token::Multiply);
413        assert_eq!(lexer.next_token(), Token::Divide);
414        assert_eq!(lexer.next_token(), Token::Modulo);
415        assert_eq!(lexer.next_token(), Token::Equal);
416        assert_eq!(lexer.next_token(), Token::NotEqual);
417        assert_eq!(lexer.next_token(), Token::Less);
418        assert_eq!(lexer.next_token(), Token::LessEqual);
419        assert_eq!(lexer.next_token(), Token::Greater);
420        assert_eq!(lexer.next_token(), Token::GreaterEqual);
421        assert_eq!(lexer.next_token(), Token::And);
422        assert_eq!(lexer.next_token(), Token::Or);
423        assert_eq!(lexer.next_token(), Token::Not);
424        assert_eq!(lexer.next_token(), Token::EOF);
425    }
426
427    #[test]
428    fn test_string_literal() {
429        let input = r#"Set MSG "Hello World""#;
430        let mut lexer = Lexer::new(input);
431
432        assert_eq!(lexer.next_token(), Token::Set);
433        assert_eq!(lexer.next_token(), Token::Identifier("MSG".to_string()));
434        assert_eq!(lexer.next_token(), Token::String("Hello World".to_string()));
435        assert_eq!(lexer.next_token(), Token::EOF);
436    }
437
438    #[test]
439    fn test_string_with_escapes() {
440        let input = r#""Hello\nWorld\t!""#;
441        let mut lexer = Lexer::new(input);
442
443        assert_eq!(
444            lexer.next_token(),
445            Token::String("Hello\nWorld\t!".to_string())
446        );
447    }
448
449    #[test]
450    fn test_numbers() {
451        let input = "123 45.67 0.5";
452        let mut lexer = Lexer::new(input);
453
454        assert_eq!(lexer.next_token(), Token::Number(123.0));
455        assert_eq!(lexer.next_token(), Token::Number(45.67));
456        assert_eq!(lexer.next_token(), Token::Number(0.5));
457        assert_eq!(lexer.next_token(), Token::EOF);
458    }
459
460    #[test]
461    fn test_keywords() {
462        let input = "Set Func If Else While For Return True False Null";
463        let mut lexer = Lexer::new(input);
464
465        assert_eq!(lexer.next_token(), Token::Set);
466        assert_eq!(lexer.next_token(), Token::Func);
467        assert_eq!(lexer.next_token(), Token::If);
468        assert_eq!(lexer.next_token(), Token::Else);
469        assert_eq!(lexer.next_token(), Token::While);
470        assert_eq!(lexer.next_token(), Token::For);
471        assert_eq!(lexer.next_token(), Token::Return);
472        assert_eq!(lexer.next_token(), Token::Boolean(true));
473        assert_eq!(lexer.next_token(), Token::Boolean(false));
474        assert_eq!(lexer.next_token(), Token::Null);
475        assert_eq!(lexer.next_token(), Token::EOF);
476    }
477
478    #[test]
479    fn test_identifiers() {
480        let input = "USER_NAME CALCULATE_TOTAL MY_VAR";
481        let mut lexer = Lexer::new(input);
482
483        assert_eq!(
484            lexer.next_token(),
485            Token::Identifier("USER_NAME".to_string())
486        );
487        assert_eq!(
488            lexer.next_token(),
489            Token::Identifier("CALCULATE_TOTAL".to_string())
490        );
491        assert_eq!(lexer.next_token(), Token::Identifier("MY_VAR".to_string()));
492        assert_eq!(lexer.next_token(), Token::EOF);
493    }
494
495    #[test]
496    fn test_delimiters() {
497        let input = "( ) { } [ ] , : ;";
498        let mut lexer = Lexer::new(input);
499
500        assert_eq!(lexer.next_token(), Token::LeftParen);
501        assert_eq!(lexer.next_token(), Token::RightParen);
502        assert_eq!(lexer.next_token(), Token::LeftBrace);
503        assert_eq!(lexer.next_token(), Token::RightBrace);
504        assert_eq!(lexer.next_token(), Token::LeftBracket);
505        assert_eq!(lexer.next_token(), Token::RightBracket);
506        assert_eq!(lexer.next_token(), Token::Comma);
507        assert_eq!(lexer.next_token(), Token::Colon);
508        assert_eq!(lexer.next_token(), Token::Semicolon);
509        assert_eq!(lexer.next_token(), Token::EOF);
510    }
511
512    #[test]
513    fn test_line_comment() {
514        let input = "Set X 10 // This is a comment\nSet Y 20";
515        let mut lexer = Lexer::new(input);
516
517        assert_eq!(lexer.next_token(), Token::Set);
518        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
519        assert_eq!(lexer.next_token(), Token::Number(10.0));
520        assert_eq!(lexer.next_token(), Token::Newline);
521        assert_eq!(lexer.next_token(), Token::Set);
522        assert_eq!(lexer.next_token(), Token::Identifier("Y".to_string()));
523        assert_eq!(lexer.next_token(), Token::Number(20.0));
524    }
525
526    #[test]
527    fn test_block_comment() {
528        let input = "Set X /* block comment */ 10";
529        let mut lexer = Lexer::new(input);
530
531        assert_eq!(lexer.next_token(), Token::Set);
532        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
533        assert_eq!(lexer.next_token(), Token::Number(10.0));
534    }
535
536    #[test]
537    fn test_newlines() {
538        let input = "Set X 10\nSet Y 20";
539        let mut lexer = Lexer::new(input);
540
541        assert_eq!(lexer.next_token(), Token::Set);
542        assert_eq!(lexer.next_token(), Token::Identifier("X".to_string()));
543        assert_eq!(lexer.next_token(), Token::Number(10.0));
544        assert_eq!(lexer.next_token(), Token::Newline);
545        assert_eq!(lexer.line(), 2);
546        assert_eq!(lexer.next_token(), Token::Set);
547    }
548
549    #[test]
550    fn test_complex_expression() {
551        let input = r#"
552            Func ADD (A, B) {
553                Return (A + B)
554            }
555        "#;
556        let mut lexer = Lexer::new(input);
557
558        assert_eq!(lexer.next_token(), Token::Newline);
559        assert_eq!(lexer.next_token(), Token::Func);
560        assert_eq!(lexer.next_token(), Token::Identifier("ADD".to_string()));
561        assert_eq!(lexer.next_token(), Token::LeftParen);
562        assert_eq!(lexer.next_token(), Token::Identifier("A".to_string()));
563        assert_eq!(lexer.next_token(), Token::Comma);
564        assert_eq!(lexer.next_token(), Token::Identifier("B".to_string()));
565        assert_eq!(lexer.next_token(), Token::RightParen);
566        assert_eq!(lexer.next_token(), Token::LeftBrace);
567        assert_eq!(lexer.next_token(), Token::Newline);
568        assert_eq!(lexer.next_token(), Token::Return);
569        assert_eq!(lexer.next_token(), Token::LeftParen);
570        assert_eq!(lexer.next_token(), Token::Identifier("A".to_string()));
571        assert_eq!(lexer.next_token(), Token::Plus);
572        assert_eq!(lexer.next_token(), Token::Identifier("B".to_string()));
573        assert_eq!(lexer.next_token(), Token::RightParen);
574        assert_eq!(lexer.next_token(), Token::Newline);
575        assert_eq!(lexer.next_token(), Token::RightBrace);
576    }
577}