aether/
lexer.rs

1// src/lexer.rs
2//! Lexer for the Aether language
3//!
4//! Converts source code into a stream of tokens
5
6use crate::token::Token;
7
8/// Lexer state
9pub struct Lexer {
10    input: Vec<char>,
11    position: usize,      // current position in input (points to current char)
12    read_position: usize, // current reading position in input (after current char)
13    ch: char,             // current char under examination
14    line: usize,          // current line number (for error reporting)
15    column: usize,        // current column number (for error reporting)
16    had_whitespace_before_token: bool, // whether whitespace was skipped before current token
17}
18
19impl Lexer {
20    /// Create a new lexer from input string
21    pub fn new(input: &str) -> Self {
22        let mut lexer = Lexer {
23            input: input.chars().collect(),
24            position: 0,
25            read_position: 0,
26            ch: '\0',
27            line: 1,
28            column: 0,
29            had_whitespace_before_token: false,
30        };
31        lexer.read_char(); // Initialize by reading the first character
32        lexer
33    }
34
35    /// Get current line number
36    pub fn line(&self) -> usize {
37        self.line
38    }
39
40    /// Get current column number
41    pub fn column(&self) -> usize {
42        self.column
43    }
44
45    /// Check if whitespace was skipped before the last token
46    pub fn had_whitespace(&self) -> bool {
47        self.had_whitespace_before_token
48    }
49
50    /// Read the next character and advance position
51    fn read_char(&mut self) {
52        if self.read_position >= self.input.len() {
53            self.ch = '\0'; // EOF
54        } else {
55            self.ch = self.input[self.read_position];
56        }
57
58        // Update line and column tracking
59        if self.ch == '\n' {
60            self.line += 1;
61            self.column = 0;
62        } else {
63            self.column += 1;
64        }
65
66        self.position = self.read_position;
67        self.read_position += 1;
68    }
69
70    /// Peek at the next character without advancing
71    fn peek_char(&self) -> char {
72        if self.read_position >= self.input.len() {
73            '\0'
74        } else {
75            self.input[self.read_position]
76        }
77    }
78
79    /// Peek at the character n positions ahead without advancing
80    fn peek_char_n(&self, n: usize) -> char {
81        let pos = self.position + n;
82        if pos >= self.input.len() {
83            '\0'
84        } else {
85            self.input[pos]
86        }
87    }
88
89    /// Get the next token
90    pub fn next_token(&mut self) -> Token {
91        let had_ws = self.skip_whitespace();
92        self.had_whitespace_before_token = had_ws;
93
94        let token = match self.ch {
95            // Operators
96            '+' => Token::Plus,
97            '-' => {
98                if self.peek_char() == '>' {
99                    self.read_char();
100                    Token::Arrow
101                } else {
102                    Token::Minus
103                }
104            }
105            '*' => Token::Multiply,
106            '/' => {
107                // Check for comments
108                if self.peek_char() == '/' {
109                    self.skip_line_comment();
110                    return self.next_token();
111                } else if self.peek_char() == '*' {
112                    self.skip_block_comment();
113                    return self.next_token();
114                } else {
115                    Token::Divide
116                }
117            }
118            '%' => Token::Modulo,
119
120            // Comparison and logical
121            '=' => {
122                if self.peek_char() == '=' {
123                    self.read_char();
124                    Token::Equal
125                } else {
126                    Token::Assign
127                }
128            }
129            '!' => {
130                if self.peek_char() == '=' {
131                    self.read_char();
132                    Token::NotEqual
133                } else {
134                    Token::Not
135                }
136            }
137            '<' => {
138                if self.peek_char() == '=' {
139                    self.read_char();
140                    Token::LessEqual
141                } else {
142                    Token::Less
143                }
144            }
145            '>' => {
146                if self.peek_char() == '=' {
147                    self.read_char();
148                    Token::GreaterEqual
149                } else {
150                    Token::Greater
151                }
152            }
153            '&' => {
154                if self.peek_char() == '&' {
155                    self.read_char();
156                    Token::And
157                } else {
158                    Token::Illegal('&')
159                }
160            }
161            '|' => {
162                if self.peek_char() == '|' {
163                    self.read_char();
164                    Token::Or
165                } else {
166                    Token::Illegal('|')
167                }
168            }
169
170            // Delimiters
171            '(' => Token::LeftParen,
172            ')' => Token::RightParen,
173            '{' => Token::LeftBrace,
174            '}' => Token::RightBrace,
175            '[' => Token::LeftBracket,
176            ']' => Token::RightBracket,
177            ',' => Token::Comma,
178            ':' => Token::Colon,
179            ';' => Token::Semicolon,
180
181            // String literals
182            '"' => {
183                // Check if it's a multiline string (""")
184                if self.peek_char() == '"' && self.peek_char_n(2) == '"' {
185                    return self.read_multiline_string();
186                } else {
187                    return self.read_string();
188                }
189            }
190
191            // Newline (statement separator)
192            '\n' => Token::Newline,
193
194            // EOF
195            '\0' => Token::EOF,
196
197            // Identifiers, keywords, and numbers
198            _ => {
199                if self.ch.is_alphabetic() || self.ch == '_' {
200                    return self.read_identifier();
201                } else if self.ch.is_numeric() {
202                    return self.read_number();
203                } else {
204                    Token::Illegal(self.ch)
205                }
206            }
207        };
208
209        self.read_char();
210        token
211    }
212
213    /// Skip whitespace (except newlines, which are significant)
214    /// Returns true if any whitespace was skipped
215    fn skip_whitespace(&mut self) -> bool {
216        let mut skipped = false;
217        while self.ch == ' ' || self.ch == '\t' || self.ch == '\r' {
218            skipped = true;
219            self.read_char();
220        }
221        skipped
222    }
223
224    /// Skip single-line comment (// ...)
225    fn skip_line_comment(&mut self) {
226        while self.ch != '\n' && self.ch != '\0' {
227            self.read_char();
228        }
229    }
230
231    /// Skip block comment (/* ... */)
232    fn skip_block_comment(&mut self) {
233        self.read_char(); // skip '/'
234        self.read_char(); // skip '*'
235
236        while !(self.ch == '*' && self.peek_char() == '/') && self.ch != '\0' {
237            if self.ch == '\n' {
238                self.line += 1;
239                self.column = 0;
240            }
241            self.read_char();
242        }
243
244        if self.ch != '\0' {
245            self.read_char(); // skip '*'
246            self.read_char(); // skip '/'
247        }
248    }
249
250    /// Read an identifier or keyword
251    fn read_identifier(&mut self) -> Token {
252        let start = self.position;
253
254        // Aether 标识符: 大写字母、数字、下划线
255        while self.ch.is_alphanumeric() || self.ch == '_' {
256            self.read_char();
257        }
258
259        let ident: String = self.input[start..self.position].iter().collect();
260        Token::lookup_keyword(&ident)
261    }
262
263    /// Read a number (integer or float)
264    fn read_number(&mut self) -> Token {
265        let start = self.position;
266        let mut has_dot = false;
267
268        while self.ch.is_numeric() || (self.ch == '.' && !has_dot) {
269            if self.ch == '.' {
270                // Check if next character is a digit
271                if !self.peek_char().is_numeric() {
272                    break;
273                }
274                has_dot = true;
275            }
276            self.read_char();
277        }
278
279        let num_str: String = self.input[start..self.position].iter().collect();
280
281        // 如果是整数且位数较多(超过15位,接近f64精度极限),作为大整数处理
282        if !has_dot && num_str.len() > 15 {
283            return Token::BigInteger(num_str);
284        }
285
286        match num_str.parse::<f64>() {
287            Ok(num) => Token::Number(num),
288            Err(_) => Token::Illegal('0'), // Invalid number
289        }
290    }
291
292    /// Read a string literal
293    fn read_string(&mut self) -> Token {
294        self.read_char(); // Skip opening quote
295        let start = self.position;
296
297        while self.ch != '"' && self.ch != '\0' {
298            // Handle escape sequences
299            if self.ch == '\\' {
300                self.read_char(); // Skip backslash
301                if self.ch != '\0' {
302                    self.read_char(); // Skip escaped character
303                }
304            } else {
305                if self.ch == '\n' {
306                    self.line += 1;
307                    self.column = 0;
308                }
309                self.read_char();
310            }
311        }
312
313        if self.ch == '\0' {
314            return Token::Illegal('"'); // Unterminated string
315        }
316
317        let string: String = self.input[start..self.position].iter().collect();
318        self.read_char(); // Skip closing quote
319
320        // Process escape sequences
321        Token::String(self.process_escapes(&string))
322    }
323
324    /// Read a multiline string literal (""" ... """)
325    fn read_multiline_string(&mut self) -> Token {
326        // Skip the opening """
327        self.read_char(); // Skip first "
328        self.read_char(); // Skip second "
329        self.read_char(); // Skip third "
330
331        let start = self.position;
332
333        // Read until we find closing """
334        loop {
335            if self.ch == '\0' {
336                return Token::Illegal('"'); // Unterminated multiline string
337            }
338
339            // Check if we found closing """
340            if self.ch == '"' && self.peek_char() == '"' && self.peek_char_n(2) == '"' {
341                let string: String = self.input[start..self.position].iter().collect();
342
343                // Skip the closing """
344                self.read_char(); // Skip first "
345                self.read_char(); // Skip second "
346                self.read_char(); // Skip third "
347
348                // Process escape sequences
349                return Token::String(self.process_escapes(&string));
350            }
351
352            // Handle newlines for line tracking
353            if self.ch == '\n' {
354                self.line += 1;
355                self.column = 0;
356            }
357
358            self.read_char();
359        }
360    }
361
362    /// Process escape sequences in strings
363    fn process_escapes(&self, s: &str) -> String {
364        let mut result = String::new();
365        let mut chars = s.chars().peekable();
366
367        while let Some(ch) = chars.next() {
368            if ch == '\\' {
369                match chars.next() {
370                    Some('n') => result.push('\n'),
371                    Some('t') => result.push('\t'),
372                    Some('r') => result.push('\r'),
373                    Some('\\') => result.push('\\'),
374                    Some('"') => result.push('"'),
375                    Some('u') => {
376                        // Handle \uXXXX Unicode escape sequences
377                        let mut hex = String::new();
378                        for _ in 0..4 {
379                            if let Some(c) = chars.next() {
380                                hex.push(c);
381                            } else {
382                                // Invalid escape sequence, keep as is
383                                result.push_str("\\u");
384                                result.push_str(&hex);
385                                break;
386                            }
387                        }
388                        if hex.len() == 4 {
389                            if let Ok(code) = u32::from_str_radix(&hex, 16) {
390                                if let Some(unicode_char) = char::from_u32(code) {
391                                    result.push(unicode_char);
392                                } else {
393                                    // Invalid Unicode code point, keep as is
394                                    result.push_str("\\u");
395                                    result.push_str(&hex);
396                                }
397                            } else {
398                                // Invalid hex, keep as is
399                                result.push_str("\\u");
400                                result.push_str(&hex);
401                            }
402                        }
403                    }
404                    Some(c) => {
405                        result.push('\\');
406                        result.push(c);
407                    }
408                    None => result.push('\\'),
409                }
410            } else {
411                result.push(ch);
412            }
413        }
414
415        result
416    }
417}