TypeScript_Rust_Compiler/
lexer_utf8.rs

1//! UTF-8 compatible lexer for TypeScript
2
3use crate::error::{CompilerError, Result};
4use crate::lexer::{Token, Keyword};
5
6/// UTF-8 compatible lexer
7pub struct Utf8Lexer {
8    chars: Vec<char>,
9    position: usize,
10    line: usize,
11    column: usize,
12}
13
14impl Utf8Lexer {
15    /// Create a new UTF-8 lexer
16    pub fn new(input: String) -> Self {
17        Self {
18            chars: input.chars().collect(),
19            position: 0,
20            line: 1,
21            column: 1,
22        }
23    }
24
25    /// Tokenize the input string
26    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
27        let mut tokens = Vec::new();
28
29        while self.position < self.chars.len() {
30            match self.next_token()? {
31                Some(token) => {
32                    tokens.push(token);
33                }
34                None => break,
35            }
36        }
37
38        tokens.push(Token::EOF);
39        Ok(tokens)
40    }
41
42    /// Get the next token
43    fn next_token(&mut self) -> Result<Option<Token>> {
44        self.skip_whitespace();
45
46        if self.position >= self.chars.len() {
47            return Ok(None);
48        }
49
50        let ch = self.current_char();
51        let token = match ch {
52            '+' => Ok(Some(Token::Plus)),
53            '-' => Ok(Some(Token::Minus)),
54            '*' => Ok(Some(Token::Multiply)),
55            '/' => {
56                // Check if this is the start of a regular expression
57                if self.peek_char() == Some('^') {
58                    Ok(self.parse_regex()?)
59                } else {
60                    Ok(Some(Token::Divide))
61                }
62            },
63            '%' => Ok(Some(Token::Modulo)),
64            '=' => {
65                if self.peek_char() == Some('=') {
66                    self.advance();
67                    if self.peek_char() == Some('=') {
68                        self.advance();
69                        Ok(Some(Token::StrictEqual))
70                    } else {
71                        Ok(Some(Token::Equal))
72                    }
73                } else if self.peek_char() == Some('>') {
74                    self.advance();
75                    Ok(Some(Token::Arrow))
76                } else {
77                    Ok(Some(Token::Assign))
78                }
79            }
80            '!' => {
81                if self.peek_char() == Some('=') {
82                    self.advance();
83                    if self.peek_char() == Some('=') {
84                        self.advance();
85                        Ok(Some(Token::StrictNotEqual))
86                    } else {
87                        Ok(Some(Token::NotEqual))
88                    }
89                } else {
90                    Ok(Some(Token::Not))
91                }
92            }
93            '<' => {
94                if self.peek_char() == Some('=') {
95                    self.advance();
96                    Ok(Some(Token::LessEqual))
97                } else {
98                    Ok(Some(Token::LessThan))
99                }
100            }
101            '>' => {
102                if self.peek_char() == Some('=') {
103                    self.advance();
104                    Ok(Some(Token::GreaterEqual))
105                } else {
106                    Ok(Some(Token::GreaterThan))
107                }
108            }
109            '&' => {
110                if self.peek_char() == Some('&') {
111                    self.advance();
112                    Ok(Some(Token::And))
113                } else {
114                    Ok(Some(Token::Intersection))
115                }
116            }
117            '|' => {
118                if self.peek_char() == Some('|') {
119                    self.advance();
120                    Ok(Some(Token::Or))
121                } else {
122                    Ok(Some(Token::Union))
123                }
124            }
125            '(' => Ok(Some(Token::LeftParen)),
126            ')' => Ok(Some(Token::RightParen)),
127            '{' => Ok(Some(Token::LeftBrace)),
128            '}' => Ok(Some(Token::RightBrace)),
129            '[' => Ok(Some(Token::LeftBracket)),
130            ']' => Ok(Some(Token::RightBracket)),
131            ';' => Ok(Some(Token::Semicolon)),
132            ',' => Ok(Some(Token::Comma)),
133            '.' => Ok(Some(Token::Dot)),
134            ':' => Ok(Some(Token::Colon)),
135            '?' => Ok(Some(Token::QuestionMark)),
136            '@' => Ok(Some(Token::At)), // Add support for @ decorator symbol
137            '"' | '\'' => Ok(self.parse_string()?),
138            '`' => Ok(self.parse_template_literal()?),
139            '0'..='9' => Ok(self.parse_number()?),
140            _ if ch.is_alphabetic() || ch == '_' || ch == '$' => Ok(self.parse_identifier_or_keyword()?),
141            _ => {
142                return Err(CompilerError::parse_error(
143                    self.line,
144                    self.column,
145                    format!("Unexpected character: {}", ch),
146                ));
147            }
148        };
149
150        // Only advance for simple tokens that don't manage position themselves
151        match ch {
152            '0'..='9' => {
153                // parse_number manages position itself
154            }
155            '"' | '\'' => {
156                // parse_string manages position itself
157            }
158            _ if ch.is_alphabetic() || ch == '_' || ch == '$' => {
159                // parse_identifier_or_keyword manages position itself
160            }
161            _ => {
162                // Simple tokens need to advance
163                self.advance();
164            }
165        }
166        token
167    }
168
169    /// Get current character
170    fn current_char(&self) -> char {
171        self.chars.get(self.position).copied().unwrap_or('\0')
172    }
173
174    /// Peek at next character
175    fn peek_char(&self) -> Option<char> {
176        self.chars.get(self.position + 1).copied()
177    }
178
179    /// Advance position
180    fn advance(&mut self) {
181        if self.position < self.chars.len() {
182            let ch = self.current_char();
183            if ch == '\n' {
184                self.line += 1;
185                self.column = 1;
186            } else {
187                self.column += 1;
188            }
189            self.position += 1;
190        }
191    }
192
193    /// Skip whitespace
194    fn skip_whitespace(&mut self) {
195        while self.position < self.chars.len() {
196            let ch = self.current_char();
197            if ch.is_whitespace() {
198                self.advance();
199            } else if ch == '/' && self.peek_char() == Some('/') {
200                // Skip line comment
201                self.advance(); // skip first /
202                self.advance(); // skip second /
203                while self.position < self.chars.len() && self.current_char() != '\n' {
204                    self.advance();
205                }
206            } else if ch == '/' && self.peek_char() == Some('*') {
207                // Skip block comment
208                self.advance(); // skip /
209                self.advance(); // skip *
210                while self.position < self.chars.len() {
211                    if self.current_char() == '*' && self.peek_char() == Some('/') {
212                        self.advance(); // skip *
213                        self.advance(); // skip /
214                        break;
215                    }
216                    self.advance();
217                }
218            } else {
219                break;
220            }
221        }
222    }
223
224    /// Parse string literal
225    fn parse_string(&mut self) -> Result<Option<Token>> {
226        let quote = self.current_char();
227        let mut value = String::new();
228        self.advance();
229
230        while self.position < self.chars.len() {
231            let ch = self.current_char();
232            if ch == quote {
233                self.advance();
234                return Ok(Some(Token::String(value)));
235            } else if ch == '\\' {
236                self.advance();
237                if self.position < self.chars.len() {
238                    let escaped = self.current_char();
239                    value.push(match escaped {
240                        'n' => '\n',
241                        't' => '\t',
242                        'r' => '\r',
243                        '\\' => '\\',
244                        '"' => '"',
245                        '\'' => '\'',
246                        _ => escaped,
247                    });
248                    self.advance();
249                }
250            } else {
251                value.push(ch);
252                self.advance();
253            }
254        }
255
256        Err(CompilerError::parse_error(
257            self.line,
258            self.column,
259            "Unterminated string literal",
260        ))
261    }
262
263    /// Parse template literal
264    fn parse_template_literal(&mut self) -> Result<Option<Token>> {
265        let mut value = String::new();
266        self.advance();
267
268        while self.position < self.chars.len() {
269            let ch = self.current_char();
270            if ch == '`' {
271                self.advance();
272                return Ok(Some(Token::TemplateLiteral(value)));
273            } else if ch == '\\' {
274                self.advance();
275                if self.position < self.chars.len() {
276                    let escaped = self.current_char();
277                    value.push(match escaped {
278                        'n' => '\n',
279                        't' => '\t',
280                        'r' => '\r',
281                        '\\' => '\\',
282                        '`' => '`',
283                        _ => escaped,
284                    });
285                    self.advance();
286                }
287            } else {
288                value.push(ch);
289                self.advance();
290            }
291        }
292
293        Err(CompilerError::parse_error(
294            self.line,
295            self.column,
296            "Unterminated template literal",
297        ))
298    }
299
300    /// Parse number literal
301    fn parse_number(&mut self) -> Result<Option<Token>> {
302        let mut value = String::new();
303
304        while self.position < self.chars.len() {
305            let ch = self.current_char();
306            if ch.is_ascii_digit() || ch == '.' {
307                value.push(ch);
308                self.advance();
309            } else {
310                break;
311            }
312        }
313
314        match value.parse::<f64>() {
315            Ok(num) => Ok(Some(Token::Number(num))),
316            Err(_) => Err(CompilerError::parse_error(
317                self.line,
318                self.column,
319                format!("Invalid number: {}", value),
320            )),
321        }
322    }
323
324    /// Parse identifier or keyword
325    fn parse_identifier_or_keyword(&mut self) -> Result<Option<Token>> {
326        let mut value = String::new();
327
328        while self.position < self.chars.len() {
329            let ch = self.current_char();
330            if ch.is_alphanumeric() || ch == '_' || ch == '$' {
331                value.push(ch);
332                self.advance();
333            } else {
334                break;
335            }
336        }
337
338        // Check if it's a keyword
339        if let Some(keyword) = self.parse_keyword(&value) {
340            Ok(Some(Token::Keyword(keyword)))
341        } else {
342            Ok(Some(Token::Identifier(value)))
343        }
344    }
345
346    /// Parse keyword from string
347    fn parse_keyword(&self, value: &str) -> Option<Keyword> {
348        match value {
349            "let" => Some(Keyword::Let),
350            "const" => Some(Keyword::Const),
351            "var" => Some(Keyword::Var),
352            "function" => Some(Keyword::Function),
353            "class" => Some(Keyword::Class),
354            "interface" => Some(Keyword::Interface),
355            "type" => Some(Keyword::Type),
356            "enum" => Some(Keyword::Enum),
357            "namespace" => Some(Keyword::Namespace),
358            "module" => Some(Keyword::Module),
359            "export" => Some(Keyword::Export),
360            "import" => Some(Keyword::Import),
361            "public" => Some(Keyword::Public),
362            "private" => Some(Keyword::Private),
363            "protected" => Some(Keyword::Protected),
364            "static" => Some(Keyword::Static),
365            "readonly" => Some(Keyword::Readonly),
366            "abstract" => Some(Keyword::Abstract),
367            "async" => Some(Keyword::Async),
368            "await" => Some(Keyword::Await),
369            "extends" => Some(Keyword::Extends),
370            "implements" => Some(Keyword::Implements),
371            "constructor" => Some(Keyword::Constructor),
372            "get" => Some(Keyword::Get),
373            "set" => Some(Keyword::Set),
374            "this" => Some(Keyword::This),
375            "super" => Some(Keyword::Super),
376            "new" => Some(Keyword::New),
377            "return" => Some(Keyword::Return),
378            "if" => Some(Keyword::If),
379            "else" => Some(Keyword::Else),
380            "while" => Some(Keyword::While),
381            "for" => Some(Keyword::For),
382            "do" => Some(Keyword::Do),
383            "break" => Some(Keyword::Break),
384            "continue" => Some(Keyword::Continue),
385            "switch" => Some(Keyword::Switch),
386            "case" => Some(Keyword::Case),
387            "default" => Some(Keyword::Default),
388            "try" => Some(Keyword::Try),
389            "catch" => Some(Keyword::Catch),
390            "finally" => Some(Keyword::Finally),
391            "throw" => Some(Keyword::Throw),
392            "true" => Some(Keyword::True),
393            "false" => Some(Keyword::False),
394            "null" => Some(Keyword::Null),
395            "undefined" => Some(Keyword::Undefined),
396            "void" => Some(Keyword::Void),
397            "never" => Some(Keyword::Never),
398            "any" => Some(Keyword::Any),
399            "unknown" => Some(Keyword::Unknown),
400            "object" => Some(Keyword::Object),
401            "string" => Some(Keyword::String),
402            "number" => Some(Keyword::Number),
403            "boolean" => Some(Keyword::Boolean),
404            "symbol" => Some(Keyword::Symbol),
405            "bigint" => Some(Keyword::BigInt),
406            "typeof" => Some(Keyword::Typeof),
407            _ => None,
408        }
409    }
410
411    /// Parse regular expression literal
412    fn parse_regex(&mut self) -> Result<Option<Token>> {
413        let mut pattern = String::new();
414        let mut flags = String::new();
415
416        self.advance(); // consume '/'
417
418        // Parse pattern until closing '/'
419        while self.position < self.chars.len() {
420            let ch = self.current_char();
421            if ch == '/' {
422                self.advance();
423                break;
424            } else if ch == '\\' {
425                // Handle escape sequences
426                pattern.push(ch);
427                self.advance();
428                if self.position < self.chars.len() {
429                    pattern.push(self.current_char());
430                    self.advance();
431                }
432            } else {
433                pattern.push(ch);
434                self.advance();
435            }
436        }
437
438        // Parse flags
439        while self.position < self.chars.len() {
440            let ch = self.current_char();
441            if ch.is_alphabetic() {
442                flags.push(ch);
443                self.advance();
444            } else {
445                break;
446            }
447        }
448
449        Ok(Some(Token::RegExp(pattern, flags)))
450    }
451}