TypeScript_Rust_Compiler/
lexer.rs

1//! Lexical analysis for TypeScript code
2
3use crate::error::{CompilerError, Result};
4use serde::{Deserialize, Serialize};
5
6/// Token types for TypeScript
7#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
8pub enum Token {
9    // Literals
10    Number(f64),
11    String(String),
12    TemplateLiteral(String),
13    Boolean(bool),
14    Null,
15    Undefined,
16
17    // Identifiers and keywords
18    Identifier(String),
19    Keyword(Keyword),
20
21    // Operators
22    Plus,
23    Minus,
24    Multiply,
25    Divide,
26    Modulo,
27    Equal,
28    NotEqual,
29    StrictEqual,
30    StrictNotEqual,
31    LessThan,
32    GreaterThan,
33    LessEqual,
34    GreaterEqual,
35    And,
36    Or,
37    Not,
38    Assign,
39    Arrow, // =>
40    PlusAssign,
41    MinusAssign,
42    MultiplyAssign,
43    DivideAssign,
44    Union, // |
45    Intersection, // &
46
47    // Delimiters
48    LeftParen,
49    RightParen,
50    LeftBrace,
51    RightBrace,
52    LeftBracket,
53    RightBracket,
54    Semicolon,
55    Comma,
56    Dot,
57    Colon,
58    QuestionMark,
59    At,
60    RegExp(String, String), // pattern, flags
61
62    // Type annotations
63    TypeAnnotation,
64    GenericStart,
65    GenericEnd,
66
67    // Special
68    Newline,
69    Whitespace,
70    Comment(String),
71    EOF,
72}
73
74/// TypeScript keywords
75#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
76pub enum Keyword {
77    // Declarations
78    Let,
79    Const,
80    Var,
81    Function,
82    Class,
83    Interface,
84    Type,
85    Enum,
86    Namespace,
87    Module,
88    Import,
89    Export,
90    From,
91    As,
92    Default,
93
94    // Control flow
95    If,
96    Else,
97    Switch,
98    Case,
99    DefaultCase,
100    For,
101    While,
102    Do,
103    Break,
104    Continue,
105    Return,
106    Throw,
107    Try,
108    Catch,
109    Finally,
110
111    // OOP
112    Extends,
113    Implements,
114    Super,
115    This,
116    New,
117    Static,
118    Public,
119    Private,
120    Protected,
121    Abstract,
122    Readonly,
123    Get,
124    Set,
125    Constructor,
126
127    // Async
128    Async,
129    Await,
130    Promise,
131
132    // Types
133    Any,
134    Unknown,
135    Never,
136    Void,
137    Null,
138    Undefined,
139    Boolean,
140    Number,
141    String,
142    Symbol,
143    BigInt,
144    Object,
145    Array,
146    Tuple,
147    Union,
148    Intersection,
149    Literal,
150    Mapped,
151    Conditional,
152    Template,
153
154    // Utility types
155    Partial,
156    Required,
157    Pick,
158    Omit,
159    Record,
160    Exclude,
161    Extract,
162    NonNullable,
163    Parameters,
164    ReturnType,
165    InstanceType,
166    ThisParameterType,
167    OmitThisParameter,
168    ThisType,
169
170    // Other
171    True,
172    False,
173    In,
174    Of,
175    Instanceof,
176    Typeof,
177    Keyof,
178    Key,
179    Is,
180    Asserts,
181    Infer,
182    Declare,
183    Ambient,
184    Global,
185}
186
187/// Lexer for TypeScript code
188pub struct Lexer {
189    input: String,
190    position: usize,
191    line: usize,
192    column: usize,
193}
194
195impl Lexer {
196    /// Create a new lexer
197    pub fn new(input: String) -> Self {
198        Self {
199            input,
200            position: 0,
201            line: 1,
202            column: 1,
203        }
204    }
205    
206    /// Create a UTF-8 compatible lexer
207    pub fn new_utf8(input: String) -> Self {
208        Self::new(input)
209    }
210
211    /// Tokenize the input string
212    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
213        let mut tokens = Vec::new();
214
215        while self.position < self.input.len() {
216            match self.next_token()? {
217                Some(token) => {
218                    tokens.push(token);
219                }
220                None => break,
221            }
222        }
223
224        tokens.push(Token::EOF);
225        Ok(tokens)
226    }
227
228    /// Get the next token
229    fn next_token(&mut self) -> Result<Option<Token>> {
230        self.skip_whitespace();
231
232        if self.position >= self.input.len() {
233            return Ok(None);
234        }
235
236        let ch = self.current_char();
237        let token = match ch {
238            '+' => {
239                if self.peek_char() == Some('=') {
240                    self.advance();
241                    Ok(Some(Token::PlusAssign))
242                } else if self.peek_char() == Some('+') {
243                    self.advance();
244                    Ok(Some(Token::Plus)) // ++ operator
245                } else {
246                    Ok(Some(Token::Plus))
247                }
248            }
249            '-' => {
250                if self.peek_char() == Some('=') {
251                    self.advance();
252                    Ok(Some(Token::MinusAssign))
253                } else if self.peek_char() == Some('>') {
254                    self.advance();
255                    Ok(Some(Token::Arrow))
256                } else {
257                    Ok(Some(Token::Minus))
258                }
259            }
260            '*' => {
261                if self.peek_char() == Some('=') {
262                    self.advance();
263                    Ok(Some(Token::MultiplyAssign))
264                } else {
265                    Ok(Some(Token::Multiply))
266                }
267            }
268            '/' => {
269                if self.peek_char() == Some('=') {
270                    self.advance();
271                    Ok(Some(Token::DivideAssign))
272                } else if self.peek_char() == Some('/') {
273                    self.advance();
274                    self.skip_line_comment();
275                    Ok(None)
276                } else if self.peek_char() == Some('*') {
277                    self.advance();
278                    self.skip_block_comment();
279                    Ok(None)
280                } else {
281                    Ok(Some(Token::Divide))
282                }
283            }
284            '%' => Ok(Some(Token::Modulo)),
285            '=' => {
286                if self.peek_char() == Some('=') {
287                    self.advance();
288                    if self.peek_char() == Some('=') {
289                        self.advance();
290                        Ok(Some(Token::StrictEqual))
291                    } else {
292                        Ok(Some(Token::Equal))
293                    }
294                } else if self.peek_char() == Some('>') {
295                    self.advance();
296                    Ok(Some(Token::Arrow))
297                } else {
298                    Ok(Some(Token::Assign))
299                }
300            }
301            '!' => {
302                if self.peek_char() == Some('=') {
303                    self.advance();
304                    if self.peek_char() == Some('=') {
305                        self.advance();
306                        Ok(Some(Token::StrictNotEqual))
307                    } else {
308                        Ok(Some(Token::NotEqual))
309                    }
310                } else {
311                    Ok(Some(Token::Not))
312                }
313            }
314            '<' => {
315                if self.peek_char() == Some('=') {
316                    self.advance();
317                    Ok(Some(Token::LessEqual))
318                } else {
319                    Ok(Some(Token::LessThan))
320                }
321            }
322            '>' => {
323                if self.peek_char() == Some('=') {
324                    self.advance();
325                    Ok(Some(Token::GreaterEqual))
326                } else {
327                    Ok(Some(Token::GreaterThan))
328                }
329            }
330            '&' => {
331                if self.peek_char() == Some('&') {
332                    self.advance();
333                    Ok(Some(Token::And))
334                } else {
335                    Ok(Some(Token::Intersection))
336                }
337            }
338            '|' => {
339                if self.peek_char() == Some('|') {
340                    self.advance();
341                    Ok(Some(Token::Or))
342                } else {
343                    Ok(Some(Token::Union))
344                }
345            }
346            '(' => Ok(Some(Token::LeftParen)),
347            ')' => Ok(Some(Token::RightParen)),
348            '{' => Ok(Some(Token::LeftBrace)),
349            '}' => Ok(Some(Token::RightBrace)),
350            '[' => Ok(Some(Token::LeftBracket)),
351            ']' => Ok(Some(Token::RightBracket)),
352            ';' => Ok(Some(Token::Semicolon)),
353            ',' => Ok(Some(Token::Comma)),
354            '.' => Ok(Some(Token::Dot)),
355            ':' => Ok(Some(Token::Colon)),
356            '?' => Ok(Some(Token::QuestionMark)),
357            '@' => Ok(Some(Token::At)), // Add support for @ decorator symbol
358            '"' | '\'' => Ok(self.parse_string()?),
359            '`' => Ok(self.parse_template_literal()?),
360            '0'..='9' => Ok(self.parse_number()?),
361            'a'..='z' | 'A'..='Z' | '_' | '$' => Ok(self.parse_identifier_or_keyword()?),
362            _ if ch.is_alphabetic() || ch.is_alphanumeric() => Ok(self.parse_identifier_or_keyword()?),
363            _ => {
364                return Err(CompilerError::parse_error(
365                    self.line,
366                    self.column,
367                    format!("Unexpected character: {}", ch),
368                ));
369            }
370        };
371
372        // Only advance for simple tokens that don't manage position themselves
373        match ch {
374            'a'..='z' | 'A'..='Z' | '_' | '$' => {
375                // parse_identifier_or_keyword manages position itself
376            }
377            '0'..='9' => {
378                // parse_number manages position itself
379            }
380            '"' | '\'' => {
381                // parse_string manages position itself
382            }
383            _ if ch.is_alphabetic() || ch.is_alphanumeric() => {
384                // parse_identifier_or_keyword manages position itself
385            }
386            _ => {
387                // Simple tokens need to advance
388                self.advance();
389            }
390        }
391        token
392    }
393
394    /// Get current character
395    fn current_char(&self) -> char {
396        self.input.chars().nth(self.position).unwrap_or('\0')
397    }
398    
399
400    /// Peek at next character
401    fn peek_char(&self) -> Option<char> {
402        self.input.chars().nth(self.position + 1)
403    }
404
405    /// Advance position
406    fn advance(&mut self) {
407        if self.position < self.input.len() {
408            let ch = self.current_char();
409            if ch == '\n' {
410                self.line += 1;
411                self.column = 1;
412            } else {
413                self.column += 1;
414            }
415            
416            // Simple advance by one character position
417            self.position += 1;
418        }
419    }
420
421    /// Skip whitespace
422    fn skip_whitespace(&mut self) {
423        while self.position < self.input.len() {
424            let ch = self.current_char();
425            if ch.is_whitespace() {
426                self.advance();
427            } else if ch == '/' && self.peek_char() == Some('/') {
428                // Skip line comment
429                self.advance(); // skip first /
430                self.advance(); // skip second /
431                while self.position < self.input.len() && self.current_char() != '\n' {
432                    self.advance();
433                }
434            } else if ch == '/' && self.peek_char() == Some('*') {
435                // Skip block comment
436                self.advance(); // skip /
437                self.advance(); // skip *
438                while self.position < self.input.len() {
439                    if self.current_char() == '*' && self.peek_char() == Some('/') {
440                        self.advance(); // skip *
441                        self.advance(); // skip /
442                        break;
443                    }
444                    self.advance();
445                }
446            } else {
447                break;
448            }
449        }
450    }
451
452    /// Skip line comment
453    fn skip_line_comment(&mut self) -> Option<Token> {
454        while self.position < self.input.len() && self.current_char() != '\n' {
455            self.advance();
456        }
457        None
458    }
459
460    /// Skip block comment
461    fn skip_block_comment(&mut self) -> Option<Token> {
462        while self.position < self.input.len() {
463            if self.current_char() == '*' && self.peek_char() == Some('/') {
464                self.advance();
465                self.advance();
466                break;
467            }
468            self.advance();
469        }
470        None
471    }
472
473    /// Parse string literal
474    fn parse_string(&mut self) -> Result<Option<Token>> {
475        let quote = self.current_char();
476        let mut value = String::new();
477        self.advance();
478
479        while self.position < self.input.len() {
480            let ch = self.current_char();
481            if ch == quote {
482                self.advance();
483                return Ok(Some(Token::String(value)));
484            } else if ch == '\\' {
485                self.advance();
486                if self.position < self.input.len() {
487                    let escaped = self.current_char();
488                    value.push(match escaped {
489                        'n' => '\n',
490                        't' => '\t',
491                        'r' => '\r',
492                        '\\' => '\\',
493                        '"' => '"',
494                        '\'' => '\'',
495                        _ => escaped,
496                    });
497                    self.advance();
498                }
499            } else {
500                value.push(ch);
501                self.advance();
502            }
503        }
504
505        Err(CompilerError::parse_error(
506            self.line,
507            self.column,
508            "Unterminated string literal",
509        ))
510    }
511
512    /// Parse template literal
513    fn parse_template_literal(&mut self) -> Result<Option<Token>> {
514        let mut value = String::new();
515        self.advance(); // consume opening backtick
516
517        while self.position < self.input.len() {
518            let ch = self.current_char();
519            if ch == '`' {
520                self.advance();
521                return Ok(Some(Token::TemplateLiteral(value)));
522            } else if ch == '\\' {
523                self.advance();
524                if self.position < self.input.len() {
525                    let escaped = self.current_char();
526                    value.push(match escaped {
527                        'n' => '\n',
528                        't' => '\t',
529                        'r' => '\r',
530                        '\\' => '\\',
531                        '`' => '`',
532                        '$' => '$',
533                        _ => escaped,
534                    });
535                    self.advance();
536                }
537            } else if ch == '$' && self.position + 1 < self.input.len() && self.input.chars().nth(self.position + 1) == Some('{') {
538                // Handle ${} interpolation - include the full ${} in the string for now
539                value.push('$');
540                self.advance();
541                if self.position < self.input.len() {
542                    value.push('{');
543                    self.advance();
544                    // Skip to closing brace
545                    while self.position < self.input.len() && self.current_char() != '}' {
546                        value.push(self.current_char());
547                        self.advance();
548                    }
549                    if self.position < self.input.len() {
550                        value.push('}');
551                        self.advance();
552                    }
553                }
554            } else {
555                value.push(ch);
556                self.advance();
557            }
558        }
559
560        Err(CompilerError::parse_error(
561            self.line,
562            self.column,
563            "Unterminated template literal",
564        ))
565    }
566
567    /// Parse number literal
568    fn parse_number(&mut self) -> Result<Option<Token>> {
569        let mut value = String::new();
570        let mut has_dot = false;
571
572        while self.position < self.input.len() {
573            let ch = self.current_char();
574            if ch.is_ascii_digit() {
575                value.push(ch);
576                self.advance();
577            } else if ch == '.' && !has_dot {
578                has_dot = true;
579                value.push(ch);
580                self.advance();
581            } else {
582                break;
583            }
584        }
585
586        let number: f64 = value.parse().map_err(|_| {
587            CompilerError::parse_error(self.line, self.column, "Invalid number literal")
588        })?;
589
590        Ok(Some(Token::Number(number)))
591    }
592
593    /// Parse identifier or keyword
594    fn parse_identifier_or_keyword(&mut self) -> Result<Option<Token>> {
595        let mut value = String::new();
596
597        while self.position < self.input.len() {
598            let ch = self.current_char();
599            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
600                value.push(ch);
601                self.advance();
602            } else {
603                break;
604            }
605        }
606
607        // Check if it's a literal
608        if value == "true" {
609            Ok(Some(Token::Boolean(true)))
610        } else if value == "false" {
611            Ok(Some(Token::Boolean(false)))
612        } else if let Some(keyword) = self.parse_keyword(&value) {
613            Ok(Some(Token::Keyword(keyword)))
614        } else {
615            Ok(Some(Token::Identifier(value)))
616        }
617    }
618
619    /// Parse keyword from string
620    fn parse_keyword(&self, value: &str) -> Option<Keyword> {
621        match value {
622            "let" => Some(Keyword::Let),
623            "const" => Some(Keyword::Const),
624            "var" => Some(Keyword::Var),
625            "function" => Some(Keyword::Function),
626            "class" => Some(Keyword::Class),
627            "interface" => Some(Keyword::Interface),
628            "type" => Some(Keyword::Type),
629            "enum" => Some(Keyword::Enum),
630            "namespace" => Some(Keyword::Namespace),
631            "module" => Some(Keyword::Module),
632            "import" => Some(Keyword::Import),
633            "export" => Some(Keyword::Export),
634            "from" => Some(Keyword::From),
635            "as" => Some(Keyword::As),
636            "default" => Some(Keyword::Default),
637            "if" => Some(Keyword::If),
638            "else" => Some(Keyword::Else),
639            "switch" => Some(Keyword::Switch),
640            "case" => Some(Keyword::Case),
641            "for" => Some(Keyword::For),
642            "while" => Some(Keyword::While),
643            "do" => Some(Keyword::Do),
644            "break" => Some(Keyword::Break),
645            "continue" => Some(Keyword::Continue),
646            "return" => Some(Keyword::Return),
647            "throw" => Some(Keyword::Throw),
648            "try" => Some(Keyword::Try),
649            "catch" => Some(Keyword::Catch),
650            "finally" => Some(Keyword::Finally),
651            "extends" => Some(Keyword::Extends),
652            "implements" => Some(Keyword::Implements),
653            "super" => Some(Keyword::Super),
654            "this" => Some(Keyword::This),
655            "new" => Some(Keyword::New),
656            "static" => Some(Keyword::Static),
657            "public" => Some(Keyword::Public),
658            "private" => Some(Keyword::Private),
659            "protected" => Some(Keyword::Protected),
660            "abstract" => Some(Keyword::Abstract),
661            "readonly" => Some(Keyword::Readonly),
662            "get" => Some(Keyword::Get),
663            "set" => Some(Keyword::Set),
664            "async" => Some(Keyword::Async),
665            "await" => Some(Keyword::Await),
666            "Promise" => Some(Keyword::Promise),
667            "any" => Some(Keyword::Any),
668            "unknown" => Some(Keyword::Unknown),
669            "never" => Some(Keyword::Never),
670            "void" => Some(Keyword::Void),
671            "null" => Some(Keyword::Null),
672            "undefined" => Some(Keyword::Undefined),
673            "boolean" => Some(Keyword::Boolean),
674            "number" => Some(Keyword::Number),
675            "string" => Some(Keyword::String),
676            "object" => Some(Keyword::Object),
677            "Array" => Some(Keyword::Array),
678            "true" => Some(Keyword::True),
679            "false" => Some(Keyword::False),
680            "in" => Some(Keyword::In),
681            "of" => Some(Keyword::Of),
682            "instanceof" => Some(Keyword::Instanceof),
683            "typeof" => Some(Keyword::Typeof),
684            "keyof" => Some(Keyword::Keyof),
685            "key" => Some(Keyword::Key),
686            "is" => Some(Keyword::Is),
687            "asserts" => Some(Keyword::Asserts),
688            "infer" => Some(Keyword::Infer),
689            "declare" => Some(Keyword::Declare),
690            "global" => Some(Keyword::Global),
691            _ => None,
692        }
693    }
694
695    /// Parse regular expression literal
696    #[allow(dead_code)]
697    fn parse_regex(&mut self) -> Result<Option<Token>> {
698        let mut pattern = String::new();
699        let mut flags = String::new();
700
701        self.advance(); // consume '/'
702
703        // Parse pattern until closing '/'
704        while self.position < self.input.len() {
705            let ch = self.current_char();
706            if ch == '/' {
707                self.advance();
708                break;
709            } else if ch == '\\' {
710                // Handle escape sequences
711                pattern.push(ch);
712                self.advance();
713                if self.position < self.input.len() {
714                    pattern.push(self.current_char());
715                    self.advance();
716                }
717            } else {
718                pattern.push(ch);
719                self.advance();
720            }
721        }
722
723        // Parse flags
724        while self.position < self.input.len() {
725            let ch = self.current_char();
726            if ch.is_alphabetic() {
727                flags.push(ch);
728                self.advance();
729            } else {
730                break;
731            }
732        }
733
734        Ok(Some(Token::RegExp(pattern, flags)))
735    }
736}