oak_javascript/lexer/
mod.rs

1use crate::{kind::JavaScriptSyntaxKind, language::JavaScriptLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, JavaScriptLanguage>;
10
11static JS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static JS_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: Some('\\') });
13
14
15#[derive(Clone)]
16pub struct JavaScriptLexer<'config> {
17    config: &'config JavaScriptLanguage,
18}
19
20impl<'config> JavaScriptLexer<'config> {
21    pub fn new(config: &'config JavaScriptLanguage) -> Self {
22        Self { config }
23    }
24
25    fn safe_check<S: Source>(&self, state: &State<S>) -> Result<(), OakError> {
26        if state.get_position() <= state.length() {
27            Ok(())
28        }
29        else {
30            Err(OakError::custom_error(format!("Lexer out-of-bounds: pos={}, len={}", state.get_position(), state.length())))
31        }
32    }
33
34    /// 主要的词法分析运行方法
35    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let current_pos = state.get_position();
38            let current_char = state.peek();
39
40            self.safe_check(state)?;
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.lex_newline(state) {
47                continue;
48            }
49
50            if self.lex_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_template_literal(state) {
59                continue;
60            }
61
62            if self.lex_numeric_literal(state) {
63                continue;
64            }
65
66            if self.lex_identifier_or_keyword(state) {
67                continue;
68            }
69
70            if self.lex_operator_or_punctuation(state) {
71                continue;
72            }
73
74            let start = state.get_position();
75            if let Some(ch) = state.peek() {
76                state.advance(ch.len_utf8());
77                state.add_token(JavaScriptSyntaxKind::Error, start, state.get_position());
78            }
79            else {
80                break;
81            }
82        }
83
84        let eof_pos = state.get_position();
85        state.add_token(JavaScriptSyntaxKind::Eof, eof_pos, eof_pos);
86        Ok(())
87    }
88
89    /// 跳过空白字符
90    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
91        match JS_WHITESPACE.scan(state.rest(), state.get_position(), JavaScriptSyntaxKind::Whitespace) {
92            Some(token) => {
93                state.advance_with(token);
94                true
95            }
96            None => false,
97        }
98    }
99
100    /// 处理换行
101    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
102        let start_pos = state.get_position();
103
104        if let Some('\n') = state.peek() {
105            state.advance(1);
106            state.add_token(JavaScriptSyntaxKind::Newline, start_pos, state.get_position());
107            true
108        }
109        else if let Some('\r') = state.peek() {
110            state.advance(1);
111            if let Some('\n') = state.peek() {
112                state.advance(1);
113            }
114            state.add_token(JavaScriptSyntaxKind::Newline, start_pos, state.get_position());
115            true
116        }
117        else {
118            false
119        }
120    }
121
122    /// 处理注释(行注释和块注释)
123    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
124        let start = state.get_position();
125        let rest = state.rest();
126
127        // 行注释: // ... 直到换行
128        if rest.starts_with("//") {
129            state.advance(2);
130            while let Some(ch) = state.peek() {
131                if ch == '\n' || ch == '\r' {
132                    break;
133                }
134                state.advance(ch.len_utf8());
135            }
136            state.add_token(JavaScriptSyntaxKind::LineComment, start, state.get_position());
137            return true;
138        }
139
140        // 块注释: /* ... */
141        if rest.starts_with("/*") {
142            state.advance(2);
143            let mut found_end = false;
144            while let Some(ch) = state.peek() {
145                if ch == '*' && state.peek_next_n(1) == Some('/') {
146                    state.advance(2);
147                    found_end = true;
148                    break;
149                }
150                state.advance(ch.len_utf8());
151            }
152
153            if !found_end {
154                let error = state.syntax_error("Unterminated comment", start);
155                state.add_error(error);
156            }
157
158            state.add_token(JavaScriptSyntaxKind::BlockComment, start, state.get_position());
159            return true;
160        }
161
162        false
163    }
164
165    /// 处理字符串字面量
166    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
167        let start_pos = state.get_position();
168
169        if let Some(first_char) = state.peek() {
170            if first_char == '"' || first_char == '\'' {
171                let quote = first_char;
172                state.advance(1);
173                let mut found_end = false;
174
175                while let Some(ch) = state.peek() {
176                    if ch == quote {
177                        state.advance(1);
178                        found_end = true;
179                        break;
180                    }
181                    else if ch == '\\' {
182                        // Skip escaped character
183                        state.advance(1);
184                        if let Some(_escaped) = state.peek() {
185                            state.advance(1);
186                        }
187                    }
188                    else if ch == '\n' || ch == '\r' {
189                        // Strings cannot span multiple lines in JavaScript
190                        break;
191                    }
192                    else {
193                        state.advance(ch.len_utf8());
194                    }
195                }
196
197                if !found_end {
198                    let error = state.syntax_error("Unterminated string literal", start_pos);
199                    state.add_error(error);
200                }
201
202                state.add_token(JavaScriptSyntaxKind::StringLiteral, start_pos, state.get_position());
203                true
204            }
205            else {
206                false
207            }
208        }
209        else {
210            false
211        }
212    }
213
214    /// 处理模板字符
215    fn lex_template_literal<S: Source>(&self, state: &mut State<S>) -> bool {
216        let start_pos = state.get_position();
217
218        if let Some('`') = state.peek() {
219            state.advance(1);
220
221            let mut found_end = false;
222            while let Some(ch) = state.peek() {
223                if ch == '`' {
224                    state.advance(1);
225                    found_end = true;
226                    break;
227                }
228                else if ch == '\\' {
229                    // 处理转义字符
230                    state.advance(1);
231                    if let Some(escaped) = state.peek() {
232                        state.advance(escaped.len_utf8());
233                    }
234                }
235                else if ch == '$' {
236                    if let Some('{') = state.peek_next_n(1) {
237                        // 模板表达式,暂时跳过
238                        state.advance(2);
239                        let mut brace_count = 1;
240                        while let Some(inner_ch) = state.peek() {
241                            if inner_ch == '{' {
242                                brace_count += 1;
243                            }
244                            else if inner_ch == '}' {
245                                brace_count -= 1;
246                                if brace_count == 0 {
247                                    state.advance(1);
248                                    break;
249                                }
250                            }
251                            state.advance(inner_ch.len_utf8());
252                        }
253                    }
254                    else {
255                        state.advance(ch.len_utf8());
256                    }
257                }
258                else {
259                    state.advance(ch.len_utf8());
260                }
261            }
262
263            if !found_end {
264                let error = state.syntax_error("Unterminated template literal", start_pos);
265                state.add_error(error);
266            }
267
268            state.add_token(JavaScriptSyntaxKind::TemplateString, start_pos, state.get_position());
269            true
270        }
271        else {
272            false
273        }
274    }
275
276    /// 处理数字字面量
277    fn lex_numeric_literal<S: Source>(&self, state: &mut State<S>) -> bool {
278        let start_pos = state.get_position();
279
280        if let Some(ch) = state.peek() {
281            // 十六进制数字 (0x 或 0X)
282            if ch == '0' {
283                if let Some(next) = state.peek_next_n(1) {
284                    if next == 'x' || next == 'X' {
285                        state.advance(2); // 跳过 '0x'
286                        let mut has_digits = false;
287                        while let Some(hex_ch) = state.peek() {
288                            if hex_ch.is_ascii_hexdigit() {
289                                state.advance(1);
290                                has_digits = true;
291                            }
292                            else {
293                                break;
294                            }
295                        }
296
297                        if !has_digits {
298                            let error = state.syntax_error("Invalid hexadecimal number", start_pos);
299                            state.add_error(error);
300                        }
301
302                        // 检查 BigInt 后缀
303                        if let Some('n') = state.peek() {
304                            state.advance(1);
305                            state.add_token(JavaScriptSyntaxKind::BigIntLiteral, start_pos, state.get_position());
306                        }
307                        else {
308                            state.add_token(JavaScriptSyntaxKind::NumericLiteral, start_pos, state.get_position());
309                        }
310                        return true;
311                    }
312                }
313            }
314
315            // 普通数字或小数
316            if ch.is_ascii_digit() || (ch == '.' && self.is_next_digit(state)) {
317                // 处理整数部分
318                if ch != '.' {
319                    while let Some(digit) = state.peek() {
320                        if digit.is_ascii_digit() {
321                            state.advance(1);
322                        }
323                        else {
324                            break;
325                        }
326                    }
327                }
328
329                // 处理小数部分
330                if let Some('.') = state.peek() {
331                    state.advance(1);
332                    while let Some(digit) = state.peek() {
333                        if digit.is_ascii_digit() {
334                            state.advance(1);
335                        }
336                        else {
337                            break;
338                        }
339                    }
340                }
341
342                // 处理指数部分
343                if let Some(exp) = state.peek() {
344                    if exp == 'e' || exp == 'E' {
345                        state.advance(1);
346
347                        // 可选的符号
348                        if let Some(sign) = state.peek() {
349                            if sign == '+' || sign == '-' {
350                                state.advance(1);
351                            }
352                        }
353
354                        // 必须有数字
355                        let mut has_exp_digits = false;
356                        while let Some(digit) = state.peek() {
357                            if digit.is_ascii_digit() {
358                                state.advance(1);
359                                has_exp_digits = true;
360                            }
361                            else {
362                                break;
363                            }
364                        }
365
366                        if !has_exp_digits {
367                            let error = state.syntax_error("Invalid number exponent", start_pos);
368                            state.add_error(error);
369                        }
370                    }
371                }
372
373                // 检查 BigInt 后缀
374                if let Some('n') = state.peek() {
375                    state.advance(1);
376                    state.add_token(JavaScriptSyntaxKind::BigIntLiteral, start_pos, state.get_position());
377                }
378                else {
379                    state.add_token(JavaScriptSyntaxKind::NumericLiteral, start_pos, state.get_position());
380                }
381                true
382            }
383            else {
384                false
385            }
386        }
387        else {
388            false
389        }
390    }
391
392    /// 检查下一个字符是否是数字
393    fn is_next_digit<S: Source>(&self, state: &State<S>) -> bool {
394        if let Some(next_ch) = state.peek_next_n(1) { next_ch.is_ascii_digit() } else { false }
395    }
396
397    /// 处理标识符和关键
398    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
399        let start_pos = state.get_position();
400
401        if let Some(ch) = state.peek() {
402            if ch.is_alphabetic() || ch == '_' || ch == '$' {
403                state.advance(ch.len_utf8());
404
405                while let Some(next_ch) = state.peek() {
406                    if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '$' {
407                        state.advance(next_ch.len_utf8());
408                    }
409                    else {
410                        break;
411                    }
412                }
413
414                let text = state.get_text_in((start_pos..state.get_position()).into());
415                let token_kind = self.keyword_or_identifier(&text);
416                state.add_token(token_kind, start_pos, state.get_position());
417                true
418            }
419            else {
420                false
421            }
422        }
423        else {
424            false
425        }
426    }
427
428    /// 判断是关键字还是标识
429    fn keyword_or_identifier(&self, text: &str) -> JavaScriptSyntaxKind {
430        match text {
431            "abstract" => JavaScriptSyntaxKind::Abstract,
432            "as" => JavaScriptSyntaxKind::As,
433            "async" => JavaScriptSyntaxKind::Async,
434            "await" => JavaScriptSyntaxKind::Await,
435            "break" => JavaScriptSyntaxKind::Break,
436            "case" => JavaScriptSyntaxKind::Case,
437            "catch" => JavaScriptSyntaxKind::Catch,
438            "class" => JavaScriptSyntaxKind::Class,
439            "const" => JavaScriptSyntaxKind::Const,
440            "continue" => JavaScriptSyntaxKind::Continue,
441            "debugger" => JavaScriptSyntaxKind::Debugger,
442            "default" => JavaScriptSyntaxKind::Default,
443            "delete" => JavaScriptSyntaxKind::Delete,
444            "do" => JavaScriptSyntaxKind::Do,
445            "else" => JavaScriptSyntaxKind::Else,
446            "enum" => JavaScriptSyntaxKind::Enum,
447            "export" => JavaScriptSyntaxKind::Export,
448            "extends" => JavaScriptSyntaxKind::Extends,
449            "false" => JavaScriptSyntaxKind::False,
450            "finally" => JavaScriptSyntaxKind::Finally,
451            "for" => JavaScriptSyntaxKind::For,
452            "function" => JavaScriptSyntaxKind::Function,
453            "if" => JavaScriptSyntaxKind::If,
454            "implements" => JavaScriptSyntaxKind::Implements,
455            "import" => JavaScriptSyntaxKind::Import,
456            "in" => JavaScriptSyntaxKind::In,
457            "instanceof" => JavaScriptSyntaxKind::Instanceof,
458            "interface" => JavaScriptSyntaxKind::Interface,
459            "let" => JavaScriptSyntaxKind::Let,
460            "new" => JavaScriptSyntaxKind::New,
461            "null" => JavaScriptSyntaxKind::Null,
462            "package" => JavaScriptSyntaxKind::Package,
463            "private" => JavaScriptSyntaxKind::Private,
464            "protected" => JavaScriptSyntaxKind::Protected,
465            "public" => JavaScriptSyntaxKind::Public,
466            "return" => JavaScriptSyntaxKind::Return,
467            "static" => JavaScriptSyntaxKind::Static,
468            "super" => JavaScriptSyntaxKind::Super,
469            "switch" => JavaScriptSyntaxKind::Switch,
470            "this" => JavaScriptSyntaxKind::This,
471            "throw" => JavaScriptSyntaxKind::Throw,
472            "true" => JavaScriptSyntaxKind::True,
473            "try" => JavaScriptSyntaxKind::Try,
474            "typeof" => JavaScriptSyntaxKind::Typeof,
475            "undefined" => JavaScriptSyntaxKind::Undefined,
476            "var" => JavaScriptSyntaxKind::Var,
477            "void" => JavaScriptSyntaxKind::Void,
478            "while" => JavaScriptSyntaxKind::While,
479            "with" => JavaScriptSyntaxKind::With,
480            "yield" => JavaScriptSyntaxKind::Yield,
481            _ => JavaScriptSyntaxKind::IdentifierName,
482        }
483    }
484
485    /// 处理操作符和标点符号
486    fn lex_operator_or_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
487        let start_pos = state.get_position();
488
489        if let Some(ch) = state.peek() {
490            let token_kind = match ch {
491                '+' => {
492                    state.advance(1);
493                    match state.peek() {
494                        Some('+') => {
495                            state.advance(1);
496                            JavaScriptSyntaxKind::PlusPlus
497                        }
498                        Some('=') => {
499                            state.advance(1);
500                            JavaScriptSyntaxKind::PlusEqual
501                        }
502                        _ => JavaScriptSyntaxKind::Plus,
503                    }
504                }
505                '-' => {
506                    state.advance(1);
507                    match state.peek() {
508                        Some('-') => {
509                            state.advance(1);
510                            JavaScriptSyntaxKind::MinusMinus
511                        }
512                        Some('=') => {
513                            state.advance(1);
514                            JavaScriptSyntaxKind::MinusEqual
515                        }
516                        _ => JavaScriptSyntaxKind::Minus,
517                    }
518                }
519                '*' => {
520                    state.advance(1);
521                    match state.peek() {
522                        Some('*') => {
523                            state.advance(1);
524                            if let Some('=') = state.peek() {
525                                state.advance(1);
526                                JavaScriptSyntaxKind::StarStarEqual
527                            }
528                            else {
529                                JavaScriptSyntaxKind::StarStar
530                            }
531                        }
532                        Some('=') => {
533                            state.advance(1);
534                            JavaScriptSyntaxKind::StarEqual
535                        }
536                        _ => JavaScriptSyntaxKind::Star,
537                    }
538                }
539                '/' => {
540                    // 检查是否是注释
541                    if let Some(next) = state.peek_next_n(1) {
542                        if next == '/' || next == '*' {
543                            return false; // 让注释处理函数处理                        
544                        }
545                    }
546                    state.advance(1);
547                    if let Some('=') = state.peek() {
548                        state.advance(1);
549                        JavaScriptSyntaxKind::SlashEqual
550                    }
551                    else {
552                        JavaScriptSyntaxKind::Slash
553                    }
554                }
555                '%' => {
556                    state.advance(1);
557                    if let Some('=') = state.peek() {
558                        state.advance(1);
559                        JavaScriptSyntaxKind::PercentEqual
560                    }
561                    else {
562                        JavaScriptSyntaxKind::Percent
563                    }
564                }
565                '<' => {
566                    state.advance(1);
567                    match state.peek() {
568                        Some('<') => {
569                            state.advance(1);
570                            if let Some('=') = state.peek() {
571                                state.advance(1);
572                                JavaScriptSyntaxKind::LeftShiftEqual
573                            }
574                            else {
575                                JavaScriptSyntaxKind::LeftShift
576                            }
577                        }
578                        Some('=') => {
579                            state.advance(1);
580                            JavaScriptSyntaxKind::LessEqual
581                        }
582                        _ => JavaScriptSyntaxKind::Less,
583                    }
584                }
585                '>' => {
586                    state.advance(1);
587                    match state.peek() {
588                        Some('>') => {
589                            state.advance(1);
590                            match state.peek() {
591                                Some('>') => {
592                                    state.advance(1);
593                                    if let Some('=') = state.peek() {
594                                        state.advance(1);
595                                        JavaScriptSyntaxKind::UnsignedRightShiftEqual
596                                    }
597                                    else {
598                                        JavaScriptSyntaxKind::UnsignedRightShift
599                                    }
600                                }
601                                Some('=') => {
602                                    state.advance(1);
603                                    JavaScriptSyntaxKind::RightShiftEqual
604                                }
605                                _ => JavaScriptSyntaxKind::RightShift,
606                            }
607                        }
608                        Some('=') => {
609                            state.advance(1);
610                            JavaScriptSyntaxKind::GreaterEqual
611                        }
612                        _ => JavaScriptSyntaxKind::Greater,
613                    }
614                }
615                '=' => {
616                    state.advance(1);
617                    match state.peek() {
618                        Some('=') => {
619                            state.advance(1);
620                            if let Some('=') = state.peek() {
621                                state.advance(1);
622                                JavaScriptSyntaxKind::EqualEqualEqual
623                            }
624                            else {
625                                JavaScriptSyntaxKind::EqualEqual
626                            }
627                        }
628                        Some('>') => {
629                            state.advance(1);
630                            JavaScriptSyntaxKind::Arrow
631                        }
632                        _ => JavaScriptSyntaxKind::Equal,
633                    }
634                }
635                '!' => {
636                    state.advance(1);
637                    match state.peek() {
638                        Some('=') => {
639                            state.advance(1);
640                            if let Some('=') = state.peek() {
641                                state.advance(1);
642                                JavaScriptSyntaxKind::NotEqualEqual
643                            }
644                            else {
645                                JavaScriptSyntaxKind::NotEqual
646                            }
647                        }
648                        _ => JavaScriptSyntaxKind::Exclamation,
649                    }
650                }
651                '&' => {
652                    state.advance(1);
653                    match state.peek() {
654                        Some('&') => {
655                            state.advance(1);
656                            if let Some('=') = state.peek() {
657                                state.advance(1);
658                                JavaScriptSyntaxKind::AmpersandAmpersandEqual
659                            }
660                            else {
661                                JavaScriptSyntaxKind::AmpersandAmpersand
662                            }
663                        }
664                        Some('=') => {
665                            state.advance(1);
666                            JavaScriptSyntaxKind::AmpersandEqual
667                        }
668                        _ => JavaScriptSyntaxKind::Ampersand,
669                    }
670                }
671                '|' => {
672                    state.advance(1);
673                    match state.peek() {
674                        Some('|') => {
675                            state.advance(1);
676                            if let Some('=') = state.peek() {
677                                state.advance(1);
678                                JavaScriptSyntaxKind::PipePipeEqual
679                            }
680                            else {
681                                JavaScriptSyntaxKind::PipePipe
682                            }
683                        }
684                        Some('=') => {
685                            state.advance(1);
686                            JavaScriptSyntaxKind::PipeEqual
687                        }
688                        _ => JavaScriptSyntaxKind::Pipe,
689                    }
690                }
691                '^' => {
692                    state.advance(1);
693                    if let Some('=') = state.peek() {
694                        state.advance(1);
695                        JavaScriptSyntaxKind::CaretEqual
696                    }
697                    else {
698                        JavaScriptSyntaxKind::Caret
699                    }
700                }
701                '~' => {
702                    state.advance(1);
703                    JavaScriptSyntaxKind::Tilde
704                }
705                '?' => {
706                    state.advance(1);
707                    match state.peek() {
708                        Some('?') => {
709                            state.advance(1);
710                            if let Some('=') = state.peek() {
711                                state.advance(1);
712                                JavaScriptSyntaxKind::QuestionQuestionEqual
713                            }
714                            else {
715                                JavaScriptSyntaxKind::QuestionQuestion
716                            }
717                        }
718                        Some('.') => {
719                            state.advance(1);
720                            JavaScriptSyntaxKind::QuestionDot
721                        }
722                        _ => JavaScriptSyntaxKind::Question,
723                    }
724                }
725                '(' => {
726                    state.advance(1);
727                    JavaScriptSyntaxKind::LeftParen
728                }
729                ')' => {
730                    state.advance(1);
731                    JavaScriptSyntaxKind::RightParen
732                }
733                '{' => {
734                    state.advance(1);
735                    JavaScriptSyntaxKind::LeftBrace
736                }
737                '}' => {
738                    state.advance(1);
739                    JavaScriptSyntaxKind::RightBrace
740                }
741                '[' => {
742                    state.advance(1);
743                    JavaScriptSyntaxKind::LeftBracket
744                }
745                ']' => {
746                    state.advance(1);
747                    JavaScriptSyntaxKind::RightBracket
748                }
749                ';' => {
750                    state.advance(1);
751                    JavaScriptSyntaxKind::Semicolon
752                }
753                ',' => {
754                    state.advance(1);
755                    JavaScriptSyntaxKind::Comma
756                }
757                '.' => {
758                    state.advance(1);
759                    if let Some('.') = state.peek() {
760                        if let Some('.') = state.peek_next_n(1) {
761                            state.advance(2);
762                            JavaScriptSyntaxKind::DotDotDot
763                        }
764                        else {
765                            JavaScriptSyntaxKind::Dot
766                        }
767                    }
768                    else {
769                        JavaScriptSyntaxKind::Dot
770                    }
771                }
772                ':' => {
773                    state.advance(1);
774                    JavaScriptSyntaxKind::Colon
775                }
776                _ => return false,
777            };
778
779            state.add_token(token_kind, start_pos, state.get_position());
780            true
781        }
782        else {
783            false
784        }
785    }
786}
787
788impl<'config> Lexer<JavaScriptLanguage> for JavaScriptLexer<'config> {
789    fn lex_incremental(
790        &self,
791        source: impl Source,
792        changed: usize,
793        cache: IncrementalCache<JavaScriptLanguage>,
794    ) -> LexOutput<JavaScriptLanguage> {
795        let mut state = LexerState::new_with_cache(source, changed, cache);
796        let result = self.run(&mut state);
797        state.finish(result)
798    }
799}