oak_typescript/lexer/
mod.rs

1use crate::{kind::TypeScriptSyntaxKind, language::TypeScriptLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4#[derive(Clone)]
5pub struct TypeScriptLexer<'config> {
6    config: &'config TypeScriptLanguage,
7}
8
9type State<S> = LexerState<S, TypeScriptLanguage>;
10
11impl<'config> TypeScriptLexer<'config> {
12    pub fn new(config: &'config TypeScriptLanguage) -> Self {
13        Self { config }
14    }
15}
16
17impl<'config> Lexer<TypeScriptLanguage> for TypeScriptLexer<'config> {
18    fn lex_incremental(
19        &self,
20        source: impl Source,
21        changed: usize,
22        cache: IncrementalCache<TypeScriptLanguage>,
23    ) -> LexOutput<TypeScriptLanguage> {
24        let mut state = LexerState::new_with_cache(source, changed, cache);
25        let result = self.run(&mut state);
26        state.finish(result)
27    }
28}
29
30impl<'config> TypeScriptLexer<'config> {
31    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
32        while state.not_at_end() {
33            let safe_point = state.get_position();
34
35            if self.skip_whitespace(state) {
36                continue;
37            }
38
39            if self.lex_newline(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_string_literal(state) {
48                continue;
49            }
50
51            if self.lex_template_literal(state) {
52                continue;
53            }
54
55            if self.lex_numeric_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operator_or_punctuation(state) {
64                continue;
65            }
66
67            // 如果所有规则都不匹配,跳过当前字符并标记为错误
68            let start_pos = state.get_position();
69            if let Some(ch) = state.peek() {
70                state.advance(ch.len_utf8());
71                state.add_token(TypeScriptSyntaxKind::Error, start_pos, state.get_position());
72            }
73
74            state.safe_check(safe_point);
75        }
76
77        // 添加 EOF token
78        let eof_pos = state.get_position();
79        state.add_token(TypeScriptSyntaxKind::Eof, eof_pos, eof_pos);
80        Ok(())
81    }
82
83    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
84        let start = state.get_position();
85        let mut found = false;
86
87        while let Some(ch) = state.peek() {
88            if ch == ' ' || ch == '\t' {
89                state.advance(ch.len_utf8());
90                found = true;
91            }
92            else {
93                break;
94            }
95        }
96
97        if found {
98            state.add_token(TypeScriptSyntaxKind::Whitespace, start, state.get_position());
99        }
100
101        found
102    }
103
104    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
105        let start = state.get_position();
106
107        if let Some(ch) = state.peek() {
108            if ch == '\n' {
109                state.advance(1);
110                state.add_token(TypeScriptSyntaxKind::Newline, start, state.get_position());
111                return true;
112            }
113            else if ch == '\r' {
114                state.advance(1);
115                if state.peek() == Some('\n') {
116                    state.advance(1);
117                }
118                state.add_token(TypeScriptSyntaxKind::Newline, start, state.get_position());
119                return true;
120            }
121        }
122
123        false
124    }
125
126    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
127        let start = state.get_position();
128        let rest = state.rest();
129
130        // 行注释: // ...
131        if rest.starts_with("//") {
132            state.advance(2);
133            while let Some(ch) = state.peek() {
134                if ch == '\n' || ch == '\r' {
135                    break;
136                }
137                state.advance(ch.len_utf8());
138            }
139            state.add_token(TypeScriptSyntaxKind::LineComment, start, state.get_position());
140            return true;
141        }
142
143        // 块注释: /* ... */
144        if rest.starts_with("/*") {
145            state.advance(2);
146            while let Some(ch) = state.peek() {
147                if ch == '*' && state.peek_next_n(1) == Some('/') {
148                    state.advance(2);
149                    break;
150                }
151                state.advance(ch.len_utf8());
152            }
153            state.add_token(TypeScriptSyntaxKind::BlockComment, start, state.get_position());
154            return true;
155        }
156
157        false
158    }
159
160    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
161        let start = state.get_position();
162
163        if let Some(quote) = state.peek() {
164            if quote == '"' || quote == '\'' {
165                state.advance(1);
166
167                while let Some(ch) = state.peek() {
168                    if ch == quote {
169                        state.advance(1);
170                        break;
171                    }
172                    else if ch == '\\' {
173                        state.advance(1);
174                        if let Some(_) = state.peek() {
175                            state.advance(1);
176                        }
177                    }
178                    else {
179                        state.advance(ch.len_utf8());
180                    }
181                }
182
183                state.add_token(TypeScriptSyntaxKind::StringLiteral, start, state.get_position());
184                return true;
185            }
186        }
187
188        false
189    }
190
191    fn lex_template_literal<S: Source>(&self, state: &mut State<S>) -> bool {
192        let start = state.get_position();
193
194        if state.peek() == Some('`') {
195            state.advance(1);
196
197            while let Some(ch) = state.peek() {
198                if ch == '`' {
199                    state.advance(1);
200                    break;
201                }
202                else if ch == '\\' {
203                    state.advance(1);
204                    if let Some(_) = state.peek() {
205                        state.advance(1);
206                    }
207                }
208                else {
209                    state.advance(ch.len_utf8());
210                }
211            }
212
213            state.add_token(TypeScriptSyntaxKind::TemplateString, start, state.get_position());
214            return true;
215        }
216
217        false
218    }
219
220    fn lex_numeric_literal<S: Source>(&self, state: &mut State<S>) -> bool {
221        let start = state.get_position();
222
223        if let Some(ch) = state.peek() {
224            if ch.is_ascii_digit() {
225                state.advance(1);
226
227                // 处理十六进制
228                if ch == '0' && (state.peek() == Some('x') || state.peek() == Some('X')) {
229                    state.advance(1);
230                    while let Some(ch) = state.peek() {
231                        if ch.is_ascii_hexdigit() {
232                            state.advance(1);
233                        }
234                        else {
235                            break;
236                        }
237                    }
238                }
239                else {
240                    // 处理十进制
241                    while let Some(ch) = state.peek() {
242                        if ch.is_ascii_digit() {
243                            state.advance(1);
244                        }
245                        else if ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
246                            state.advance(1);
247                            while let Some(ch) = state.peek() {
248                                if ch.is_ascii_digit() {
249                                    state.advance(1);
250                                }
251                                else {
252                                    break;
253                                }
254                            }
255                            break;
256                        }
257                        else {
258                            break;
259                        }
260                    }
261                }
262
263                // 检查 BigInt 后缀
264                if state.peek() == Some('n') {
265                    state.advance(1);
266                    state.add_token(TypeScriptSyntaxKind::BigIntLiteral, start, state.get_position());
267                }
268                else {
269                    state.add_token(TypeScriptSyntaxKind::NumericLiteral, start, state.get_position());
270                }
271
272                return true;
273            }
274        }
275
276        false
277    }
278
279    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
280        let start = state.get_position();
281
282        if let Some(ch) = state.peek() {
283            if ch.is_alphabetic() || ch == '_' || ch == '$' {
284                state.advance(ch.len_utf8());
285
286                while let Some(ch) = state.peek() {
287                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
288                        state.advance(ch.len_utf8());
289                    }
290                    else {
291                        break;
292                    }
293                }
294
295                // 获取标识符文本并检查是否为关键字
296                let end = state.get_position();
297                let text = state.get_text_in((start..end).into());
298                let kind = self.keyword_or_identifier(&text);
299
300                state.add_token(kind, start, state.get_position());
301                return true;
302            }
303        }
304
305        false
306    }
307
308    fn keyword_or_identifier(&self, text: &str) -> TypeScriptSyntaxKind {
309        match text {
310            "abstract" => TypeScriptSyntaxKind::Abstract,
311            "any" => TypeScriptSyntaxKind::Any,
312            "as" => TypeScriptSyntaxKind::As,
313            "asserts" => TypeScriptSyntaxKind::Asserts,
314            "async" => TypeScriptSyntaxKind::Async,
315            "await" => TypeScriptSyntaxKind::Await,
316            "boolean" => TypeScriptSyntaxKind::Boolean,
317            "break" => TypeScriptSyntaxKind::Break,
318            "case" => TypeScriptSyntaxKind::Case,
319            "catch" => TypeScriptSyntaxKind::Catch,
320            "class" => TypeScriptSyntaxKind::Class,
321            "const" => TypeScriptSyntaxKind::Const,
322            "constructor" => TypeScriptSyntaxKind::Constructor,
323            "continue" => TypeScriptSyntaxKind::Continue,
324            "debugger" => TypeScriptSyntaxKind::Debugger,
325            "declare" => TypeScriptSyntaxKind::Declare,
326            "default" => TypeScriptSyntaxKind::Default,
327            "delete" => TypeScriptSyntaxKind::Delete,
328            "do" => TypeScriptSyntaxKind::Do,
329            "else" => TypeScriptSyntaxKind::Else,
330            "enum" => TypeScriptSyntaxKind::Enum,
331            "export" => TypeScriptSyntaxKind::Export,
332            "extends" => TypeScriptSyntaxKind::Extends,
333            "false" => TypeScriptSyntaxKind::False,
334            "finally" => TypeScriptSyntaxKind::Finally,
335            "for" => TypeScriptSyntaxKind::For,
336            "from" => TypeScriptSyntaxKind::From,
337            "function" => TypeScriptSyntaxKind::Function,
338            "get" => TypeScriptSyntaxKind::Get,
339            "global" => TypeScriptSyntaxKind::Global,
340            "if" => TypeScriptSyntaxKind::If,
341            "implements" => TypeScriptSyntaxKind::Implements,
342            "import" => TypeScriptSyntaxKind::Import,
343            "in" => TypeScriptSyntaxKind::In,
344            "infer" => TypeScriptSyntaxKind::Infer,
345            "instanceof" => TypeScriptSyntaxKind::Instanceof,
346            "interface" => TypeScriptSyntaxKind::Interface,
347            "is" => TypeScriptSyntaxKind::Is,
348            "keyof" => TypeScriptSyntaxKind::Keyof,
349            "let" => TypeScriptSyntaxKind::Let,
350            "namespace" => TypeScriptSyntaxKind::Namespace,
351            "never" => TypeScriptSyntaxKind::Never,
352            "new" => TypeScriptSyntaxKind::New,
353            "null" => TypeScriptSyntaxKind::Null,
354            "number" => TypeScriptSyntaxKind::Number,
355            "object" => TypeScriptSyntaxKind::Object,
356            "of" => TypeScriptSyntaxKind::Of,
357            "package" => TypeScriptSyntaxKind::Package,
358            "private" => TypeScriptSyntaxKind::Private,
359            "protected" => TypeScriptSyntaxKind::Protected,
360            "public" => TypeScriptSyntaxKind::Public,
361            "readonly" => TypeScriptSyntaxKind::Readonly,
362            "require" => TypeScriptSyntaxKind::Require,
363            "return" => TypeScriptSyntaxKind::Return,
364            "set" => TypeScriptSyntaxKind::Set,
365            "static" => TypeScriptSyntaxKind::Static,
366            "string" => TypeScriptSyntaxKind::String,
367            "super" => TypeScriptSyntaxKind::Super,
368            "switch" => TypeScriptSyntaxKind::Switch,
369            "symbol" => TypeScriptSyntaxKind::Symbol,
370            "this" => TypeScriptSyntaxKind::This,
371            "throw" => TypeScriptSyntaxKind::Throw,
372            "true" => TypeScriptSyntaxKind::True,
373            "try" => TypeScriptSyntaxKind::Try,
374            "type" => TypeScriptSyntaxKind::Type,
375            "typeof" => TypeScriptSyntaxKind::Typeof,
376            "undefined" => TypeScriptSyntaxKind::Undefined,
377            "unique" => TypeScriptSyntaxKind::Unique,
378            "unknown" => TypeScriptSyntaxKind::Unknown,
379            "var" => TypeScriptSyntaxKind::Var,
380            "void" => TypeScriptSyntaxKind::Void,
381            "while" => TypeScriptSyntaxKind::While,
382            "with" => TypeScriptSyntaxKind::With,
383            "yield" => TypeScriptSyntaxKind::Yield,
384            _ => TypeScriptSyntaxKind::IdentifierName,
385        }
386    }
387
388    fn lex_operator_or_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
389        let start = state.get_position();
390        let rest = state.rest();
391
392        // 三字符操作符
393        if rest.starts_with("===") {
394            state.advance(3);
395            state.add_token(TypeScriptSyntaxKind::EqualEqualEqual, start, state.get_position());
396            return true;
397        }
398        if rest.starts_with("!==") {
399            state.advance(3);
400            state.add_token(TypeScriptSyntaxKind::NotEqualEqual, start, state.get_position());
401            return true;
402        }
403        if rest.starts_with(">>>") {
404            state.advance(3);
405            state.add_token(TypeScriptSyntaxKind::UnsignedRightShift, start, state.get_position());
406            return true;
407        }
408        if rest.starts_with("...") {
409            state.advance(3);
410            state.add_token(TypeScriptSyntaxKind::DotDotDot, start, state.get_position());
411            return true;
412        }
413
414        // 双字符操作符
415        if rest.starts_with("**") {
416            state.advance(2);
417            state.add_token(TypeScriptSyntaxKind::StarStar, start, state.get_position());
418            return true;
419        }
420        if rest.starts_with("<=") {
421            state.advance(2);
422            state.add_token(TypeScriptSyntaxKind::LessEqual, start, state.get_position());
423            return true;
424        }
425        if rest.starts_with(">=") {
426            state.advance(2);
427            state.add_token(TypeScriptSyntaxKind::GreaterEqual, start, state.get_position());
428            return true;
429        }
430        if rest.starts_with("==") {
431            state.advance(2);
432            state.add_token(TypeScriptSyntaxKind::EqualEqual, start, state.get_position());
433            return true;
434        }
435        if rest.starts_with("!=") {
436            state.advance(2);
437            state.add_token(TypeScriptSyntaxKind::NotEqual, start, state.get_position());
438            return true;
439        }
440        if rest.starts_with("&&") {
441            state.advance(2);
442            state.add_token(TypeScriptSyntaxKind::AmpersandAmpersand, start, state.get_position());
443            return true;
444        }
445        if rest.starts_with("||") {
446            state.advance(2);
447            state.add_token(TypeScriptSyntaxKind::PipePipe, start, state.get_position());
448            return true;
449        }
450        if rest.starts_with("<<") {
451            state.advance(2);
452            state.add_token(TypeScriptSyntaxKind::LeftShift, start, state.get_position());
453            return true;
454        }
455        if rest.starts_with(">>") {
456            state.advance(2);
457            state.add_token(TypeScriptSyntaxKind::RightShift, start, state.get_position());
458            return true;
459        }
460        if rest.starts_with("++") {
461            state.advance(2);
462            state.add_token(TypeScriptSyntaxKind::PlusPlus, start, state.get_position());
463            return true;
464        }
465        if rest.starts_with("--") {
466            state.advance(2);
467            state.add_token(TypeScriptSyntaxKind::MinusMinus, start, state.get_position());
468            return true;
469        }
470        if rest.starts_with("=>") {
471            state.advance(2);
472            state.add_token(TypeScriptSyntaxKind::Arrow, start, state.get_position());
473            return true;
474        }
475        if rest.starts_with("?.") {
476            state.advance(2);
477            state.add_token(TypeScriptSyntaxKind::QuestionDot, start, state.get_position());
478            return true;
479        }
480        if rest.starts_with("??") {
481            state.advance(2);
482            state.add_token(TypeScriptSyntaxKind::QuestionQuestion, start, state.get_position());
483            return true;
484        }
485
486        // 单字符操作符
487        if let Some(ch) = state.peek() {
488            let kind = match ch {
489                '+' => TypeScriptSyntaxKind::Plus,
490                '-' => TypeScriptSyntaxKind::Minus,
491                '*' => TypeScriptSyntaxKind::Star,
492                '/' => TypeScriptSyntaxKind::Slash,
493                '%' => TypeScriptSyntaxKind::Percent,
494                '<' => TypeScriptSyntaxKind::Less,
495                '>' => TypeScriptSyntaxKind::Greater,
496                '!' => TypeScriptSyntaxKind::Exclamation,
497                '&' => TypeScriptSyntaxKind::Ampersand,
498                '|' => TypeScriptSyntaxKind::Pipe,
499                '^' => TypeScriptSyntaxKind::Caret,
500                '~' => TypeScriptSyntaxKind::Tilde,
501                '=' => TypeScriptSyntaxKind::Equal,
502                '?' => TypeScriptSyntaxKind::Question,
503                '(' => TypeScriptSyntaxKind::LeftParen,
504                ')' => TypeScriptSyntaxKind::RightParen,
505                '{' => TypeScriptSyntaxKind::LeftBrace,
506                '}' => TypeScriptSyntaxKind::RightBrace,
507                '[' => TypeScriptSyntaxKind::LeftBracket,
508                ']' => TypeScriptSyntaxKind::RightBracket,
509                ';' => TypeScriptSyntaxKind::Semicolon,
510                ',' => TypeScriptSyntaxKind::Comma,
511                '.' => TypeScriptSyntaxKind::Dot,
512                ':' => TypeScriptSyntaxKind::Colon,
513                _ => return false,
514            };
515
516            state.advance(1);
517            state.add_token(kind, start, state.get_position());
518            return true;
519        }
520
521        false
522    }
523}