oak_typescript/lexer/
mod.rs

1use crate::{kind::TypeScriptSyntaxKind, language::TypeScriptLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
3
4#[derive(Clone)]
5pub struct TypeScriptLexer<'config> {
6    _config: &'config TypeScriptLanguage,
7}
8
9type State<'a, S> = LexerState<'a, S, TypeScriptLanguage>;
10
11impl<'config> TypeScriptLexer<'config> {
12    pub fn new(config: &'config TypeScriptLanguage) -> Self {
13        Self { _config: config }
14    }
15}
16
17impl<'config> Lexer<TypeScriptLanguage> for TypeScriptLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<TypeScriptLanguage>) -> LexOutput<TypeScriptLanguage> {
19        let mut state: State<'_, S> = LexerState::new(text);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof();
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> TypeScriptLexer<'config> {
29    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
30        while state.not_at_end() {
31            let safe_point = state.get_position();
32
33            if self.skip_whitespace(state) {
34                continue;
35            }
36
37            if self.lex_newline(state) {
38                continue;
39            }
40
41            if self.skip_comment(state) {
42                continue;
43            }
44
45            if self.lex_string_literal(state) {
46                continue;
47            }
48
49            if self.lex_template_literal(state) {
50                continue;
51            }
52
53            if self.lex_numeric_literal(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_operator_or_punctuation(state) {
62                continue;
63            }
64
65            // 如果所有规则都不匹配,跳过当前字符并标记为错误
66            let start_pos = state.get_position();
67            if let Some(ch) = state.peek() {
68                state.advance(ch.len_utf8());
69                state.add_token(TypeScriptSyntaxKind::Error, start_pos, state.get_position());
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        Ok(())
76    }
77
78    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79        let start = state.get_position();
80        let mut found = false;
81
82        while let Some(ch) = state.peek() {
83            if ch == ' ' || ch == '\t' {
84                state.advance(ch.len_utf8());
85                found = true;
86            }
87            else {
88                break;
89            }
90        }
91
92        if found {
93            state.add_token(TypeScriptSyntaxKind::Whitespace, start, state.get_position());
94        }
95
96        found
97    }
98
99    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
100        let start = state.get_position();
101
102        if let Some(ch) = state.peek() {
103            if ch == '\n' {
104                state.advance(1);
105                state.add_token(TypeScriptSyntaxKind::Newline, start, state.get_position());
106                return true;
107            }
108            else if ch == '\r' {
109                state.advance(1);
110                if state.peek() == Some('\n') {
111                    state.advance(1);
112                }
113                state.add_token(TypeScriptSyntaxKind::Newline, start, state.get_position());
114                return true;
115            }
116        }
117
118        false
119    }
120
121    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
122        let start = state.get_position();
123        let rest = state.rest();
124
125        // 行注释: // ...
126        if rest.starts_with("//") {
127            state.advance(2);
128            while let Some(ch) = state.peek() {
129                if ch == '\n' || ch == '\r' {
130                    break;
131                }
132                state.advance(ch.len_utf8());
133            }
134            state.add_token(TypeScriptSyntaxKind::LineComment, start, state.get_position());
135            return true;
136        }
137
138        // 块注释: /* ... */
139        if rest.starts_with("/*") {
140            state.advance(2);
141            while let Some(ch) = state.peek() {
142                if ch == '*' && state.peek_next_n(1) == Some('/') {
143                    state.advance(2);
144                    break;
145                }
146                state.advance(ch.len_utf8());
147            }
148            state.add_token(TypeScriptSyntaxKind::BlockComment, start, state.get_position());
149            return true;
150        }
151
152        false
153    }
154
155    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
156        let start = state.get_position();
157
158        if let Some(quote) = state.peek() {
159            if quote == '"' || quote == '\'' {
160                state.advance(1);
161
162                while let Some(ch) = state.peek() {
163                    if ch == quote {
164                        state.advance(1);
165                        break;
166                    }
167                    else if ch == '\\' {
168                        state.advance(1);
169                        if let Some(_) = state.peek() {
170                            state.advance(1);
171                        }
172                    }
173                    else {
174                        state.advance(ch.len_utf8());
175                    }
176                }
177
178                state.add_token(TypeScriptSyntaxKind::StringLiteral, start, state.get_position());
179                return true;
180            }
181        }
182
183        false
184    }
185
186    fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
187        let start = state.get_position();
188
189        if state.peek() == Some('`') {
190            state.advance(1);
191
192            while let Some(ch) = state.peek() {
193                if ch == '`' {
194                    state.advance(1);
195                    break;
196                }
197                else if ch == '\\' {
198                    state.advance(1);
199                    if let Some(_) = state.peek() {
200                        state.advance(1);
201                    }
202                }
203                else {
204                    state.advance(ch.len_utf8());
205                }
206            }
207
208            state.add_token(TypeScriptSyntaxKind::TemplateString, start, state.get_position());
209            return true;
210        }
211
212        false
213    }
214
215    fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
216        let start = state.get_position();
217
218        if let Some(ch) = state.peek() {
219            if ch.is_ascii_digit() {
220                state.advance(1);
221
222                // 处理十六进制
223                if ch == '0' && (state.peek() == Some('x') || state.peek() == Some('X')) {
224                    state.advance(1);
225                    while let Some(ch) = state.peek() {
226                        if ch.is_ascii_hexdigit() {
227                            state.advance(1);
228                        }
229                        else {
230                            break;
231                        }
232                    }
233                }
234                else {
235                    // 处理十进制
236                    while let Some(ch) = state.peek() {
237                        if ch.is_ascii_digit() {
238                            state.advance(1);
239                        }
240                        else if ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
241                            state.advance(1);
242                            while let Some(ch) = state.peek() {
243                                if ch.is_ascii_digit() {
244                                    state.advance(1);
245                                }
246                                else {
247                                    break;
248                                }
249                            }
250                            break;
251                        }
252                        else {
253                            break;
254                        }
255                    }
256                }
257
258                // 检查 BigInt 后缀
259                if state.peek() == Some('n') {
260                    state.advance(1);
261                    state.add_token(TypeScriptSyntaxKind::BigIntLiteral, start, state.get_position());
262                }
263                else {
264                    state.add_token(TypeScriptSyntaxKind::NumericLiteral, start, state.get_position());
265                }
266
267                return true;
268            }
269        }
270
271        false
272    }
273
274    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
275        let start = state.get_position();
276
277        if let Some(ch) = state.peek() {
278            if ch.is_alphabetic() || ch == '_' || ch == '$' {
279                state.advance(ch.len_utf8());
280
281                while let Some(ch) = state.peek() {
282                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
283                        state.advance(ch.len_utf8());
284                    }
285                    else {
286                        break;
287                    }
288                }
289
290                // 获取标识符文本并检查是否为关键字
291                let end = state.get_position();
292                let text = state.get_text_in(oak_core::Range { start, end });
293                let kind = self.keyword_or_identifier(&text);
294
295                state.add_token(kind, start, state.get_position());
296                return true;
297            }
298        }
299
300        false
301    }
302
303    fn keyword_or_identifier(&self, text: &str) -> TypeScriptSyntaxKind {
304        match text {
305            "abstract" => TypeScriptSyntaxKind::Abstract,
306            "any" => TypeScriptSyntaxKind::Any,
307            "as" => TypeScriptSyntaxKind::As,
308            "asserts" => TypeScriptSyntaxKind::Asserts,
309            "async" => TypeScriptSyntaxKind::Async,
310            "await" => TypeScriptSyntaxKind::Await,
311            "boolean" => TypeScriptSyntaxKind::Boolean,
312            "break" => TypeScriptSyntaxKind::Break,
313            "case" => TypeScriptSyntaxKind::Case,
314            "catch" => TypeScriptSyntaxKind::Catch,
315            "class" => TypeScriptSyntaxKind::Class,
316            "const" => TypeScriptSyntaxKind::Const,
317            "constructor" => TypeScriptSyntaxKind::Constructor,
318            "continue" => TypeScriptSyntaxKind::Continue,
319            "debugger" => TypeScriptSyntaxKind::Debugger,
320            "declare" => TypeScriptSyntaxKind::Declare,
321            "default" => TypeScriptSyntaxKind::Default,
322            "delete" => TypeScriptSyntaxKind::Delete,
323            "do" => TypeScriptSyntaxKind::Do,
324            "else" => TypeScriptSyntaxKind::Else,
325            "enum" => TypeScriptSyntaxKind::Enum,
326            "export" => TypeScriptSyntaxKind::Export,
327            "extends" => TypeScriptSyntaxKind::Extends,
328            "false" => TypeScriptSyntaxKind::False,
329            "finally" => TypeScriptSyntaxKind::Finally,
330            "for" => TypeScriptSyntaxKind::For,
331            "from" => TypeScriptSyntaxKind::From,
332            "function" => TypeScriptSyntaxKind::Function,
333            "get" => TypeScriptSyntaxKind::Get,
334            "global" => TypeScriptSyntaxKind::Global,
335            "if" => TypeScriptSyntaxKind::If,
336            "implements" => TypeScriptSyntaxKind::Implements,
337            "import" => TypeScriptSyntaxKind::Import,
338            "in" => TypeScriptSyntaxKind::In,
339            "infer" => TypeScriptSyntaxKind::Infer,
340            "instanceof" => TypeScriptSyntaxKind::Instanceof,
341            "interface" => TypeScriptSyntaxKind::Interface,
342            "is" => TypeScriptSyntaxKind::Is,
343            "keyof" => TypeScriptSyntaxKind::Keyof,
344            "let" => TypeScriptSyntaxKind::Let,
345            "namespace" => TypeScriptSyntaxKind::Namespace,
346            "never" => TypeScriptSyntaxKind::Never,
347            "new" => TypeScriptSyntaxKind::New,
348            "null" => TypeScriptSyntaxKind::Null,
349            "number" => TypeScriptSyntaxKind::Number,
350            "object" => TypeScriptSyntaxKind::Object,
351            "of" => TypeScriptSyntaxKind::Of,
352            "package" => TypeScriptSyntaxKind::Package,
353            "private" => TypeScriptSyntaxKind::Private,
354            "protected" => TypeScriptSyntaxKind::Protected,
355            "public" => TypeScriptSyntaxKind::Public,
356            "readonly" => TypeScriptSyntaxKind::Readonly,
357            "require" => TypeScriptSyntaxKind::Require,
358            "return" => TypeScriptSyntaxKind::Return,
359            "set" => TypeScriptSyntaxKind::Set,
360            "static" => TypeScriptSyntaxKind::Static,
361            "string" => TypeScriptSyntaxKind::String,
362            "super" => TypeScriptSyntaxKind::Super,
363            "switch" => TypeScriptSyntaxKind::Switch,
364            "symbol" => TypeScriptSyntaxKind::Symbol,
365            "this" => TypeScriptSyntaxKind::This,
366            "throw" => TypeScriptSyntaxKind::Throw,
367            "true" => TypeScriptSyntaxKind::True,
368            "try" => TypeScriptSyntaxKind::Try,
369            "type" => TypeScriptSyntaxKind::Type,
370            "typeof" => TypeScriptSyntaxKind::Typeof,
371            "undefined" => TypeScriptSyntaxKind::Undefined,
372            "unique" => TypeScriptSyntaxKind::Unique,
373            "unknown" => TypeScriptSyntaxKind::Unknown,
374            "var" => TypeScriptSyntaxKind::Var,
375            "void" => TypeScriptSyntaxKind::Void,
376            "while" => TypeScriptSyntaxKind::While,
377            "with" => TypeScriptSyntaxKind::With,
378            "yield" => TypeScriptSyntaxKind::Yield,
379            _ => TypeScriptSyntaxKind::IdentifierName,
380        }
381    }
382
383    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
384        let start = state.get_position();
385        let rest = state.rest();
386
387        // 三字符操作符
388        if rest.starts_with("===") {
389            state.advance(3);
390            state.add_token(TypeScriptSyntaxKind::EqualEqualEqual, start, state.get_position());
391            return true;
392        }
393        if rest.starts_with("!==") {
394            state.advance(3);
395            state.add_token(TypeScriptSyntaxKind::NotEqualEqual, start, state.get_position());
396            return true;
397        }
398        if rest.starts_with(">>>") {
399            state.advance(3);
400            state.add_token(TypeScriptSyntaxKind::UnsignedRightShift, start, state.get_position());
401            return true;
402        }
403        if rest.starts_with("...") {
404            state.advance(3);
405            state.add_token(TypeScriptSyntaxKind::DotDotDot, start, state.get_position());
406            return true;
407        }
408
409        // 双字符操作符
410        if rest.starts_with("**") {
411            state.advance(2);
412            state.add_token(TypeScriptSyntaxKind::StarStar, start, state.get_position());
413            return true;
414        }
415        if rest.starts_with("<=") {
416            state.advance(2);
417            state.add_token(TypeScriptSyntaxKind::LessEqual, start, state.get_position());
418            return true;
419        }
420        if rest.starts_with(">=") {
421            state.advance(2);
422            state.add_token(TypeScriptSyntaxKind::GreaterEqual, start, state.get_position());
423            return true;
424        }
425        if rest.starts_with("==") {
426            state.advance(2);
427            state.add_token(TypeScriptSyntaxKind::EqualEqual, start, state.get_position());
428            return true;
429        }
430        if rest.starts_with("!=") {
431            state.advance(2);
432            state.add_token(TypeScriptSyntaxKind::NotEqual, start, state.get_position());
433            return true;
434        }
435        if rest.starts_with("&&") {
436            state.advance(2);
437            state.add_token(TypeScriptSyntaxKind::AmpersandAmpersand, start, state.get_position());
438            return true;
439        }
440        if rest.starts_with("||") {
441            state.advance(2);
442            state.add_token(TypeScriptSyntaxKind::PipePipe, start, state.get_position());
443            return true;
444        }
445        if rest.starts_with("<<") {
446            state.advance(2);
447            state.add_token(TypeScriptSyntaxKind::LeftShift, start, state.get_position());
448            return true;
449        }
450        if rest.starts_with(">>") {
451            state.advance(2);
452            state.add_token(TypeScriptSyntaxKind::RightShift, start, state.get_position());
453            return true;
454        }
455        if rest.starts_with("++") {
456            state.advance(2);
457            state.add_token(TypeScriptSyntaxKind::PlusPlus, start, state.get_position());
458            return true;
459        }
460        if rest.starts_with("--") {
461            state.advance(2);
462            state.add_token(TypeScriptSyntaxKind::MinusMinus, start, state.get_position());
463            return true;
464        }
465        if rest.starts_with("=>") {
466            state.advance(2);
467            state.add_token(TypeScriptSyntaxKind::Arrow, start, state.get_position());
468            return true;
469        }
470        if rest.starts_with("?.") {
471            state.advance(2);
472            state.add_token(TypeScriptSyntaxKind::QuestionDot, start, state.get_position());
473            return true;
474        }
475        if rest.starts_with("??") {
476            state.advance(2);
477            state.add_token(TypeScriptSyntaxKind::QuestionQuestion, start, state.get_position());
478            return true;
479        }
480
481        // 单字符操作符
482        if let Some(ch) = state.peek() {
483            let kind = match ch {
484                '+' => TypeScriptSyntaxKind::Plus,
485                '-' => TypeScriptSyntaxKind::Minus,
486                '*' => TypeScriptSyntaxKind::Star,
487                '/' => TypeScriptSyntaxKind::Slash,
488                '%' => TypeScriptSyntaxKind::Percent,
489                '<' => TypeScriptSyntaxKind::Less,
490                '>' => TypeScriptSyntaxKind::Greater,
491                '!' => TypeScriptSyntaxKind::Exclamation,
492                '&' => TypeScriptSyntaxKind::Ampersand,
493                '|' => TypeScriptSyntaxKind::Pipe,
494                '^' => TypeScriptSyntaxKind::Caret,
495                '~' => TypeScriptSyntaxKind::Tilde,
496                '=' => TypeScriptSyntaxKind::Equal,
497                '?' => TypeScriptSyntaxKind::Question,
498                '(' => TypeScriptSyntaxKind::LeftParen,
499                ')' => TypeScriptSyntaxKind::RightParen,
500                '{' => TypeScriptSyntaxKind::LeftBrace,
501                '}' => TypeScriptSyntaxKind::RightBrace,
502                '[' => TypeScriptSyntaxKind::LeftBracket,
503                ']' => TypeScriptSyntaxKind::RightBracket,
504                ';' => TypeScriptSyntaxKind::Semicolon,
505                ',' => TypeScriptSyntaxKind::Comma,
506                '.' => TypeScriptSyntaxKind::Dot,
507                ':' => TypeScriptSyntaxKind::Colon,
508                _ => return false,
509            };
510
511            state.advance(1);
512            state.add_token(kind, start, state.get_position());
513            return true;
514        }
515
516        false
517    }
518}