Skip to main content

oak_typescript/lexer/
mod.rs

1use crate::{kind::TypeScriptSyntaxKind, language::TypeScriptLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
3
4#[derive(Clone, Debug)]
5pub struct TypeScriptLexer<'config> {
6    _config: &'config TypeScriptLanguage,
7}
8
9type State<'a, S> = LexerState<'a, S, TypeScriptLanguage>;
10
11impl<'config> TypeScriptLexer<'config> {
12    pub fn new(config: &'config TypeScriptLanguage) -> Self {
13        Self { _config: config }
14    }
15}
16
17impl<'config> Lexer<TypeScriptLanguage> for TypeScriptLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<TypeScriptLanguage>) -> LexOutput<TypeScriptLanguage> {
19        let relex_from = edits.iter().map(|e| e.span.start).min().unwrap_or(0);
20        let mut state: State<'_, S> = LexerState::new_with_cache(text, relex_from, cache);
21
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof();
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> TypeScriptLexer<'config> {
31    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
32        while state.not_at_end() {
33            let safe_point = state.get_position();
34
35            if self.skip_whitespace(state) {
36                continue;
37            }
38
39            if self.lex_newline(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_string_literal(state) {
48                continue;
49            }
50
51            if self.lex_template_literal(state) {
52                continue;
53            }
54
55            if self.lex_numeric_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operator_or_punctuation(state) {
64                continue;
65            }
66
67            // 如果所有规则都不匹配,跳过当前字符并标记为错误
68            let start_pos = state.get_position();
69            if let Some(ch) = state.peek() {
70                state.advance(ch.len_utf8());
71                state.add_token(TypeScriptSyntaxKind::Error, start_pos, state.get_position());
72            }
73
74            state.advance_if_dead_lock(safe_point);
75        }
76
77        Ok(())
78    }
79
80    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81        let start = state.get_position();
82        let mut found = false;
83
84        while let Some(ch) = state.peek() {
85            if ch == ' ' || ch == '\t' {
86                state.advance(ch.len_utf8());
87                found = true;
88            }
89            else {
90                break;
91            }
92        }
93
94        if found {
95            state.add_token(TypeScriptSyntaxKind::Whitespace, start, state.get_position());
96        }
97
98        found
99    }
100
101    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        let start = state.get_position();
103
104        if let Some(ch) = state.peek() {
105            if ch == '\n' {
106                state.advance(1);
107                state.add_token(TypeScriptSyntaxKind::Newline, start, state.get_position());
108                return true;
109            }
110            else if ch == '\r' {
111                state.advance(1);
112                if state.peek() == Some('\n') {
113                    state.advance(1);
114                }
115                state.add_token(TypeScriptSyntaxKind::Newline, start, state.get_position());
116                return true;
117            }
118        }
119
120        false
121    }
122
123    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124        let start = state.get_position();
125        let rest = state.rest();
126
127        // 行注释: // ...
128        if rest.starts_with("//") {
129            state.advance(2);
130            while let Some(ch) = state.peek() {
131                if ch == '\n' || ch == '\r' {
132                    break;
133                }
134                state.advance(ch.len_utf8());
135            }
136            state.add_token(TypeScriptSyntaxKind::LineComment, start, state.get_position());
137            return true;
138        }
139
140        // 块注释: /* ... */
141        if rest.starts_with("/*") {
142            state.advance(2);
143            while let Some(ch) = state.peek() {
144                if ch == '*' && state.peek_next_n(1) == Some('/') {
145                    state.advance(2);
146                    break;
147                }
148                state.advance(ch.len_utf8());
149            }
150            state.add_token(TypeScriptSyntaxKind::BlockComment, start, state.get_position());
151            return true;
152        }
153
154        false
155    }
156
157    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
158        let start = state.get_position();
159
160        if let Some(quote) = state.peek() {
161            if quote == '"' || quote == '\'' {
162                state.advance(1);
163
164                while let Some(ch) = state.peek() {
165                    if ch == quote {
166                        state.advance(1);
167                        break;
168                    }
169                    else if ch == '\\' {
170                        state.advance(1);
171                        if let Some(_) = state.peek() {
172                            state.advance(1);
173                        }
174                    }
175                    else {
176                        state.advance(ch.len_utf8());
177                    }
178                }
179
180                state.add_token(TypeScriptSyntaxKind::StringLiteral, start, state.get_position());
181                return true;
182            }
183        }
184
185        false
186    }
187
188    fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189        let start = state.get_position();
190
191        if state.peek() == Some('`') {
192            state.advance(1);
193
194            while let Some(ch) = state.peek() {
195                if ch == '`' {
196                    state.advance(1);
197                    break;
198                }
199                else if ch == '\\' {
200                    state.advance(1);
201                    if let Some(_) = state.peek() {
202                        state.advance(1);
203                    }
204                }
205                else {
206                    state.advance(ch.len_utf8());
207                }
208            }
209
210            state.add_token(TypeScriptSyntaxKind::TemplateString, start, state.get_position());
211            return true;
212        }
213
214        false
215    }
216
217    fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
218        let start = state.get_position();
219
220        if let Some(ch) = state.peek() {
221            if ch.is_ascii_digit() {
222                state.advance(1);
223
224                // 处理十六进制
225                if ch == '0' && (state.peek() == Some('x') || state.peek() == Some('X')) {
226                    state.advance(1);
227                    while let Some(ch) = state.peek() {
228                        if ch.is_ascii_hexdigit() {
229                            state.advance(1);
230                        }
231                        else {
232                            break;
233                        }
234                    }
235                }
236                else {
237                    // 处理十进制
238                    while let Some(ch) = state.peek() {
239                        if ch.is_ascii_digit() {
240                            state.advance(1);
241                        }
242                        else if ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
243                            state.advance(1);
244                            while let Some(ch) = state.peek() {
245                                if ch.is_ascii_digit() {
246                                    state.advance(1);
247                                }
248                                else {
249                                    break;
250                                }
251                            }
252                            break;
253                        }
254                        else {
255                            break;
256                        }
257                    }
258                }
259
260                // 检查 BigInt 后缀
261                if state.peek() == Some('n') {
262                    state.advance(1);
263                    state.add_token(TypeScriptSyntaxKind::BigIntLiteral, start, state.get_position());
264                }
265                else {
266                    state.add_token(TypeScriptSyntaxKind::NumericLiteral, start, state.get_position());
267                }
268
269                return true;
270            }
271        }
272
273        false
274    }
275
276    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
277        let start = state.get_position();
278
279        if let Some(ch) = state.peek() {
280            if ch.is_alphabetic() || ch == '_' || ch == '$' {
281                state.advance(ch.len_utf8());
282
283                while let Some(ch) = state.peek() {
284                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
285                        state.advance(ch.len_utf8());
286                    }
287                    else {
288                        break;
289                    }
290                }
291
292                // 获取标识符文本并检查是否为关键字
293                let end = state.get_position();
294                let text = state.get_text_in(oak_core::Range { start, end });
295                let kind = self.keyword_or_identifier(&text);
296
297                state.add_token(kind, start, state.get_position());
298                return true;
299            }
300        }
301
302        false
303    }
304
305    fn keyword_or_identifier(&self, text: &str) -> TypeScriptSyntaxKind {
306        TypeScriptSyntaxKind::from_keyword(text).unwrap_or(TypeScriptSyntaxKind::IdentifierName)
307    }
308
309    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
310        let start = state.get_position();
311        let rest = state.rest();
312
313        let ops = [
314            ("===", TypeScriptSyntaxKind::EqualEqualEqual),
315            ("!==", TypeScriptSyntaxKind::NotEqualEqual),
316            (">>>", TypeScriptSyntaxKind::UnsignedRightShift),
317            ("...", TypeScriptSyntaxKind::DotDotDot),
318            ("**=", TypeScriptSyntaxKind::StarStarEqual),
319            ("<<=", TypeScriptSyntaxKind::LeftShiftEqual),
320            (">>=", TypeScriptSyntaxKind::RightShiftEqual),
321            ("&&=", TypeScriptSyntaxKind::AmpersandAmpersandEqual),
322            ("||=", TypeScriptSyntaxKind::PipePipeEqual),
323            ("??=", TypeScriptSyntaxKind::QuestionQuestionEqual),
324            ("**", TypeScriptSyntaxKind::StarStar),
325            ("<=", TypeScriptSyntaxKind::LessEqual),
326            (">=", TypeScriptSyntaxKind::GreaterEqual),
327            ("==", TypeScriptSyntaxKind::EqualEqual),
328            ("!=", TypeScriptSyntaxKind::NotEqual),
329            ("&&", TypeScriptSyntaxKind::AmpersandAmpersand),
330            ("||", TypeScriptSyntaxKind::PipePipe),
331            ("<<", TypeScriptSyntaxKind::LeftShift),
332            (">>", TypeScriptSyntaxKind::RightShift),
333            ("++", TypeScriptSyntaxKind::PlusPlus),
334            ("--", TypeScriptSyntaxKind::MinusMinus),
335            ("=>", TypeScriptSyntaxKind::Arrow),
336            ("?.", TypeScriptSyntaxKind::QuestionDot),
337            ("??", TypeScriptSyntaxKind::QuestionQuestion),
338            ("+=", TypeScriptSyntaxKind::PlusEqual),
339            ("-=", TypeScriptSyntaxKind::MinusEqual),
340            ("*=", TypeScriptSyntaxKind::StarEqual),
341            ("/=", TypeScriptSyntaxKind::SlashEqual),
342            ("%=", TypeScriptSyntaxKind::PercentEqual),
343            ("&=", TypeScriptSyntaxKind::AmpersandEqual),
344            ("|=", TypeScriptSyntaxKind::PipeEqual),
345            ("^=", TypeScriptSyntaxKind::CaretEqual),
346        ];
347
348        for (op, kind) in ops {
349            if rest.starts_with(op) {
350                state.advance(op.len());
351                state.add_token(kind, start, state.get_position());
352                return true;
353            }
354        }
355
356        if let Some(ch) = state.peek() {
357            let kind = match ch {
358                '+' => TypeScriptSyntaxKind::Plus,
359                '-' => TypeScriptSyntaxKind::Minus,
360                '*' => TypeScriptSyntaxKind::Star,
361                '/' => TypeScriptSyntaxKind::Slash,
362                '%' => TypeScriptSyntaxKind::Percent,
363                '<' => TypeScriptSyntaxKind::Less,
364                '>' => TypeScriptSyntaxKind::Greater,
365                '!' => TypeScriptSyntaxKind::Exclamation,
366                '&' => TypeScriptSyntaxKind::Ampersand,
367                '|' => TypeScriptSyntaxKind::Pipe,
368                '^' => TypeScriptSyntaxKind::Caret,
369                '~' => TypeScriptSyntaxKind::Tilde,
370                '=' => TypeScriptSyntaxKind::Equal,
371                '?' => TypeScriptSyntaxKind::Question,
372                '(' => TypeScriptSyntaxKind::LeftParen,
373                ')' => TypeScriptSyntaxKind::RightParen,
374                '{' => TypeScriptSyntaxKind::LeftBrace,
375                '}' => TypeScriptSyntaxKind::RightBrace,
376                '[' => TypeScriptSyntaxKind::LeftBracket,
377                ']' => TypeScriptSyntaxKind::RightBracket,
378                ';' => TypeScriptSyntaxKind::Semicolon,
379                ',' => TypeScriptSyntaxKind::Comma,
380                '.' => TypeScriptSyntaxKind::Dot,
381                ':' => TypeScriptSyntaxKind::Colon,
382                _ => return false,
383            };
384
385            state.advance(1);
386            state.add_token(kind, start, state.get_position());
387            return true;
388        }
389
390        false
391    }
392}