Skip to main content

oak_typescript/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use self::token_type::TypeScriptTokenType;
5use crate::language::TypeScriptLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
7
8#[derive(Clone, Debug)]
9pub struct TypeScriptLexer<'config> {
10    _config: &'config TypeScriptLanguage,
11}
12
13type State<'a, S> = LexerState<'a, S, TypeScriptLanguage>;
14
15impl<'config> TypeScriptLexer<'config> {
16    pub fn new(config: &'config TypeScriptLanguage) -> Self {
17        Self { _config: config }
18    }
19}
20
21impl<'config> Lexer<TypeScriptLanguage> for TypeScriptLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<TypeScriptLanguage>) -> LexOutput<TypeScriptLanguage> {
23        let relex_from = edits.iter().map(|e| e.span.start).min().unwrap_or(0);
24        let mut state: State<'_, S> = LexerState::new_with_cache(text, relex_from, cache);
25
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof()
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> TypeScriptLexer<'config> {
35    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let safe_point = state.get_position();
38
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.lex_newline(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_template_literal(state) {
56                continue;
57            }
58
59            if self.lex_numeric_literal(state) {
60                continue;
61            }
62
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            if self.lex_operator_or_punctuation(state) {
68                continue;
69            }
70
71            // 如果所有规则都不匹配,跳过当前字符并标记为错误
72            let start_pos = state.get_position();
73            if let Some(ch) = state.peek() {
74                state.advance(ch.len_utf8());
75                state.add_token(TypeScriptTokenType::Error, start_pos, state.get_position());
76            }
77
78            state.advance_if_dead_lock(safe_point);
79        }
80
81        Ok(())
82    }
83
84    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        let start = state.get_position();
86        let mut found = false;
87
88        while let Some(ch) = state.peek() {
89            if ch == ' ' || ch == '\t' {
90                state.advance(ch.len_utf8());
91                found = true;
92            }
93            else {
94                break;
95            }
96        }
97
98        if found {
99            state.add_token(TypeScriptTokenType::Whitespace, start, state.get_position());
100        }
101
102        found
103    }
104
105    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106        let start = state.get_position();
107
108        if let Some(ch) = state.peek() {
109            if ch == '\n' {
110                state.advance(1);
111                state.add_token(TypeScriptTokenType::Newline, start, state.get_position());
112                return true;
113            }
114            else if ch == '\r' {
115                state.advance(1);
116                if state.peek() == Some('\n') {
117                    state.advance(1);
118                }
119                state.add_token(TypeScriptTokenType::Newline, start, state.get_position());
120                return true;
121            }
122        }
123
124        false
125    }
126
127    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
128        let start = state.get_position();
129        let rest = state.rest();
130
131        // 行注释: // ...
132        if rest.starts_with("//") {
133            state.advance(2);
134            while let Some(ch) = state.peek() {
135                if ch == '\n' || ch == '\r' {
136                    break;
137                }
138                state.advance(ch.len_utf8());
139            }
140            state.add_token(TypeScriptTokenType::LineComment, start, state.get_position());
141            return true;
142        }
143
144        // 块注释: /* ... */
145        if rest.starts_with("/*") {
146            state.advance(2);
147            while let Some(ch) = state.peek() {
148                if ch == '*' && state.peek_next_n(1) == Some('/') {
149                    state.advance(2);
150                    break;
151                }
152                state.advance(ch.len_utf8());
153            }
154            state.add_token(TypeScriptTokenType::BlockComment, start, state.get_position());
155            return true;
156        }
157
158        false
159    }
160
161    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
162        let start = state.get_position();
163
164        if let Some(quote) = state.peek() {
165            if quote == '"' || quote == '\'' {
166                state.advance(1);
167
168                while let Some(ch) = state.peek() {
169                    if ch == quote {
170                        state.advance(1);
171                        break;
172                    }
173                    else if ch == '\\' {
174                        state.advance(1);
175                        if let Some(_) = state.peek() {
176                            state.advance(1);
177                        }
178                    }
179                    else {
180                        state.advance(ch.len_utf8());
181                    }
182                }
183
184                state.add_token(TypeScriptTokenType::StringLiteral, start, state.get_position());
185                return true;
186            }
187        }
188
189        false
190    }
191
192    fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
193        let start = state.get_position();
194
195        if state.peek() == Some('`') {
196            state.advance(1);
197
198            while let Some(ch) = state.peek() {
199                if ch == '`' {
200                    state.advance(1);
201                    break;
202                }
203                else if ch == '\\' {
204                    state.advance(1);
205                    if let Some(_) = state.peek() {
206                        state.advance(1);
207                    }
208                }
209                else {
210                    state.advance(ch.len_utf8());
211                }
212            }
213
214            state.add_token(TypeScriptTokenType::TemplateString, start, state.get_position());
215            return true;
216        }
217
218        false
219    }
220
221    fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
222        let start = state.get_position();
223
224        if let Some(ch) = state.peek() {
225            if ch.is_ascii_digit() {
226                state.advance(1);
227
228                // 处理十六进制
229                if ch == '0' && (state.peek() == Some('x') || state.peek() == Some('X')) {
230                    state.advance(1);
231                    while let Some(ch) = state.peek() {
232                        if ch.is_ascii_hexdigit() {
233                            state.advance(1);
234                        }
235                        else {
236                            break;
237                        }
238                    }
239                }
240                else {
241                    // 处理十进制
242                    while let Some(ch) = state.peek() {
243                        if ch.is_ascii_digit() {
244                            state.advance(1);
245                        }
246                        else if ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
247                            state.advance(1);
248                            while let Some(ch) = state.peek() {
249                                if ch.is_ascii_digit() {
250                                    state.advance(1);
251                                }
252                                else {
253                                    break;
254                                }
255                            }
256                            break;
257                        }
258                        else {
259                            break;
260                        }
261                    }
262                }
263
264                // 检查 BigInt 后缀
265                if state.peek() == Some('n') {
266                    state.advance(1);
267                    state.add_token(TypeScriptTokenType::BigIntLiteral, start, state.get_position());
268                }
269                else {
270                    state.add_token(TypeScriptTokenType::NumericLiteral, start, state.get_position());
271                }
272
273                return true;
274            }
275        }
276
277        false
278    }
279
280    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
281        let start = state.get_position();
282
283        if let Some(ch) = state.peek() {
284            if ch.is_alphabetic() || ch == '_' || ch == '$' {
285                state.advance(ch.len_utf8());
286
287                while let Some(ch) = state.peek() {
288                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
289                        state.advance(ch.len_utf8());
290                    }
291                    else {
292                        break;
293                    }
294                }
295
296                // 获取标识符文本并检查是否为关键字
297                let end = state.get_position();
298                let text = state.get_text_in(oak_core::Range { start, end });
299                let kind = self.keyword_or_identifier(&text);
300
301                state.add_token(kind, start, state.get_position());
302                return true;
303            }
304        }
305
306        false
307    }
308
309    fn keyword_or_identifier(&self, text: &str) -> TypeScriptTokenType {
310        TypeScriptTokenType::from_keyword(text).unwrap_or(TypeScriptTokenType::IdentifierName)
311    }
312
313    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
314        let start = state.get_position();
315        let rest = state.rest();
316
317        let ops = [
318            ("===", TypeScriptTokenType::EqualEqualEqual),
319            ("!==", TypeScriptTokenType::NotEqualEqual),
320            (">>>", TypeScriptTokenType::UnsignedRightShift),
321            ("...", TypeScriptTokenType::DotDotDot),
322            ("**=", TypeScriptTokenType::StarStarEqual),
323            ("<<=", TypeScriptTokenType::LeftShiftEqual),
324            (">>=", TypeScriptTokenType::RightShiftEqual),
325            ("&&=", TypeScriptTokenType::AmpersandAmpersandEqual),
326            ("||=", TypeScriptTokenType::PipePipeEqual),
327            ("??=", TypeScriptTokenType::QuestionQuestionEqual),
328            ("**", TypeScriptTokenType::StarStar),
329            ("<=", TypeScriptTokenType::LessEqual),
330            (">=", TypeScriptTokenType::GreaterEqual),
331            ("==", TypeScriptTokenType::EqualEqual),
332            ("!=", TypeScriptTokenType::NotEqual),
333            ("&&", TypeScriptTokenType::AmpersandAmpersand),
334            ("||", TypeScriptTokenType::PipePipe),
335            ("<<", TypeScriptTokenType::LeftShift),
336            (">>", TypeScriptTokenType::RightShift),
337            ("++", TypeScriptTokenType::PlusPlus),
338            ("--", TypeScriptTokenType::MinusMinus),
339            ("=>", TypeScriptTokenType::Arrow),
340            ("?.", TypeScriptTokenType::QuestionDot),
341            ("??", TypeScriptTokenType::QuestionQuestion),
342            ("+=", TypeScriptTokenType::PlusEqual),
343            ("-=", TypeScriptTokenType::MinusEqual),
344            ("*=", TypeScriptTokenType::StarEqual),
345            ("/=", TypeScriptTokenType::SlashEqual),
346            ("%=", TypeScriptTokenType::PercentEqual),
347            ("&=", TypeScriptTokenType::AmpersandEqual),
348            ("|=", TypeScriptTokenType::PipeEqual),
349            ("^=", TypeScriptTokenType::CaretEqual),
350        ];
351
352        for (op, kind) in ops {
353            if rest.starts_with(op) {
354                state.advance(op.len());
355                state.add_token(kind, start, state.get_position());
356                return true;
357            }
358        }
359
360        if let Some(ch) = state.peek() {
361            let kind = match ch {
362                '+' => TypeScriptTokenType::Plus,
363                '-' => TypeScriptTokenType::Minus,
364                '*' => TypeScriptTokenType::Star,
365                '/' => TypeScriptTokenType::Slash,
366                '%' => TypeScriptTokenType::Percent,
367                '<' => TypeScriptTokenType::Less,
368                '>' => TypeScriptTokenType::Greater,
369                '!' => TypeScriptTokenType::Exclamation,
370                '&' => TypeScriptTokenType::Ampersand,
371                '|' => TypeScriptTokenType::Pipe,
372                '^' => TypeScriptTokenType::Caret,
373                '~' => TypeScriptTokenType::Tilde,
374                '=' => TypeScriptTokenType::Equal,
375                '?' => TypeScriptTokenType::Question,
376                '(' => TypeScriptTokenType::LeftParen,
377                ')' => TypeScriptTokenType::RightParen,
378                '{' => TypeScriptTokenType::LeftBrace,
379                '}' => TypeScriptTokenType::RightBrace,
380                '[' => TypeScriptTokenType::LeftBracket,
381                ']' => TypeScriptTokenType::RightBracket,
382                ';' => TypeScriptTokenType::Semicolon,
383                ',' => TypeScriptTokenType::Comma,
384                '.' => TypeScriptTokenType::Dot,
385                ':' => TypeScriptTokenType::Colon,
386                '@' => TypeScriptTokenType::At,
387                _ => return false,
388            };
389
390            state.advance(1);
391            state.add_token(kind, start, state.get_position());
392            return true;
393        }
394
395        false
396    }
397}