Skip to main content

oak_typescript/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the TypeScript language.
3pub mod token_type;
4
5pub use self::token_type::TypeScriptTokenType;
6use crate::language::TypeScriptLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
8
9/// Lexer for the TypeScript language.
10#[derive(Clone, Debug)]
11pub struct TypeScriptLexer<'config> {
12    config: &'config TypeScriptLanguage,
13}
14
15pub(crate) type State<'a, S> = LexerState<'a, S, TypeScriptLanguage>;
16
17impl<'config> TypeScriptLexer<'config> {
18    /// Creates a new `TypeScriptLexer` with the given language configuration.
19    pub fn new(config: &'config TypeScriptLanguage) -> Self {
20        Self { config }
21    }
22}
23
24impl<'config> Lexer<TypeScriptLanguage> for TypeScriptLexer<'config> {
25    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<TypeScriptLanguage>) -> LexOutput<TypeScriptLanguage> {
26        let relex_from = edits.iter().map(|e| e.span.start).min().unwrap_or(0);
27        let mut state: State<'_, S> = LexerState::new_with_cache(text, relex_from, cache);
28
29        let result = self.run(&mut state);
30        if result.is_ok() {
31            state.add_eof()
32        }
33        state.finish_with_cache(result, cache)
34    }
35}
36
37impl<'config> TypeScriptLexer<'config> {
38    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.lex_newline(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_template_literal(state) {
59                continue;
60            }
61
62            if self.lex_numeric_literal(state) {
63                continue;
64            }
65
66            if self.lex_identifier_or_keyword(state) {
67                continue;
68            }
69
70            if self.lex_operator_or_punctuation(state) {
71                continue;
72            }
73
74            // If all rules do not match, skip current character and mark as error
75            let start_pos = state.get_position();
76            if let Some(ch) = state.peek() {
77                state.advance(ch.len_utf8());
78                state.add_token(TypeScriptTokenType::Error, start_pos, state.get_position());
79            }
80
81            state.advance_if_dead_lock(safe_point);
82        }
83
84        Ok(())
85    }
86
87    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89        let mut found = false;
90
91        while let Some(ch) = state.peek() {
92            if ch == ' ' || ch == '\t' {
93                state.advance(ch.len_utf8());
94                found = true;
95            }
96            else {
97                break;
98            }
99        }
100
101        if found {
102            state.add_token(TypeScriptTokenType::Whitespace, start, state.get_position());
103        }
104
105        found
106    }
107
108    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
109        let start = state.get_position();
110
111        if let Some(ch) = state.peek() {
112            if ch == '\n' {
113                state.advance(1);
114                state.add_token(TypeScriptTokenType::Newline, start, state.get_position());
115                return true;
116            }
117            else if ch == '\r' {
118                state.advance(1);
119                if state.peek() == Some('\n') {
120                    state.advance(1);
121                }
122                state.add_token(TypeScriptTokenType::Newline, start, state.get_position());
123                return true;
124            }
125        }
126
127        false
128    }
129
130    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
131        let start = state.get_position();
132        let rest = state.rest();
133
134        // Line comment: // ...
135        if rest.starts_with("//") {
136            state.advance(2);
137            while let Some(ch) = state.peek() {
138                if ch == '\n' || ch == '\r' {
139                    break;
140                }
141                state.advance(ch.len_utf8());
142            }
143            state.add_token(TypeScriptTokenType::LineComment, start, state.get_position());
144            return true;
145        }
146
147        // Block comment: /* ... */
148        if rest.starts_with("/*") {
149            state.advance(2);
150            while let Some(ch) = state.peek() {
151                if ch == '*' && state.peek_next_n(1) == Some('/') {
152                    state.advance(2);
153                    break;
154                }
155                state.advance(ch.len_utf8());
156            }
157            state.add_token(TypeScriptTokenType::BlockComment, start, state.get_position());
158            return true;
159        }
160
161        false
162    }
163
164    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
165        let start = state.get_position();
166
167        if let Some(quote) = state.peek() {
168            if quote == '"' || quote == '\'' {
169                state.advance(1);
170
171                while let Some(ch) = state.peek() {
172                    if ch == quote {
173                        state.advance(1);
174                        break;
175                    }
176                    else if ch == '\\' {
177                        state.advance(1);
178                        if let Some(_) = state.peek() {
179                            state.advance(1);
180                        }
181                    }
182                    else {
183                        state.advance(ch.len_utf8());
184                    }
185                }
186
187                state.add_token(TypeScriptTokenType::StringLiteral, start, state.get_position());
188                return true;
189            }
190        }
191
192        false
193    }
194
195    fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
196        let start = state.get_position();
197
198        if state.peek() == Some('`') {
199            state.advance(1);
200
201            while let Some(ch) = state.peek() {
202                if ch == '`' {
203                    state.advance(1);
204                    break;
205                }
206                else if ch == '\\' {
207                    state.advance(1);
208                    if let Some(_) = state.peek() {
209                        state.advance(1);
210                    }
211                }
212                else {
213                    state.advance(ch.len_utf8());
214                }
215            }
216
217            state.add_token(TypeScriptTokenType::TemplateString, start, state.get_position());
218            return true;
219        }
220
221        false
222    }
223
224    fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
225        let start = state.get_position();
226
227        if let Some(ch) = state.peek() {
228            if ch.is_ascii_digit() {
229                state.advance(1);
230
231                // Handle hexadecimal
232                if ch == '0' && (state.peek() == Some('x') || state.peek() == Some('X')) {
233                    state.advance(1);
234                    while let Some(ch) = state.peek() {
235                        if ch.is_ascii_hexdigit() {
236                            state.advance(1);
237                        }
238                        else {
239                            break;
240                        }
241                    }
242                }
243                else {
244                    // Handle decimal
245                    while let Some(ch) = state.peek() {
246                        if ch.is_ascii_digit() {
247                            state.advance(1);
248                        }
249                        else if ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
250                            state.advance(1);
251                            while let Some(ch) = state.peek() {
252                                if ch.is_ascii_digit() {
253                                    state.advance(1);
254                                }
255                                else {
256                                    break;
257                                }
258                            }
259                            break;
260                        }
261                        else {
262                            break;
263                        }
264                    }
265                }
266
267                // Check BigInt suffix
268                if state.peek() == Some('n') {
269                    state.advance(1);
270                    state.add_token(TypeScriptTokenType::BigIntLiteral, start, state.get_position());
271                }
272                else {
273                    state.add_token(TypeScriptTokenType::NumericLiteral, start, state.get_position());
274                }
275
276                return true;
277            }
278        }
279
280        false
281    }
282
283    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
284        let start = state.get_position();
285
286        if let Some(ch) = state.peek() {
287            if ch.is_alphabetic() || ch == '_' || ch == '$' {
288                state.advance(ch.len_utf8());
289
290                while let Some(ch) = state.peek() {
291                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
292                        state.advance(ch.len_utf8());
293                    }
294                    else {
295                        break;
296                    }
297                }
298
299                // Get identifier text and check if it's a keyword
300                let end = state.get_position();
301                let text = state.get_text_in(oak_core::Range { start, end });
302                let kind = self.keyword_or_identifier(&text);
303
304                state.add_token(kind, start, state.get_position());
305                return true;
306            }
307        }
308
309        false
310    }
311
312    fn keyword_or_identifier(&self, text: &str) -> TypeScriptTokenType {
313        TypeScriptTokenType::from_keyword(text).unwrap_or(TypeScriptTokenType::IdentifierName)
314    }
315
316    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
317        let start = state.get_position();
318        let rest = state.rest();
319
320        let ops = [
321            ("===", TypeScriptTokenType::EqualEqualEqual),
322            ("!==", TypeScriptTokenType::NotEqualEqual),
323            (">>>", TypeScriptTokenType::UnsignedRightShift),
324            ("...", TypeScriptTokenType::DotDotDot),
325            ("**=", TypeScriptTokenType::StarStarEqual),
326            ("<<=", TypeScriptTokenType::LeftShiftEqual),
327            (">>=", TypeScriptTokenType::RightShiftEqual),
328            ("&&=", TypeScriptTokenType::AmpersandAmpersandEqual),
329            ("||=", TypeScriptTokenType::PipePipeEqual),
330            ("??=", TypeScriptTokenType::QuestionQuestionEqual),
331            ("**", TypeScriptTokenType::StarStar),
332            ("<=", TypeScriptTokenType::LessEqual),
333            (">=", TypeScriptTokenType::GreaterEqual),
334            ("==", TypeScriptTokenType::EqualEqual),
335            ("!=", TypeScriptTokenType::NotEqual),
336            ("&&", TypeScriptTokenType::AmpersandAmpersand),
337            ("||", TypeScriptTokenType::PipePipe),
338            ("<<", TypeScriptTokenType::LeftShift),
339            (">>", TypeScriptTokenType::RightShift),
340            ("++", TypeScriptTokenType::PlusPlus),
341            ("--", TypeScriptTokenType::MinusMinus),
342            ("=>", TypeScriptTokenType::Arrow),
343            ("?.", TypeScriptTokenType::QuestionDot),
344            ("??", TypeScriptTokenType::QuestionQuestion),
345            ("+=", TypeScriptTokenType::PlusEqual),
346            ("-=", TypeScriptTokenType::MinusEqual),
347            ("*=", TypeScriptTokenType::StarEqual),
348            ("/=", TypeScriptTokenType::SlashEqual),
349            ("%=", TypeScriptTokenType::PercentEqual),
350            ("&=", TypeScriptTokenType::AmpersandEqual),
351            ("|=", TypeScriptTokenType::PipeEqual),
352            ("^=", TypeScriptTokenType::CaretEqual),
353        ];
354
355        for (op, kind) in ops {
356            if rest.starts_with(op) {
357                state.advance(op.len());
358                state.add_token(kind, start, state.get_position());
359                return true;
360            }
361        }
362
363        if let Some(ch) = state.peek() {
364            let kind = match ch {
365                '+' => TypeScriptTokenType::Plus,
366                '-' => TypeScriptTokenType::Minus,
367                '*' => TypeScriptTokenType::Star,
368                '/' => TypeScriptTokenType::Slash,
369                '%' => TypeScriptTokenType::Percent,
370                '<' => TypeScriptTokenType::Less,
371                '>' => TypeScriptTokenType::Greater,
372                '!' => TypeScriptTokenType::Exclamation,
373                '&' => TypeScriptTokenType::Ampersand,
374                '|' => TypeScriptTokenType::Pipe,
375                '^' => TypeScriptTokenType::Caret,
376                '~' => TypeScriptTokenType::Tilde,
377                '=' => TypeScriptTokenType::Equal,
378                '?' => TypeScriptTokenType::Question,
379                '(' => TypeScriptTokenType::LeftParen,
380                ')' => TypeScriptTokenType::RightParen,
381                '{' => TypeScriptTokenType::LeftBrace,
382                '}' => TypeScriptTokenType::RightBrace,
383                '[' => TypeScriptTokenType::LeftBracket,
384                ']' => TypeScriptTokenType::RightBracket,
385                ';' => TypeScriptTokenType::Semicolon,
386                ',' => TypeScriptTokenType::Comma,
387                '.' => TypeScriptTokenType::Dot,
388                ':' => TypeScriptTokenType::Colon,
389                '@' => TypeScriptTokenType::At,
390                _ => return false,
391            };
392
393            state.advance(1);
394            state.add_token(kind, start, state.get_position());
395            return true;
396        }
397
398        false
399    }
400}