Skip to main content

oak_javascript/lexer/
mod.rs

1//! JavaScript lexer implementation.
2
3pub mod token_type;
4
5use crate::{language::JavaScriptLanguage, lexer::token_type::JavaScriptTokenType};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
7use std::simd::prelude::*;
8
9pub(crate) type State<'a, S> = LexerState<'a, S, JavaScriptLanguage>;
10
11/// JavaScript lexer.
12#[derive(Clone, Debug)]
13pub struct JavaScriptLexer<'config> {
14    config: &'config JavaScriptLanguage,
15}
16
17impl<'config> JavaScriptLexer<'config> {
18    /// Creates a new JavaScript lexer.
19    pub fn new(config: &'config JavaScriptLanguage) -> Self {
20        Self { config }
21    }
22
23    fn safe_check<'a, S: Source + ?Sized>(&self, state: &State<'a, S>) -> Result<(), OakError> {
24        if state.get_position() <= state.get_length() { Ok(()) } else { Err(OakError::custom_error(format!("Lexer out-of-bounds: pos={}, len={}", state.get_position(), state.get_length()))) }
25    }
26
27    /// Main lexer run method.
28    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
29        while state.not_at_end() {
30            let safe_point = state.get_position();
31            self.safe_check(state)?;
32
33            if let Some(ch) = state.peek() {
34                match ch {
35                    ' ' | '\t' => {
36                        self.skip_whitespace(state);
37                    }
38                    '\n' | '\r' => {
39                        self.lex_newline(state);
40                    }
41                    '/' => {
42                        // Comment or Slash or SlashEqual
43                        if let Some(next) = state.peek_next_n(1) {
44                            if next == '/' || next == '*' {
45                                self.lex_comment(state);
46                            }
47                            else {
48                                self.lex_operator_or_punctuation(state);
49                            }
50                        }
51                        else {
52                            self.lex_operator_or_punctuation(state);
53                        }
54                    }
55                    '"' | '\'' => {
56                        self.lex_string_literal(state);
57                    }
58                    '`' => {
59                        self.lex_template_literal(state);
60                    }
61                    '0'..='9' => {
62                        self.lex_numeric_literal(state);
63                    }
64                    '.' => {
65                        // Dot, DotDotDot, or Number (.5)
66                        if self.is_next_digit(state) {
67                            self.lex_numeric_literal(state);
68                        }
69                        else {
70                            self.lex_operator_or_punctuation(state);
71                        }
72                    }
73                    'a'..='z' | 'A'..='Z' | '_' | '$' => {
74                        self.lex_identifier_or_keyword(state);
75                    }
76                    '+' | '-' | '*' | '%' | '<' | '>' | '=' | '!' | '&' | '|' | '^' | '~' | '?' | '(' | ')' | '{' | '}' | '[' | ']' | ';' | ',' | ':' => {
77                        self.lex_operator_or_punctuation(state);
78                    }
79                    _ => {
80                        let start = state.get_position();
81                        state.advance(ch.len_utf8());
82                        state.add_token(JavaScriptTokenType::Error, start, state.get_position());
83                    }
84                }
85            }
86
87            state.advance_if_dead_lock(safe_point)
88        }
89
90        Ok(())
91    }
92
93    /// Skips whitespace characters.
94    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start = state.get_position();
96        let bytes = state.rest_bytes();
97        let mut i = 0;
98        let len = bytes.len();
99        const LANES: usize = 32;
100
101        while i + LANES <= len {
102            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
103            let is_space = chunk.simd_eq(Simd::splat(b' '));
104            let is_tab = chunk.simd_eq(Simd::splat(b'\t'));
105            let is_ws = is_space | is_tab;
106
107            if !is_ws.all() {
108                let not_ws = !is_ws;
109                let idx = not_ws.first_set().unwrap();
110                i += idx;
111                state.advance(i);
112                state.add_token(JavaScriptTokenType::Whitespace, start, state.get_position());
113                return true;
114            }
115            i += LANES
116        }
117
118        while i < len {
119            let ch = unsafe { *bytes.get_unchecked(i) };
120            if ch != b' ' && ch != b'\t' {
121                break;
122            }
123            i += 1
124        }
125
126        if i > 0 {
127            state.advance(i);
128            state.add_token(JavaScriptTokenType::Whitespace, start, state.get_position());
129            true
130        }
131        else {
132            false
133        }
134    }
135
136    /// Handles newline characters.
137    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
138        let start_pos = state.get_position();
139
140        if let Some('\n') = state.peek() {
141            state.advance(1);
142            state.add_token(JavaScriptTokenType::Newline, start_pos, state.get_position());
143            true
144        }
145        else if let Some('\r') = state.peek() {
146            state.advance(1);
147            if let Some('\n') = state.peek() {
148                state.advance(1)
149            }
150            state.add_token(JavaScriptTokenType::Newline, start_pos, state.get_position());
151            true
152        }
153        else {
154            false
155        }
156    }
157
158    /// Handles comments (line and block comments).
159    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160        let start = state.get_position();
161        let rest = state.rest();
162
163        // Line comment: // ... until newline
164        if rest.starts_with("//") {
165            state.advance(2);
166            while let Some(ch) = state.peek() {
167                if ch == '\n' || ch == '\r' {
168                    break;
169                }
170                state.advance(ch.len_utf8())
171            }
172            state.add_token(JavaScriptTokenType::LineComment, start, state.get_position());
173            return true;
174        }
175
176        // Block comment: /* ... */
177        if rest.starts_with("/*") {
178            state.advance(2);
179            let mut found_end = false;
180            while let Some(ch) = state.peek() {
181                if ch == '*' && state.peek_next_n(1) == Some('/') {
182                    state.advance(2);
183                    found_end = true;
184                    break;
185                }
186                state.advance(ch.len_utf8())
187            }
188
189            if !found_end {
190                let error = OakError::syntax_error("Unterminated comment".to_string(), start, None);
191                state.add_error(error)
192            }
193
194            state.add_token(JavaScriptTokenType::BlockComment, start, state.get_position());
195            return true;
196        }
197
198        false
199    }
200
201    /// Handles string literals.
202    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
203        let start_pos = state.get_position();
204
205        if let Some(first_char) = state.peek() {
206            if first_char == '"' || first_char == '\'' {
207                let quote = first_char;
208                state.advance(1);
209                let mut found_end = false;
210
211                while let Some(ch) = state.peek() {
212                    if ch == quote {
213                        state.advance(1);
214                        found_end = true;
215                        break;
216                    }
217                    else if ch == '\\' {
218                        // Skip escaped character
219                        state.advance(1);
220                        if let Some(escaped) = state.peek() {
221                            state.advance(escaped.len_utf8())
222                        }
223                    }
224                    else {
225                        state.advance(ch.len_utf8())
226                    }
227                }
228
229                if !found_end {
230                    let error = OakError::syntax_error("Unterminated string literal".to_string(), start_pos, None);
231                    state.add_error(error)
232                }
233
234                state.add_token(JavaScriptTokenType::StringLiteral, start_pos, state.get_position());
235                return true;
236            }
237        }
238
239        false
240    }
241
242    /// Handles template literals.
243    fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
244        let start_pos = state.get_position();
245
246        if let Some('`') = state.peek() {
247            state.advance(1);
248            let mut found_end = false;
249
250            while let Some(ch) = state.peek() {
251                if ch == '`' {
252                    state.advance(1);
253                    found_end = true;
254                    break;
255                }
256                else if ch == '\\' {
257                    // Handle escaped characters
258                    state.advance(1);
259                    if let Some(escaped) = state.peek() {
260                        state.advance(escaped.len_utf8())
261                    }
262                }
263                else if ch == '$' {
264                    if let Some('{') = state.peek_next_n(1) {
265                        // Template expression, skip for now
266                        state.advance(2);
267                        let mut brace_count = 1;
268                        while let Some(inner_ch) = state.peek() {
269                            if inner_ch == '{' {
270                                brace_count += 1
271                            }
272                            else if inner_ch == '}' {
273                                brace_count -= 1;
274                                if brace_count == 0 {
275                                    state.advance(1);
276                                    break;
277                                }
278                            }
279                            state.advance(inner_ch.len_utf8())
280                        }
281                    }
282                    else {
283                        state.advance(ch.len_utf8())
284                    }
285                }
286                else {
287                    state.advance(ch.len_utf8())
288                }
289            }
290
291            if !found_end {
292                let error = OakError::syntax_error("Unterminated template literal".to_string(), start_pos, None);
293                state.add_error(error)
294            }
295
296            state.add_token(JavaScriptTokenType::TemplateString, start_pos, state.get_position());
297            true
298        }
299        else {
300            false
301        }
302    }
303
304    /// Handles numeric literals.
305    fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
306        let start_pos = state.get_position();
307
308        if let Some(ch) = state.peek() {
309            // Hexadecimal number (0x or 0X)
310            if ch == '0' {
311                if let Some(next) = state.peek_next_n(1) {
312                    if next == 'x' || next == 'X' {
313                        state.advance(2); // Skip '0x'
314                        let mut has_digits = false;
315                        while let Some(hex_ch) = state.peek() {
316                            if hex_ch.is_ascii_hexdigit() {
317                                state.advance(1);
318                                has_digits = true
319                            }
320                            else {
321                                break;
322                            }
323                        }
324
325                        if !has_digits {
326                            let error = OakError::syntax_error("Invalid hexadecimal number".to_string(), start_pos, None);
327                            state.add_error(error)
328                        }
329
330                        // Check for BigInt suffix
331                        if let Some('n') = state.peek() {
332                            state.advance(1);
333                            state.add_token(JavaScriptTokenType::BigIntLiteral, start_pos, state.get_position())
334                        }
335                        else {
336                            state.add_token(JavaScriptTokenType::NumericLiteral, start_pos, state.get_position())
337                        }
338                        return true;
339                    }
340                }
341            }
342
343            // Normal number or decimal
344            if ch.is_ascii_digit() || (ch == '.' && self.is_next_digit(state)) {
345                // Handle integer part
346                if ch != '.' {
347                    while let Some(digit) = state.peek() {
348                        if digit.is_ascii_digit() { state.advance(1) } else { break }
349                    }
350                }
351
352                // Handle decimal part
353                if let Some('.') = state.peek() {
354                    state.advance(1);
355                    while let Some(digit) = state.peek() {
356                        if digit.is_ascii_digit() { state.advance(1) } else { break }
357                    }
358                }
359
360                // Handle exponent part
361                if let Some(exp) = state.peek() {
362                    if exp == 'e' || exp == 'E' {
363                        state.advance(1);
364
365                        // Optional sign
366                        if let Some(sign) = state.peek() {
367                            if sign == '+' || sign == '-' {
368                                state.advance(1)
369                            }
370                        }
371
372                        // Must have digits
373                        let mut has_exp_digits = false;
374                        while let Some(digit) = state.peek() {
375                            if digit.is_ascii_digit() {
376                                state.advance(1);
377                                has_exp_digits = true
378                            }
379                            else {
380                                break;
381                            }
382                        }
383
384                        if !has_exp_digits {
385                            let error = OakError::syntax_error("Invalid number exponent".to_string(), start_pos, None);
386                            state.add_error(error)
387                        }
388                    }
389                }
390
391                // Check for BigInt suffix
392                if let Some('n') = state.peek() {
393                    state.advance(1);
394                    state.add_token(JavaScriptTokenType::BigIntLiteral, start_pos, state.get_position())
395                }
396                else {
397                    state.add_token(JavaScriptTokenType::NumericLiteral, start_pos, state.get_position())
398                }
399                true
400            }
401            else {
402                false
403            }
404        }
405        else {
406            false
407        }
408    }
409
410    /// Checks if the next character is a digit.
411    fn is_next_digit<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
412        if let Some(next_ch) = state.peek_next_n(1) { next_ch.is_ascii_digit() } else { false }
413    }
414
415    /// Handles identifiers or keywords.
416    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
417        let start_pos = state.get_position();
418
419        if let Some(ch) = state.peek() {
420            if ch.is_alphabetic() || ch == '_' || ch == '$' {
421                state.advance(ch.len_utf8());
422
423                while let Some(next_ch) = state.peek() {
424                    if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '$' { state.advance(next_ch.len_utf8()) } else { break }
425                }
426
427                let text = state.get_text_in((start_pos..state.get_position()).into());
428                let token_kind = self.keyword_or_identifier(&text);
429                state.add_token(token_kind, start_pos, state.get_position());
430                true
431            }
432            else {
433                false
434            }
435        }
436        else {
437            false
438        }
439    }
440
441    /// Determines if it's a keyword or an identifier.
442    fn keyword_or_identifier(&self, text: &str) -> JavaScriptTokenType {
443        JavaScriptTokenType::from_keyword(text).unwrap_or(JavaScriptTokenType::IdentifierName)
444    }
445
446    /// Handles operators and punctuation.
447    fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
448        let start_pos = state.get_position();
449
450        if let Some(ch) = state.peek() {
451            let token_kind = match ch {
452                '+' => {
453                    state.advance(1);
454                    match state.peek() {
455                        Some('+') => {
456                            state.advance(1);
457                            JavaScriptTokenType::PlusPlus
458                        }
459                        Some('=') => {
460                            state.advance(1);
461                            JavaScriptTokenType::PlusEqual
462                        }
463                        _ => JavaScriptTokenType::Plus,
464                    }
465                }
466                '-' => {
467                    state.advance(1);
468                    match state.peek() {
469                        Some('-') => {
470                            state.advance(1);
471                            JavaScriptTokenType::MinusMinus
472                        }
473                        Some('=') => {
474                            state.advance(1);
475                            JavaScriptTokenType::MinusEqual
476                        }
477                        _ => JavaScriptTokenType::Minus,
478                    }
479                }
480                '*' => {
481                    state.advance(1);
482                    match state.peek() {
483                        Some('*') => {
484                            state.advance(1);
485                            if let Some('=') = state.peek() {
486                                state.advance(1);
487                                JavaScriptTokenType::StarStarEqual
488                            }
489                            else {
490                                JavaScriptTokenType::StarStar
491                            }
492                        }
493                        Some('=') => {
494                            state.advance(1);
495                            JavaScriptTokenType::StarEqual
496                        }
497                        _ => JavaScriptTokenType::Star,
498                    }
499                }
500                '/' => {
501                    // Check if it's a comment
502                    if let Some(next) = state.peek_next_n(1) {
503                        if next == '/' || next == '*' {
504                            return false; // Let the comment handler process it
505                        }
506                    }
507                    state.advance(1);
508                    if let Some('=') = state.peek() {
509                        state.advance(1);
510                        JavaScriptTokenType::SlashEqual
511                    }
512                    else {
513                        JavaScriptTokenType::Slash
514                    }
515                }
516                '%' => {
517                    state.advance(1);
518                    if let Some('=') = state.peek() {
519                        state.advance(1);
520                        JavaScriptTokenType::PercentEqual
521                    }
522                    else {
523                        JavaScriptTokenType::Percent
524                    }
525                }
526                '<' => {
527                    state.advance(1);
528                    match state.peek() {
529                        Some('<') => {
530                            state.advance(1);
531                            if let Some('=') = state.peek() {
532                                state.advance(1);
533                                JavaScriptTokenType::LeftShiftEqual
534                            }
535                            else {
536                                JavaScriptTokenType::LeftShift
537                            }
538                        }
539                        Some('=') => {
540                            state.advance(1);
541                            JavaScriptTokenType::LessEqual
542                        }
543                        _ => JavaScriptTokenType::Less,
544                    }
545                }
546                '>' => {
547                    state.advance(1);
548                    match state.peek() {
549                        Some('>') => {
550                            state.advance(1);
551                            match state.peek() {
552                                Some('>') => {
553                                    state.advance(1);
554                                    if let Some('=') = state.peek() {
555                                        state.advance(1);
556                                        JavaScriptTokenType::UnsignedRightShiftEqual
557                                    }
558                                    else {
559                                        JavaScriptTokenType::UnsignedRightShift
560                                    }
561                                }
562                                Some('=') => {
563                                    state.advance(1);
564                                    JavaScriptTokenType::RightShiftEqual
565                                }
566                                _ => JavaScriptTokenType::RightShift,
567                            }
568                        }
569                        Some('=') => {
570                            state.advance(1);
571                            JavaScriptTokenType::GreaterEqual
572                        }
573                        _ => JavaScriptTokenType::Greater,
574                    }
575                }
576                '=' => {
577                    state.advance(1);
578                    match state.peek() {
579                        Some('=') => {
580                            state.advance(1);
581                            if let Some('=') = state.peek() {
582                                state.advance(1);
583                                JavaScriptTokenType::EqualEqualEqual
584                            }
585                            else {
586                                JavaScriptTokenType::EqualEqual
587                            }
588                        }
589                        Some('>') => {
590                            state.advance(1);
591                            JavaScriptTokenType::Arrow
592                        }
593                        _ => JavaScriptTokenType::Equal,
594                    }
595                }
596                '!' => {
597                    state.advance(1);
598                    match state.peek() {
599                        Some('=') => {
600                            state.advance(1);
601                            if let Some('=') = state.peek() {
602                                state.advance(1);
603                                JavaScriptTokenType::NotEqualEqual
604                            }
605                            else {
606                                JavaScriptTokenType::NotEqual
607                            }
608                        }
609                        _ => JavaScriptTokenType::Exclamation,
610                    }
611                }
612                '&' => {
613                    state.advance(1);
614                    match state.peek() {
615                        Some('&') => {
616                            state.advance(1);
617                            if let Some('=') = state.peek() {
618                                state.advance(1);
619                                JavaScriptTokenType::AmpersandAmpersandEqual
620                            }
621                            else {
622                                JavaScriptTokenType::AmpersandAmpersand
623                            }
624                        }
625                        Some('=') => {
626                            state.advance(1);
627                            JavaScriptTokenType::AmpersandEqual
628                        }
629                        _ => JavaScriptTokenType::Ampersand,
630                    }
631                }
632                '|' => {
633                    state.advance(1);
634                    match state.peek() {
635                        Some('|') => {
636                            state.advance(1);
637                            if let Some('=') = state.peek() {
638                                state.advance(1);
639                                JavaScriptTokenType::PipePipeEqual
640                            }
641                            else {
642                                JavaScriptTokenType::PipePipe
643                            }
644                        }
645                        Some('=') => {
646                            state.advance(1);
647                            JavaScriptTokenType::PipeEqual
648                        }
649                        _ => JavaScriptTokenType::Pipe,
650                    }
651                }
652                '^' => {
653                    state.advance(1);
654                    if let Some('=') = state.peek() {
655                        state.advance(1);
656                        JavaScriptTokenType::CaretEqual
657                    }
658                    else {
659                        JavaScriptTokenType::Caret
660                    }
661                }
662                '~' => {
663                    state.advance(1);
664                    JavaScriptTokenType::Tilde
665                }
666                '?' => {
667                    state.advance(1);
668                    match state.peek() {
669                        Some('?') => {
670                            state.advance(1);
671                            if let Some('=') = state.peek() {
672                                state.advance(1);
673                                JavaScriptTokenType::QuestionQuestionEqual
674                            }
675                            else {
676                                JavaScriptTokenType::QuestionQuestion
677                            }
678                        }
679                        Some('.') => {
680                            state.advance(1);
681                            JavaScriptTokenType::QuestionDot
682                        }
683                        _ => JavaScriptTokenType::Question,
684                    }
685                }
686                '(' => {
687                    state.advance(1);
688                    JavaScriptTokenType::LeftParen
689                }
690                ')' => {
691                    state.advance(1);
692                    JavaScriptTokenType::RightParen
693                }
694                '{' => {
695                    state.advance(1);
696                    JavaScriptTokenType::LeftBrace
697                }
698                '}' => {
699                    state.advance(1);
700                    JavaScriptTokenType::RightBrace
701                }
702                '[' => {
703                    state.advance(1);
704                    JavaScriptTokenType::LeftBracket
705                }
706                ']' => {
707                    state.advance(1);
708                    JavaScriptTokenType::RightBracket
709                }
710                ';' => {
711                    state.advance(1);
712                    JavaScriptTokenType::Semicolon
713                }
714                ',' => {
715                    state.advance(1);
716                    JavaScriptTokenType::Comma
717                }
718                '.' => {
719                    state.advance(1);
720                    if let Some('.') = state.peek() {
721                        if let Some('.') = state.peek_next_n(1) {
722                            state.advance(2);
723                            JavaScriptTokenType::DotDotDot
724                        }
725                        else {
726                            JavaScriptTokenType::Dot
727                        }
728                    }
729                    else {
730                        JavaScriptTokenType::Dot
731                    }
732                }
733                ':' => {
734                    state.advance(1);
735                    JavaScriptTokenType::Colon
736                }
737                _ => return false,
738            };
739
740            state.add_token(token_kind, start_pos, state.get_position());
741            true
742        }
743        else {
744            false
745        }
746    }
747}
748
749impl<'config> Lexer<JavaScriptLanguage> for JavaScriptLexer<'config> {
750    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JavaScriptLanguage>) -> LexOutput<JavaScriptLanguage> {
751        let mut state = LexerState::new(text);
752        let result = self.run(&mut state);
753        if result.is_ok() {
754            state.add_eof()
755        }
756        state.finish_with_cache(result, cache)
757    }
758}