1use crate::{kind::JavaScriptSyntaxKind, language::JavaScriptLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, JavaScriptLanguage>;
10
11static JS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static JS_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: Some('\\') });
13
14
15#[derive(Clone)]
16pub struct JavaScriptLexer<'config> {
17 config: &'config JavaScriptLanguage,
18}
19
20impl<'config> JavaScriptLexer<'config> {
21 pub fn new(config: &'config JavaScriptLanguage) -> Self {
22 Self { config }
23 }
24
25 fn safe_check<S: Source>(&self, state: &State<S>) -> Result<(), OakError> {
26 if state.get_position() <= state.length() {
27 Ok(())
28 }
29 else {
30 Err(OakError::custom_error(format!("Lexer out-of-bounds: pos={}, len={}", state.get_position(), state.length())))
31 }
32 }
33
34 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
36 while state.not_at_end() {
37 let current_pos = state.get_position();
38 let current_char = state.peek();
39
40 self.safe_check(state)?;
41
42 if self.skip_whitespace(state) {
43 continue;
44 }
45
46 if self.lex_newline(state) {
47 continue;
48 }
49
50 if self.lex_comment(state) {
51 continue;
52 }
53
54 if self.lex_string_literal(state) {
55 continue;
56 }
57
58 if self.lex_template_literal(state) {
59 continue;
60 }
61
62 if self.lex_numeric_literal(state) {
63 continue;
64 }
65
66 if self.lex_identifier_or_keyword(state) {
67 continue;
68 }
69
70 if self.lex_operator_or_punctuation(state) {
71 continue;
72 }
73
74 let start = state.get_position();
75 if let Some(ch) = state.peek() {
76 state.advance(ch.len_utf8());
77 state.add_token(JavaScriptSyntaxKind::Error, start, state.get_position());
78 }
79 else {
80 break;
81 }
82 }
83
84 let eof_pos = state.get_position();
85 state.add_token(JavaScriptSyntaxKind::Eof, eof_pos, eof_pos);
86 Ok(())
87 }
88
89 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
91 match JS_WHITESPACE.scan(state.rest(), state.get_position(), JavaScriptSyntaxKind::Whitespace) {
92 Some(token) => {
93 state.advance_with(token);
94 true
95 }
96 None => false,
97 }
98 }
99
100 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
102 let start_pos = state.get_position();
103
104 if let Some('\n') = state.peek() {
105 state.advance(1);
106 state.add_token(JavaScriptSyntaxKind::Newline, start_pos, state.get_position());
107 true
108 }
109 else if let Some('\r') = state.peek() {
110 state.advance(1);
111 if let Some('\n') = state.peek() {
112 state.advance(1);
113 }
114 state.add_token(JavaScriptSyntaxKind::Newline, start_pos, state.get_position());
115 true
116 }
117 else {
118 false
119 }
120 }
121
122 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
124 let start = state.get_position();
125 let rest = state.rest();
126
127 if rest.starts_with("//") {
129 state.advance(2);
130 while let Some(ch) = state.peek() {
131 if ch == '\n' || ch == '\r' {
132 break;
133 }
134 state.advance(ch.len_utf8());
135 }
136 state.add_token(JavaScriptSyntaxKind::LineComment, start, state.get_position());
137 return true;
138 }
139
140 if rest.starts_with("/*") {
142 state.advance(2);
143 let mut found_end = false;
144 while let Some(ch) = state.peek() {
145 if ch == '*' && state.peek_next_n(1) == Some('/') {
146 state.advance(2);
147 found_end = true;
148 break;
149 }
150 state.advance(ch.len_utf8());
151 }
152
153 if !found_end {
154 let error = state.syntax_error("Unterminated comment", start);
155 state.add_error(error);
156 }
157
158 state.add_token(JavaScriptSyntaxKind::BlockComment, start, state.get_position());
159 return true;
160 }
161
162 false
163 }
164
165 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
167 let start_pos = state.get_position();
168
169 if let Some(first_char) = state.peek() {
170 if first_char == '"' || first_char == '\'' {
171 let quote = first_char;
172 state.advance(1);
173 let mut found_end = false;
174
175 while let Some(ch) = state.peek() {
176 if ch == quote {
177 state.advance(1);
178 found_end = true;
179 break;
180 }
181 else if ch == '\\' {
182 state.advance(1);
184 if let Some(_escaped) = state.peek() {
185 state.advance(1);
186 }
187 }
188 else if ch == '\n' || ch == '\r' {
189 break;
191 }
192 else {
193 state.advance(ch.len_utf8());
194 }
195 }
196
197 if !found_end {
198 let error = state.syntax_error("Unterminated string literal", start_pos);
199 state.add_error(error);
200 }
201
202 state.add_token(JavaScriptSyntaxKind::StringLiteral, start_pos, state.get_position());
203 true
204 }
205 else {
206 false
207 }
208 }
209 else {
210 false
211 }
212 }
213
214 fn lex_template_literal<S: Source>(&self, state: &mut State<S>) -> bool {
216 let start_pos = state.get_position();
217
218 if let Some('`') = state.peek() {
219 state.advance(1);
220
221 let mut found_end = false;
222 while let Some(ch) = state.peek() {
223 if ch == '`' {
224 state.advance(1);
225 found_end = true;
226 break;
227 }
228 else if ch == '\\' {
229 state.advance(1);
231 if let Some(escaped) = state.peek() {
232 state.advance(escaped.len_utf8());
233 }
234 }
235 else if ch == '$' {
236 if let Some('{') = state.peek_next_n(1) {
237 state.advance(2);
239 let mut brace_count = 1;
240 while let Some(inner_ch) = state.peek() {
241 if inner_ch == '{' {
242 brace_count += 1;
243 }
244 else if inner_ch == '}' {
245 brace_count -= 1;
246 if brace_count == 0 {
247 state.advance(1);
248 break;
249 }
250 }
251 state.advance(inner_ch.len_utf8());
252 }
253 }
254 else {
255 state.advance(ch.len_utf8());
256 }
257 }
258 else {
259 state.advance(ch.len_utf8());
260 }
261 }
262
263 if !found_end {
264 let error = state.syntax_error("Unterminated template literal", start_pos);
265 state.add_error(error);
266 }
267
268 state.add_token(JavaScriptSyntaxKind::TemplateString, start_pos, state.get_position());
269 true
270 }
271 else {
272 false
273 }
274 }
275
276 fn lex_numeric_literal<S: Source>(&self, state: &mut State<S>) -> bool {
278 let start_pos = state.get_position();
279
280 if let Some(ch) = state.peek() {
281 if ch == '0' {
283 if let Some(next) = state.peek_next_n(1) {
284 if next == 'x' || next == 'X' {
285 state.advance(2); let mut has_digits = false;
287 while let Some(hex_ch) = state.peek() {
288 if hex_ch.is_ascii_hexdigit() {
289 state.advance(1);
290 has_digits = true;
291 }
292 else {
293 break;
294 }
295 }
296
297 if !has_digits {
298 let error = state.syntax_error("Invalid hexadecimal number", start_pos);
299 state.add_error(error);
300 }
301
302 if let Some('n') = state.peek() {
304 state.advance(1);
305 state.add_token(JavaScriptSyntaxKind::BigIntLiteral, start_pos, state.get_position());
306 }
307 else {
308 state.add_token(JavaScriptSyntaxKind::NumericLiteral, start_pos, state.get_position());
309 }
310 return true;
311 }
312 }
313 }
314
315 if ch.is_ascii_digit() || (ch == '.' && self.is_next_digit(state)) {
317 if ch != '.' {
319 while let Some(digit) = state.peek() {
320 if digit.is_ascii_digit() {
321 state.advance(1);
322 }
323 else {
324 break;
325 }
326 }
327 }
328
329 if let Some('.') = state.peek() {
331 state.advance(1);
332 while let Some(digit) = state.peek() {
333 if digit.is_ascii_digit() {
334 state.advance(1);
335 }
336 else {
337 break;
338 }
339 }
340 }
341
342 if let Some(exp) = state.peek() {
344 if exp == 'e' || exp == 'E' {
345 state.advance(1);
346
347 if let Some(sign) = state.peek() {
349 if sign == '+' || sign == '-' {
350 state.advance(1);
351 }
352 }
353
354 let mut has_exp_digits = false;
356 while let Some(digit) = state.peek() {
357 if digit.is_ascii_digit() {
358 state.advance(1);
359 has_exp_digits = true;
360 }
361 else {
362 break;
363 }
364 }
365
366 if !has_exp_digits {
367 let error = state.syntax_error("Invalid number exponent", start_pos);
368 state.add_error(error);
369 }
370 }
371 }
372
373 if let Some('n') = state.peek() {
375 state.advance(1);
376 state.add_token(JavaScriptSyntaxKind::BigIntLiteral, start_pos, state.get_position());
377 }
378 else {
379 state.add_token(JavaScriptSyntaxKind::NumericLiteral, start_pos, state.get_position());
380 }
381 true
382 }
383 else {
384 false
385 }
386 }
387 else {
388 false
389 }
390 }
391
392 fn is_next_digit<S: Source>(&self, state: &State<S>) -> bool {
394 if let Some(next_ch) = state.peek_next_n(1) { next_ch.is_ascii_digit() } else { false }
395 }
396
397 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
399 let start_pos = state.get_position();
400
401 if let Some(ch) = state.peek() {
402 if ch.is_alphabetic() || ch == '_' || ch == '$' {
403 state.advance(ch.len_utf8());
404
405 while let Some(next_ch) = state.peek() {
406 if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '$' {
407 state.advance(next_ch.len_utf8());
408 }
409 else {
410 break;
411 }
412 }
413
414 let text = state.get_text_in((start_pos..state.get_position()).into());
415 let token_kind = self.keyword_or_identifier(&text);
416 state.add_token(token_kind, start_pos, state.get_position());
417 true
418 }
419 else {
420 false
421 }
422 }
423 else {
424 false
425 }
426 }
427
428 fn keyword_or_identifier(&self, text: &str) -> JavaScriptSyntaxKind {
430 match text {
431 "abstract" => JavaScriptSyntaxKind::Abstract,
432 "as" => JavaScriptSyntaxKind::As,
433 "async" => JavaScriptSyntaxKind::Async,
434 "await" => JavaScriptSyntaxKind::Await,
435 "break" => JavaScriptSyntaxKind::Break,
436 "case" => JavaScriptSyntaxKind::Case,
437 "catch" => JavaScriptSyntaxKind::Catch,
438 "class" => JavaScriptSyntaxKind::Class,
439 "const" => JavaScriptSyntaxKind::Const,
440 "continue" => JavaScriptSyntaxKind::Continue,
441 "debugger" => JavaScriptSyntaxKind::Debugger,
442 "default" => JavaScriptSyntaxKind::Default,
443 "delete" => JavaScriptSyntaxKind::Delete,
444 "do" => JavaScriptSyntaxKind::Do,
445 "else" => JavaScriptSyntaxKind::Else,
446 "enum" => JavaScriptSyntaxKind::Enum,
447 "export" => JavaScriptSyntaxKind::Export,
448 "extends" => JavaScriptSyntaxKind::Extends,
449 "false" => JavaScriptSyntaxKind::False,
450 "finally" => JavaScriptSyntaxKind::Finally,
451 "for" => JavaScriptSyntaxKind::For,
452 "function" => JavaScriptSyntaxKind::Function,
453 "if" => JavaScriptSyntaxKind::If,
454 "implements" => JavaScriptSyntaxKind::Implements,
455 "import" => JavaScriptSyntaxKind::Import,
456 "in" => JavaScriptSyntaxKind::In,
457 "instanceof" => JavaScriptSyntaxKind::Instanceof,
458 "interface" => JavaScriptSyntaxKind::Interface,
459 "let" => JavaScriptSyntaxKind::Let,
460 "new" => JavaScriptSyntaxKind::New,
461 "null" => JavaScriptSyntaxKind::Null,
462 "package" => JavaScriptSyntaxKind::Package,
463 "private" => JavaScriptSyntaxKind::Private,
464 "protected" => JavaScriptSyntaxKind::Protected,
465 "public" => JavaScriptSyntaxKind::Public,
466 "return" => JavaScriptSyntaxKind::Return,
467 "static" => JavaScriptSyntaxKind::Static,
468 "super" => JavaScriptSyntaxKind::Super,
469 "switch" => JavaScriptSyntaxKind::Switch,
470 "this" => JavaScriptSyntaxKind::This,
471 "throw" => JavaScriptSyntaxKind::Throw,
472 "true" => JavaScriptSyntaxKind::True,
473 "try" => JavaScriptSyntaxKind::Try,
474 "typeof" => JavaScriptSyntaxKind::Typeof,
475 "undefined" => JavaScriptSyntaxKind::Undefined,
476 "var" => JavaScriptSyntaxKind::Var,
477 "void" => JavaScriptSyntaxKind::Void,
478 "while" => JavaScriptSyntaxKind::While,
479 "with" => JavaScriptSyntaxKind::With,
480 "yield" => JavaScriptSyntaxKind::Yield,
481 _ => JavaScriptSyntaxKind::IdentifierName,
482 }
483 }
484
485 fn lex_operator_or_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
487 let start_pos = state.get_position();
488
489 if let Some(ch) = state.peek() {
490 let token_kind = match ch {
491 '+' => {
492 state.advance(1);
493 match state.peek() {
494 Some('+') => {
495 state.advance(1);
496 JavaScriptSyntaxKind::PlusPlus
497 }
498 Some('=') => {
499 state.advance(1);
500 JavaScriptSyntaxKind::PlusEqual
501 }
502 _ => JavaScriptSyntaxKind::Plus,
503 }
504 }
505 '-' => {
506 state.advance(1);
507 match state.peek() {
508 Some('-') => {
509 state.advance(1);
510 JavaScriptSyntaxKind::MinusMinus
511 }
512 Some('=') => {
513 state.advance(1);
514 JavaScriptSyntaxKind::MinusEqual
515 }
516 _ => JavaScriptSyntaxKind::Minus,
517 }
518 }
519 '*' => {
520 state.advance(1);
521 match state.peek() {
522 Some('*') => {
523 state.advance(1);
524 if let Some('=') = state.peek() {
525 state.advance(1);
526 JavaScriptSyntaxKind::StarStarEqual
527 }
528 else {
529 JavaScriptSyntaxKind::StarStar
530 }
531 }
532 Some('=') => {
533 state.advance(1);
534 JavaScriptSyntaxKind::StarEqual
535 }
536 _ => JavaScriptSyntaxKind::Star,
537 }
538 }
539 '/' => {
540 if let Some(next) = state.peek_next_n(1) {
542 if next == '/' || next == '*' {
543 return false; }
545 }
546 state.advance(1);
547 if let Some('=') = state.peek() {
548 state.advance(1);
549 JavaScriptSyntaxKind::SlashEqual
550 }
551 else {
552 JavaScriptSyntaxKind::Slash
553 }
554 }
555 '%' => {
556 state.advance(1);
557 if let Some('=') = state.peek() {
558 state.advance(1);
559 JavaScriptSyntaxKind::PercentEqual
560 }
561 else {
562 JavaScriptSyntaxKind::Percent
563 }
564 }
565 '<' => {
566 state.advance(1);
567 match state.peek() {
568 Some('<') => {
569 state.advance(1);
570 if let Some('=') = state.peek() {
571 state.advance(1);
572 JavaScriptSyntaxKind::LeftShiftEqual
573 }
574 else {
575 JavaScriptSyntaxKind::LeftShift
576 }
577 }
578 Some('=') => {
579 state.advance(1);
580 JavaScriptSyntaxKind::LessEqual
581 }
582 _ => JavaScriptSyntaxKind::Less,
583 }
584 }
585 '>' => {
586 state.advance(1);
587 match state.peek() {
588 Some('>') => {
589 state.advance(1);
590 match state.peek() {
591 Some('>') => {
592 state.advance(1);
593 if let Some('=') = state.peek() {
594 state.advance(1);
595 JavaScriptSyntaxKind::UnsignedRightShiftEqual
596 }
597 else {
598 JavaScriptSyntaxKind::UnsignedRightShift
599 }
600 }
601 Some('=') => {
602 state.advance(1);
603 JavaScriptSyntaxKind::RightShiftEqual
604 }
605 _ => JavaScriptSyntaxKind::RightShift,
606 }
607 }
608 Some('=') => {
609 state.advance(1);
610 JavaScriptSyntaxKind::GreaterEqual
611 }
612 _ => JavaScriptSyntaxKind::Greater,
613 }
614 }
615 '=' => {
616 state.advance(1);
617 match state.peek() {
618 Some('=') => {
619 state.advance(1);
620 if let Some('=') = state.peek() {
621 state.advance(1);
622 JavaScriptSyntaxKind::EqualEqualEqual
623 }
624 else {
625 JavaScriptSyntaxKind::EqualEqual
626 }
627 }
628 Some('>') => {
629 state.advance(1);
630 JavaScriptSyntaxKind::Arrow
631 }
632 _ => JavaScriptSyntaxKind::Equal,
633 }
634 }
635 '!' => {
636 state.advance(1);
637 match state.peek() {
638 Some('=') => {
639 state.advance(1);
640 if let Some('=') = state.peek() {
641 state.advance(1);
642 JavaScriptSyntaxKind::NotEqualEqual
643 }
644 else {
645 JavaScriptSyntaxKind::NotEqual
646 }
647 }
648 _ => JavaScriptSyntaxKind::Exclamation,
649 }
650 }
651 '&' => {
652 state.advance(1);
653 match state.peek() {
654 Some('&') => {
655 state.advance(1);
656 if let Some('=') = state.peek() {
657 state.advance(1);
658 JavaScriptSyntaxKind::AmpersandAmpersandEqual
659 }
660 else {
661 JavaScriptSyntaxKind::AmpersandAmpersand
662 }
663 }
664 Some('=') => {
665 state.advance(1);
666 JavaScriptSyntaxKind::AmpersandEqual
667 }
668 _ => JavaScriptSyntaxKind::Ampersand,
669 }
670 }
671 '|' => {
672 state.advance(1);
673 match state.peek() {
674 Some('|') => {
675 state.advance(1);
676 if let Some('=') = state.peek() {
677 state.advance(1);
678 JavaScriptSyntaxKind::PipePipeEqual
679 }
680 else {
681 JavaScriptSyntaxKind::PipePipe
682 }
683 }
684 Some('=') => {
685 state.advance(1);
686 JavaScriptSyntaxKind::PipeEqual
687 }
688 _ => JavaScriptSyntaxKind::Pipe,
689 }
690 }
691 '^' => {
692 state.advance(1);
693 if let Some('=') = state.peek() {
694 state.advance(1);
695 JavaScriptSyntaxKind::CaretEqual
696 }
697 else {
698 JavaScriptSyntaxKind::Caret
699 }
700 }
701 '~' => {
702 state.advance(1);
703 JavaScriptSyntaxKind::Tilde
704 }
705 '?' => {
706 state.advance(1);
707 match state.peek() {
708 Some('?') => {
709 state.advance(1);
710 if let Some('=') = state.peek() {
711 state.advance(1);
712 JavaScriptSyntaxKind::QuestionQuestionEqual
713 }
714 else {
715 JavaScriptSyntaxKind::QuestionQuestion
716 }
717 }
718 Some('.') => {
719 state.advance(1);
720 JavaScriptSyntaxKind::QuestionDot
721 }
722 _ => JavaScriptSyntaxKind::Question,
723 }
724 }
725 '(' => {
726 state.advance(1);
727 JavaScriptSyntaxKind::LeftParen
728 }
729 ')' => {
730 state.advance(1);
731 JavaScriptSyntaxKind::RightParen
732 }
733 '{' => {
734 state.advance(1);
735 JavaScriptSyntaxKind::LeftBrace
736 }
737 '}' => {
738 state.advance(1);
739 JavaScriptSyntaxKind::RightBrace
740 }
741 '[' => {
742 state.advance(1);
743 JavaScriptSyntaxKind::LeftBracket
744 }
745 ']' => {
746 state.advance(1);
747 JavaScriptSyntaxKind::RightBracket
748 }
749 ';' => {
750 state.advance(1);
751 JavaScriptSyntaxKind::Semicolon
752 }
753 ',' => {
754 state.advance(1);
755 JavaScriptSyntaxKind::Comma
756 }
757 '.' => {
758 state.advance(1);
759 if let Some('.') = state.peek() {
760 if let Some('.') = state.peek_next_n(1) {
761 state.advance(2);
762 JavaScriptSyntaxKind::DotDotDot
763 }
764 else {
765 JavaScriptSyntaxKind::Dot
766 }
767 }
768 else {
769 JavaScriptSyntaxKind::Dot
770 }
771 }
772 ':' => {
773 state.advance(1);
774 JavaScriptSyntaxKind::Colon
775 }
776 _ => return false,
777 };
778
779 state.add_token(token_kind, start_pos, state.get_position());
780 true
781 }
782 else {
783 false
784 }
785 }
786}
787
788impl<'config> Lexer<JavaScriptLanguage> for JavaScriptLexer<'config> {
789 fn lex_incremental(
790 &self,
791 source: impl Source,
792 changed: usize,
793 cache: IncrementalCache<JavaScriptLanguage>,
794 ) -> LexOutput<JavaScriptLanguage> {
795 let mut state = LexerState::new_with_cache(source, changed, cache);
796 let result = self.run(&mut state);
797 state.finish(result)
798 }
799}