1use crate::{kind::JavaScriptSyntaxKind, language::JavaScriptLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
3use std::simd::prelude::*;
4
5type State<'a, S> = LexerState<'a, S, JavaScriptLanguage>;
6
7#[derive(Clone, Default)]
8pub struct JavaScriptLexer {}
9
10impl JavaScriptLexer {
11 pub fn new(_config: &JavaScriptLanguage) -> Self {
12 Self {}
13 }
14
15 fn safe_check<'a, S: Source + ?Sized>(&self, state: &State<'a, S>) -> Result<(), OakError> {
16 if state.get_position() <= state.get_length() { Ok(()) } else { Err(OakError::custom_error(format!("Lexer out-of-bounds: pos={}, len={}", state.get_position(), state.get_length()))) }
17 }
18
19 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
21 while state.not_at_end() {
22 let safe_point = state.get_position();
23 self.safe_check(state)?;
24
25 if let Some(ch) = state.peek() {
26 match ch {
27 ' ' | '\t' => {
28 self.skip_whitespace(state);
29 }
30 '\n' | '\r' => {
31 self.lex_newline(state);
32 }
33 '/' => {
34 if let Some(next) = state.peek_next_n(1) {
36 if next == '/' || next == '*' {
37 self.lex_comment(state);
38 }
39 else {
40 self.lex_operator_or_punctuation(state);
41 }
42 }
43 else {
44 self.lex_operator_or_punctuation(state);
45 }
46 }
47 '"' | '\'' => {
48 self.lex_string_literal(state);
49 }
50 '`' => {
51 self.lex_template_literal(state);
52 }
53 '0'..='9' => {
54 self.lex_numeric_literal(state);
55 }
56 '.' => {
57 if self.is_next_digit(state) {
59 self.lex_numeric_literal(state);
60 }
61 else {
62 self.lex_operator_or_punctuation(state);
63 }
64 }
65 'a'..='z' | 'A'..='Z' | '_' | '$' => {
66 self.lex_identifier_or_keyword(state);
67 }
68 '+' | '-' | '*' | '%' | '<' | '>' | '=' | '!' | '&' | '|' | '^' | '~' | '?' | '(' | ')' | '{' | '}' | '[' | ']' | ';' | ',' | ':' => {
69 self.lex_operator_or_punctuation(state);
70 }
71 _ => {
72 let start = state.get_position();
73 state.advance(ch.len_utf8());
74 state.add_token(JavaScriptSyntaxKind::Error, start, state.get_position());
75 }
76 }
77 }
78
79 state.advance_if_dead_lock(safe_point);
80 }
81
82 Ok(())
83 }
84
85 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start = state.get_position();
88 let bytes = state.rest_bytes();
89 let mut i = 0;
90 let len = bytes.len();
91 const LANES: usize = 32;
92
93 while i + LANES <= len {
94 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
95 let is_space = chunk.simd_eq(Simd::splat(b' '));
96 let is_tab = chunk.simd_eq(Simd::splat(b'\t'));
97 let is_ws = is_space | is_tab;
98
99 if !is_ws.all() {
100 let not_ws = !is_ws;
101 let idx = not_ws.first_set().unwrap();
102 i += idx;
103 state.advance(i);
104 state.add_token(JavaScriptSyntaxKind::Whitespace, start, state.get_position());
105 return true;
106 }
107 i += LANES;
108 }
109
110 while i < len {
111 let ch = unsafe { *bytes.get_unchecked(i) };
112 if ch != b' ' && ch != b'\t' {
113 break;
114 }
115 i += 1;
116 }
117
118 if i > 0 {
119 state.advance(i);
120 state.add_token(JavaScriptSyntaxKind::Whitespace, start, state.get_position());
121 true
122 }
123 else {
124 false
125 }
126 }
127
128 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
130 let start_pos = state.get_position();
131
132 if let Some('\n') = state.peek() {
133 state.advance(1);
134 state.add_token(JavaScriptSyntaxKind::Newline, start_pos, state.get_position());
135 true
136 }
137 else if let Some('\r') = state.peek() {
138 state.advance(1);
139 if let Some('\n') = state.peek() {
140 state.advance(1);
141 }
142 state.add_token(JavaScriptSyntaxKind::Newline, start_pos, state.get_position());
143 true
144 }
145 else {
146 false
147 }
148 }
149
150 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
152 let start = state.get_position();
153 let rest = state.rest();
154
155 if rest.starts_with("//") {
157 state.advance(2);
158 while let Some(ch) = state.peek() {
159 if ch == '\n' || ch == '\r' {
160 break;
161 }
162 state.advance(ch.len_utf8());
163 }
164 state.add_token(JavaScriptSyntaxKind::LineComment, start, state.get_position());
165 return true;
166 }
167
168 if rest.starts_with("/*") {
170 state.advance(2);
171 let mut found_end = false;
172 while let Some(ch) = state.peek() {
173 if ch == '*' && state.peek_next_n(1) == Some('/') {
174 state.advance(2);
175 found_end = true;
176 break;
177 }
178 state.advance(ch.len_utf8());
179 }
180
181 if !found_end {
182 let error = OakError::syntax_error("Unterminated comment".to_string(), start, None);
183 state.add_error(error);
184 }
185
186 state.add_token(JavaScriptSyntaxKind::BlockComment, start, state.get_position());
187 return true;
188 }
189
190 false
191 }
192
193 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
195 let start_pos = state.get_position();
196
197 if let Some(first_char) = state.peek() {
198 if first_char == '"' || first_char == '\'' {
199 let quote = first_char;
200 state.advance(1);
201 let mut found_end = false;
202
203 while let Some(ch) = state.peek() {
204 if ch == quote {
205 state.advance(1);
206 found_end = true;
207 break;
208 }
209 else if ch == '\\' {
210 state.advance(1);
212 if let Some(escaped) = state.peek() {
213 state.advance(escaped.len_utf8());
214 }
215 }
216 else {
217 state.advance(ch.len_utf8());
218 }
219 }
220
221 if !found_end {
222 let error = OakError::syntax_error("Unterminated string literal".to_string(), start_pos, None);
223 state.add_error(error);
224 }
225
226 state.add_token(JavaScriptSyntaxKind::StringLiteral, start_pos, state.get_position());
227 return true;
228 }
229 }
230
231 false
232 }
233
234 fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
236 let start_pos = state.get_position();
237
238 if let Some('`') = state.peek() {
239 state.advance(1);
240 let mut found_end = false;
241
242 while let Some(ch) = state.peek() {
243 if ch == '`' {
244 state.advance(1);
245 found_end = true;
246 break;
247 }
248 else if ch == '\\' {
249 state.advance(1);
251 if let Some(escaped) = state.peek() {
252 state.advance(escaped.len_utf8());
253 }
254 }
255 else if ch == '$' {
256 if let Some('{') = state.peek_next_n(1) {
257 state.advance(2);
259 let mut brace_count = 1;
260 while let Some(inner_ch) = state.peek() {
261 if inner_ch == '{' {
262 brace_count += 1;
263 }
264 else if inner_ch == '}' {
265 brace_count -= 1;
266 if brace_count == 0 {
267 state.advance(1);
268 break;
269 }
270 }
271 state.advance(inner_ch.len_utf8());
272 }
273 }
274 else {
275 state.advance(ch.len_utf8());
276 }
277 }
278 else {
279 state.advance(ch.len_utf8());
280 }
281 }
282
283 if !found_end {
284 let error = OakError::syntax_error("Unterminated template literal".to_string(), start_pos, None);
285 state.add_error(error);
286 }
287
288 state.add_token(JavaScriptSyntaxKind::TemplateString, start_pos, state.get_position());
289 true
290 }
291 else {
292 false
293 }
294 }
295
296 fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
298 let start_pos = state.get_position();
299
300 if let Some(ch) = state.peek() {
301 if ch == '0' {
303 if let Some(next) = state.peek_next_n(1) {
304 if next == 'x' || next == 'X' {
305 state.advance(2); let mut has_digits = false;
307 while let Some(hex_ch) = state.peek() {
308 if hex_ch.is_ascii_hexdigit() {
309 state.advance(1);
310 has_digits = true;
311 }
312 else {
313 break;
314 }
315 }
316
317 if !has_digits {
318 let error = OakError::syntax_error("Invalid hexadecimal number".to_string(), start_pos, None);
319 state.add_error(error);
320 }
321
322 if let Some('n') = state.peek() {
324 state.advance(1);
325 state.add_token(JavaScriptSyntaxKind::BigIntLiteral, start_pos, state.get_position());
326 }
327 else {
328 state.add_token(JavaScriptSyntaxKind::NumericLiteral, start_pos, state.get_position());
329 }
330 return true;
331 }
332 }
333 }
334
335 if ch.is_ascii_digit() || (ch == '.' && self.is_next_digit(state)) {
337 if ch != '.' {
339 while let Some(digit) = state.peek() {
340 if digit.is_ascii_digit() {
341 state.advance(1);
342 }
343 else {
344 break;
345 }
346 }
347 }
348
349 if let Some('.') = state.peek() {
351 state.advance(1);
352 while let Some(digit) = state.peek() {
353 if digit.is_ascii_digit() {
354 state.advance(1);
355 }
356 else {
357 break;
358 }
359 }
360 }
361
362 if let Some(exp) = state.peek() {
364 if exp == 'e' || exp == 'E' {
365 state.advance(1);
366
367 if let Some(sign) = state.peek() {
369 if sign == '+' || sign == '-' {
370 state.advance(1);
371 }
372 }
373
374 let mut has_exp_digits = false;
376 while let Some(digit) = state.peek() {
377 if digit.is_ascii_digit() {
378 state.advance(1);
379 has_exp_digits = true;
380 }
381 else {
382 break;
383 }
384 }
385
386 if !has_exp_digits {
387 let error = OakError::syntax_error("Invalid number exponent".to_string(), start_pos, None);
388 state.add_error(error);
389 }
390 }
391 }
392
393 if let Some('n') = state.peek() {
395 state.advance(1);
396 state.add_token(JavaScriptSyntaxKind::BigIntLiteral, start_pos, state.get_position());
397 }
398 else {
399 state.add_token(JavaScriptSyntaxKind::NumericLiteral, start_pos, state.get_position());
400 }
401 true
402 }
403 else {
404 false
405 }
406 }
407 else {
408 false
409 }
410 }
411
412 fn is_next_digit<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
414 if let Some(next_ch) = state.peek_next_n(1) { next_ch.is_ascii_digit() } else { false }
415 }
416
417 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
419 let start_pos = state.get_position();
420
421 if let Some(ch) = state.peek() {
422 if ch.is_alphabetic() || ch == '_' || ch == '$' {
423 state.advance(ch.len_utf8());
424
425 while let Some(next_ch) = state.peek() {
426 if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '$' {
427 state.advance(next_ch.len_utf8());
428 }
429 else {
430 break;
431 }
432 }
433
434 let text = state.get_text_in((start_pos..state.get_position()).into());
435 let token_kind = self.keyword_or_identifier(&text);
436 state.add_token(token_kind, start_pos, state.get_position());
437 true
438 }
439 else {
440 false
441 }
442 }
443 else {
444 false
445 }
446 }
447
448 fn keyword_or_identifier(&self, text: &str) -> JavaScriptSyntaxKind {
450 match text {
451 "abstract" => JavaScriptSyntaxKind::Abstract,
452 "as" => JavaScriptSyntaxKind::As,
453 "async" => JavaScriptSyntaxKind::Async,
454 "await" => JavaScriptSyntaxKind::Await,
455 "break" => JavaScriptSyntaxKind::Break,
456 "case" => JavaScriptSyntaxKind::Case,
457 "catch" => JavaScriptSyntaxKind::Catch,
458 "class" => JavaScriptSyntaxKind::Class,
459 "const" => JavaScriptSyntaxKind::Const,
460 "continue" => JavaScriptSyntaxKind::Continue,
461 "debugger" => JavaScriptSyntaxKind::Debugger,
462 "default" => JavaScriptSyntaxKind::Default,
463 "delete" => JavaScriptSyntaxKind::Delete,
464 "do" => JavaScriptSyntaxKind::Do,
465 "else" => JavaScriptSyntaxKind::Else,
466 "enum" => JavaScriptSyntaxKind::Enum,
467 "export" => JavaScriptSyntaxKind::Export,
468 "extends" => JavaScriptSyntaxKind::Extends,
469 "false" => JavaScriptSyntaxKind::False,
470 "finally" => JavaScriptSyntaxKind::Finally,
471 "for" => JavaScriptSyntaxKind::For,
472 "function" => JavaScriptSyntaxKind::Function,
473 "if" => JavaScriptSyntaxKind::If,
474 "implements" => JavaScriptSyntaxKind::Implements,
475 "import" => JavaScriptSyntaxKind::Import,
476 "in" => JavaScriptSyntaxKind::In,
477 "instanceof" => JavaScriptSyntaxKind::Instanceof,
478 "interface" => JavaScriptSyntaxKind::Interface,
479 "let" => JavaScriptSyntaxKind::Let,
480 "new" => JavaScriptSyntaxKind::New,
481 "null" => JavaScriptSyntaxKind::Null,
482 "package" => JavaScriptSyntaxKind::Package,
483 "private" => JavaScriptSyntaxKind::Private,
484 "protected" => JavaScriptSyntaxKind::Protected,
485 "public" => JavaScriptSyntaxKind::Public,
486 "return" => JavaScriptSyntaxKind::Return,
487 "static" => JavaScriptSyntaxKind::Static,
488 "super" => JavaScriptSyntaxKind::Super,
489 "switch" => JavaScriptSyntaxKind::Switch,
490 "this" => JavaScriptSyntaxKind::This,
491 "throw" => JavaScriptSyntaxKind::Throw,
492 "true" => JavaScriptSyntaxKind::True,
493 "try" => JavaScriptSyntaxKind::Try,
494 "typeof" => JavaScriptSyntaxKind::Typeof,
495 "undefined" => JavaScriptSyntaxKind::Undefined,
496 "var" => JavaScriptSyntaxKind::Var,
497 "void" => JavaScriptSyntaxKind::Void,
498 "while" => JavaScriptSyntaxKind::While,
499 "with" => JavaScriptSyntaxKind::With,
500 "yield" => JavaScriptSyntaxKind::Yield,
501 _ => JavaScriptSyntaxKind::IdentifierName,
502 }
503 }
504
505 fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
507 let start_pos = state.get_position();
508
509 if let Some(ch) = state.peek() {
510 let token_kind = match ch {
511 '+' => {
512 state.advance(1);
513 match state.peek() {
514 Some('+') => {
515 state.advance(1);
516 JavaScriptSyntaxKind::PlusPlus
517 }
518 Some('=') => {
519 state.advance(1);
520 JavaScriptSyntaxKind::PlusEqual
521 }
522 _ => JavaScriptSyntaxKind::Plus,
523 }
524 }
525 '-' => {
526 state.advance(1);
527 match state.peek() {
528 Some('-') => {
529 state.advance(1);
530 JavaScriptSyntaxKind::MinusMinus
531 }
532 Some('=') => {
533 state.advance(1);
534 JavaScriptSyntaxKind::MinusEqual
535 }
536 _ => JavaScriptSyntaxKind::Minus,
537 }
538 }
539 '*' => {
540 state.advance(1);
541 match state.peek() {
542 Some('*') => {
543 state.advance(1);
544 if let Some('=') = state.peek() {
545 state.advance(1);
546 JavaScriptSyntaxKind::StarStarEqual
547 }
548 else {
549 JavaScriptSyntaxKind::StarStar
550 }
551 }
552 Some('=') => {
553 state.advance(1);
554 JavaScriptSyntaxKind::StarEqual
555 }
556 _ => JavaScriptSyntaxKind::Star,
557 }
558 }
559 '/' => {
560 if let Some(next) = state.peek_next_n(1) {
562 if next == '/' || next == '*' {
563 return false; }
565 }
566 state.advance(1);
567 if let Some('=') = state.peek() {
568 state.advance(1);
569 JavaScriptSyntaxKind::SlashEqual
570 }
571 else {
572 JavaScriptSyntaxKind::Slash
573 }
574 }
575 '%' => {
576 state.advance(1);
577 if let Some('=') = state.peek() {
578 state.advance(1);
579 JavaScriptSyntaxKind::PercentEqual
580 }
581 else {
582 JavaScriptSyntaxKind::Percent
583 }
584 }
585 '<' => {
586 state.advance(1);
587 match state.peek() {
588 Some('<') => {
589 state.advance(1);
590 if let Some('=') = state.peek() {
591 state.advance(1);
592 JavaScriptSyntaxKind::LeftShiftEqual
593 }
594 else {
595 JavaScriptSyntaxKind::LeftShift
596 }
597 }
598 Some('=') => {
599 state.advance(1);
600 JavaScriptSyntaxKind::LessEqual
601 }
602 _ => JavaScriptSyntaxKind::Less,
603 }
604 }
605 '>' => {
606 state.advance(1);
607 match state.peek() {
608 Some('>') => {
609 state.advance(1);
610 match state.peek() {
611 Some('>') => {
612 state.advance(1);
613 if let Some('=') = state.peek() {
614 state.advance(1);
615 JavaScriptSyntaxKind::UnsignedRightShiftEqual
616 }
617 else {
618 JavaScriptSyntaxKind::UnsignedRightShift
619 }
620 }
621 Some('=') => {
622 state.advance(1);
623 JavaScriptSyntaxKind::RightShiftEqual
624 }
625 _ => JavaScriptSyntaxKind::RightShift,
626 }
627 }
628 Some('=') => {
629 state.advance(1);
630 JavaScriptSyntaxKind::GreaterEqual
631 }
632 _ => JavaScriptSyntaxKind::Greater,
633 }
634 }
635 '=' => {
636 state.advance(1);
637 match state.peek() {
638 Some('=') => {
639 state.advance(1);
640 if let Some('=') = state.peek() {
641 state.advance(1);
642 JavaScriptSyntaxKind::EqualEqualEqual
643 }
644 else {
645 JavaScriptSyntaxKind::EqualEqual
646 }
647 }
648 Some('>') => {
649 state.advance(1);
650 JavaScriptSyntaxKind::Arrow
651 }
652 _ => JavaScriptSyntaxKind::Equal,
653 }
654 }
655 '!' => {
656 state.advance(1);
657 match state.peek() {
658 Some('=') => {
659 state.advance(1);
660 if let Some('=') = state.peek() {
661 state.advance(1);
662 JavaScriptSyntaxKind::NotEqualEqual
663 }
664 else {
665 JavaScriptSyntaxKind::NotEqual
666 }
667 }
668 _ => JavaScriptSyntaxKind::Exclamation,
669 }
670 }
671 '&' => {
672 state.advance(1);
673 match state.peek() {
674 Some('&') => {
675 state.advance(1);
676 if let Some('=') = state.peek() {
677 state.advance(1);
678 JavaScriptSyntaxKind::AmpersandAmpersandEqual
679 }
680 else {
681 JavaScriptSyntaxKind::AmpersandAmpersand
682 }
683 }
684 Some('=') => {
685 state.advance(1);
686 JavaScriptSyntaxKind::AmpersandEqual
687 }
688 _ => JavaScriptSyntaxKind::Ampersand,
689 }
690 }
691 '|' => {
692 state.advance(1);
693 match state.peek() {
694 Some('|') => {
695 state.advance(1);
696 if let Some('=') = state.peek() {
697 state.advance(1);
698 JavaScriptSyntaxKind::PipePipeEqual
699 }
700 else {
701 JavaScriptSyntaxKind::PipePipe
702 }
703 }
704 Some('=') => {
705 state.advance(1);
706 JavaScriptSyntaxKind::PipeEqual
707 }
708 _ => JavaScriptSyntaxKind::Pipe,
709 }
710 }
711 '^' => {
712 state.advance(1);
713 if let Some('=') = state.peek() {
714 state.advance(1);
715 JavaScriptSyntaxKind::CaretEqual
716 }
717 else {
718 JavaScriptSyntaxKind::Caret
719 }
720 }
721 '~' => {
722 state.advance(1);
723 JavaScriptSyntaxKind::Tilde
724 }
725 '?' => {
726 state.advance(1);
727 match state.peek() {
728 Some('?') => {
729 state.advance(1);
730 if let Some('=') = state.peek() {
731 state.advance(1);
732 JavaScriptSyntaxKind::QuestionQuestionEqual
733 }
734 else {
735 JavaScriptSyntaxKind::QuestionQuestion
736 }
737 }
738 Some('.') => {
739 state.advance(1);
740 JavaScriptSyntaxKind::QuestionDot
741 }
742 _ => JavaScriptSyntaxKind::Question,
743 }
744 }
745 '(' => {
746 state.advance(1);
747 JavaScriptSyntaxKind::LeftParen
748 }
749 ')' => {
750 state.advance(1);
751 JavaScriptSyntaxKind::RightParen
752 }
753 '{' => {
754 state.advance(1);
755 JavaScriptSyntaxKind::LeftBrace
756 }
757 '}' => {
758 state.advance(1);
759 JavaScriptSyntaxKind::RightBrace
760 }
761 '[' => {
762 state.advance(1);
763 JavaScriptSyntaxKind::LeftBracket
764 }
765 ']' => {
766 state.advance(1);
767 JavaScriptSyntaxKind::RightBracket
768 }
769 ';' => {
770 state.advance(1);
771 JavaScriptSyntaxKind::Semicolon
772 }
773 ',' => {
774 state.advance(1);
775 JavaScriptSyntaxKind::Comma
776 }
777 '.' => {
778 state.advance(1);
779 if let Some('.') = state.peek() {
780 if let Some('.') = state.peek_next_n(1) {
781 state.advance(2);
782 JavaScriptSyntaxKind::DotDotDot
783 }
784 else {
785 JavaScriptSyntaxKind::Dot
786 }
787 }
788 else {
789 JavaScriptSyntaxKind::Dot
790 }
791 }
792 ':' => {
793 state.advance(1);
794 JavaScriptSyntaxKind::Colon
795 }
796 _ => return false,
797 };
798
799 state.add_token(token_kind, start_pos, state.get_position());
800 true
801 }
802 else {
803 false
804 }
805 }
806}
807
808impl Lexer<JavaScriptLanguage> for JavaScriptLexer {
809 fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JavaScriptLanguage>) -> LexOutput<JavaScriptLanguage> {
810 let mut state = LexerState::new(text);
811 let result = self.run(&mut state);
812 if result.is_ok() {
813 state.add_eof();
814 }
815 state.finish_with_cache(result, cache)
816 }
817}