1pub mod token_type;
4
5use crate::{language::JavaScriptLanguage, lexer::token_type::JavaScriptTokenType};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
7use std::simd::prelude::*;
8
9pub(crate) type State<'a, S> = LexerState<'a, S, JavaScriptLanguage>;
10
11#[derive(Clone, Debug)]
13pub struct JavaScriptLexer<'config> {
14 config: &'config JavaScriptLanguage,
15}
16
17impl<'config> JavaScriptLexer<'config> {
18 pub fn new(config: &'config JavaScriptLanguage) -> Self {
20 Self { config }
21 }
22
23 fn safe_check<'a, S: Source + ?Sized>(&self, state: &State<'a, S>) -> Result<(), OakError> {
24 if state.get_position() <= state.get_length() { Ok(()) } else { Err(OakError::custom_error(format!("Lexer out-of-bounds: pos={}, len={}", state.get_position(), state.get_length()))) }
25 }
26
27 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
29 while state.not_at_end() {
30 let safe_point = state.get_position();
31 self.safe_check(state)?;
32
33 if let Some(ch) = state.peek() {
34 match ch {
35 ' ' | '\t' => {
36 self.skip_whitespace(state);
37 }
38 '\n' | '\r' => {
39 self.lex_newline(state);
40 }
41 '/' => {
42 if let Some(next) = state.peek_next_n(1) {
44 if next == '/' || next == '*' {
45 self.lex_comment(state);
46 }
47 else {
48 self.lex_operator_or_punctuation(state);
49 }
50 }
51 else {
52 self.lex_operator_or_punctuation(state);
53 }
54 }
55 '"' | '\'' => {
56 self.lex_string_literal(state);
57 }
58 '`' => {
59 self.lex_template_literal(state);
60 }
61 '0'..='9' => {
62 self.lex_numeric_literal(state);
63 }
64 '.' => {
65 if self.is_next_digit(state) {
67 self.lex_numeric_literal(state);
68 }
69 else {
70 self.lex_operator_or_punctuation(state);
71 }
72 }
73 'a'..='z' | 'A'..='Z' | '_' | '$' => {
74 self.lex_identifier_or_keyword(state);
75 }
76 '+' | '-' | '*' | '%' | '<' | '>' | '=' | '!' | '&' | '|' | '^' | '~' | '?' | '(' | ')' | '{' | '}' | '[' | ']' | ';' | ',' | ':' => {
77 self.lex_operator_or_punctuation(state);
78 }
79 _ => {
80 let start = state.get_position();
81 state.advance(ch.len_utf8());
82 state.add_token(JavaScriptTokenType::Error, start, state.get_position());
83 }
84 }
85 }
86
87 state.advance_if_dead_lock(safe_point)
88 }
89
90 Ok(())
91 }
92
93 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95 let start = state.get_position();
96 let bytes = state.rest_bytes();
97 let mut i = 0;
98 let len = bytes.len();
99 const LANES: usize = 32;
100
101 while i + LANES <= len {
102 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
103 let is_space = chunk.simd_eq(Simd::splat(b' '));
104 let is_tab = chunk.simd_eq(Simd::splat(b'\t'));
105 let is_ws = is_space | is_tab;
106
107 if !is_ws.all() {
108 let not_ws = !is_ws;
109 let idx = not_ws.first_set().unwrap();
110 i += idx;
111 state.advance(i);
112 state.add_token(JavaScriptTokenType::Whitespace, start, state.get_position());
113 return true;
114 }
115 i += LANES
116 }
117
118 while i < len {
119 let ch = unsafe { *bytes.get_unchecked(i) };
120 if ch != b' ' && ch != b'\t' {
121 break;
122 }
123 i += 1
124 }
125
126 if i > 0 {
127 state.advance(i);
128 state.add_token(JavaScriptTokenType::Whitespace, start, state.get_position());
129 true
130 }
131 else {
132 false
133 }
134 }
135
136 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
138 let start_pos = state.get_position();
139
140 if let Some('\n') = state.peek() {
141 state.advance(1);
142 state.add_token(JavaScriptTokenType::Newline, start_pos, state.get_position());
143 true
144 }
145 else if let Some('\r') = state.peek() {
146 state.advance(1);
147 if let Some('\n') = state.peek() {
148 state.advance(1)
149 }
150 state.add_token(JavaScriptTokenType::Newline, start_pos, state.get_position());
151 true
152 }
153 else {
154 false
155 }
156 }
157
158 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160 let start = state.get_position();
161 let rest = state.rest();
162
163 if rest.starts_with("//") {
165 state.advance(2);
166 while let Some(ch) = state.peek() {
167 if ch == '\n' || ch == '\r' {
168 break;
169 }
170 state.advance(ch.len_utf8())
171 }
172 state.add_token(JavaScriptTokenType::LineComment, start, state.get_position());
173 return true;
174 }
175
176 if rest.starts_with("/*") {
178 state.advance(2);
179 let mut found_end = false;
180 while let Some(ch) = state.peek() {
181 if ch == '*' && state.peek_next_n(1) == Some('/') {
182 state.advance(2);
183 found_end = true;
184 break;
185 }
186 state.advance(ch.len_utf8())
187 }
188
189 if !found_end {
190 let error = OakError::syntax_error("Unterminated comment".to_string(), start, None);
191 state.add_error(error)
192 }
193
194 state.add_token(JavaScriptTokenType::BlockComment, start, state.get_position());
195 return true;
196 }
197
198 false
199 }
200
201 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
203 let start_pos = state.get_position();
204
205 if let Some(first_char) = state.peek() {
206 if first_char == '"' || first_char == '\'' {
207 let quote = first_char;
208 state.advance(1);
209 let mut found_end = false;
210
211 while let Some(ch) = state.peek() {
212 if ch == quote {
213 state.advance(1);
214 found_end = true;
215 break;
216 }
217 else if ch == '\\' {
218 state.advance(1);
220 if let Some(escaped) = state.peek() {
221 state.advance(escaped.len_utf8())
222 }
223 }
224 else {
225 state.advance(ch.len_utf8())
226 }
227 }
228
229 if !found_end {
230 let error = OakError::syntax_error("Unterminated string literal".to_string(), start_pos, None);
231 state.add_error(error)
232 }
233
234 state.add_token(JavaScriptTokenType::StringLiteral, start_pos, state.get_position());
235 return true;
236 }
237 }
238
239 false
240 }
241
242 fn lex_template_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
244 let start_pos = state.get_position();
245
246 if let Some('`') = state.peek() {
247 state.advance(1);
248 let mut found_end = false;
249
250 while let Some(ch) = state.peek() {
251 if ch == '`' {
252 state.advance(1);
253 found_end = true;
254 break;
255 }
256 else if ch == '\\' {
257 state.advance(1);
259 if let Some(escaped) = state.peek() {
260 state.advance(escaped.len_utf8())
261 }
262 }
263 else if ch == '$' {
264 if let Some('{') = state.peek_next_n(1) {
265 state.advance(2);
267 let mut brace_count = 1;
268 while let Some(inner_ch) = state.peek() {
269 if inner_ch == '{' {
270 brace_count += 1
271 }
272 else if inner_ch == '}' {
273 brace_count -= 1;
274 if brace_count == 0 {
275 state.advance(1);
276 break;
277 }
278 }
279 state.advance(inner_ch.len_utf8())
280 }
281 }
282 else {
283 state.advance(ch.len_utf8())
284 }
285 }
286 else {
287 state.advance(ch.len_utf8())
288 }
289 }
290
291 if !found_end {
292 let error = OakError::syntax_error("Unterminated template literal".to_string(), start_pos, None);
293 state.add_error(error)
294 }
295
296 state.add_token(JavaScriptTokenType::TemplateString, start_pos, state.get_position());
297 true
298 }
299 else {
300 false
301 }
302 }
303
304 fn lex_numeric_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
306 let start_pos = state.get_position();
307
308 if let Some(ch) = state.peek() {
309 if ch == '0' {
311 if let Some(next) = state.peek_next_n(1) {
312 if next == 'x' || next == 'X' {
313 state.advance(2); let mut has_digits = false;
315 while let Some(hex_ch) = state.peek() {
316 if hex_ch.is_ascii_hexdigit() {
317 state.advance(1);
318 has_digits = true
319 }
320 else {
321 break;
322 }
323 }
324
325 if !has_digits {
326 let error = OakError::syntax_error("Invalid hexadecimal number".to_string(), start_pos, None);
327 state.add_error(error)
328 }
329
330 if let Some('n') = state.peek() {
332 state.advance(1);
333 state.add_token(JavaScriptTokenType::BigIntLiteral, start_pos, state.get_position())
334 }
335 else {
336 state.add_token(JavaScriptTokenType::NumericLiteral, start_pos, state.get_position())
337 }
338 return true;
339 }
340 }
341 }
342
343 if ch.is_ascii_digit() || (ch == '.' && self.is_next_digit(state)) {
345 if ch != '.' {
347 while let Some(digit) = state.peek() {
348 if digit.is_ascii_digit() { state.advance(1) } else { break }
349 }
350 }
351
352 if let Some('.') = state.peek() {
354 state.advance(1);
355 while let Some(digit) = state.peek() {
356 if digit.is_ascii_digit() { state.advance(1) } else { break }
357 }
358 }
359
360 if let Some(exp) = state.peek() {
362 if exp == 'e' || exp == 'E' {
363 state.advance(1);
364
365 if let Some(sign) = state.peek() {
367 if sign == '+' || sign == '-' {
368 state.advance(1)
369 }
370 }
371
372 let mut has_exp_digits = false;
374 while let Some(digit) = state.peek() {
375 if digit.is_ascii_digit() {
376 state.advance(1);
377 has_exp_digits = true
378 }
379 else {
380 break;
381 }
382 }
383
384 if !has_exp_digits {
385 let error = OakError::syntax_error("Invalid number exponent".to_string(), start_pos, None);
386 state.add_error(error)
387 }
388 }
389 }
390
391 if let Some('n') = state.peek() {
393 state.advance(1);
394 state.add_token(JavaScriptTokenType::BigIntLiteral, start_pos, state.get_position())
395 }
396 else {
397 state.add_token(JavaScriptTokenType::NumericLiteral, start_pos, state.get_position())
398 }
399 true
400 }
401 else {
402 false
403 }
404 }
405 else {
406 false
407 }
408 }
409
410 fn is_next_digit<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
412 if let Some(next_ch) = state.peek_next_n(1) { next_ch.is_ascii_digit() } else { false }
413 }
414
415 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
417 let start_pos = state.get_position();
418
419 if let Some(ch) = state.peek() {
420 if ch.is_alphabetic() || ch == '_' || ch == '$' {
421 state.advance(ch.len_utf8());
422
423 while let Some(next_ch) = state.peek() {
424 if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '$' { state.advance(next_ch.len_utf8()) } else { break }
425 }
426
427 let text = state.get_text_in((start_pos..state.get_position()).into());
428 let token_kind = self.keyword_or_identifier(&text);
429 state.add_token(token_kind, start_pos, state.get_position());
430 true
431 }
432 else {
433 false
434 }
435 }
436 else {
437 false
438 }
439 }
440
441 fn keyword_or_identifier(&self, text: &str) -> JavaScriptTokenType {
443 JavaScriptTokenType::from_keyword(text).unwrap_or(JavaScriptTokenType::IdentifierName)
444 }
445
446 fn lex_operator_or_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
448 let start_pos = state.get_position();
449
450 if let Some(ch) = state.peek() {
451 let token_kind = match ch {
452 '+' => {
453 state.advance(1);
454 match state.peek() {
455 Some('+') => {
456 state.advance(1);
457 JavaScriptTokenType::PlusPlus
458 }
459 Some('=') => {
460 state.advance(1);
461 JavaScriptTokenType::PlusEqual
462 }
463 _ => JavaScriptTokenType::Plus,
464 }
465 }
466 '-' => {
467 state.advance(1);
468 match state.peek() {
469 Some('-') => {
470 state.advance(1);
471 JavaScriptTokenType::MinusMinus
472 }
473 Some('=') => {
474 state.advance(1);
475 JavaScriptTokenType::MinusEqual
476 }
477 _ => JavaScriptTokenType::Minus,
478 }
479 }
480 '*' => {
481 state.advance(1);
482 match state.peek() {
483 Some('*') => {
484 state.advance(1);
485 if let Some('=') = state.peek() {
486 state.advance(1);
487 JavaScriptTokenType::StarStarEqual
488 }
489 else {
490 JavaScriptTokenType::StarStar
491 }
492 }
493 Some('=') => {
494 state.advance(1);
495 JavaScriptTokenType::StarEqual
496 }
497 _ => JavaScriptTokenType::Star,
498 }
499 }
500 '/' => {
501 if let Some(next) = state.peek_next_n(1) {
503 if next == '/' || next == '*' {
504 return false; }
506 }
507 state.advance(1);
508 if let Some('=') = state.peek() {
509 state.advance(1);
510 JavaScriptTokenType::SlashEqual
511 }
512 else {
513 JavaScriptTokenType::Slash
514 }
515 }
516 '%' => {
517 state.advance(1);
518 if let Some('=') = state.peek() {
519 state.advance(1);
520 JavaScriptTokenType::PercentEqual
521 }
522 else {
523 JavaScriptTokenType::Percent
524 }
525 }
526 '<' => {
527 state.advance(1);
528 match state.peek() {
529 Some('<') => {
530 state.advance(1);
531 if let Some('=') = state.peek() {
532 state.advance(1);
533 JavaScriptTokenType::LeftShiftEqual
534 }
535 else {
536 JavaScriptTokenType::LeftShift
537 }
538 }
539 Some('=') => {
540 state.advance(1);
541 JavaScriptTokenType::LessEqual
542 }
543 _ => JavaScriptTokenType::Less,
544 }
545 }
546 '>' => {
547 state.advance(1);
548 match state.peek() {
549 Some('>') => {
550 state.advance(1);
551 match state.peek() {
552 Some('>') => {
553 state.advance(1);
554 if let Some('=') = state.peek() {
555 state.advance(1);
556 JavaScriptTokenType::UnsignedRightShiftEqual
557 }
558 else {
559 JavaScriptTokenType::UnsignedRightShift
560 }
561 }
562 Some('=') => {
563 state.advance(1);
564 JavaScriptTokenType::RightShiftEqual
565 }
566 _ => JavaScriptTokenType::RightShift,
567 }
568 }
569 Some('=') => {
570 state.advance(1);
571 JavaScriptTokenType::GreaterEqual
572 }
573 _ => JavaScriptTokenType::Greater,
574 }
575 }
576 '=' => {
577 state.advance(1);
578 match state.peek() {
579 Some('=') => {
580 state.advance(1);
581 if let Some('=') = state.peek() {
582 state.advance(1);
583 JavaScriptTokenType::EqualEqualEqual
584 }
585 else {
586 JavaScriptTokenType::EqualEqual
587 }
588 }
589 Some('>') => {
590 state.advance(1);
591 JavaScriptTokenType::Arrow
592 }
593 _ => JavaScriptTokenType::Equal,
594 }
595 }
596 '!' => {
597 state.advance(1);
598 match state.peek() {
599 Some('=') => {
600 state.advance(1);
601 if let Some('=') = state.peek() {
602 state.advance(1);
603 JavaScriptTokenType::NotEqualEqual
604 }
605 else {
606 JavaScriptTokenType::NotEqual
607 }
608 }
609 _ => JavaScriptTokenType::Exclamation,
610 }
611 }
612 '&' => {
613 state.advance(1);
614 match state.peek() {
615 Some('&') => {
616 state.advance(1);
617 if let Some('=') = state.peek() {
618 state.advance(1);
619 JavaScriptTokenType::AmpersandAmpersandEqual
620 }
621 else {
622 JavaScriptTokenType::AmpersandAmpersand
623 }
624 }
625 Some('=') => {
626 state.advance(1);
627 JavaScriptTokenType::AmpersandEqual
628 }
629 _ => JavaScriptTokenType::Ampersand,
630 }
631 }
632 '|' => {
633 state.advance(1);
634 match state.peek() {
635 Some('|') => {
636 state.advance(1);
637 if let Some('=') = state.peek() {
638 state.advance(1);
639 JavaScriptTokenType::PipePipeEqual
640 }
641 else {
642 JavaScriptTokenType::PipePipe
643 }
644 }
645 Some('=') => {
646 state.advance(1);
647 JavaScriptTokenType::PipeEqual
648 }
649 _ => JavaScriptTokenType::Pipe,
650 }
651 }
652 '^' => {
653 state.advance(1);
654 if let Some('=') = state.peek() {
655 state.advance(1);
656 JavaScriptTokenType::CaretEqual
657 }
658 else {
659 JavaScriptTokenType::Caret
660 }
661 }
662 '~' => {
663 state.advance(1);
664 JavaScriptTokenType::Tilde
665 }
666 '?' => {
667 state.advance(1);
668 match state.peek() {
669 Some('?') => {
670 state.advance(1);
671 if let Some('=') = state.peek() {
672 state.advance(1);
673 JavaScriptTokenType::QuestionQuestionEqual
674 }
675 else {
676 JavaScriptTokenType::QuestionQuestion
677 }
678 }
679 Some('.') => {
680 state.advance(1);
681 JavaScriptTokenType::QuestionDot
682 }
683 _ => JavaScriptTokenType::Question,
684 }
685 }
686 '(' => {
687 state.advance(1);
688 JavaScriptTokenType::LeftParen
689 }
690 ')' => {
691 state.advance(1);
692 JavaScriptTokenType::RightParen
693 }
694 '{' => {
695 state.advance(1);
696 JavaScriptTokenType::LeftBrace
697 }
698 '}' => {
699 state.advance(1);
700 JavaScriptTokenType::RightBrace
701 }
702 '[' => {
703 state.advance(1);
704 JavaScriptTokenType::LeftBracket
705 }
706 ']' => {
707 state.advance(1);
708 JavaScriptTokenType::RightBracket
709 }
710 ';' => {
711 state.advance(1);
712 JavaScriptTokenType::Semicolon
713 }
714 ',' => {
715 state.advance(1);
716 JavaScriptTokenType::Comma
717 }
718 '.' => {
719 state.advance(1);
720 if let Some('.') = state.peek() {
721 if let Some('.') = state.peek_next_n(1) {
722 state.advance(2);
723 JavaScriptTokenType::DotDotDot
724 }
725 else {
726 JavaScriptTokenType::Dot
727 }
728 }
729 else {
730 JavaScriptTokenType::Dot
731 }
732 }
733 ':' => {
734 state.advance(1);
735 JavaScriptTokenType::Colon
736 }
737 _ => return false,
738 };
739
740 state.add_token(token_kind, start_pos, state.get_position());
741 true
742 }
743 else {
744 false
745 }
746 }
747}
748
749impl<'config> Lexer<JavaScriptLanguage> for JavaScriptLexer<'config> {
750 fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JavaScriptLanguage>) -> LexOutput<JavaScriptLanguage> {
751 let mut state = LexerState::new(text);
752 let result = self.run(&mut state);
753 if result.is_ok() {
754 state.add_eof()
755 }
756 state.finish_with_cache(result, cache)
757 }
758}