rslint_lexer/
lib.rs

1//! An extremely fast, lookup table based, ECMAScript lexer which yields SyntaxKind tokens used by the rslint_parse parser.  
2//! For the purposes of error recovery, tokens may have an error attached to them, which is reflected in the Iterator Item.  
3//! The lexer will also yield `COMMENT` and `WHITESPACE` tokens.
4//!
5//! The lexer operates on raw bytes to take full advantage of lookup table optimizations, these bytes **must** be valid utf8,
6//! therefore making a lexer from a `&[u8]` is unsafe since you must make sure the bytes are valid utf8.
7//! Do not use this to learn how to lex JavaScript, this is just needlessly fast and demonic because i can't control myself :)
8//!
9//! basic ANSI syntax highlighting is also offered through the `highlight` feature.
10//!
11//! # Warning ⚠️
12//!
13//! `>>` and `>>>` are not emitted as single tokens, they are emitted as multiple `>` tokens. This is because of
14//! TypeScript parsing and productions such as `T<U<N>>`
15
16#![allow(clippy::or_fun_call)]
17
18#[macro_use]
19mod token;
20mod state;
21mod tests;
22
23
24#[rustfmt::skip]
25mod tables;
26
27pub use token::Token;
28
29#[cfg(feature = "highlight")]
30mod highlight;
31#[cfg(feature = "highlight")]
32pub use highlight::*;
33
34use rslint_errors::Diagnostic;
35use state::LexerState;
36use tables::derived_property::*;
37
38pub use rslint_syntax::*;
39pub type LexerReturn = (Token, Option<Diagnostic>);
40
41// Simple macro for unwinding a loop
42macro_rules! unwind_loop {
43    ($($iter:tt)*) => {
44        $($iter)*
45        $($iter)*
46        $($iter)*
47        $($iter)*
48        $($iter)*
49
50        loop {
51            $($iter)*
52            $($iter)*
53            $($iter)*
54            $($iter)*
55            $($iter)*
56        }
57    };
58}
59
60// The first utf8 byte of every valid unicode whitespace char, used for short circuiting whitespace checks
61const UNICODE_WHITESPACE_STARTS: [u8; 5] = [
62    // NBSP
63    0xC2, // BOM
64    0xEF, // Ogham space mark
65    0xE1, // En quad .. Hair space, narrow no break space, mathematical space
66    0xE2, // Ideographic space
67    0xE3,
68];
69
70// Unicode spaces, designated by the `Zs` unicode property
71const UNICODE_SPACES: [char; 19] = [
72    '\u{0020}', '\u{00A0}', '\u{1680}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}',
73    '\u{2005}', '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{202F}',
74    '\u{205F}', '\u{3000}', '\u{FEFF}',
75];
76
77fn is_id_start(c: char) -> bool {
78    c == '_' || c == '$' || ID_Start(c)
79}
80
81fn is_id_continue(c: char) -> bool {
82    c == '$' || c == '\u{200d}' || c == '\u{200c}' || ID_Continue(c)
83}
84
85/// An extremely fast, lookup table based, lossless ECMAScript lexer
86#[derive(Debug, Clone, PartialEq, Eq, Hash)]
87pub struct Lexer<'src> {
88    bytes: &'src [u8],
89    cur: usize,
90    state: LexerState,
91    pub file_id: usize,
92    returned_eof: bool,
93}
94
95impl<'src> Lexer<'src> {
96    /// Make a new lexer from raw bytes.
97    ///
98    /// # Safety
99    /// You must make sure the bytes are valid utf8, failure to do so is undefined behavior.
100    pub unsafe fn from_bytes(bytes: &'src [u8], file_id: usize) -> Self {
101        Self {
102            bytes,
103            cur: 0,
104            file_id,
105            state: LexerState::new(),
106            returned_eof: false,
107        }
108    }
109
110    /// Make a new lexer from a str, this is safe because strs are valid utf8
111    pub fn from_str(string: &'src str, file_id: usize) -> Self {
112        Self {
113            bytes: string.as_bytes(),
114            cur: 0,
115            file_id,
116            state: LexerState::new(),
117            returned_eof: false,
118        }
119    }
120
121    // Bump the lexer and return the token given in
122    fn eat(&mut self, tok: LexerReturn) -> LexerReturn {
123        self.next();
124        tok
125    }
126
127    // Consume all whitespace starting from the current byte
128    fn consume_whitespace(&mut self) {
129        unwind_loop! {
130            if let Some(byte) = self.next().copied() {
131                // This is the most likely scenario, unicode spaces are very uncommon
132                if DISPATCHER[byte as usize] != Dispatch::WHS {
133                    // try to short circuit the branch by checking the first byte of the potential unicode space
134                    if byte > 0xC1 && UNICODE_WHITESPACE_STARTS.contains(&byte) {
135                        let chr = self.get_unicode_char();
136                        if is_linebreak(chr) {
137                            self.state.had_linebreak = true;
138                        }
139                        if !UNICODE_SPACES.contains(&chr) {
140                            return;
141                        }
142                        self.cur += chr.len_utf8() - 1;
143                    } else {
144                        return;
145                    }
146                }
147                if is_linebreak(byte as char) {
148                    self.state.had_linebreak = true;
149                }
150            } else {
151                return;
152            }
153        }
154    }
155
156    // Get the unicode char which starts at the current byte and advance the lexer's cursor
157    fn get_unicode_char(&self) -> char {
158        // This is unreachable for all intents and purposes, but this is just a precautionary measure
159        debug_assert!(self.cur < self.bytes.len());
160
161        // Safety: We know this is safe because we require the input to the lexer to be valid utf8 and we always call this when we are at a char
162        let string = unsafe { std::str::from_utf8_unchecked(self.bytes.get_unchecked(self.cur..)) };
163        let chr = if let Some(chr) = string.chars().next() {
164            chr
165        } else {
166            // Safety: we always call this when we are at a valid char, so this branch is completely unreachable
167            unsafe {
168                core::hint::unreachable_unchecked();
169            }
170        };
171
172        chr
173    }
174
175    // Get the next byte and advance the index
176    #[inline]
177    fn next(&mut self) -> Option<&u8> {
178        self.cur += 1;
179        self.bytes.get(self.cur)
180    }
181
182    // Get the next byte but only advance the index if there is a next byte
183    // This is really just a hack for certain methods like escapes
184    #[inline]
185    fn next_bounded(&mut self) -> Option<&u8> {
186        if let Some(b) = self.bytes.get(self.cur + 1) {
187            self.cur += 1;
188            Some(b)
189        } else {
190            if self.cur != self.bytes.len() {
191                self.cur += 1;
192            }
193            None
194        }
195    }
196
197    fn advance(&mut self, amount: usize) {
198        self.cur += amount;
199    }
200
201    fn lookup(byte: u8) -> Dispatch {
202        // Safety: our lookup table maps all values of u8, so its impossible for a u8 to be out of bounds
203        unsafe { *DISPATCHER.get_unchecked(byte as usize) }
204    }
205
206    // Read a `\u{000...}` escape sequence, this expects the cur char to be the `{`
207    fn read_codepoint_escape(&mut self) -> Result<char, Diagnostic> {
208        let start = self.cur + 1;
209        self.read_hexnumber();
210
211        if self.bytes.get(self.cur) != Some(&b'}') {
212            // We should not yield diagnostics on a unicode char boundary. That wont make codespan panic
213            // but it may cause a panic for other crates which just consume the diagnostics
214            let invalid = self.get_unicode_char();
215            let err = Diagnostic::error(self.file_id, "", "expected hex digits for a unicode code point escape, but encountered an invalid character")
216                .primary(self.cur .. invalid.len_utf8(), "");
217
218            self.cur -= 1;
219            return Err(err);
220        }
221
222        // Safety: We know for a fact this is in bounds because we must be on the possible char after the } at this point
223        // which means its impossible for the range of the digits to be out of bounds.
224        // We also know we cant possibly be indexing a unicode char boundary because a unicode char (which cant be a hexdigit)
225        // would have triggered the if statement above. We also know this must be valid utf8, both because of read_hexnumber's behavior
226        // and because input to the lexer must be valid utf8
227        let digits_str = unsafe {
228            debug_assert!(self.bytes.get(start..self.cur).is_some());
229            debug_assert!(std::str::from_utf8(self.bytes.get_unchecked(start..self.cur)).is_ok());
230
231            std::str::from_utf8_unchecked(self.bytes.get_unchecked(start..self.cur))
232        };
233
234        match u32::from_str_radix(digits_str, 16) {
235            Ok(digits) if digits <= 0x10FFFF => {
236                let res = std::char::from_u32(digits);
237                if let Some(chr) = res {
238                    Ok(chr)
239                } else {
240                    let err =
241                        Diagnostic::error(self.file_id, "", "invalid codepoint for unicode escape")
242                            .primary(start..self.cur, "");
243
244                    Err(err)
245                }
246            }
247
248            _ => {
249                let err = Diagnostic::error(
250                    self.file_id,
251                    "",
252                    "out of bounds codepoint for unicode codepoint escape sequence",
253                )
254                .primary(start..self.cur, "")
255                .footer_note("Codepoints range from 0 to 0x10FFFF (1114111)");
256
257                Err(err)
258            }
259        }
260    }
261
262    // Read a `\u0000` escape sequence, this expects the current char to be the `u`, it also does not skip over the escape sequence
263    // The pos after this method is the last hex digit
264    fn read_unicode_escape(&mut self, advance: bool) -> Result<char, Diagnostic> {
265        debug_assert_eq!(self.bytes[self.cur], b'u');
266
267        let diagnostic = Diagnostic::error(
268            self.file_id,
269            "",
270            "invalid digits after unicode escape sequence",
271        )
272        .primary(
273            self.cur - 1..self.cur + 1,
274            "expected 4 hex digits following this",
275        );
276
277        for idx in 0..4 {
278            match self.next_bounded() {
279                None => {
280                    if !advance {
281                        self.cur -= idx + 1;
282                    }
283                    return Err(diagnostic);
284                }
285                Some(b) if !b.is_ascii_hexdigit() => {
286                    if !advance {
287                        self.cur -= idx + 1;
288                    }
289                    return Err(diagnostic);
290                }
291                _ => {}
292            }
293        }
294
295        unsafe {
296            // Safety: input to the lexer is guaranteed to be valid utf8 and so is the range since we return if there is a wrong amount of digits beforehand
297            let digits_str = std::str::from_utf8_unchecked(
298                self.bytes.get_unchecked((self.cur - 3)..(self.cur + 1)),
299            );
300            if let Ok(digits) = u32::from_str_radix(digits_str, 16) {
301                if !advance {
302                    self.cur -= 4;
303                }
304                // Safety: we make sure the 4 chars are hex digits beforehand, and 4 hex digits cannot make an invalid char
305                Ok(std::char::from_u32_unchecked(digits))
306            } else {
307                // Safety: we know this is unreachable because 4 hexdigits cannot make an out of bounds char,
308                // and we make sure that the chars are actually hex digits
309                core::hint::unreachable_unchecked();
310            }
311        }
312    }
313
314    // Validate a `\x00 escape sequence, this expects the current char to be the `x`, it also does not skip over the escape sequence
315    // The pos after this method is the last hex digit
316    fn validate_hex_escape(&mut self) -> Option<Diagnostic> {
317        debug_assert_eq!(self.bytes[self.cur], b'x');
318
319        let diagnostic =
320            Diagnostic::error(self.file_id, "", "invalid digits after hex escape sequence")
321                .primary(
322                    (self.cur - 1)..(self.cur + 1),
323                    "Expected 2 hex digits following this",
324                );
325
326        for _ in 0..2 {
327            match self.next_bounded() {
328                None => return Some(diagnostic),
329                Some(b) if !(*b as u8).is_ascii_hexdigit() => return Some(diagnostic),
330                _ => {}
331            }
332        }
333        None
334    }
335
336    // Validate a `\..` escape sequence and advance the lexer based on it
337    fn validate_escape_sequence(&mut self) -> Option<Diagnostic> {
338        let cur = self.cur;
339        if let Some(escape) = self.bytes.get(self.cur + 1) {
340            match escape {
341                b'u' if self.bytes.get(self.cur + 2) == Some(&b'{') => {
342                    self.advance(2);
343                    self.read_codepoint_escape().err()
344                }
345                b'u' => {
346                    self.next();
347                    self.read_unicode_escape(true).err()
348                }
349                b'x' => {
350                    self.next();
351                    self.validate_hex_escape()
352                }
353                _ => {
354                    // We use get_unicode_char to account for escaped source characters which are unicode
355                    let chr = self.get_unicode_char();
356                    self.cur += chr.len_utf8();
357                    None
358                }
359            }
360        } else {
361            Some(Diagnostic::error(self.file_id, "", "").primary(
362                cur..cur + 1,
363                "expected an escape sequence following a backslash, but found none",
364            ))
365        }
366    }
367
368    // Consume an identifier by recursively consuming IDENTIFIER_PART kind chars
369    #[inline]
370    fn consume_ident(&mut self) {
371        unwind_loop! {
372            if self.next_bounded().is_some() {
373                if self.cur_ident_part().is_none() {
374                    return;
375                }
376            } else {
377                return;
378            }
379        }
380    }
381
382    /// Consumes the identifier at the current position, and fills the given buf with the UTF-8
383    /// encoded identifier that got consumed.
384    ///
385    /// Returns the number of bytes written into the buffer.
386    /// This method will stop writing into the buffer if the buffer is too small to
387    /// fit the whole identifier.
388    #[inline]
389    fn consume_and_get_ident(&mut self, buf: &mut [u8]) -> usize {
390        let mut idx = 0;
391
392        unwind_loop! {
393            if self.next_bounded().is_some() {
394                if let Some(c) = self.cur_ident_part() {
395                    if let Some(buf) = buf.get_mut(idx..idx + 4) {
396                        let res = c.encode_utf8(buf);
397                        idx += res.len();
398                    }
399                } else {
400                    return idx;
401                }
402            } else {
403                return idx;
404            }
405        }
406    }
407
408    // Consume a string literal and advance the lexer, and returning a list of errors that occurred when reading the string
409    // This could include unterminated string and invalid escape sequences
410    fn read_str_literal(&mut self) -> Option<Diagnostic> {
411        // Safety: this is only ever called from lex_token, which is guaranteed to be called on a char position
412        let quote = unsafe { *self.bytes.get_unchecked(self.cur) };
413        let start = self.cur;
414        let mut diagnostic = None;
415
416        while let Some(byte) = self.next_bounded() {
417            match *byte {
418                b'\\' => {
419                    diagnostic = self.validate_escape_sequence();
420                }
421                b if b == quote => {
422                    self.next();
423                    return diagnostic;
424                }
425                _ => {}
426            }
427        }
428
429        let unterminated = Diagnostic::error(self.file_id, "", "unterminated string literal")
430            .primary(self.cur..self.cur, "input ends here")
431            .secondary(start..start + 1, "string literal starts here");
432
433        Some(unterminated)
434    }
435
436    /// Returns `Some(x)` if the current position is an identifier, with the character at
437    /// the position.
438    ///
439    /// The character may be a char that was generated from a unicode escape sequence,
440    /// e.g. `t` is returned, the actual source code is `\u{74}`
441    #[inline]
442    fn cur_ident_part(&mut self) -> Option<char> {
443        debug_assert!(self.cur < self.bytes.len());
444
445        // Safety: we always call this method on a char
446        let b = unsafe { self.bytes.get_unchecked(self.cur) };
447
448        match Self::lookup(*b) {
449            IDT | DIG | ZER => Some(*b as char),
450            // FIXME: This should use ID_Continue, not XID_Continue
451            UNI => {
452                let chr = self.get_unicode_char();
453                let res = is_id_continue(chr);
454                if res {
455                    self.cur += chr.len_utf8() - 1;
456                    Some(chr)
457                } else {
458                    None
459                }
460            }
461            BSL if self.bytes.get(self.cur + 1) == Some(&b'u') => {
462                let start = self.cur;
463                self.next();
464                let res = if self.bytes.get(self.cur + 1).copied() == Some(b'{') {
465                    self.next();
466                    self.read_codepoint_escape()
467                } else {
468                    self.read_unicode_escape(true)
469                };
470
471                if let Ok(c) = res {
472                    if is_id_continue(c) {
473                        Some(c)
474                    } else {
475                        self.cur -= 1;
476                        None
477                    }
478                } else {
479                    self.cur = start;
480                    None
481                }
482            }
483            _ => None,
484        }
485    }
486
487    // check if the current char is an identifier start, this implicitly advances if the char being matched
488    // is a `\uxxxx` sequence which is an identifier start, or if the char is a unicode char which is an identifier start
489    #[inline]
490    fn cur_is_ident_start(&mut self) -> bool {
491        debug_assert!(self.cur < self.bytes.len());
492
493        // Safety: we always call this method on a char
494        let b = unsafe { self.bytes.get_unchecked(self.cur) };
495
496        match Self::lookup(*b) {
497            BSL if self.bytes.get(self.cur + 1) == Some(&b'u') => {
498                self.next();
499                if let Ok(chr) = self.read_unicode_escape(false) {
500                    if is_id_start(chr) {
501                        self.advance(5);
502                        return true;
503                    }
504                }
505                self.cur -= 1;
506                false
507            }
508            UNI => {
509                let chr = self.get_unicode_char();
510                if is_id_start(chr) {
511                    self.cur += chr.len_utf8() - 1;
512                    true
513                } else {
514                    false
515                }
516            }
517            IDT => true,
518            _ => false,
519        }
520    }
521
522    /// Returns the identifier token at the current position, or the keyword token if
523    /// the identifier is a keyword.
524    ///
525    /// `first` is a pair of a character that was already consumed,
526    /// but is still part of the identifier, and the characters position.
527    #[inline]
528    fn resolve_identifier(&mut self, first: (char, usize)) -> LexerReturn {
529        use SyntaxKind::*;
530
531        // Note to keep the buffer large enough to fit every possible keyword that
532        // the lexer can return
533        let mut buf = [0u8; 16];
534        let (len, start) = (first.0.encode_utf8(&mut buf).len(), first.1);
535
536        let count = self.consume_and_get_ident(&mut buf[len..]);
537
538        let kind = match &buf[..count + len] {
539            b"await" => Some(AWAIT_KW),
540            b"break" => Some(BREAK_KW),
541            b"case" => Some(CASE_KW),
542            b"catch" => Some(CATCH_KW),
543            b"class" => Some(CLASS_KW),
544            b"const" => Some(CONST_KW),
545            b"continue" => Some(CONTINUE_KW),
546            b"debugger" => Some(DEBUGGER_KW),
547            b"default" => Some(DEFAULT_KW),
548            b"delete" => Some(DELETE_KW),
549            b"do" => Some(DO_KW),
550            b"else" => Some(ELSE_KW),
551            b"enum" => Some(ENUM_KW),
552            b"export" => Some(EXPORT_KW),
553            b"extends" => Some(EXTENDS_KW),
554            b"false" => Some(FALSE_KW),
555            b"finally" => Some(FINALLY_KW),
556            b"for" => Some(FOR_KW),
557            b"function" => Some(FUNCTION_KW),
558            b"if" => Some(IF_KW),
559            b"in" => Some(IN_KW),
560            b"import" => Some(IMPORT_KW),
561            b"instanceof" => Some(INSTANCEOF_KW),
562            b"new" => Some(NEW_KW),
563            b"null" => Some(NULL_KW),
564            b"return" => Some(RETURN_KW),
565            b"super" => Some(SUPER_KW),
566            b"switch" => Some(SWITCH_KW),
567            b"this" => Some(THIS_KW),
568            b"throw" => Some(THROW_KW),
569            b"try" => Some(TRY_KW),
570            b"true" => Some(TRUE_KW),
571            b"typeof" => Some(TYPEOF_KW),
572            b"var" => Some(VAR_KW),
573            b"void" => Some(VOID_KW),
574            b"while" => Some(WHILE_KW),
575            b"with" => Some(WITH_KW),
576            b"yield" => Some(YIELD_KW),
577            _ => None,
578        };
579
580        if let Some(kind) = kind {
581            (Token::new(kind, self.cur - start), None)
582        } else {
583            (Token::new(T![ident], self.cur - start), None)
584        }
585    }
586
587    #[inline]
588    fn special_number_start<F: Fn(char) -> bool>(&mut self, func: F) -> bool {
589        if self
590            .bytes
591            .get(self.cur + 2)
592            .map(|b| func(*b as char))
593            .unwrap_or(false)
594        {
595            self.cur += 1;
596            true
597        } else {
598            false
599        }
600    }
601
602    #[inline]
603    fn maybe_bigint(&mut self) {
604        if let Some(b'n') = self.bytes.get(self.cur) {
605            self.next();
606        }
607    }
608
609    #[inline]
610    fn read_zero(&mut self) -> Option<Diagnostic> {
611        // TODO: Octal literals
612        match self.bytes.get(self.cur + 1) {
613            Some(b'x') | Some(b'X') => {
614                if self.special_number_start(|c| c.is_ascii_hexdigit()) {
615                    let diag = self.read_hexnumber();
616                    self.maybe_bigint();
617                    diag
618                } else {
619                    self.next();
620                    None
621                }
622            }
623            Some(b'b') | Some(b'B') => {
624                if self.special_number_start(|c| c == '0' || c == '1') {
625                    let diag = self.read_bindigits();
626                    self.maybe_bigint();
627                    diag
628                } else {
629                    self.next();
630                    None
631                }
632            }
633            Some(b'o') | Some(b'O') => {
634                if self.special_number_start(|c| ('0'..='7').contains(&c)) {
635                    let diag = self.read_octaldigits();
636                    self.maybe_bigint();
637                    diag
638                } else {
639                    self.next();
640                    None
641                }
642            }
643            Some(b'n') => {
644                self.cur += 2;
645                None
646            }
647            Some(b'.') => {
648                self.cur += 1;
649                self.read_float()
650            }
651            Some(b'e') | Some(b'E') => {
652                // At least one digit is required
653                match self.bytes.get(self.cur + 2) {
654                    Some(b'-') | Some(b'+') => {
655                        if let Some(b'0'..=b'9') = self.bytes.get(self.cur + 3) {
656                            self.next();
657                            self.read_exponent()
658                        } else {
659                            None
660                        }
661                    }
662                    Some(b'0'..=b'9') => self.read_exponent(),
663                    _ => {
664                        self.next();
665                        None
666                    }
667                }
668            }
669            // FIXME: many engines actually allow things like `09`, but by the spec, this is not allowed
670            // maybe we should not allow it if we want to go fully by the spec
671            _ => self.read_number(),
672        }
673    }
674
675    #[inline]
676    fn read_hexnumber(&mut self) -> Option<Diagnostic> {
677        let mut diag = None;
678        unwind_loop! {
679            match self.next() {
680                Some(b'_') => diag = diag.or(self.handle_numeric_separator(16)),
681                Some(b) if char::from(*b).is_ascii_hexdigit() => {},
682                _ => return diag,
683            }
684        }
685    }
686
687    #[inline]
688    fn handle_numeric_separator(&mut self, radix: u8) -> Option<Diagnostic> {
689        debug_assert_eq!(self.bytes[self.cur], b'_');
690
691        let err_diag = Diagnostic::error(
692            self.file_id,
693            "",
694            "numeric separators are only allowed between two digits",
695        )
696        .primary(self.cur..self.cur + 1, "");
697
698        let peeked = self.bytes.get(self.cur + 1).copied();
699
700        if peeked.is_none() || !char::from(peeked.unwrap()).is_digit(radix as u32) {
701            return Some(err_diag);
702        }
703
704        let forbidden = |c: Option<u8>| {
705            if c.is_none() {
706                return true;
707            }
708            let c = c.unwrap();
709
710            if radix == 16 {
711                matches!(c, b'.' | b'X' | b'_' | b'x')
712            } else {
713                matches!(c, b'.' | b'B' | b'E' | b'O' | b'_' | b'b' | b'e' | b'o')
714            }
715        };
716
717        let prev = self.bytes.get(self.cur - 1).copied();
718
719        if forbidden(prev) || forbidden(peeked) {
720            return Some(err_diag);
721        }
722
723        self.next_bounded();
724        None
725    }
726
727    // Read a number which does not start with 0, since that can be more things and is handled
728    // by another function
729    #[inline]
730    fn read_number(&mut self) -> Option<Diagnostic> {
731        let mut diag = None;
732        unwind_loop! {
733            match self.next_bounded() {
734                Some(b'_') => diag = diag.or(self.handle_numeric_separator(10)),
735                Some(b'0'..=b'9') => {},
736                Some(b'.') => {
737                    return self.read_float();
738                },
739                // TODO: merge this, and read_float's implementation into one so we dont duplicate exponent code
740                Some(b'e') | Some(b'E') => {
741                    // At least one digit is required
742                    match self.bytes.get(self.cur + 1) {
743                        Some(b'-') | Some(b'+') => {
744                            if let Some(b'0'..=b'9') = self.bytes.get(self.cur + 2) {
745                                self.next();
746                                return self.read_exponent();
747                            } else {
748                                return diag;
749                            }
750                        },
751                        Some(b'0'..=b'9') => return self.read_exponent(),
752                        _ => return diag,
753                    }
754                },
755                Some(b'n') => {
756                    self.next();
757                    return diag;
758                }
759                _ => return diag,
760            }
761        }
762    }
763
764    #[inline]
765    fn read_float(&mut self) -> Option<Diagnostic> {
766        let mut diag = None;
767
768        unwind_loop! {
769            match self.next_bounded() {
770                Some(b'_') => diag = diag.or(self.handle_numeric_separator(16)),
771                // LLVM has a hard time optimizing inclusive patterns, perhaps we should check if it makes llvm sad,
772                // and optimize this into a lookup table
773                Some(b'0'..=b'9') => {},
774                Some(b'e') | Some(b'E') => {
775                    // At least one digit is required
776                    match self.bytes.get(self.cur + 1) {
777                        Some(b'-') | Some(b'+') => {
778                            if let Some(b'0'..=b'9') = self.bytes.get(self.cur + 2) {
779                                self.next();
780                                return self.read_exponent().or(diag);
781                            } else {
782                                return diag;
783                            }
784                        },
785                        Some(b'0'..=b'9') => return self.read_exponent().or(diag),
786                        _ => return diag,
787                    }
788                },
789                _ => return diag,
790            }
791        }
792    }
793
794    #[inline]
795    fn read_exponent(&mut self) -> Option<Diagnostic> {
796        if let Some(b'-') | Some(b'+') = self.bytes.get(self.cur + 1) {
797            self.next();
798        }
799
800        let mut diag = None;
801        unwind_loop! {
802            match self.next() {
803                Some(b'_') => diag = diag.or(self.handle_numeric_separator(10)),
804                Some(b'0'..=b'9') => {},
805                _ => return diag,
806            }
807        }
808    }
809
810    #[inline]
811    fn read_bindigits(&mut self) -> Option<Diagnostic> {
812        let mut diag = None;
813        unwind_loop! {
814            match self.next() {
815                Some(b'_') => diag = diag.or(self.handle_numeric_separator(2)),
816                Some(b'0') | Some(b'1') => {},
817                _ => return diag,
818            }
819        }
820    }
821
822    #[inline]
823    fn read_octaldigits(&mut self) -> Option<Diagnostic> {
824        let mut diag = None;
825        unwind_loop! {
826            match self.next() {
827                Some(b'_') => diag = diag.or(self.handle_numeric_separator(8)),
828                Some(b'0'..=b'7') => {},
829                _ => return diag,
830            }
831        }
832    }
833
834    #[inline]
835    fn verify_number_end(&mut self, start: usize) -> LexerReturn {
836        let err_start = self.cur;
837        if self.cur < self.bytes.len() && self.cur_is_ident_start() {
838            self.consume_ident();
839            let err = Diagnostic::error(
840                self.file_id,
841                "",
842                "numbers cannot be followed by identifiers directly after",
843            )
844            .primary(err_start..self.cur, "an identifier cannot appear here");
845
846            (
847                Token::new(SyntaxKind::ERROR_TOKEN, self.cur - start),
848                Some(err),
849            )
850        } else {
851            tok!(NUMBER, self.cur - start)
852        }
853    }
854
855    #[inline]
856    fn read_shebang(&mut self) -> LexerReturn {
857        let start = self.cur;
858        self.next();
859        if start != 0 {
860            return (Token::new(T![#], 1), None);
861        }
862
863        if let Some(b'!') = self.bytes.get(1) {
864            while self.next().is_some() {
865                let chr = self.get_unicode_char();
866
867                if is_linebreak(chr) {
868                    return tok!(SHEBANG, self.cur);
869                }
870                self.cur += chr.len_utf8() - 1;
871            }
872            tok!(SHEBANG, self.cur)
873        } else {
874            let err = Diagnostic::error(
875                self.file_id,
876                "",
877                "expected `!` following a `#`, but found none",
878            )
879            .primary(0usize..1usize, "");
880
881            (Token::new(SyntaxKind::ERROR_TOKEN, 1), Some(err))
882        }
883    }
884
885    #[inline]
886    fn read_slash(&mut self) -> LexerReturn {
887        let start = self.cur;
888        match self.bytes.get(self.cur + 1) {
889            Some(b'*') => {
890                self.next();
891                while let Some(b) = self.next().copied() {
892                    match b {
893                        b'*' if self.bytes.get(self.cur + 1) == Some(&b'/') => {
894                            self.advance(2);
895                            return tok!(COMMENT, self.cur - start);
896                        }
897                        _ => {}
898                    }
899                }
900
901                let err = Diagnostic::error(self.file_id, "", "unterminated block comment")
902                    .primary(self.cur..self.cur + 1, "... but the file ends here")
903                    .secondary(start..start + 2, "A block comment starts here");
904
905                (Token::new(SyntaxKind::COMMENT, self.cur - start), Some(err))
906            }
907            Some(b'/') => {
908                self.next();
909                while self.next().is_some() {
910                    let chr = self.get_unicode_char();
911
912                    if is_linebreak(chr) {
913                        return tok!(COMMENT, self.cur - start);
914                    }
915                    self.cur += chr.len_utf8() - 1;
916                }
917                tok!(COMMENT, self.cur - start)
918            }
919            _ if self.state.expr_allowed => self.read_regex(),
920            Some(b'=') => {
921                self.advance(2);
922                tok!(SLASHEQ, self.cur - start)
923            }
924            _ => self.eat(tok![/]),
925        }
926    }
927
928    #[inline]
929    fn flag_err(&self, flag: char) -> Diagnostic {
930        Diagnostic::error(self.file_id, "", format!("duplicate flag `{}`", flag))
931            .primary(self.cur..self.cur + 1, "this flag was already used")
932    }
933
934    // TODO: Due to our return of (Token, Option<Error>) we cant issue more than one regex error
935    // This is not a huge issue but it would be helpful to users
936    #[inline]
937    #[allow(clippy::many_single_char_names)]
938    fn read_regex(&mut self) -> LexerReturn {
939        let start = self.cur;
940        let mut in_class = false;
941        let mut diagnostic = None;
942
943        unwind_loop! {
944            match self.next() {
945                Some(b'[') => in_class = true,
946                Some(b']') => in_class = false,
947                Some(b'/') => {
948                    if !in_class {
949                        let (mut g, mut i, mut m, mut s, mut u, mut y) = (false, false, false, false, false, false);
950
951                        unwind_loop! {
952                            let next = self.next_bounded().copied();
953                            let chr_start = self.cur;
954                            match next {
955                               Some(b'g') => {
956                                   if g && diagnostic.is_none() {
957                                        diagnostic = Some(self.flag_err('g'))
958                                   }
959                                   g = true;
960                               },
961                               Some(b'i') => {
962                                    if i && diagnostic.is_none() {
963                                        diagnostic = Some(self.flag_err('i'))
964                                    }
965                                    i = true;
966                               },
967                               Some(b'm') => {
968                                    if m && diagnostic.is_none() {
969                                        diagnostic = Some(self.flag_err('m'))
970                                    }
971                                    m = true;
972                               },
973                               Some(b's') => {
974                                    if s && diagnostic.is_none() {
975                                        diagnostic = Some(self.flag_err('s'))
976                                    }
977                                    s = true;
978                                },
979                                Some(b'u') => {
980                                    if u && diagnostic.is_none() {
981                                        diagnostic = Some(self.flag_err('u'))
982                                    }
983                                    u = true;
984                               },
985                               Some(b'y') => {
986                                    if y && diagnostic.is_none() {
987                                        diagnostic = Some(self.flag_err('y'))
988                                    }
989                                    y = true;
990                                },
991                                Some(_) if self.cur_ident_part().is_some() => {
992                                    if diagnostic.is_none() {
993                                        diagnostic = Some(Diagnostic::error(self.file_id, "", "invalid regex flag")
994                                            .primary(chr_start .. self.cur + 1, "this is not a valid regex flag"));
995                                    }
996                                },
997                                _ => {
998                                    return (Token::new(SyntaxKind::REGEX, self.cur - start), diagnostic)
999                                }
1000                            }
1001                        }
1002                    }
1003                },
1004                Some(b'\\') => {
1005                    if self.next_bounded().is_none() {
1006                        let err = Diagnostic::error(self.file_id, "", "expected a character after a regex escape, but found none")
1007                            .primary(self.cur..self.cur + 1, "expected a character following this");
1008
1009                        return (Token::new(SyntaxKind::REGEX, self.cur - start), Some(err));
1010                    }
1011                },
1012                None => {
1013                    let err = Diagnostic::error(self.file_id, "", "unterminated regex literal")
1014                        .primary(self.cur..self.cur, "...but the file ends here")
1015                        .secondary(start..start + 1, "a regex literal starts there...");
1016
1017                    return (Token::new(SyntaxKind::REGEX, self.cur - start), Some(err));
1018                },
1019                _ => {},
1020            }
1021        }
1022    }
1023
1024    #[inline]
1025    fn bin_or_assign(&mut self, bin: SyntaxKind, assign: SyntaxKind) -> LexerReturn {
1026        if let Some(b'=') = self.next() {
1027            self.next();
1028            (Token::new(assign, 2), None)
1029        } else {
1030            (Token::new(bin, 1), None)
1031        }
1032    }
1033
1034    #[inline]
1035    fn resolve_bang(&mut self) -> LexerReturn {
1036        match self.next() {
1037            Some(b'=') => {
1038                if let Some(b'=') = self.next() {
1039                    self.next();
1040                    tok!(NEQ2, 3)
1041                } else {
1042                    tok!(NEQ, 2)
1043                }
1044            }
1045            _ => tok!(!),
1046        }
1047    }
1048
1049    #[inline]
1050    fn resolve_amp(&mut self) -> LexerReturn {
1051        match self.next() {
1052            Some(b'&') => {
1053                if let Some(b'=') = self.next() {
1054                    self.next();
1055                    tok!(AMP2EQ, 3)
1056                } else {
1057                    tok!(AMP2, 2)
1058                }
1059            }
1060            Some(b'=') => {
1061                self.next();
1062                tok!(AMPEQ, 2)
1063            }
1064            _ => tok!(&),
1065        }
1066    }
1067
1068    #[inline]
1069    fn resolve_plus(&mut self) -> LexerReturn {
1070        match self.next() {
1071            Some(b'+') => {
1072                self.next();
1073                tok!(PLUS2, 2)
1074            }
1075            Some(b'=') => {
1076                self.next();
1077                tok!(PLUSEQ, 2)
1078            }
1079            _ => tok!(+),
1080        }
1081    }
1082
1083    #[inline]
1084    fn resolve_minus(&mut self) -> LexerReturn {
1085        match self.next() {
1086            Some(b'-') => {
1087                self.next();
1088                tok!(MINUS2, 2)
1089            }
1090            Some(b'=') => {
1091                self.next();
1092                tok!(MINUSEQ, 2)
1093            }
1094            _ => tok!(-),
1095        }
1096    }
1097
1098    #[inline]
1099    fn resolve_less_than(&mut self) -> LexerReturn {
1100        match self.next() {
1101            Some(b'<') => {
1102                if let Some(b'=') = self.next() {
1103                    self.next();
1104                    tok!(SHLEQ, 3)
1105                } else {
1106                    tok!(SHL, 2)
1107                }
1108            }
1109            Some(b'=') => {
1110                self.next();
1111                tok!(LTEQ, 2)
1112            }
1113            _ => tok!(<),
1114        }
1115    }
1116
1117    #[inline]
1118    fn resolve_greater_than(&mut self) -> LexerReturn {
1119        match self.next() {
1120            Some(b'>') => {
1121                if let Some(b'>') = self.bytes.get(self.cur + 1).copied() {
1122                    if let Some(b'=') = self.bytes.get(self.cur + 2).copied() {
1123                        self.advance(3);
1124                        tok!(USHREQ, 4)
1125                    } else {
1126                        tok!(>)
1127                    }
1128                } else if self.bytes.get(self.cur + 1).copied() == Some(b'=') {
1129                    self.advance(2);
1130                    tok!(SHREQ, 3)
1131                } else {
1132                    tok!(>)
1133                }
1134            }
1135            Some(b'=') => {
1136                self.next();
1137                tok!(GTEQ, 2)
1138            }
1139            _ => tok!(>),
1140        }
1141    }
1142
1143    #[inline]
1144    fn resolve_eq(&mut self) -> LexerReturn {
1145        match self.next() {
1146            Some(b'=') => {
1147                if let Some(b'=') = self.next() {
1148                    self.next();
1149                    tok!(EQ3, 3)
1150                } else {
1151                    tok!(EQ2, 2)
1152                }
1153            }
1154            Some(b'>') => {
1155                self.next();
1156                tok!(FAT_ARROW, 2)
1157            }
1158            _ => tok!(=),
1159        }
1160    }
1161
1162    #[inline]
1163    fn resolve_pipe(&mut self) -> LexerReturn {
1164        match self.next() {
1165            Some(b'|') => {
1166                if let Some(b'=') = self.next() {
1167                    self.next();
1168                    tok!(PIPE2EQ, 3)
1169                } else {
1170                    tok!(PIPE2, 2)
1171                }
1172            }
1173            Some(b'=') => {
1174                self.next();
1175                tok!(PIPEEQ, 2)
1176            }
1177            _ => tok!(|),
1178        }
1179    }
1180
1181    // Dont ask it to resolve the question of life's meaning because you'll be dissapointed
1182    #[inline]
1183    fn resolve_question(&mut self) -> LexerReturn {
1184        match self.next() {
1185            Some(b'?') => {
1186                if let Some(b'=') = self.next() {
1187                    self.next();
1188                    tok!(QUESTION2EQ, 3)
1189                } else {
1190                    tok!(QUESTION2, 2)
1191                }
1192            }
1193            Some(b'.') => {
1194                // 11.7 Optional chaining punctuator
1195                if let Some(b'0'..=b'9') = self.bytes.get(self.cur + 1) {
1196                    tok!(?)
1197                } else {
1198                    self.next();
1199                    tok!(QUESTIONDOT, 2)
1200                }
1201            }
1202            _ => tok!(?),
1203        }
1204    }
1205
1206    #[inline]
1207    fn resolve_star(&mut self) -> LexerReturn {
1208        match self.next() {
1209            Some(b'*') => {
1210                if let Some(b'=') = self.next() {
1211                    self.next();
1212                    tok!(STAR2EQ, 3)
1213                } else {
1214                    tok!(STAR2, 2)
1215                }
1216            }
1217            Some(b'=') => {
1218                self.next();
1219                tok!(STAREQ, 2)
1220            }
1221            _ => tok!(*),
1222        }
1223    }
1224
1225    /// Lex the next token
1226    fn lex_token(&mut self) -> LexerReturn {
1227        // Safety: we always call lex_token when we are at a valid char
1228        let byte = unsafe { *self.bytes.get_unchecked(self.cur) };
1229        let start = self.cur;
1230
1231        // A lookup table of `byte -> fn(l: &mut Lexer) -> Token` is exponentially slower than this approach
1232        // The speed difference comes from the difference in table size, a 2kb table is easily fit into cpu cache
1233        // While a 16kb table will be ejected from cache very often leading to slowdowns, this also allows LLVM
1234        // to do more aggressive optimizations on the match regarding how to map it to instructions
1235        let dispatched = Self::lookup(byte);
1236
1237        match dispatched {
1238            WHS => {
1239                self.consume_whitespace();
1240                tok!(WHITESPACE, self.cur - start)
1241            }
1242            EXL => self.resolve_bang(),
1243            HAS => self.read_shebang(),
1244            PRC => self.bin_or_assign(T![%], T![%=]),
1245            AMP => self.resolve_amp(),
1246            PNO => self.eat(tok!(L_PAREN, 1)),
1247            PNC => self.eat(tok!(R_PAREN, 1)),
1248            MUL => self.resolve_star(),
1249            PLS => self.resolve_plus(),
1250            COM => self.eat(tok![,]),
1251            MIN => self.resolve_minus(),
1252            SLH => self.read_slash(),
1253            // This simply changes state on the start
1254            TPL => self.eat(tok!(BACKTICK, 1)),
1255            ZER => {
1256                let diag = self.read_zero();
1257                let (token, err) = self.verify_number_end(start);
1258                (token, err.or(diag))
1259            }
1260            PRD => {
1261                if let Some(b"..") = self.bytes.get(self.cur + 1..self.cur + 3) {
1262                    self.cur += 3;
1263                    return tok!(DOT2, 3);
1264                }
1265                if let Some(b'0'..=b'9') = self.bytes.get(self.cur + 1) {
1266                    let diag = self.read_float();
1267                    let (token, err) = self.verify_number_end(start);
1268                    (token, err.or(diag))
1269                } else {
1270                    self.eat(tok![.])
1271                }
1272            }
1273            BSL => {
1274                if self.bytes.get(self.cur + 1) == Some(&b'u') {
1275                    self.next();
1276                    let res = if self.bytes.get(self.cur + 1).copied() == Some(b'{') {
1277                        self.next();
1278                        self.read_codepoint_escape()
1279                    } else {
1280                        self.read_unicode_escape(true)
1281                    };
1282
1283                    match res {
1284                        Ok(chr) => {
1285                            if is_id_start(chr) {
1286                                self.resolve_identifier((chr, start))
1287                            } else {
1288                                let err = Diagnostic::error(self.file_id, "", "unexpected unicode escape")
1289                                    .primary(start..self.cur, "this escape is unexpected, as it does not designate the start of an identifier");
1290
1291                                self.next();
1292                                (
1293                                    Token::new(SyntaxKind::ERROR_TOKEN, self.cur - start),
1294                                    Some(err),
1295                                )
1296                            }
1297                        }
1298                        Err(err) => (
1299                            Token::new(SyntaxKind::ERROR_TOKEN, self.cur - start),
1300                            Some(err),
1301                        ),
1302                    }
1303                } else {
1304                    let err = Diagnostic::error(
1305                        self.file_id,
1306                        "",
1307                        format!("unexpected token `{}`", byte as char),
1308                    )
1309                    .primary(start..self.cur + 1, "");
1310                    self.next();
1311
1312                    (Token::new(SyntaxKind::ERROR_TOKEN, 1), Some(err))
1313                }
1314            }
1315            QOT => {
1316                if let Some(err) = self.read_str_literal() {
1317                    (
1318                        Token::new(SyntaxKind::ERROR_TOKEN, self.cur - start),
1319                        Some(err),
1320                    )
1321                } else {
1322                    tok!(STRING, self.cur - start)
1323                }
1324            }
1325            IDT => self.resolve_identifier((byte as char, start)),
1326            DIG => {
1327                let diag = self.read_number();
1328                let (token, err) = self.verify_number_end(start);
1329                (token, err.or(diag))
1330            }
1331            COL => self.eat(tok![:]),
1332            SEM => self.eat(tok![;]),
1333            LSS => self.resolve_less_than(),
1334            EQL => self.resolve_eq(),
1335            MOR => self.resolve_greater_than(),
1336            QST => self.resolve_question(),
1337            BTO => self.eat(tok!(L_BRACK, 1)),
1338            BTC => self.eat(tok![R_BRACK, 1]),
1339            CRT => self.bin_or_assign(T![^], T![^=]),
1340            BEO => self.eat(tok![L_CURLY, 1]),
1341            BEC => self.eat(tok![R_CURLY, 1]),
1342            PIP => self.resolve_pipe(),
1343            TLD => self.eat(tok![~]),
1344            UNI => {
1345                let chr = self.get_unicode_char();
1346                if UNICODE_WHITESPACE_STARTS.contains(&byte)
1347                    && (is_linebreak(chr) || UNICODE_SPACES.contains(&chr))
1348                {
1349                    if is_linebreak(chr) {
1350                        self.state.had_linebreak = true;
1351                    }
1352
1353                    self.cur += chr.len_utf8() - 1;
1354                    self.consume_whitespace();
1355                    tok!(WHITESPACE, self.cur - start)
1356                } else {
1357                    self.cur += chr.len_utf8() - 1;
1358                    if is_id_start(chr) {
1359                        self.resolve_identifier((chr, start))
1360                    } else {
1361                        let err = Diagnostic::error(
1362                            self.file_id,
1363                            "",
1364                            format!("Unexpected token `{}`", chr as char),
1365                        )
1366                        .primary(start..self.cur + 1, "");
1367                        self.next();
1368
1369                        (
1370                            Token::new(SyntaxKind::ERROR_TOKEN, self.cur - start),
1371                            Some(err),
1372                        )
1373                    }
1374                }
1375            }
1376            AT_ => self.eat(tok![@]),
1377            _ => {
1378                let err = Diagnostic::error(
1379                    self.file_id,
1380                    "",
1381                    format!("unexpected token `{}`", byte as char),
1382                )
1383                .primary(start..self.cur + 1, "");
1384                self.next();
1385
1386                (Token::new(SyntaxKind::ERROR_TOKEN, 1), Some(err))
1387            }
1388        }
1389    }
1390
1391    fn lex_template(&mut self) -> LexerReturn {
1392        let start = self.cur;
1393        let mut diagnostic = None;
1394
1395        while let Some(b) = self.bytes.get(self.cur) {
1396            match *b as char {
1397                '`' if self.cur == start => {
1398                    self.next();
1399                    return tok!(BACKTICK, 1);
1400                }
1401                '`' => {
1402                    return (
1403                        Token::new(SyntaxKind::TEMPLATE_CHUNK, self.cur - start),
1404                        diagnostic,
1405                    );
1406                }
1407                '\\' => {
1408                    if let Some(err) = self.validate_escape_sequence() {
1409                        diagnostic = Some(err);
1410                    }
1411                    self.next_bounded();
1412                }
1413                '$' if self.bytes.get(self.cur + 1) == Some(&b'{') && self.cur == start => {
1414                    self.advance(2);
1415                    return (Token::new(SyntaxKind::DOLLARCURLY, 2), diagnostic);
1416                }
1417                '$' if self.bytes.get(self.cur + 1) == Some(&b'{') => {
1418                    return (
1419                        Token::new(SyntaxKind::TEMPLATE_CHUNK, self.cur - start),
1420                        diagnostic,
1421                    )
1422                }
1423                _ => {
1424                    let _ = self.next();
1425                }
1426            }
1427        }
1428
1429        let err = Diagnostic::error(self.file_id, "", "unterminated template literal")
1430            .primary(self.cur..self.cur + 1, "");
1431
1432        (
1433            Token::new(SyntaxKind::TEMPLATE_CHUNK, self.cur - start),
1434            Some(err),
1435        )
1436    }
1437}
1438
1439/// Check if a char is a JS linebreak
1440pub fn is_linebreak(chr: char) -> bool {
1441    ['\n', '\r', '\u{2028}', '\u{2029}'].contains(&chr)
1442}
1443
1444impl Iterator for Lexer<'_> {
1445    type Item = LexerReturn;
1446
1447    fn next(&mut self) -> Option<Self::Item> {
1448        if self.cur >= self.bytes.len() {
1449            if !self.returned_eof {
1450                self.returned_eof = true;
1451                return Some(tok!(EOF, 0));
1452            }
1453            return None;
1454        }
1455
1456        let token = if self.state.is_in_template() {
1457            self.lex_template()
1458        } else {
1459            self.lex_token()
1460        };
1461
1462        if ![
1463            SyntaxKind::COMMENT,
1464            SyntaxKind::WHITESPACE,
1465            SyntaxKind::TEMPLATE_CHUNK,
1466        ]
1467        .contains(&token.0.kind)
1468        {
1469            self.state.update(token.0.kind);
1470        }
1471        Some(token)
1472    }
1473}
1474
1475// Every handler a byte coming in could be mapped to
1476#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
1477#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
1478#[repr(u8)]
1479enum Dispatch {
1480    ERR,
1481    WHS,
1482    EXL,
1483    QOT,
1484    IDT,
1485    HAS,
1486    PRC,
1487    AMP,
1488    PNO,
1489    PNC,
1490    MUL,
1491    PLS,
1492    COM,
1493    MIN,
1494    PRD,
1495    SLH,
1496    ZER,
1497    DIG,
1498    COL,
1499    SEM,
1500    LSS,
1501    EQL,
1502    MOR,
1503    QST,
1504    AT_,
1505    BTO,
1506    BSL,
1507    BTC,
1508    CRT,
1509    TPL,
1510    BEO,
1511    PIP,
1512    BEC,
1513    TLD,
1514    UNI,
1515}
1516use Dispatch::*;
1517
1518// A lookup table mapping any incoming byte to a handler function
1519// This is taken from the ratel project lexer and modified
1520// FIXME: Should we ignore the first ascii control chars which are nearly never seen instead of returning Err?
1521static DISPATCHER: [Dispatch; 256] = [
1522    //   0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   //
1523    ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, WHS, WHS, WHS, WHS, WHS, ERR, ERR, // 0
1524    ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1
1525    WHS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, MUL, PLS, COM, MIN, PRD, SLH, // 2
1526    ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3
1527    AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4
1528    IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, BSL, BTC, CRT, IDT, // 5
1529    TPL, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 6
1530    IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BEO, PIP, BEC, TLD, ERR, // 7
1531    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
1532    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
1533    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
1534    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B
1535    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C
1536    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D
1537    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E
1538    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F
1539];