boa/syntax/lexer/
mod.rs

1//! A lexical analyzer for JavaScript source code.
2//!
3//! This module contains the Boa lexer or tokenizer implementation.
4//!
5//! The Lexer splits its input source code into a sequence of input elements called tokens,
6//! represented by the [Token](../ast/token/struct.Token.html) structure. It also removes
7//! whitespace and comments and attaches them to the next token.
8//!
9//! This is tightly coupled with the parser due to the javascript goal-symbol requirements
10//! as documented by the spec.
11//!
12//! More information:
13//!  - [ECMAScript reference][spec]
14//!
15//! [spec]: https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar
16
17mod comment;
18mod cursor;
19pub mod error;
20mod identifier;
21mod number;
22mod operator;
23mod regex;
24mod spread;
25mod string;
26mod template;
27pub mod token;
28
29#[cfg(test)]
30mod tests;
31
32use self::{
33    comment::{MultiLineComment, SingleLineComment},
34    cursor::Cursor,
35    identifier::Identifier,
36    number::NumberLiteral,
37    operator::Operator,
38    regex::RegexLiteral,
39    spread::SpreadLiteral,
40    string::StringLiteral,
41    template::TemplateLiteral,
42};
43use crate::syntax::ast::{Punctuator, Span};
44pub use crate::{profiler::BoaProfiler, syntax::ast::Position};
45use core::convert::TryFrom;
46pub use error::Error;
47use std::io::Read;
48pub use token::{Token, TokenKind};
49
50trait Tokenizer<R> {
51    /// Lexes the next token.
52    fn lex(&mut self, cursor: &mut Cursor<R>, start_pos: Position) -> Result<Token, Error>
53    where
54        R: Read;
55}
56
57/// Lexer or tokenizer for the Boa JavaScript Engine.
58#[derive(Debug)]
59pub struct Lexer<R> {
60    cursor: Cursor<R>,
61    goal_symbol: InputElement,
62}
63
64impl<R> Lexer<R> {
65    /// Checks if a character is whitespace as per ECMAScript standards.
66    ///
67    /// The Rust `char::is_whitespace` function and the ECMAScript standard use different sets of
68    /// characters as whitespaces:
69    ///  * Rust uses `\p{White_Space}`,
70    ///  * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}`
71    ///
72    /// [More information](https://tc39.es/ecma262/#table-32)
73    fn is_whitespace(ch: u32) -> bool {
74        matches!(
75            ch,
76            0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
77            // Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
78            0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
79        )
80    }
81
82    /// Sets the goal symbol for the lexer.
83    #[inline]
84    pub(crate) fn set_goal(&mut self, elm: InputElement) {
85        self.goal_symbol = elm;
86    }
87
88    /// Gets the goal symbol the lexer is currently using.
89    #[inline]
90    pub(crate) fn get_goal(&self) -> InputElement {
91        self.goal_symbol
92    }
93
94    #[inline]
95    pub(super) fn strict_mode(&self) -> bool {
96        self.cursor.strict_mode()
97    }
98
99    #[inline]
100    pub(super) fn set_strict_mode(&mut self, strict_mode: bool) {
101        self.cursor.set_strict_mode(strict_mode)
102    }
103
104    /// Creates a new lexer.
105    #[inline]
106    pub fn new(reader: R) -> Self
107    where
108        R: Read,
109    {
110        Self {
111            cursor: Cursor::new(reader),
112            goal_symbol: Default::default(),
113        }
114    }
115
116    // Handles lexing of a token starting '/' with the '/' already being consumed.
117    // This could be a divide symbol or the start of a regex.
118    //
119    // A '/' symbol can always be a comment but if as tested above it is not then
120    // that means it could be multiple different tokens depending on the input token.
121    //
122    // As per https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar
123    pub(crate) fn lex_slash_token(&mut self, start: Position) -> Result<Token, Error>
124    where
125        R: Read,
126    {
127        let _timer = BoaProfiler::global().start_event("lex_slash_token", "Lexing");
128
129        if let Some(c) = self.cursor.peek()? {
130            match c {
131                b'/' => {
132                    self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/'
133                    SingleLineComment.lex(&mut self.cursor, start)
134                }
135                b'*' => {
136                    self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*'
137                    MultiLineComment.lex(&mut self.cursor, start)
138                }
139                ch => {
140                    match self.get_goal() {
141                        InputElement::Div | InputElement::TemplateTail => {
142                            // Only div punctuator allowed, regex not.
143
144                            if ch == b'=' {
145                                // Indicates this is an AssignDiv.
146                                self.cursor.next_byte()?.expect("= token vanished"); // Consume the '='
147                                Ok(Token::new(
148                                    Punctuator::AssignDiv.into(),
149                                    Span::new(start, self.cursor.pos()),
150                                ))
151                            } else {
152                                Ok(Token::new(
153                                    Punctuator::Div.into(),
154                                    Span::new(start, self.cursor.pos()),
155                                ))
156                            }
157                        }
158                        InputElement::RegExp => {
159                            // Can be a regular expression.
160                            RegexLiteral.lex(&mut self.cursor, start)
161                        }
162                    }
163                }
164            }
165        } else {
166            Err(Error::syntax(
167                "Abrupt end: Expecting Token /,*,= or regex",
168                start,
169            ))
170        }
171    }
172
173    /// Retrieves the next token from the lexer.
174    // We intentionally don't implement Iterator trait as Result<Option> is cleaner to handle.
175    #[allow(clippy::should_implement_trait)]
176    pub fn next(&mut self) -> Result<Option<Token>, Error>
177    where
178        R: Read,
179    {
180        let _timer = BoaProfiler::global().start_event("next()", "Lexing");
181
182        let (start, next_ch) = loop {
183            let start = self.cursor.pos();
184            if let Some(next_ch) = self.cursor.next_char()? {
185                // Ignore whitespace
186                if !Self::is_whitespace(next_ch) {
187                    break (start, next_ch);
188                }
189            } else {
190                return Ok(None);
191            }
192        };
193
194        if let Ok(c) = char::try_from(next_ch) {
195            let token = match c {
196                '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
197                    TokenKind::LineTerminator,
198                    Span::new(start, self.cursor.pos()),
199                )),
200                '"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start),
201                '`' => TemplateLiteral.lex(&mut self.cursor, start),
202                ';' => Ok(Token::new(
203                    Punctuator::Semicolon.into(),
204                    Span::new(start, self.cursor.pos()),
205                )),
206                ':' => Ok(Token::new(
207                    Punctuator::Colon.into(),
208                    Span::new(start, self.cursor.pos()),
209                )),
210                '.' => {
211                    if self.cursor.peek()?.map(|c| (b'0'..=b'9').contains(&c)) == Some(true) {
212                        NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
213                    } else {
214                        SpreadLiteral::new().lex(&mut self.cursor, start)
215                    }
216                }
217                '(' => Ok(Token::new(
218                    Punctuator::OpenParen.into(),
219                    Span::new(start, self.cursor.pos()),
220                )),
221                ')' => Ok(Token::new(
222                    Punctuator::CloseParen.into(),
223                    Span::new(start, self.cursor.pos()),
224                )),
225                ',' => Ok(Token::new(
226                    Punctuator::Comma.into(),
227                    Span::new(start, self.cursor.pos()),
228                )),
229                '{' => Ok(Token::new(
230                    Punctuator::OpenBlock.into(),
231                    Span::new(start, self.cursor.pos()),
232                )),
233                '}' => Ok(Token::new(
234                    Punctuator::CloseBlock.into(),
235                    Span::new(start, self.cursor.pos()),
236                )),
237                '[' => Ok(Token::new(
238                    Punctuator::OpenBracket.into(),
239                    Span::new(start, self.cursor.pos()),
240                )),
241                ']' => Ok(Token::new(
242                    Punctuator::CloseBracket.into(),
243                    Span::new(start, self.cursor.pos()),
244                )),
245                '/' => self.lex_slash_token(start),
246                '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
247                    Operator::new(next_ch as u8).lex(&mut self.cursor, start)
248                }
249                '\\' if self.cursor.peek()? == Some(b'u') => {
250                    Identifier::new(c).lex(&mut self.cursor, start)
251                }
252                _ if Identifier::is_identifier_start(c as u32) => {
253                    Identifier::new(c).lex(&mut self.cursor, start)
254                }
255                _ if c.is_digit(10) => {
256                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
257                }
258                _ => {
259                    let details = format!(
260                        "unexpected '{}' at line {}, column {}",
261                        c,
262                        start.line_number(),
263                        start.column_number()
264                    );
265                    Err(Error::syntax(details, start))
266                }
267            }?;
268
269            if token.kind() == &TokenKind::Comment {
270                // Skip comment
271                self.next()
272            } else {
273                Ok(Some(token))
274            }
275        } else {
276            Err(Error::syntax(
277                format!(
278                    "unexpected utf-8 char '\\u{}' at line {}, column {}",
279                    next_ch,
280                    start.line_number(),
281                    start.column_number()
282                ),
283                start,
284            ))
285        }
286    }
287
288    pub(crate) fn lex_template(&mut self, start: Position) -> Result<Token, Error>
289    where
290        R: Read,
291    {
292        TemplateLiteral.lex(&mut self.cursor, start)
293    }
294}
295
296/// ECMAScript goal symbols.
297///
298/// <https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar>
299#[derive(Debug, Clone, Copy, PartialEq, Eq)]
300pub(crate) enum InputElement {
301    Div,
302    RegExp,
303    TemplateTail,
304}
305
306impl Default for InputElement {
307    fn default() -> Self {
308        InputElement::RegExp
309    }
310}