mod comment;
mod cursor;
pub mod error;
mod identifier;
mod number;
mod operator;
mod private_identifier;
pub mod regex;
mod spread;
mod string;
mod template;
pub mod token;
#[cfg(test)]
mod tests;
use self::{
comment::{HashbangComment, MultiLineComment, SingleLineComment},
cursor::Cursor,
identifier::Identifier,
number::NumberLiteral,
operator::Operator,
private_identifier::PrivateIdentifier,
regex::RegexLiteral,
spread::SpreadLiteral,
string::StringLiteral,
template::TemplateLiteral,
};
use crate::syntax::ast::{Position, Punctuator, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
use std::io::Read;
pub use self::{
error::Error,
token::{Token, TokenKind},
};
trait Tokenizer<R> {
fn lex(
&mut self,
cursor: &mut Cursor<R>,
start_pos: Position,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read;
}
#[derive(Debug)]
pub struct Lexer<R> {
cursor: Cursor<R>,
goal_symbol: InputElement,
}
impl<R> Lexer<R> {
fn is_whitespace(ch: u32) -> bool {
matches!(
ch,
0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
)
}
#[inline]
pub(crate) fn set_goal(&mut self, elm: InputElement) {
self.goal_symbol = elm;
}
#[inline]
pub(crate) fn get_goal(&self) -> InputElement {
self.goal_symbol
}
#[inline]
pub(super) fn strict_mode(&self) -> bool {
self.cursor.strict_mode()
}
#[inline]
pub(super) fn set_strict_mode(&mut self, strict_mode: bool) {
self.cursor.set_strict_mode(strict_mode);
}
#[inline]
pub fn new(reader: R) -> Self
where
R: Read,
{
Self {
cursor: Cursor::new(reader),
goal_symbol: InputElement::default(),
}
}
pub(crate) fn lex_slash_token(
&mut self,
start: Position,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
{
let _timer = Profiler::global().start_event("lex_slash_token", "Lexing");
if let Some(c) = self.cursor.peek()? {
match c {
b'/' => {
self.cursor.next_byte()?.expect("/ token vanished"); SingleLineComment.lex(&mut self.cursor, start, interner)
}
b'*' => {
self.cursor.next_byte()?.expect("* token vanished"); MultiLineComment.lex(&mut self.cursor, start, interner)
}
ch => {
match self.get_goal() {
InputElement::Div | InputElement::TemplateTail => {
if ch == b'=' {
self.cursor.next_byte()?.expect("= token vanished"); Ok(Token::new(
Punctuator::AssignDiv.into(),
Span::new(start, self.cursor.pos()),
))
} else {
Ok(Token::new(
Punctuator::Div.into(),
Span::new(start, self.cursor.pos()),
))
}
}
InputElement::RegExp => {
RegexLiteral.lex(&mut self.cursor, start, interner)
}
}
}
}
} else {
Err(Error::syntax(
"Abrupt end: Expecting Token /,*,= or regex",
start,
))
}
}
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self, interner: &mut Interner) -> Result<Option<Token>, Error>
where
R: Read,
{
let _timer = Profiler::global().start_event("next()", "Lexing");
let (start, next_ch) = loop {
let start = self.cursor.pos();
if let Some(next_ch) = self.cursor.next_char()? {
if !Self::is_whitespace(next_ch) {
break (start, next_ch);
}
} else {
return Ok(None);
}
};
if start.column_number() == 1 && start.line_number() == 1 && next_ch == 0x23 {
if let Some(hashbang_peek) = self.cursor.peek()? {
if hashbang_peek == 0x21 {
let _token = HashbangComment.lex(&mut self.cursor, start, interner);
return self.next(interner);
}
}
};
if let Ok(c) = char::try_from(next_ch) {
let token = match c {
'\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
TokenKind::LineTerminator,
Span::new(start, self.cursor.pos()),
)),
'"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start, interner),
'`' => TemplateLiteral.lex(&mut self.cursor, start, interner),
';' => Ok(Token::new(
Punctuator::Semicolon.into(),
Span::new(start, self.cursor.pos()),
)),
':' => Ok(Token::new(
Punctuator::Colon.into(),
Span::new(start, self.cursor.pos()),
)),
'.' => {
if self.cursor.peek()?.map(|c| (b'0'..=b'9').contains(&c)) == Some(true) {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start, interner)
} else {
SpreadLiteral::new().lex(&mut self.cursor, start, interner)
}
}
'(' => Ok(Token::new(
Punctuator::OpenParen.into(),
Span::new(start, self.cursor.pos()),
)),
')' => Ok(Token::new(
Punctuator::CloseParen.into(),
Span::new(start, self.cursor.pos()),
)),
',' => Ok(Token::new(
Punctuator::Comma.into(),
Span::new(start, self.cursor.pos()),
)),
'{' => Ok(Token::new(
Punctuator::OpenBlock.into(),
Span::new(start, self.cursor.pos()),
)),
'}' => Ok(Token::new(
Punctuator::CloseBlock.into(),
Span::new(start, self.cursor.pos()),
)),
'[' => Ok(Token::new(
Punctuator::OpenBracket.into(),
Span::new(start, self.cursor.pos()),
)),
']' => Ok(Token::new(
Punctuator::CloseBracket.into(),
Span::new(start, self.cursor.pos()),
)),
'#' => PrivateIdentifier::new().lex(&mut self.cursor, start, interner),
'/' => self.lex_slash_token(start, interner),
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start, interner)
}
'\\' if self.cursor.peek()? == Some(b'u') => {
Identifier::new(c).lex(&mut self.cursor, start, interner)
}
_ if Identifier::is_identifier_start(c as u32) => {
Identifier::new(c).lex(&mut self.cursor, start, interner)
}
_ if c.is_ascii_digit() => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start, interner)
}
_ => {
let details = format!(
"unexpected '{c}' at line {}, column {}",
start.line_number(),
start.column_number()
);
Err(Error::syntax(details, start))
}
}?;
if token.kind() == &TokenKind::Comment {
self.next(interner)
} else {
Ok(Some(token))
}
} else {
Err(Error::syntax(
format!(
"unexpected utf-8 char '\\u{next_ch}' at line {}, column {}",
start.line_number(),
start.column_number()
),
start,
))
}
}
pub(crate) fn lex_template(
&mut self,
start: Position,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
{
TemplateLiteral.lex(&mut self.cursor, start, interner)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum InputElement {
Div,
RegExp,
TemplateTail,
}
impl Default for InputElement {
fn default() -> Self {
Self::RegExp
}
}