use super::{Cursor, Error, Tokenizer};
use crate::{
profiler::BoaProfiler,
syntax::{
ast::{Position, Span},
lexer::{Token, TokenKind},
},
};
use std::{
io::{self, ErrorKind, Read},
str,
};
#[derive(Debug, Clone, Copy)]
pub(super) struct StringLiteral {
terminator: StringTerminator,
}
impl StringLiteral {
pub(super) fn new(init: char) -> Self {
let terminator = match init {
'\'' => StringTerminator::SingleQuote,
'"' => StringTerminator::DoubleQuote,
_ => unreachable!(),
};
Self { terminator }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum StringTerminator {
SingleQuote,
DoubleQuote,
}
pub(crate) trait UTF16CodeUnitsBuffer {
fn push_code_point(&mut self, code_point: u32);
fn to_string_lossy(&self) -> String;
}
impl UTF16CodeUnitsBuffer for Vec<u16> {
#[inline]
fn push_code_point(&mut self, code_point: u32) {
if code_point <= 65535 {
self.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
self.push(cu1);
self.push(cu2);
}
}
#[inline]
fn to_string_lossy(&self) -> String {
String::from_utf16_lossy(self.as_slice())
}
}
impl<R> Tokenizer<R> for StringLiteral {
fn lex(&mut self, cursor: &mut Cursor<R>, start_pos: Position) -> Result<Token, Error>
where
R: Read,
{
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let (lit, span) =
Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Ok(Token::new(TokenKind::string_literal(lit), span))
}
}
impl StringLiteral {
#[inline]
pub(super) fn is_line_terminator(ch: u32) -> bool {
matches!(
ch,
0x000A | 0x000D | 0x2028 | 0x2029
)
}
#[inline]
fn take_string_characters<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
terminator: StringTerminator,
is_strict_mode: bool,
) -> Result<(String, Span), Error>
where
R: Read,
{
let mut buf = Vec::new();
loop {
let ch_start_pos = cursor.pos();
let ch = cursor.next_char()?;
match ch {
Some(0x0027 ) if terminator == StringTerminator::SingleQuote => break,
Some(0x0022 ) if terminator == StringTerminator::DoubleQuote => break,
Some(0x005C ) => {
let _timer = BoaProfiler::global()
.start_event("StringLiteral - escape sequence", "Lexing");
if let Some(escape_value) = Self::take_escape_sequence_or_line_continuation(
cursor,
ch_start_pos,
is_strict_mode,
false,
)? {
buf.push_code_point(escape_value);
}
}
Some(0x2028) => buf.push(0x2028 ),
Some(0x2029) => buf.push(0x2029 ),
Some(ch) if !Self::is_line_terminator(ch) => {
buf.push_code_point(ch);
}
_ => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
}
}
Ok((buf.to_string_lossy(), Span::new(start_pos, cursor.pos())))
}
#[inline]
pub(super) fn take_escape_sequence_or_line_continuation<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
is_strict_mode: bool,
is_template_literal: bool,
) -> Result<Option<u32>, Error>
where
R: Read,
{
let escape_ch = cursor.next_char()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in literal",
))
})?;
let escape_value = match escape_ch {
0x0062 => Some(0x0008 ),
0x0074 => Some(0x0009 ),
0x006E => Some(0x000A ),
0x0076 => Some(0x000B ),
0x0066 => Some(0x000C ),
0x0072 => Some(0x000D ),
0x0022 => Some(0x0022 ),
0x0027 => Some(0x0027 ),
0x005C => Some(0x005C ),
0x0030 if cursor
.peek()?
.filter(|next_byte| (b'0'..=b'9').contains(next_byte))
.is_none() =>
Some(0x0000 ),
0x0078 => {
Some(Self::take_hex_escape_sequence(cursor, start_pos)?)
}
0x0075 => {
Some(Self::take_unicode_escape_sequence(cursor, start_pos)?)
}
0x0038 | 0x0039 => {
if is_template_literal {
return Err(Error::syntax(
"\\8 and \\9 are not allowed in template literal",
start_pos,
));
} else if is_strict_mode {
return Err(Error::syntax(
"\\8 and \\9 are not allowed in strict mode",
start_pos,
));
} else {
Some(escape_ch)
}
}
_ if (0x0030..=0x0037 ).contains(&escape_ch) => {
if is_template_literal {
return Err(Error::syntax(
"octal escape sequences are not allowed in template literal",
start_pos,
));
} else if is_strict_mode {
return Err(Error::syntax(
"octal escape sequences are not allowed in strict mode",
start_pos,
));
} else {
Some(Self::take_legacy_octal_escape_sequence(
cursor,
escape_ch as u8,
)?)
}
}
_ if Self::is_line_terminator(escape_ch) => {
None
}
_ => {
Some(escape_ch)
}
};
Ok(escape_value)
}
#[inline]
pub(super) fn take_unicode_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
) -> Result<u32, Error>
where
R: Read,
{
if cursor.next_is(b'{')? {
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;
let code_point = str::from_utf8(code_point_buf.as_slice())
.ok()
.and_then(|code_point_str| {
u32::from_str_radix(code_point_str, 16).ok()
})
.ok_or_else(|| {
Error::syntax("malformed Unicode character escape sequence", start_pos)
})?;
if code_point > 0x10_FFFF {
return Err(Error::syntax(
"Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
start_pos,
));
}
Ok(code_point)
} else {
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?;
Ok(code_point as u32)
}
}
#[inline]
fn take_hex_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
) -> Result<u32, Error>
where
R: Read,
{
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?;
Ok(code_point as u32)
}
#[inline]
fn take_legacy_octal_escape_sequence<R>(
cursor: &mut Cursor<R>,
init_byte: u8,
) -> Result<u32, Error>
where
R: Read,
{
let mut code_point = (init_byte - b'0') as u32;
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
if (b'0'..=b'3').contains(&init_byte) {
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
}
}
}
}
}
Ok(code_point)
}
}