1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
//! This module implements lexing for string literals used in the JavaScript programing language.
use super::{Cursor, Error, Tokenizer};
use crate::{
profiler::BoaProfiler,
syntax::{
ast::{Position, Span},
lexer::{Token, TokenKind},
},
};
use core::convert::TryFrom;
use std::{
io::{self, ErrorKind, Read},
str,
};
/// String literal lexing.
///
/// Note: expects for the initializer `'` or `"` to already be consumed from the cursor.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#sec-literals-string-literals
/// [mdn]: https://developer.cdn.mozilla.net/en-US/docs/Web/JavaScript/Reference/Global_Objects/String
#[derive(Debug, Clone, Copy)]
pub(super) struct StringLiteral {
terminator: StringTerminator,
}
impl StringLiteral {
/// Creates a new string literal lexer.
pub(super) fn new(init: char) -> Self {
let terminator = match init {
'\'' => StringTerminator::SingleQuote,
'"' => StringTerminator::DoubleQuote,
_ => unreachable!(),
};
Self { terminator }
}
}
/// Terminator for the string.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum StringTerminator {
SingleQuote,
DoubleQuote,
End,
}
impl<R> Tokenizer<R> for StringLiteral {
fn lex(&mut self, cursor: &mut Cursor<R>, start_pos: Position) -> Result<Token, Error>
where
R: Read,
{
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let (lit, span) =
unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Ok(Token::new(TokenKind::string_literal(lit), span))
}
}
pub(super) fn unescape_string<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
terminator: StringTerminator,
strict_mode: bool,
) -> Result<(String, Span), Error>
where
R: Read,
{
let mut buf = Vec::new();
loop {
let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap();
match next_chr {
Some('\'') if terminator == StringTerminator::SingleQuote => {
break;
}
Some('"') if terminator == StringTerminator::DoubleQuote => {
break;
}
Some('\\') => {
let _timer =
BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing");
let escape = cursor.peek()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in literal",
))
})?;
if escape <= 0x7f {
let _ = cursor.next_byte()?;
match escape {
b'\n' => (),
b'n' => buf.push('\n' as u16),
b'r' => buf.push('\r' as u16),
b't' => buf.push('\t' as u16),
b'b' => buf.push('\x08' as u16),
b'f' => buf.push('\x0c' as u16),
b'0' => buf.push('\0' as u16),
b'x' => {
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Hexadecimal character escape sequence");
let code_point =
u16::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Hexadecimal escape sequence",
cursor.pos(),
)
})?;
buf.push(code_point);
}
b'u' => {
// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is(b'{')? {
// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;
let code_point_str =
unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
// We know this is a single unicode codepoint, convert to u32
let code_point =
u32::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"malformed Unicode character escape sequence",
cursor.pos(),
)
})?;
// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
} else if code_point <= 65535 {
buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1);
buf.push(cu2);
}
} else {
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
// Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence");
let code_point =
u16::from_str_radix(code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?;
buf.push(code_point);
}
}
n if char::is_digit(char::from(n), 8) => {
if strict_mode {
return Err(Error::syntax(
"octal escape sequences are deprecated",
cursor.pos(),
));
}
let mut o = char::from(n).to_digit(8).unwrap();
match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
let _ = cursor.next_byte()?;
o = o * 8 + char::from(n).to_digit(8).unwrap();
if n <= b'3' {
match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
let _ = cursor.next_byte();
o = o * 8 + char::from(n).to_digit(8).unwrap();
}
_ => (),
}
}
}
_ => (),
}
buf.push(o as u16);
}
_ => buf.push(escape as u16),
};
}
}
Some(next_ch) => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
buf.extend(code_point_bytes.iter());
}
}
None if terminator != StringTerminator::End => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
None => {
break;
}
}
}
Ok((
String::from_utf16_lossy(buf.as_slice()),
Span::new(start_pos, cursor.pos()),
))
}