1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
use std::cmp::max;
use oxc_allocator::StringBuilder;
use crate::{config::LexerConfig as Config, diagnostics};
use super::{
Kind, Lexer, LexerContext, Span, Token, cold_branch,
search::{SafeByteMatchTable, byte_search, safe_byte_match_table},
};
/// Convert `char` to UTF-8 bytes array.
const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
assert!(ch.len_utf8() == N);
let mut bytes = [0u8; N];
ch.encode_utf8(&mut bytes);
bytes
}
/// Lossy replacement character (U+FFFD) as UTF-8 bytes.
const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}');
const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0];
const _: () = assert!(LOSSY_REPLACEMENT_CHAR_FIRST_BYTE == 0xEF);
const MIN_ESCAPED_STR_LEN: usize = 16;
static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\'));
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
// Same as above, but with 1st byte of lossy replacement character added
static DOUBLE_QUOTE_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(
b,
b'"' | b'\r' | b'\n' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE
));
static SINGLE_QUOTE_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(
b,
b'\'' | b'\r' | b'\n' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE
));
/// Macro to handle a string literal.
///
/// # SAFETY
/// `$delimiter` must be an ASCII byte.
/// Next char in `lexer.source` must be ASCII.
/// `$table` must be a `SafeByteMatchTable`.
/// `$table` must only match `$delimiter`, '\', '\r' or '\n'.
macro_rules! handle_string_literal {
($lexer:ident, $delimiter:literal, $table:ident, $escaped_table:ident) => {{
debug_assert!($delimiter.is_ascii());
if $lexer.context == LexerContext::JsxAttributeValue {
// SAFETY: Caller guarantees `$delimiter` is ASCII, and next char is ASCII
return $lexer.read_jsx_string_literal($delimiter);
}
// Skip opening quote.
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
let after_opening_quote = $lexer.source.position().add(1);
// Consume bytes which are part of string
let next_byte = byte_search! {
lexer: $lexer,
table: $table,
start: after_opening_quote,
handle_eof: {
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
return Kind::Undetermined;
},
};
// Found a matching byte.
// Either end of string found, or a line break, or `\` escape.
match next_byte {
$delimiter => {
// SAFETY: Macro user guarantees delimiter is ASCII, so consuming it cannot move
// `lexer.source` off a UTF-8 character boundary.
$lexer.source.next_byte_unchecked();
Kind::Str
}
b'\\' => cold_branch(|| {
handle_string_literal_escape!(
$lexer,
$delimiter,
$escaped_table,
after_opening_quote
)
}),
_ => {
// Line break. This is impossible in valid JS, so cold path.
cold_branch(|| {
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
$lexer.consume_char();
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
Kind::Undetermined
})
}
}
}};
}
macro_rules! handle_string_literal_escape {
($lexer:ident, $delimiter:literal, $table:ident, $after_opening_quote:ident) => {{
// Create arena string to hold unescaped string.
// We don't know how long string will end up being. Take a guess that total length
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
let so_far = $lexer.source.str_from_pos_to_current($after_opening_quote);
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
let mut str = StringBuilder::with_capacity_in(capacity, $lexer.allocator);
// Push chunk before `\` into `str`.
str.push_str(so_far);
'outer: loop {
// Consume `\`
let escape_start_offset = $lexer.offset();
$lexer.consume_char();
// Consume escape sequence and add char to `str`
let mut is_valid_escape_sequence = true;
$lexer.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
if !is_valid_escape_sequence {
let range = Span::new(escape_start_offset, $lexer.offset());
$lexer.error(diagnostics::invalid_escape_sequence(range));
}
// Consume bytes until reach end of string, line break, or another escape
let mut chunk_start = $lexer.source.position();
while let Some(b) = $lexer.peek_byte() {
match b {
b if !$table.matches(b) => {
// SAFETY: A byte is available, as we just peeked it.
// This may put `source`'s position on a UTF-8 continuation byte, which violates
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
// mean `!table.matches(b)` on this branch prevents exiting this loop until
// `source` is positioned on a UTF-8 character boundary again.
$lexer.source.next_byte_unchecked();
}
b if b == $delimiter => {
// End of string found. Push last chunk to `str`.
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
// Consume closing quote.
// SAFETY: Caller guarantees delimiter is ASCII, so consuming it cannot move
// `lexer.source` off a UTF-8 character boundary
$lexer.source.next_byte_unchecked();
break 'outer;
}
b'\\' => {
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
continue 'outer;
}
LOSSY_REPLACEMENT_CHAR_FIRST_BYTE => cold_branch(|| {
// If the string contains lone surrogates, the lossy replacement character (U+FFFD)
// is used as start of an escape sequence.
// So an actual lossy escape character has to be escaped too.
// Output it as `\u{FFFD}fffd`.
// Cold branch because this should be very rare in real-world code.
// SAFETY: A byte is available, as we just peeked it, and it's 0xEF.
// 0xEF is always 1st byte of a 3-byte Unicode sequence, so safe to consume 3 bytes.
$lexer.source.next_byte_unchecked();
let next1 = $lexer.source.next_byte_unchecked();
let next2 = $lexer.source.next_byte_unchecked();
if $lexer.token.lone_surrogates()
&& [next1, next2] == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]]
{
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
str.push_str("fffd");
chunk_start = $lexer.source.position();
}
}),
_ => {
// Line break. This is impossible in valid JS, so cold path.
return cold_branch(|| {
debug_assert!(matches!(b, b'\r' | b'\n'));
$lexer.consume_char();
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
Kind::Undetermined
});
}
}
}
// EOF
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
return Kind::Undetermined;
}
// Convert `str` to arena slice and save to `escaped_strings`
$lexer.save_string(true, str.into_str());
Kind::Str
}};
}
/// 12.9.4 String Literals
impl<'a, C: Config> Lexer<'a, C> {
/// Read string literal delimited with `"`.
/// # SAFETY
/// Next character must be `"`.
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
// b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe {
handle_string_literal!(
self,
b'"',
DOUBLE_QUOTE_STRING_END_TABLE,
DOUBLE_QUOTE_ESCAPED_MATCH_TABLE
)
}
}
/// Read string literal delimited with `'`.
/// # SAFETY
/// Next character must be `'`.
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
// SAFETY: Caller guarantees next char is `'`, which is ASCII.
// b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe {
handle_string_literal!(
self,
b'\'',
SINGLE_QUOTE_STRING_END_TABLE,
SINGLE_QUOTE_ESCAPED_MATCH_TABLE
)
}
}
/// Save the string if it is escaped
/// This reduces the overall memory consumption while keeping the `Token` size small
/// Strings without escaped values can be retrieved as is from the token span
pub(super) fn save_string(&mut self, has_escape: bool, s: &'a str) {
if has_escape {
self.save_escaped_string(s);
}
}
#[cold]
fn save_escaped_string(&mut self, s: &'a str) {
self.escaped_strings.insert(self.token.start(), s);
self.token.set_escaped(true);
}
pub(crate) fn get_string(&self, token: Token) -> &'a str {
if token.escaped() {
return self.escaped_strings[&token.start()];
}
let source_text = self.source.whole();
let mut start = token.start() as usize;
let mut end = token.end() as usize;
match token.kind() {
Kind::Str => {
// Omit surrounding quotes
start += 1;
end -= 1;
}
Kind::PrivateIdentifier => {
// Omit leading `#`
start += 1;
}
_ => {}
}
&source_text[start..end]
}
}