1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
//! This module implements lexing for string literals used in the JavaScript programing language.
use super::{Cursor, Error, Tokenizer};
use crate::{
profiler::BoaProfiler,
syntax::{
ast::{Position, Span},
lexer::{Token, TokenKind},
},
};
use std::{
char::{decode_utf16, from_u32},
convert::TryFrom,
io::{self, ErrorKind, Read},
str,
};
/// String literal lexing.
///
/// Note: expects for the initializer `'` or `"` to already be consumed from the cursor.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#sec-literals-string-literals
/// [mdn]: https://developer.cdn.mozilla.net/en-US/docs/Web/JavaScript/Reference/Global_Objects/String
#[derive(Debug, Clone, Copy)]
pub(super) struct StringLiteral {
terminator: StringTerminator,
}
impl StringLiteral {
/// Creates a new string literal lexer.
pub(super) fn new(init: char) -> Self {
let terminator = match init {
'\'' => StringTerminator::SingleQuote,
'"' => StringTerminator::DoubleQuote,
_ => unreachable!(),
};
Self { terminator }
}
}
/// Terminator for the string.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum StringTerminator {
SingleQuote,
DoubleQuote,
}
impl<R> Tokenizer<R> for StringLiteral {
fn lex(&mut self, cursor: &mut Cursor<R>, start_pos: Position) -> Result<Token, Error>
where
R: Read,
{
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let mut buf = String::new();
loop {
let next_chr_start = cursor.pos();
let next_chr = cursor.next_char()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
))
})?;
match next_chr {
'\'' if self.terminator == StringTerminator::SingleQuote => {
break;
}
'"' if self.terminator == StringTerminator::DoubleQuote => {
break;
}
'\\' => {
let _timer = BoaProfiler::global()
.start_event("StringLiteral - escape sequence", "Lexing");
let escape = cursor.next_char()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in string literal",
))
})?;
if escape != '\n' {
let escaped_ch = match escape {
'n' => '\n',
'r' => '\r',
't' => '\t',
'b' => '\x08',
'f' => '\x0c',
'0' => '\0',
'x' => {
let mut nums = [0u8; 2];
cursor.fill_bytes(&mut nums)?;
let nums = str::from_utf8(&nums).expect("non-UTF-8 bytes found");
let as_num = match u64::from_str_radix(&nums, 16) {
Ok(v) => v,
Err(_) => 0,
};
match from_u32(as_num as u32) {
Some(v) => v,
None => {
return Err(Error::syntax(
format!(
"{}: {} is not a valid Unicode scalar value",
cursor.pos(),
as_num
),
cursor.pos(),
))
}
}
}
'u' => {
// There are 2 types of codepoints. Surragate codepoints and
// unicode codepoints. UTF-16 could be surrogate codepoints,
// "\uXXXX\uXXXX" which make up a single unicode codepoint. We will
// need to loop to make sure we catch all UTF-16 codepoints
// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is('{')? {
cursor.next_char()?.expect("{ character vanished"); // Consume the '{'.
// The biggest code point is 0x10FFFF
// TODO: use bytes for a bit better performance (using stack)
let mut code_point = String::with_capacity(6);
cursor.take_until('}', &mut code_point)?;
cursor.next_char()?.expect("} character vanished"); // Consume the '}'.
// We know this is a single unicode codepoint, convert to u32
let as_num =
u32::from_str_radix(&code_point, 16).map_err(|_| {
Error::syntax(
"malformed Unicode character escape sequence",
cursor.pos(),
)
})?;
if as_num > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
}
char::try_from(as_num).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?
} else {
let mut codepoints: Vec<u16> = vec![];
loop {
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point = [0u8; 4];
cursor.fill_bytes(&mut code_point)?;
// Convert to u16
let as_num = match u16::from_str_radix(
str::from_utf8(&code_point)
.expect("the cursor returned invalid UTF-8"),
16,
) {
Ok(v) => v,
Err(_) => 0,
};
codepoints.push(as_num);
// Check for another UTF-16 codepoint
if cursor.next_is('\\')? && cursor.next_is('u')? {
continue;
}
break;
}
// codepoints length should either be 1 (unicode codepoint) or
// 2 (surrogate codepoint). Rust's decode_utf16 will deal with
// it regardless
// TODO: do not panic with invalid code points.
decode_utf16(codepoints.iter().copied())
.next()
.expect("Could not get next codepoint")
.expect("Could not get next codepoint")
}
}
'\'' | '"' | '\\' => escape,
ch => {
let details = format!(
"invalid escape sequence `{}` at line {}, column {}",
next_chr_start.line_number(),
next_chr_start.column_number(),
ch
);
return Err(Error::syntax(details, cursor.pos()));
}
};
buf.push(escaped_ch);
}
}
next_ch => buf.push(next_ch),
}
}
Ok(Token::new(
TokenKind::string_literal(buf),
Span::new(start_pos, cursor.pos()),
))
}
}