1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
//! Boa's lexing for ECMAScript regex literals.
use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer};
use crate::source::ReadChar;
use bitflags::bitflags;
use boa_ast::PositionGroup;
use boa_interner::Interner;
use regress::Flags;
use std::fmt::{Display, Write};
use std::str::{self, FromStr};
const MAXIMUM_REGEX_FLAGS: usize = 8;
/// Regex literal lexing.
///
/// Lexes Division, Assigndiv or Regex literal.
///
/// Expects: Initial '/' to already be consumed by cursor.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#sec-literals-regular-expression-literals
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
#[derive(Debug, Clone, Copy)]
pub(super) struct RegexLiteral {
// If there is more cases than only `/=`
// then use `Option<u8>` or (more correct) `Option<enum>`
init_with_eq: bool,
}
impl RegexLiteral {
/// `init_with_eq` is '=' after `/` already consumed?
pub(super) fn new(init_with_eq: bool) -> Self {
Self { init_with_eq }
}
}
impl<R> Tokenizer<R> for RegexLiteral {
fn lex(
&mut self,
cursor: &mut Cursor<R>,
start_pos: PositionGroup,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: ReadChar,
{
let mut body = Vec::new();
if self.init_with_eq {
body.push(u32::from(b'='));
}
let mut is_class_char = false;
// Lex RegularExpressionBody.
loop {
match cursor.next_char()? {
None => {
// Abrupt end.
return Err(Error::syntax(
"abrupt end on regular expression",
cursor.pos(),
));
}
Some(b) => {
match b {
// /
0x2F if !is_class_char => break, // RegularExpressionBody finished.
// [
0x5B => {
is_class_char = true;
body.push(b);
}
// ]
0x5D if is_class_char => {
is_class_char = false;
body.push(b);
}
// \n | \r | \u{2028} | \u{2029}
0xA | 0xD | 0x2028 | 0x2029 => {
// Not allowed in Regex literal.
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
// \
0x5C => {
// Escape sequence
body.push(b);
if let Some(sc) = cursor.next_char()? {
match sc {
// \n | \r | \u{2028} | \u{2029}
0xA | 0xD | 0x2028 | 0x2029 => {
// Not allowed in Regex literal.
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
b => body.push(b),
}
} else {
// Abrupt end of regex.
return Err(Error::syntax(
"abrupt end on regular expression",
cursor.pos(),
));
}
}
_ => body.push(b),
}
}
}
}
let mut flags: [u32; MAXIMUM_REGEX_FLAGS] = [0; MAXIMUM_REGEX_FLAGS];
let n = cursor.take_array_alphabetic(&mut flags)?;
if n > MAXIMUM_REGEX_FLAGS {
// There can only be a maximum of 8 flags.
return Err(Error::syntax(
"Invalid regular expression: too many flags",
start_pos,
));
}
let flags: RegExpFlags =
RegExpFlags::try_from(&flags[..n]).map_err(|e| Error::syntax(e, start_pos))?;
// We have a vague hint of the size of this vector in the best case scenario.
let mut body_utf16 = Vec::with_capacity(body.len());
// We convert the body to UTF-16 since it may contain code points that are not valid UTF-8.
// We already know that the body is valid UTF-16. Casting is fine.
#[allow(clippy::cast_possible_truncation)]
for cp in &body {
let cp = *cp;
if cp <= 0xFFFF {
body_utf16.push(cp as u16);
} else {
let cp = cp - 0x1_0000;
let high = 0xD800 | ((cp >> 10) as u16);
let low = 0xDC00 | ((cp as u16) & 0x3FF);
body_utf16.push(high);
body_utf16.push(low);
}
}
// Only try to parse and validate, do not optimize/compile.
drop(
regress::backends::try_parse(body.into_iter(), flags.into()).map_err(|error| {
Error::syntax(
format!("Invalid regular expression literal: {error}"),
start_pos,
)
})?,
);
Ok(Token::new_by_position_group(
TokenKind::regular_expression_literal(
interner.get_or_intern(body_utf16.as_slice()),
interner.get_or_intern(flags.to_string().as_str()),
),
start_pos,
cursor.pos_group(),
))
}
}
bitflags! {
/// Flags of a regular expression.
#[derive(Debug, Default, Copy, Clone)]
pub struct RegExpFlags: u8 {
/// Whether to test the regular expression against all possible matches in a string,
/// or only against the first.
const GLOBAL = 0b0000_0001;
/// Whether to ignore case while attempting a match in a string.
const IGNORE_CASE = 0b0000_0010;
/// Whether or not to search in strings across multiple lines.
const MULTILINE = 0b0000_0100;
/// Whether `.` matches newlines or not.
const DOT_ALL = 0b0000_1000;
/// Whether or not Unicode features are enabled.
const UNICODE = 0b0001_0000;
/// Whether or not the search is sticky.
const STICKY = 0b0010_0000;
/// Whether the regular expression result exposes the start and end indices of
/// captured substrings.
const HAS_INDICES = 0b0100_0000;
/// Whether or not UnicodeSets features are enabled.
const UNICODE_SETS = 0b1000_0000;
}
}
impl TryFrom<&[u32]> for RegExpFlags {
type Error = String;
fn try_from(value: &[u32]) -> Result<Self, Self::Error> {
let mut flags = Self::default();
for c in value {
let c = char::from_u32(*c)
.ok_or_else(|| format!("Invalid regular expression flag: {c}"))?;
let new_flag = match c {
'g' => Self::GLOBAL,
'i' => Self::IGNORE_CASE,
'm' => Self::MULTILINE,
's' => Self::DOT_ALL,
'u' => Self::UNICODE,
'y' => Self::STICKY,
'd' => Self::HAS_INDICES,
'v' => Self::UNICODE_SETS,
_ => return Err(format!("invalid regular expression flag {c}")),
};
if flags.contains(new_flag) {
return Err(format!("repeated regular expression flag {c}"));
}
flags.insert(new_flag);
}
if flags.contains(Self::UNICODE) && flags.contains(Self::UNICODE_SETS) {
return Err("cannot use both 'u' and 'v' flags".into());
}
Ok(flags)
}
}
impl FromStr for RegExpFlags {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut flags = Self::default();
for c in s.bytes() {
let new_flag = match c {
b'g' => Self::GLOBAL,
b'i' => Self::IGNORE_CASE,
b'm' => Self::MULTILINE,
b's' => Self::DOT_ALL,
b'u' => Self::UNICODE,
b'y' => Self::STICKY,
b'd' => Self::HAS_INDICES,
b'v' => Self::UNICODE_SETS,
_ => return Err(format!("invalid regular expression flag {}", char::from(c))),
};
if flags.contains(new_flag) {
return Err(format!(
"repeated regular expression flag {}",
char::from(c)
));
}
flags.insert(new_flag);
}
if flags.contains(Self::UNICODE) && flags.contains(Self::UNICODE_SETS) {
return Err("cannot use both 'u' and 'v' flags".into());
}
Ok(flags)
}
}
impl Display for RegExpFlags {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.contains(Self::HAS_INDICES) {
f.write_char('d')?;
}
if self.contains(Self::GLOBAL) {
f.write_char('g')?;
}
if self.contains(Self::IGNORE_CASE) {
f.write_char('i')?;
}
if self.contains(Self::MULTILINE) {
f.write_char('m')?;
}
if self.contains(Self::DOT_ALL) {
f.write_char('s')?;
}
if self.contains(Self::UNICODE) {
f.write_char('u')?;
}
if self.contains(Self::STICKY) {
f.write_char('y')?;
}
if self.contains(Self::UNICODE_SETS) {
f.write_char('v')?;
}
Ok(())
}
}
impl From<RegExpFlags> for Flags {
fn from(value: RegExpFlags) -> Self {
Self {
icase: value.contains(RegExpFlags::IGNORE_CASE),
multiline: value.contains(RegExpFlags::MULTILINE),
dot_all: value.contains(RegExpFlags::DOT_ALL),
unicode: value.contains(RegExpFlags::UNICODE),
unicode_sets: value.contains(RegExpFlags::UNICODE_SETS),
..Self::default()
}
}
}