1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
use super::{Cursor, Error, Span, Tokenizer};
use crate::syntax::{
ast::Position,
lexer::{Token, TokenKind},
};
use bitflags::bitflags;
use boa_interner::{Interner, Sym};
use boa_profiler::Profiler;
use std::{
io::{self, ErrorKind, Read},
str::{self, FromStr},
};
#[derive(Debug, Clone, Copy)]
pub(super) struct RegexLiteral;
impl<R> Tokenizer<R> for RegexLiteral {
fn lex(
&mut self,
cursor: &mut Cursor<R>,
start_pos: Position,
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
{
let _timer = Profiler::global().start_event("RegexLiteral", "Lexing");
let mut body = Vec::new();
loop {
match cursor.next_byte()? {
None => {
return Err(Error::syntax(
"abrupt end on regular expression",
cursor.pos(),
));
}
Some(b) => {
match b {
b'/' => break,
b'\n' | b'\r' => {
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
0xE2 if (cursor.peek_n(2)? == 0xA8_80 || cursor.peek_n(2)? == 0xA9_80) => {
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
b'\\' => {
body.push(b'\\');
if let Some(sc) = cursor.next_byte()? {
match sc {
b'\n' | b'\r' => {
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
0xE2 if (cursor.peek_n(2)? == 0xA8_80
|| cursor.peek_n(2)? == 0xA9_80) =>
{
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
b => body.push(b),
}
} else {
return Err(Error::syntax(
"abrupt end on regular expression",
cursor.pos(),
));
}
}
_ => body.push(b),
}
}
}
}
let mut flags = Vec::new();
let flags_start = cursor.pos();
cursor.take_while_ascii_pred(&mut flags, &char::is_alphabetic)?;
let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
if let Ok(body_str) = str::from_utf8(body.as_slice()) {
Ok(Token::new(
TokenKind::regular_expression_literal(
interner.get_or_intern(body_str),
parse_regex_flags(flags_str, flags_start, interner)?,
),
Span::new(start_pos, cursor.pos()),
))
} else {
Err(Error::from(io::Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8 character in regular expressions",
)))
}
}
}
bitflags! {
#[derive(Default)]
pub struct RegExpFlags: u8 {
const GLOBAL = 0b0000_0001;
const IGNORE_CASE = 0b0000_0010;
const MULTILINE = 0b0000_0100;
const DOT_ALL = 0b0000_1000;
const UNICODE = 0b0001_0000;
const STICKY = 0b0010_0000;
const HAS_INDICES = 0b0100_0000;
}
}
impl FromStr for RegExpFlags {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut flags = Self::default();
for c in s.bytes() {
let new_flag = match c {
b'g' => Self::GLOBAL,
b'i' => Self::IGNORE_CASE,
b'm' => Self::MULTILINE,
b's' => Self::DOT_ALL,
b'u' => Self::UNICODE,
b'y' => Self::STICKY,
b'd' => Self::HAS_INDICES,
_ => return Err(format!("invalid regular expression flag {}", char::from(c))),
};
if flags.contains(new_flag) {
return Err(format!(
"repeated regular expression flag {}",
char::from(c)
));
}
flags.insert(new_flag);
}
Ok(flags)
}
}
fn parse_regex_flags(s: &str, start: Position, interner: &mut Interner) -> Result<Sym, Error> {
match RegExpFlags::from_str(s) {
Err(message) => Err(Error::Syntax(message.into(), start)),
Ok(flags) => Ok(interner.get_or_intern(flags.to_string())),
}
}
impl ToString for RegExpFlags {
fn to_string(&self) -> String {
let mut s = String::new();
if self.contains(Self::HAS_INDICES) {
s.push('d');
}
if self.contains(Self::GLOBAL) {
s.push('g');
}
if self.contains(Self::IGNORE_CASE) {
s.push('i');
}
if self.contains(Self::MULTILINE) {
s.push('m');
}
if self.contains(Self::DOT_ALL) {
s.push('s');
}
if self.contains(Self::UNICODE) {
s.push('u');
}
if self.contains(Self::STICKY) {
s.push('y');
}
s
}
}