1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
use memchr::memmem::Finder;
use oxc_ast::CommentKind;
use oxc_syntax::line_terminator::is_line_terminator;
use crate::{config::LexerConfig as Config, diagnostics};
use super::{
Kind, Lexer, cold_branch,
search::{SafeByteMatchTable, byte_search, safe_byte_match_table},
source::SourcePosition,
};
// Irregular line breaks - '\u{2028}' (LS) and '\u{2029}' (PS)
const LS_OR_PS_FIRST: u8 = 0xE2;
const LS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA8];
const PS_BYTES_2_AND_3: [u8; 2] = [0x80, 0xA9];
static LINE_BREAK_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'\r' | b'\n' | LS_OR_PS_FIRST));
static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST));
impl<'a, C: Config> Lexer<'a, C> {
/// Section 12.4 Single Line Comment
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
byte_search! {
lexer: self,
table: LINE_BREAK_TABLE,
continue_if: (next_byte, pos) {
// Match found. Decide whether to continue searching.
// If this is end of comment, create trivia, and advance `pos` to after line break.
// Do that here rather than in `handle_match`, to avoid branching twice on value of
// the matched byte.
#[expect(clippy::if_not_else)]
if next_byte != LS_OR_PS_FIRST {
// `\r` or `\n`
self.trivia_builder
.add_line_comment(self.token.start(), self.source.offset_of(pos), self.source.whole());
// SAFETY: Safe to consume `\r` or `\n` as both are ASCII
pos = unsafe { pos.add(1) };
// We've found the end. Do not continue searching.
false
} else {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break
self.trivia_builder
.add_line_comment(self.token.start(), self.source.offset_of(pos), self.source.whole());
// Advance `pos` to after this char.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(3) };
// We've found the end. Do not continue searching.
false
} else {
// Some other Unicode char beginning with `0xE2`.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(2) };
true
}
})
}
},
handle_eof: {
self.trivia_builder.add_line_comment(self.token.start(), self.offset(), self.source.whole());
return Kind::Skip;
},
};
self.token.set_is_on_new_line(true);
Kind::Skip
}
/// Section 12.4 Multi Line Comment
pub(super) fn skip_multi_line_comment(&mut self) -> Kind {
// If `is_on_new_line` is already set, go directly to faster search which only looks for `*/`
// We need to identify if comment contains line breaks or not
// (`CommentKind::Block` or `CommentKind::MultilineBlock`).
// So we have to use the loop below for the first line of the comment even if
// `Token`'s `is_on_new_line` flag is already set.
// If the loop finds a line break before end of the comment, we then switch to
// the faster `skip_multi_line_comment_after_line_break` which searches
// for the end of the comment using `memchr`.
byte_search! {
lexer: self,
table: MULTILINE_COMMENT_START_TABLE,
continue_if: (next_byte, pos) {
// Match found. Decide whether to continue searching.
if next_byte == b'*' {
// SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary
let after_star = unsafe { pos.add(1) };
if after_star.is_not_end_of(&self.source) {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if unsafe { after_star.read() } == b'/' {
// Consume `*/`
// SAFETY: Consuming `*/` leaves `pos` on a UTF-8 char boundary
pos = unsafe { pos.add(2) };
false
} else {
true
}
} else {
// This is last byte in file. Continue to `handle_eof`.
// This is illegal in valid JS, so mark this branch cold.
cold_branch(|| true)
}
} else if next_byte == LS_OR_PS_FIRST {
// `0xE2`. Could be first byte of LS/PS, or could be some other Unicode char.
// Either way, Unicode is uncommon, so make this a cold branch.
cold_branch(|| {
// SAFETY: Next byte is `0xE2` which is always 1st byte of a 3-byte UTF-8 char.
// So safe to advance `pos` by 1 and read 2 bytes.
let next2 = unsafe { pos.add(1).read2() };
if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
// Irregular line break
self.token.set_is_on_new_line(true);
// Ideally we'd go on to `skip_multi_line_comment_after_line_break` here
// but can't do that easily because can't use `return` in a closure.
// But irregular line breaks are rare anyway.
}
// Either way, continue searching.
// Skip 3 bytes (macro skips 1 already, so skip 2 here), and continue searching.
// SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
// so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
pos = unsafe { pos.add(2) };
true
})
} else {
// Regular line break.
// No need to look for more line breaks, so switch to faster search just for `*/`.
self.token.set_is_on_new_line(true);
// SAFETY: Regular line breaks are ASCII, so skipping 1 byte is a UTF-8 char boundary.
let after_line_break = unsafe { pos.add(1) };
return self.skip_multi_line_comment_after_line_break(after_line_break);
}
},
handle_eof: {
self.error(diagnostics::unterminated_multi_line_comment(self.unterminated_range()));
return Kind::Eof;
},
};
self.trivia_builder.add_block_comment(
self.token.start(),
self.offset(),
CommentKind::SingleLineBlock,
self.source.whole(),
);
Kind::Skip
}
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition<'a>) -> Kind {
// Can use `memchr` here as only searching for 1 pattern.
// Cache `Finder` instance on `Lexer` as there's a significant cost to creating it.
// `Finder::new` isn't a const function, so can't make it a `static`, and `lazy_static!`
// has a cost each time it's deref-ed. Creating `Finder` unconditionally in `Lexer::new`
// would be efficient for files containing multi-line comments, but would impose pointless
// cost on files which don't. So this is the fastest solution.
let finder = self.multi_line_comment_end_finder.get_or_insert_with(|| Finder::new("*/"));
let remaining = self.source.str_from_pos_to_end(pos).as_bytes();
if let Some(index) = finder.find(remaining) {
// SAFETY: `pos + index + 2` is end of `*/`, so a valid `SourcePosition`
self.source.set_position(unsafe { pos.add(index + 2) });
self.trivia_builder.add_block_comment(
self.token.start(),
self.offset(),
CommentKind::MultiLineBlock,
self.source.whole(),
);
Kind::Skip
} else {
self.source.advance_to_end();
self.error(diagnostics::unterminated_multi_line_comment(self.unterminated_range()));
Kind::Eof
}
}
/// Section 12.5 Hashbang Comments.
///
/// # SAFETY
/// Next 2 bytes must be `#!`.
pub(super) unsafe fn read_hashbang_comment(&mut self) -> Kind {
debug_assert!(self.peek_2_bytes() == Some([b'#', b'!']));
// SAFETY: Caller guarantees next 2 bytes are `#!`
unsafe {
self.source.next_byte_unchecked();
self.source.next_byte_unchecked();
}
while let Some(c) = self.peek_char() {
if is_line_terminator(c) {
break;
}
self.consume_char();
}
Kind::HashbangComment
}
}