solar_parse/lexer/cursor/
mod.rs1use memchr::memmem;
6use solar_ast::{Base, StrKind};
7use solar_data_structures::hint::unlikely;
8use std::sync::OnceLock;
9
10pub mod token;
11use token::{RawLiteralKind, RawToken, RawTokenKind};
12
13#[cfg(test)]
14mod tests;
15
16#[inline]
18pub const fn is_whitespace(c: char) -> bool {
19 is_whitespace_byte(ch2u8(c))
20}
21#[inline]
23pub const fn is_whitespace_byte(c: u8) -> bool {
24 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
25}
26
27#[inline]
29pub const fn is_id_start(c: char) -> bool {
30 is_id_start_byte(ch2u8(c))
31}
32#[inline]
34pub const fn is_id_start_byte(c: u8) -> bool {
35 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
36}
37
38#[inline]
40pub const fn is_id_continue(c: char) -> bool {
41 is_id_continue_byte(ch2u8(c))
42}
43#[inline]
45pub const fn is_id_continue_byte(c: u8) -> bool {
46 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$')
47}
48
49#[inline]
56pub const fn is_ident(s: &str) -> bool {
57 is_ident_bytes(s.as_bytes())
58}
59
60pub const fn is_ident_bytes(s: &[u8]) -> bool {
64 let [first, ref rest @ ..] = *s else {
67 return false;
68 };
69
70 if !is_id_start_byte(first) {
71 return false;
72 }
73
74 let mut i = 0;
75 while i < rest.len() {
76 if !is_id_continue_byte(rest[i]) {
77 return false;
78 }
79 i += 1;
80 }
81
82 true
83}
84
85#[inline(always)]
87const fn ch2u8(c: char) -> u8 {
88 c as u32 as u8
89}
90
91const EOF: u8 = b'\0';
92
93#[derive(Clone, Debug)]
98pub struct Cursor<'a> {
99 bytes: std::slice::Iter<'a, u8>,
100}
101
102impl<'a> Cursor<'a> {
103 pub fn new(input: &'a str) -> Self {
105 Cursor { bytes: input.as_bytes().iter() }
106 }
107
108 pub fn advance_token(&mut self) -> RawToken {
110 let start = self.as_ptr();
113
114 let first_char = match self.bump_ret() {
115 Some(c) => c,
116 None => return RawToken::EOF,
117 };
118 let token_kind = self.advance_token_kind(first_char);
119
120 let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
122
123 RawToken::new(token_kind, len as u32)
124 }
125
126 #[inline]
127 fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
128 match first_char {
129 b'/' => match self.first() {
131 b'/' => self.line_comment(),
132 b'*' => self.block_comment(),
133 _ => RawTokenKind::Slash,
134 },
135
136 c if is_whitespace_byte(c) => self.whitespace(),
138
139 c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
141
142 b'0'..=b'9' => {
144 let kind = self.number(first_char);
145 RawTokenKind::Literal { kind }
146 }
147 b'.' if self.first().is_ascii_digit() => {
148 let kind = self.rational_number_after_dot(Base::Decimal);
149 RawTokenKind::Literal { kind }
150 }
151
152 b';' => RawTokenKind::Semi,
154 b',' => RawTokenKind::Comma,
155 b'.' => RawTokenKind::Dot,
156 b'(' => RawTokenKind::OpenParen,
157 b')' => RawTokenKind::CloseParen,
158 b'{' => RawTokenKind::OpenBrace,
159 b'}' => RawTokenKind::CloseBrace,
160 b'[' => RawTokenKind::OpenBracket,
161 b']' => RawTokenKind::CloseBracket,
162 b'~' => RawTokenKind::Tilde,
163 b'?' => RawTokenKind::Question,
164 b':' => RawTokenKind::Colon,
165 b'=' => RawTokenKind::Eq,
166 b'!' => RawTokenKind::Bang,
167 b'<' => RawTokenKind::Lt,
168 b'>' => RawTokenKind::Gt,
169 b'-' => RawTokenKind::Minus,
170 b'&' => RawTokenKind::And,
171 b'|' => RawTokenKind::Or,
172 b'+' => RawTokenKind::Plus,
173 b'*' => RawTokenKind::Star,
174 b'^' => RawTokenKind::Caret,
175 b'%' => RawTokenKind::Percent,
176
177 b'\'' | b'"' => {
179 let terminated = self.eat_string(first_char);
180 let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
181 RawTokenKind::Literal { kind }
182 }
183
184 _ => {
185 if unlikely(!first_char.is_ascii()) {
186 self.bump_utf8_with(first_char);
187 }
188 RawTokenKind::Unknown
189 }
190 }
191 }
192
193 #[inline(never)]
194 fn line_comment(&mut self) -> RawTokenKind {
195 debug_assert!(self.prev() == b'/' && self.first() == b'/');
196 self.bump();
197
198 let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
200
201 self.eat_until_either(b'\n', b'\r');
203 RawTokenKind::LineComment { is_doc }
204 }
205
206 #[inline(never)]
207 fn block_comment(&mut self) -> RawTokenKind {
208 debug_assert!(self.prev() == b'/' && self.first() == b'*');
209 self.bump();
210
211 let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
214
215 let b = self.as_bytes();
216 static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
217 let (terminated, n) = FINDER
218 .get_or_init(|| memmem::Finder::new(b"*/"))
219 .find(b)
220 .map_or((false, b.len()), |pos| (true, pos + 2));
221 self.ignore_bytes(n);
222
223 RawTokenKind::BlockComment { is_doc, terminated }
224 }
225
226 fn whitespace(&mut self) -> RawTokenKind {
227 debug_assert!(is_whitespace_byte(self.prev()));
228 self.eat_while(is_whitespace_byte);
229 RawTokenKind::Whitespace
230 }
231
232 fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
233 debug_assert!(is_id_start_byte(self.prev()));
234
235 let start = self.as_ptr();
237 self.eat_while(is_id_continue_byte);
238
239 if unlikely(matches!(first, b'h' | b'u')) {
241 let id = unsafe {
243 let start = start.sub(1);
244 std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
245 };
246 let is_hex = id == b"hex";
247 if is_hex || id == b"unicode" {
248 if let quote @ (b'\'' | b'"') = self.first() {
249 self.bump();
250 let terminated = self.eat_string(quote);
251 let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
252 return RawTokenKind::Literal {
253 kind: RawLiteralKind::Str { kind, terminated },
254 };
255 }
256 }
257 }
258
259 RawTokenKind::Ident
260 }
261
262 fn number(&mut self, first_digit: u8) -> RawLiteralKind {
263 debug_assert!(self.prev().is_ascii_digit());
264 let mut base = Base::Decimal;
265 if first_digit == b'0' {
266 let has_digits = match self.first() {
268 b'b' => {
269 base = Base::Binary;
270 self.bump();
271 self.eat_decimal_digits()
272 }
273 b'o' => {
274 base = Base::Octal;
275 self.bump();
276 self.eat_decimal_digits()
277 }
278 b'x' => {
279 base = Base::Hexadecimal;
280 self.bump();
281 self.eat_hexadecimal_digits()
282 }
283 b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
285 self.eat_decimal_digits();
286 true
287 }
288 _ => return RawLiteralKind::Int { base, empty_int: false },
290 };
291 if !has_digits {
293 return RawLiteralKind::Int { base, empty_int: true };
294 }
295 } else {
296 self.eat_decimal_digits();
298 };
299
300 match self.first() {
301 b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
305 self.bump();
306 self.rational_number_after_dot(base)
307 }
308 b'e' | b'E' => {
309 self.bump();
310 let empty_exponent = !self.eat_exponent();
311 RawLiteralKind::Rational { base, empty_exponent }
312 }
313 _ => RawLiteralKind::Int { base, empty_int: false },
314 }
315 }
316
317 #[cold]
318 fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
319 self.eat_decimal_digits();
320 let empty_exponent = match self.first() {
321 b'e' | b'E' => {
322 self.bump();
323 !self.eat_exponent()
324 }
325 _ => false,
326 };
327 RawLiteralKind::Rational { base, empty_exponent }
328 }
329
330 fn eat_string(&mut self, quote: u8) -> bool {
332 debug_assert_eq!(self.prev(), quote);
333 while let Some(c) = self.bump_ret() {
334 if c == quote {
335 return true;
336 }
337 if c == b'\\' {
338 let first = self.first();
339 if first == b'\\' || first == quote {
340 self.bump();
342 }
343 }
344 }
345 false
347 }
348
349 fn eat_decimal_digits(&mut self) -> bool {
351 self.eat_digits(|x| x.is_ascii_digit())
352 }
353
354 fn eat_hexadecimal_digits(&mut self) -> bool {
356 self.eat_digits(|x| x.is_ascii_hexdigit())
357 }
358
359 fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
360 let mut has_digits = false;
361 loop {
362 match self.first() {
363 b'_' => {
364 self.bump();
365 }
366 c if is_digit(c) => {
367 has_digits = true;
368 self.bump();
369 }
370 _ => break,
371 }
372 }
373 has_digits
374 }
375
376 fn eat_exponent(&mut self) -> bool {
378 debug_assert!(self.prev() == b'e' || self.prev() == b'E');
379 if self.first() == b'-' {
381 self.bump();
382 }
383 self.eat_decimal_digits()
384 }
385
386 #[inline]
388 #[deprecated = "use `as_bytes` instead; utf-8 is not guaranteed anymore"]
389 pub fn as_str(&self) -> &'a str {
390 unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }
392 }
393
394 #[inline]
396 pub fn as_bytes(&self) -> &'a [u8] {
397 self.bytes.as_slice()
398 }
399
400 #[inline]
402 pub fn as_ptr(&self) -> *const u8 {
403 self.bytes.as_slice().as_ptr()
404 }
405
406 #[inline]
408 fn prev(&self) -> u8 {
409 unsafe { *self.as_ptr().sub(1) }
411 }
412
413 #[inline]
418 fn first(&self) -> u8 {
419 self.peek_byte(0)
420 }
421
422 #[inline]
424 fn second(&self) -> u8 {
425 self.peek_byte(1)
428 }
429
430 #[doc(hidden)]
432 #[inline]
433 fn peek_byte(&self, index: usize) -> u8 {
434 self.as_bytes().get(index).copied().unwrap_or(EOF)
435 }
436
437 #[inline]
439 fn is_eof(&self) -> bool {
440 self.as_bytes().is_empty()
441 }
442
443 fn bump(&mut self) {
445 self.bytes.next();
446 }
447
448 #[cold]
452 #[allow(clippy::match_overlapping_arm)]
453 fn bump_utf8_with(&mut self, x: u8) {
454 debug_assert_eq!(self.prev(), x);
455 let skip = match x {
456 ..0x80 => 0,
457 ..0xE0 => 1,
458 ..0xF0 => 2,
459 _ => 3,
460 };
461 self.ignore_bytes(skip);
464 }
465
466 fn bump_ret(&mut self) -> Option<u8> {
468 let c = self.as_bytes().first().copied();
469 self.bytes.next();
470 c
471 }
472
473 #[inline]
475 #[cfg_attr(debug_assertions, track_caller)]
476 fn ignore_bytes(&mut self, n: usize) {
477 debug_assert!(n <= self.as_bytes().len());
478 self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
479 }
480
481 #[inline]
485 fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
486 let b = self.as_bytes();
487 let res = memchr::memchr2(ch1, ch2, b);
488 self.ignore_bytes(res.unwrap_or(b.len()));
489 res.is_some()
490 }
491
492 #[inline]
494 fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
495 while predicate(self.first()) && !self.is_eof() {
496 self.bump();
497 }
498 }
499}
500
501impl Iterator for Cursor<'_> {
502 type Item = RawToken;
503
504 #[inline]
505 fn next(&mut self) -> Option<Self::Item> {
506 let token = self.advance_token();
507 if token.kind == RawTokenKind::Eof {
508 None
509 } else {
510 Some(token)
511 }
512 }
513}
514
515impl std::iter::FusedIterator for Cursor<'_> {}