solar_parse/lexer/cursor/
mod.rs1use memchr::memmem;
6use solar_ast::{
7 Base, StrKind,
8 token::{BinOpToken, Delimiter},
9};
10use solar_data_structures::hint::{likely, unlikely};
11use std::sync::OnceLock;
12
13pub mod token;
14use token::{RawLiteralKind, RawToken, RawTokenKind};
15
16mod char_info;
17pub use char_info::*;
18
19#[cfg(test)]
20mod tests;
21
22#[derive(Clone, Debug)]
27pub struct Cursor<'a> {
28 bytes: std::slice::Iter<'a, u8>,
29}
30
31impl<'a> Cursor<'a> {
32 #[inline]
34 pub fn new(input: &'a str) -> Self {
35 Cursor { bytes: input.as_bytes().iter() }
36 }
37
38 #[inline]
44 pub fn with_position(self) -> CursorWithPosition<'a> {
45 CursorWithPosition::new(self)
46 }
47
48 pub fn slop(&mut self) -> RawToken {
53 let start = self.as_ptr();
56
57 let Some(first_char) = self.bump_ret() else { return RawToken::EOF };
58 let token_kind = self.advance_token_kind(first_char);
59
60 let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
62
63 RawToken::new(token_kind, len as u32)
64 }
65
66 #[inline]
67 fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
68 match first_char {
69 b'/' => match self.first() {
71 b'/' => self.line_comment(),
72 b'*' => self.block_comment(),
73 b'=' => {
74 self.bump();
75 RawTokenKind::BinOpEq(BinOpToken::Slash)
76 }
77 _ => RawTokenKind::BinOp(BinOpToken::Slash),
78 },
79
80 c if is_whitespace_byte(c) => self.whitespace(),
82
83 c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
85
86 b'0'..=b'9' => {
88 let kind = self.number(first_char);
89 RawTokenKind::Literal { kind }
90 }
91 b'.' if self.first().is_ascii_digit() => {
92 let kind = self.rational_number_after_dot(Base::Decimal);
93 RawTokenKind::Literal { kind }
94 }
95
96 b';' => RawTokenKind::Semi,
98 b',' => RawTokenKind::Comma,
99 b'.' => RawTokenKind::Dot,
100 b'(' => RawTokenKind::OpenDelim(Delimiter::Parenthesis),
101 b')' => RawTokenKind::CloseDelim(Delimiter::Parenthesis),
102 b'{' => RawTokenKind::OpenDelim(Delimiter::Brace),
103 b'}' => RawTokenKind::CloseDelim(Delimiter::Brace),
104 b'[' => RawTokenKind::OpenDelim(Delimiter::Bracket),
105 b']' => RawTokenKind::CloseDelim(Delimiter::Bracket),
106 b'~' => RawTokenKind::Tilde,
107 b'?' => RawTokenKind::Question,
108
109 b':' => match self.first() {
112 b'=' => {
113 self.bump();
114 RawTokenKind::Walrus
115 }
116 _ => RawTokenKind::Colon,
117 },
118 b'=' => match self.first() {
120 b'=' => {
121 self.bump();
122 RawTokenKind::EqEq
123 }
124 b'>' => {
125 self.bump();
126 RawTokenKind::FatArrow
127 }
128 _ => RawTokenKind::Eq,
129 },
130 b'!' => match self.first() {
132 b'=' => {
133 self.bump();
134 RawTokenKind::Ne
135 }
136 _ => RawTokenKind::Not,
137 },
138 b'<' => match self.first() {
140 b'=' => {
141 self.bump();
142 RawTokenKind::Le
143 }
144 b'<' => {
145 self.bump();
146 if self.first() == b'=' {
147 self.bump();
148 RawTokenKind::BinOpEq(BinOpToken::Shl)
149 } else {
150 RawTokenKind::BinOp(BinOpToken::Shl)
151 }
152 }
153 _ => RawTokenKind::Lt,
154 },
155 b'>' => match self.first() {
158 b'=' => {
159 self.bump();
160 RawTokenKind::Ge
161 }
162 b'>' => {
163 self.bump();
164 match self.first() {
165 b'>' => {
166 self.bump();
167 if self.first() == b'=' {
168 self.bump();
169 RawTokenKind::BinOpEq(BinOpToken::Sar)
170 } else {
171 RawTokenKind::BinOp(BinOpToken::Sar)
172 }
173 }
174 b'=' => {
175 self.bump();
176 RawTokenKind::BinOpEq(BinOpToken::Shr)
177 }
178 _ => RawTokenKind::BinOp(BinOpToken::Shr),
179 }
180 }
181 _ => RawTokenKind::Gt,
182 },
183 b'-' => match self.first() {
185 b'-' => {
186 self.bump();
187 RawTokenKind::MinusMinus
188 }
189 b'=' => {
190 self.bump();
191 RawTokenKind::BinOpEq(BinOpToken::Minus)
192 }
193 b'>' => {
194 self.bump();
195 RawTokenKind::Arrow
196 }
197 _ => RawTokenKind::BinOp(BinOpToken::Minus),
198 },
199 b'&' => match self.first() {
201 b'&' => {
202 self.bump();
203 RawTokenKind::AndAnd
204 }
205 b'=' => {
206 self.bump();
207 RawTokenKind::BinOpEq(BinOpToken::And)
208 }
209 _ => RawTokenKind::BinOp(BinOpToken::And),
210 },
211 b'|' => match self.first() {
213 b'|' => {
214 self.bump();
215 RawTokenKind::OrOr
216 }
217 b'=' => {
218 self.bump();
219 RawTokenKind::BinOpEq(BinOpToken::Or)
220 }
221 _ => RawTokenKind::BinOp(BinOpToken::Or),
222 },
223 b'+' => match self.first() {
225 b'+' => {
226 self.bump();
227 RawTokenKind::PlusPlus
228 }
229 b'=' => {
230 self.bump();
231 RawTokenKind::BinOpEq(BinOpToken::Plus)
232 }
233 _ => RawTokenKind::BinOp(BinOpToken::Plus),
234 },
235 b'*' => match self.first() {
237 b'*' => {
238 self.bump();
239 RawTokenKind::StarStar
240 }
241 b'=' => {
242 self.bump();
243 RawTokenKind::BinOpEq(BinOpToken::Star)
244 }
245 _ => RawTokenKind::BinOp(BinOpToken::Star),
246 },
247 b'^' => match self.first() {
249 b'=' => {
250 self.bump();
251 RawTokenKind::BinOpEq(BinOpToken::Caret)
252 }
253 _ => RawTokenKind::BinOp(BinOpToken::Caret),
254 },
255 b'%' => match self.first() {
257 b'=' => {
258 self.bump();
259 RawTokenKind::BinOpEq(BinOpToken::Percent)
260 }
261 _ => RawTokenKind::BinOp(BinOpToken::Percent),
262 },
263
264 b'\'' | b'"' => {
266 let terminated = self.eat_string(first_char);
267 let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
268 RawTokenKind::Literal { kind }
269 }
270
271 _ => {
272 if unlikely(!first_char.is_ascii()) {
273 self.bump_utf8_with(first_char);
274 }
275 RawTokenKind::Unknown
276 }
277 }
278 }
279
280 #[inline(never)]
281 fn line_comment(&mut self) -> RawTokenKind {
282 debug_assert!(self.prev() == b'/' && self.first() == b'/');
283 self.bump();
284
285 let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
287
288 self.eat_until_either(b'\n', b'\r');
290 RawTokenKind::LineComment { is_doc }
291 }
292
293 #[inline(never)]
294 fn block_comment(&mut self) -> RawTokenKind {
295 debug_assert!(self.prev() == b'/' && self.first() == b'*');
296 self.bump();
297
298 let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
301
302 let b = self.as_bytes();
303 static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
304 let (terminated, n) = FINDER
305 .get_or_init(|| memmem::Finder::new(b"*/"))
306 .find(b)
307 .map_or((false, b.len()), |pos| (true, pos + 2));
308 self.ignore_bytes(n);
309
310 RawTokenKind::BlockComment { is_doc, terminated }
311 }
312
313 fn whitespace(&mut self) -> RawTokenKind {
314 debug_assert!(is_whitespace_byte(self.prev()));
315 self.eat_while(is_whitespace_byte);
316 RawTokenKind::Whitespace
317 }
318
319 fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
320 debug_assert!(is_id_start_byte(self.prev()));
321
322 let start = self.as_ptr();
324 self.eat_while(is_id_continue_byte);
325
326 if unlikely(matches!(first, b'h' | b'u')) {
328 let id = unsafe {
330 let start = start.sub(1);
331 std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
332 };
333 let is_hex = id == b"hex";
334 if (is_hex || id == b"unicode")
335 && let quote @ (b'\'' | b'"') = self.first()
336 {
337 self.bump();
338 let terminated = self.eat_string(quote);
339 let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
340 return RawTokenKind::Literal { kind: RawLiteralKind::Str { kind, terminated } };
341 }
342 }
343
344 RawTokenKind::Ident
345 }
346
347 fn number(&mut self, first_digit: u8) -> RawLiteralKind {
348 debug_assert!(self.prev().is_ascii_digit());
349 let mut base = Base::Decimal;
350 if first_digit == b'0' {
351 let has_digits = match self.first() {
353 b'b' => {
354 base = Base::Binary;
355 self.bump();
356 self.eat_decimal_digits()
357 }
358 b'o' => {
359 base = Base::Octal;
360 self.bump();
361 self.eat_decimal_digits()
362 }
363 b'x' => {
364 base = Base::Hexadecimal;
365 self.bump();
366 self.eat_hexadecimal_digits()
367 }
368 b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
370 self.eat_decimal_digits();
371 true
372 }
373 _ => return RawLiteralKind::Int { base, empty_int: false },
375 };
376 if !has_digits {
378 return RawLiteralKind::Int { base, empty_int: true };
379 }
380 } else {
381 self.eat_decimal_digits();
383 };
384
385 match self.first() {
386 b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
390 self.bump();
391 self.rational_number_after_dot(base)
392 }
393 b'e' | b'E' => {
394 self.bump();
395 let empty_exponent = !self.eat_exponent();
396 RawLiteralKind::Rational { base, empty_exponent }
397 }
398 _ => RawLiteralKind::Int { base, empty_int: false },
399 }
400 }
401
402 #[cold]
403 fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
404 self.eat_decimal_digits();
405 let empty_exponent = match self.first() {
406 b'e' | b'E' => {
407 self.bump();
408 !self.eat_exponent()
409 }
410 _ => false,
411 };
412 RawLiteralKind::Rational { base, empty_exponent }
413 }
414
415 fn eat_string(&mut self, quote: u8) -> bool {
417 debug_assert_eq!(self.prev(), quote);
418 loop {
419 if unlikely(!self.eat_until_either(quote, b'\\')) {
420 return false;
421 }
422 let c = unsafe { self.bump_ret().unwrap_unchecked() };
424 if likely(c == quote) {
425 return true;
426 }
427 debug_assert_eq!(c, b'\\');
429 let next = self.first();
430 if next == b'\\' || next == quote {
431 self.bump();
433 }
434 }
435 }
436
437 fn eat_decimal_digits(&mut self) -> bool {
439 self.eat_digits(is_decimal_digit)
440 }
441
442 fn eat_hexadecimal_digits(&mut self) -> bool {
444 self.eat_digits(is_hex_digit)
445 }
446
447 fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
448 let mut has_digits = false;
449 loop {
450 match self.first() {
451 b'_' => {
452 self.bump();
453 }
454 c if is_digit(c) => {
455 has_digits = true;
456 self.bump();
457 }
458 _ => break,
459 }
460 }
461 has_digits
462 }
463
464 fn eat_exponent(&mut self) -> bool {
466 debug_assert!(self.prev() == b'e' || self.prev() == b'E');
467 if self.first() == b'-' {
469 self.bump();
470 }
471 self.eat_decimal_digits()
472 }
473
474 #[inline]
476 pub fn as_bytes(&self) -> &'a [u8] {
477 self.bytes.as_slice()
478 }
479
480 #[inline]
482 pub fn as_ptr(&self) -> *const u8 {
483 self.bytes.as_slice().as_ptr()
484 }
485
486 #[inline]
488 fn prev(&self) -> u8 {
489 unsafe { *self.as_ptr().sub(1) }
491 }
492
493 #[inline]
498 fn first(&self) -> u8 {
499 self.peek_byte(0)
500 }
501
502 #[inline]
504 fn second(&self) -> u8 {
505 self.peek_byte(1)
508 }
509
510 #[doc(hidden)]
512 #[inline]
513 fn peek_byte(&self, index: usize) -> u8 {
514 self.as_bytes().get(index).copied().unwrap_or(EOF)
515 }
516
517 fn bump(&mut self) {
519 self.bytes.next();
520 }
521
522 #[cold]
526 #[allow(clippy::match_overlapping_arm)]
527 fn bump_utf8_with(&mut self, x: u8) {
528 debug_assert_eq!(self.prev(), x);
529 let skip = match x {
530 ..0x80 => 0,
531 ..0xE0 => 1,
532 ..0xF0 => 2,
533 _ => 3,
534 };
535 self.ignore_bytes(skip);
538 }
539
540 fn bump_ret(&mut self) -> Option<u8> {
542 let c = self.as_bytes().first().copied();
543 self.bytes.next();
544 c
545 }
546
547 #[inline]
549 #[cfg_attr(debug_assertions, track_caller)]
550 fn ignore_bytes(&mut self, n: usize) {
551 debug_assert!(n <= self.as_bytes().len());
552 self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
553 }
554
555 #[inline]
559 fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
560 let b = self.as_bytes();
561 let res = memchr::memchr2(ch1, ch2, b);
562 self.ignore_bytes(res.unwrap_or(b.len()));
563 res.is_some()
564 }
565
566 #[inline]
568 fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
569 while predicate(self.first()) {
570 self.bump();
571 }
572 }
573}
574
575impl Iterator for Cursor<'_> {
576 type Item = RawToken;
577
578 #[inline]
579 fn next(&mut self) -> Option<Self::Item> {
580 let token = self.slop();
581 if token.kind == RawTokenKind::Eof { None } else { Some(token) }
582 }
583}
584
585impl std::iter::FusedIterator for Cursor<'_> {}
586
587#[derive(Clone, Debug)]
591pub struct CursorWithPosition<'a> {
592 cursor: Cursor<'a>,
593 position: u32,
594}
595
596impl<'a> CursorWithPosition<'a> {
597 #[inline]
599 fn new(cursor: Cursor<'a>) -> Self {
600 CursorWithPosition { cursor, position: 0 }
601 }
602
603 #[inline]
605 pub fn inner(&self) -> &Cursor<'a> {
606 &self.cursor
607 }
608
609 #[inline]
611 pub fn inner_mut(&mut self) -> &mut Cursor<'a> {
612 &mut self.cursor
613 }
614
615 #[inline]
617 pub fn position(&self) -> usize {
618 self.position as usize
619 }
620}
621
622impl Iterator for CursorWithPosition<'_> {
623 type Item = (usize, RawToken);
624
625 #[inline]
626 fn next(&mut self) -> Option<Self::Item> {
627 self.cursor.next().map(|t| {
628 let pos = self.position;
629 self.position = pos + t.len;
630 (pos as usize, t)
631 })
632 }
633
634 #[inline]
635 fn size_hint(&self) -> (usize, Option<usize>) {
636 self.cursor.size_hint()
637 }
638}
639
640impl std::iter::FusedIterator for CursorWithPosition<'_> {}