solar_parse/lexer/cursor/
mod.rs1use memchr::memmem;
6use solar_ast::{
7 Base, StrKind,
8 token::{BinOpToken, Delimiter},
9};
10use solar_data_structures::hint::unlikely;
11use std::sync::OnceLock;
12
13pub mod token;
14use token::{RawLiteralKind, RawToken, RawTokenKind};
15
16#[cfg(test)]
17mod tests;
18
19#[inline]
21pub const fn is_whitespace(c: char) -> bool {
22 is_whitespace_byte(ch2u8(c))
23}
24#[inline]
26pub const fn is_whitespace_byte(c: u8) -> bool {
27 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
28}
29
30#[inline]
32pub const fn is_id_start(c: char) -> bool {
33 is_id_start_byte(ch2u8(c))
34}
35#[inline]
37pub const fn is_id_start_byte(c: u8) -> bool {
38 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
39}
40
41#[inline]
43pub const fn is_id_continue(c: char) -> bool {
44 is_id_continue_byte(ch2u8(c))
45}
46#[inline]
48pub const fn is_id_continue_byte(c: u8) -> bool {
49 let is_number = (c >= b'0') & (c <= b'9');
50 is_id_start_byte(c) || is_number
51}
52
53#[inline]
60pub const fn is_ident(s: &str) -> bool {
61 is_ident_bytes(s.as_bytes())
62}
63
64pub const fn is_ident_bytes(s: &[u8]) -> bool {
68 let [first, ref rest @ ..] = *s else {
69 return false;
70 };
71
72 if !is_id_start_byte(first) {
73 return false;
74 }
75
76 let mut i = 0;
77 while i < rest.len() {
78 if !is_id_continue_byte(rest[i]) {
79 return false;
80 }
81 i += 1;
82 }
83
84 true
85}
86
87#[inline(always)]
89const fn ch2u8(c: char) -> u8 {
90 c as u32 as u8
91}
92
93const EOF: u8 = b'\0';
94
95#[derive(Clone, Debug)]
100pub struct Cursor<'a> {
101 bytes: std::slice::Iter<'a, u8>,
102}
103
104impl<'a> Cursor<'a> {
105 #[inline]
107 pub fn new(input: &'a str) -> Self {
108 Cursor { bytes: input.as_bytes().iter() }
109 }
110
111 #[inline]
117 pub fn with_position(self) -> CursorWithPosition<'a> {
118 CursorWithPosition::new(self)
119 }
120
121 pub fn slop(&mut self) -> RawToken {
126 let start = self.as_ptr();
129
130 let Some(first_char) = self.bump_ret() else { return RawToken::EOF };
131 let token_kind = self.advance_token_kind(first_char);
132
133 let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
135
136 RawToken::new(token_kind, len as u32)
137 }
138
139 #[inline]
140 fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
141 match first_char {
142 b'/' => match self.first() {
144 b'/' => self.line_comment(),
145 b'*' => self.block_comment(),
146 b'=' => {
147 self.bump();
148 RawTokenKind::BinOpEq(BinOpToken::Slash)
149 }
150 _ => RawTokenKind::BinOp(BinOpToken::Slash),
151 },
152
153 c if is_whitespace_byte(c) => self.whitespace(),
155
156 c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
158
159 b'0'..=b'9' => {
161 let kind = self.number(first_char);
162 RawTokenKind::Literal { kind }
163 }
164 b'.' if self.first().is_ascii_digit() => {
165 let kind = self.rational_number_after_dot(Base::Decimal);
166 RawTokenKind::Literal { kind }
167 }
168
169 b';' => RawTokenKind::Semi,
171 b',' => RawTokenKind::Comma,
172 b'.' => RawTokenKind::Dot,
173 b'(' => RawTokenKind::OpenDelim(Delimiter::Parenthesis),
174 b')' => RawTokenKind::CloseDelim(Delimiter::Parenthesis),
175 b'{' => RawTokenKind::OpenDelim(Delimiter::Brace),
176 b'}' => RawTokenKind::CloseDelim(Delimiter::Brace),
177 b'[' => RawTokenKind::OpenDelim(Delimiter::Bracket),
178 b']' => RawTokenKind::CloseDelim(Delimiter::Bracket),
179 b'~' => RawTokenKind::Tilde,
180 b'?' => RawTokenKind::Question,
181
182 b':' => match self.first() {
184 b'=' => {
185 self.bump();
186 RawTokenKind::Walrus
187 }
188 _ => RawTokenKind::Colon,
189 },
190 b'=' => match self.first() {
191 b'=' => {
192 self.bump();
193 RawTokenKind::EqEq
194 }
195 b'>' => {
196 self.bump();
197 RawTokenKind::FatArrow
198 }
199 _ => RawTokenKind::Eq,
200 },
201 b'!' => match self.first() {
202 b'=' => {
203 self.bump();
204 RawTokenKind::Ne
205 }
206 _ => RawTokenKind::Not,
207 },
208 b'<' => match self.first() {
209 b'=' => {
210 self.bump();
211 RawTokenKind::Le
212 }
213 b'<' => {
214 self.bump();
215 if self.first() == b'=' {
217 self.bump();
218 RawTokenKind::BinOpEq(BinOpToken::Shl)
219 } else {
220 RawTokenKind::BinOp(BinOpToken::Shl)
221 }
222 }
223 _ => RawTokenKind::Lt,
224 },
225 b'>' => match self.first() {
226 b'=' => {
227 self.bump();
228 RawTokenKind::Ge
229 }
230 b'>' => {
231 self.bump();
232 match self.first() {
233 b'>' => {
234 self.bump();
236 if self.first() == b'=' {
237 self.bump();
238 RawTokenKind::BinOpEq(BinOpToken::Sar)
239 } else {
240 RawTokenKind::BinOp(BinOpToken::Sar)
241 }
242 }
243 b'=' => {
244 self.bump();
245 RawTokenKind::BinOpEq(BinOpToken::Shr)
246 }
247 _ => RawTokenKind::BinOp(BinOpToken::Shr),
248 }
249 }
250 _ => RawTokenKind::Gt,
251 },
252 b'-' => match self.first() {
253 b'-' => {
254 self.bump();
255 RawTokenKind::MinusMinus
256 }
257 b'=' => {
258 self.bump();
259 RawTokenKind::BinOpEq(BinOpToken::Minus)
260 }
261 b'>' => {
262 self.bump();
263 RawTokenKind::Arrow
264 }
265 _ => RawTokenKind::BinOp(BinOpToken::Minus),
266 },
267 b'&' => match self.first() {
268 b'&' => {
269 self.bump();
270 RawTokenKind::AndAnd
271 }
272 b'=' => {
273 self.bump();
274 RawTokenKind::BinOpEq(BinOpToken::And)
275 }
276 _ => RawTokenKind::BinOp(BinOpToken::And),
277 },
278 b'|' => match self.first() {
279 b'|' => {
280 self.bump();
281 RawTokenKind::OrOr
282 }
283 b'=' => {
284 self.bump();
285 RawTokenKind::BinOpEq(BinOpToken::Or)
286 }
287 _ => RawTokenKind::BinOp(BinOpToken::Or),
288 },
289 b'+' => match self.first() {
290 b'+' => {
291 self.bump();
292 RawTokenKind::PlusPlus
293 }
294 b'=' => {
295 self.bump();
296 RawTokenKind::BinOpEq(BinOpToken::Plus)
297 }
298 _ => RawTokenKind::BinOp(BinOpToken::Plus),
299 },
300 b'*' => match self.first() {
301 b'*' => {
302 self.bump();
303 RawTokenKind::StarStar
304 }
305 b'=' => {
306 self.bump();
307 RawTokenKind::BinOpEq(BinOpToken::Star)
308 }
309 _ => RawTokenKind::BinOp(BinOpToken::Star),
310 },
311 b'^' => match self.first() {
312 b'=' => {
313 self.bump();
314 RawTokenKind::BinOpEq(BinOpToken::Caret)
315 }
316 _ => RawTokenKind::BinOp(BinOpToken::Caret),
317 },
318 b'%' => match self.first() {
319 b'=' => {
320 self.bump();
321 RawTokenKind::BinOpEq(BinOpToken::Percent)
322 }
323 _ => RawTokenKind::BinOp(BinOpToken::Percent),
324 },
325
326 b'\'' | b'"' => {
328 let terminated = self.eat_string(first_char);
329 let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
330 RawTokenKind::Literal { kind }
331 }
332
333 _ => {
334 if unlikely(!first_char.is_ascii()) {
335 self.bump_utf8_with(first_char);
336 }
337 RawTokenKind::Unknown
338 }
339 }
340 }
341
342 #[inline(never)]
343 fn line_comment(&mut self) -> RawTokenKind {
344 debug_assert!(self.prev() == b'/' && self.first() == b'/');
345 self.bump();
346
347 let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
349
350 self.eat_until_either(b'\n', b'\r');
352 RawTokenKind::LineComment { is_doc }
353 }
354
355 #[inline(never)]
356 fn block_comment(&mut self) -> RawTokenKind {
357 debug_assert!(self.prev() == b'/' && self.first() == b'*');
358 self.bump();
359
360 let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
363
364 let b = self.as_bytes();
365 static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
366 let (terminated, n) = FINDER
367 .get_or_init(|| memmem::Finder::new(b"*/"))
368 .find(b)
369 .map_or((false, b.len()), |pos| (true, pos + 2));
370 self.ignore_bytes(n);
371
372 RawTokenKind::BlockComment { is_doc, terminated }
373 }
374
375 fn whitespace(&mut self) -> RawTokenKind {
376 debug_assert!(is_whitespace_byte(self.prev()));
377 self.eat_while(is_whitespace_byte);
378 RawTokenKind::Whitespace
379 }
380
381 fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
382 debug_assert!(is_id_start_byte(self.prev()));
383
384 let start = self.as_ptr();
386 self.eat_while(is_id_continue_byte);
387
388 if unlikely(matches!(first, b'h' | b'u')) {
390 let id = unsafe {
392 let start = start.sub(1);
393 std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
394 };
395 let is_hex = id == b"hex";
396 if (is_hex || id == b"unicode")
397 && let quote @ (b'\'' | b'"') = self.first()
398 {
399 self.bump();
400 let terminated = self.eat_string(quote);
401 let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
402 return RawTokenKind::Literal { kind: RawLiteralKind::Str { kind, terminated } };
403 }
404 }
405
406 RawTokenKind::Ident
407 }
408
409 fn number(&mut self, first_digit: u8) -> RawLiteralKind {
410 debug_assert!(self.prev().is_ascii_digit());
411 let mut base = Base::Decimal;
412 if first_digit == b'0' {
413 let has_digits = match self.first() {
415 b'b' => {
416 base = Base::Binary;
417 self.bump();
418 self.eat_decimal_digits()
419 }
420 b'o' => {
421 base = Base::Octal;
422 self.bump();
423 self.eat_decimal_digits()
424 }
425 b'x' => {
426 base = Base::Hexadecimal;
427 self.bump();
428 self.eat_hexadecimal_digits()
429 }
430 b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
432 self.eat_decimal_digits();
433 true
434 }
435 _ => return RawLiteralKind::Int { base, empty_int: false },
437 };
438 if !has_digits {
440 return RawLiteralKind::Int { base, empty_int: true };
441 }
442 } else {
443 self.eat_decimal_digits();
445 };
446
447 match self.first() {
448 b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
452 self.bump();
453 self.rational_number_after_dot(base)
454 }
455 b'e' | b'E' => {
456 self.bump();
457 let empty_exponent = !self.eat_exponent();
458 RawLiteralKind::Rational { base, empty_exponent }
459 }
460 _ => RawLiteralKind::Int { base, empty_int: false },
461 }
462 }
463
464 #[cold]
465 fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
466 self.eat_decimal_digits();
467 let empty_exponent = match self.first() {
468 b'e' | b'E' => {
469 self.bump();
470 !self.eat_exponent()
471 }
472 _ => false,
473 };
474 RawLiteralKind::Rational { base, empty_exponent }
475 }
476
477 fn eat_string(&mut self, quote: u8) -> bool {
479 debug_assert_eq!(self.prev(), quote);
480 while let Some(c) = self.bump_ret() {
481 if c == quote {
482 return true;
483 }
484 if c == b'\\' {
485 let first = self.first();
486 if first == b'\\' || first == quote {
487 self.bump();
489 }
490 }
491 }
492 false
494 }
495
496 fn eat_decimal_digits(&mut self) -> bool {
498 self.eat_digits(|x| x.is_ascii_digit())
499 }
500
501 fn eat_hexadecimal_digits(&mut self) -> bool {
503 self.eat_digits(|x| x.is_ascii_hexdigit())
504 }
505
506 fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
507 let mut has_digits = false;
508 loop {
509 match self.first() {
510 b'_' => {
511 self.bump();
512 }
513 c if is_digit(c) => {
514 has_digits = true;
515 self.bump();
516 }
517 _ => break,
518 }
519 }
520 has_digits
521 }
522
523 fn eat_exponent(&mut self) -> bool {
525 debug_assert!(self.prev() == b'e' || self.prev() == b'E');
526 if self.first() == b'-' {
528 self.bump();
529 }
530 self.eat_decimal_digits()
531 }
532
533 #[inline]
535 pub fn as_bytes(&self) -> &'a [u8] {
536 self.bytes.as_slice()
537 }
538
539 #[inline]
541 pub fn as_ptr(&self) -> *const u8 {
542 self.bytes.as_slice().as_ptr()
543 }
544
545 #[inline]
547 fn prev(&self) -> u8 {
548 unsafe { *self.as_ptr().sub(1) }
550 }
551
552 #[inline]
557 fn first(&self) -> u8 {
558 self.peek_byte(0)
559 }
560
561 #[inline]
563 fn second(&self) -> u8 {
564 self.peek_byte(1)
567 }
568
569 #[doc(hidden)]
571 #[inline]
572 fn peek_byte(&self, index: usize) -> u8 {
573 self.as_bytes().get(index).copied().unwrap_or(EOF)
574 }
575
576 fn bump(&mut self) {
578 self.bytes.next();
579 }
580
581 #[cold]
585 #[allow(clippy::match_overlapping_arm)]
586 fn bump_utf8_with(&mut self, x: u8) {
587 debug_assert_eq!(self.prev(), x);
588 let skip = match x {
589 ..0x80 => 0,
590 ..0xE0 => 1,
591 ..0xF0 => 2,
592 _ => 3,
593 };
594 self.ignore_bytes(skip);
597 }
598
599 fn bump_ret(&mut self) -> Option<u8> {
601 let c = self.as_bytes().first().copied();
602 self.bytes.next();
603 c
604 }
605
606 #[inline]
608 #[cfg_attr(debug_assertions, track_caller)]
609 fn ignore_bytes(&mut self, n: usize) {
610 debug_assert!(n <= self.as_bytes().len());
611 self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
612 }
613
614 #[inline]
618 fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
619 let b = self.as_bytes();
620 let res = memchr::memchr2(ch1, ch2, b);
621 self.ignore_bytes(res.unwrap_or(b.len()));
622 res.is_some()
623 }
624
625 #[inline]
627 fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
628 while predicate(self.first()) {
629 self.bump();
630 }
631 }
632}
633
634impl Iterator for Cursor<'_> {
635 type Item = RawToken;
636
637 #[inline]
638 fn next(&mut self) -> Option<Self::Item> {
639 let token = self.slop();
640 if token.kind == RawTokenKind::Eof { None } else { Some(token) }
641 }
642}
643
644impl std::iter::FusedIterator for Cursor<'_> {}
645
646#[derive(Clone, Debug)]
650pub struct CursorWithPosition<'a> {
651 cursor: Cursor<'a>,
652 position: u32,
653}
654
655impl<'a> CursorWithPosition<'a> {
656 #[inline]
658 fn new(cursor: Cursor<'a>) -> Self {
659 CursorWithPosition { cursor, position: 0 }
660 }
661
662 #[inline]
664 pub fn inner(&self) -> &Cursor<'a> {
665 &self.cursor
666 }
667
668 #[inline]
670 pub fn inner_mut(&mut self) -> &mut Cursor<'a> {
671 &mut self.cursor
672 }
673
674 #[inline]
676 pub fn position(&self) -> usize {
677 self.position as usize
678 }
679}
680
681impl Iterator for CursorWithPosition<'_> {
682 type Item = (usize, RawToken);
683
684 #[inline]
685 fn next(&mut self) -> Option<Self::Item> {
686 self.cursor.next().map(|t| {
687 let pos = self.position;
688 self.position = pos + t.len;
689 (pos as usize, t)
690 })
691 }
692
693 #[inline]
694 fn size_hint(&self) -> (usize, Option<usize>) {
695 self.cursor.size_hint()
696 }
697}
698
699impl std::iter::FusedIterator for CursorWithPosition<'_> {}