solar_parse/lexer/cursor/
mod.rs1use memchr::memmem;
6use solar_ast::{Base, StrKind};
7use solar_data_structures::hint::unlikely;
8use std::sync::OnceLock;
9
10pub mod token;
11use token::{RawLiteralKind, RawToken, RawTokenKind};
12
13#[cfg(test)]
14mod tests;
15
16#[inline]
18pub const fn is_whitespace(c: char) -> bool {
19 is_whitespace_byte(ch2u8(c))
20}
21#[inline]
23pub const fn is_whitespace_byte(c: u8) -> bool {
24 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
25}
26
27#[inline]
29pub const fn is_id_start(c: char) -> bool {
30 is_id_start_byte(ch2u8(c))
31}
32#[inline]
34pub const fn is_id_start_byte(c: u8) -> bool {
35 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
36}
37
38#[inline]
40pub const fn is_id_continue(c: char) -> bool {
41 is_id_continue_byte(ch2u8(c))
42}
43#[inline]
45pub const fn is_id_continue_byte(c: u8) -> bool {
46 let is_number = (c >= b'0') & (c <= b'9');
47 is_id_start_byte(c) || is_number
48}
49
50#[inline]
57pub const fn is_ident(s: &str) -> bool {
58 is_ident_bytes(s.as_bytes())
59}
60
61pub const fn is_ident_bytes(s: &[u8]) -> bool {
65 let [first, ref rest @ ..] = *s else {
66 return false;
67 };
68
69 if !is_id_start_byte(first) {
70 return false;
71 }
72
73 let mut i = 0;
74 while i < rest.len() {
75 if !is_id_continue_byte(rest[i]) {
76 return false;
77 }
78 i += 1;
79 }
80
81 true
82}
83
84#[inline(always)]
86const fn ch2u8(c: char) -> u8 {
87 c as u32 as u8
88}
89
90const EOF: u8 = b'\0';
91
92#[derive(Clone, Debug)]
97pub struct Cursor<'a> {
98 bytes: std::slice::Iter<'a, u8>,
99}
100
101impl<'a> Cursor<'a> {
102 #[inline]
104 pub fn new(input: &'a str) -> Self {
105 Cursor { bytes: input.as_bytes().iter() }
106 }
107
108 #[inline]
114 pub fn with_position(self) -> CursorWithPosition<'a> {
115 CursorWithPosition::new(self)
116 }
117
118 pub fn advance_token(&mut self) -> RawToken {
120 let start = self.as_ptr();
123
124 let Some(first_char) = self.bump_ret() else { return RawToken::EOF };
125 let token_kind = self.advance_token_kind(first_char);
126
127 let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
129
130 RawToken::new(token_kind, len as u32)
131 }
132
133 #[inline]
134 fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
135 match first_char {
136 b'/' => match self.first() {
138 b'/' => self.line_comment(),
139 b'*' => self.block_comment(),
140 _ => RawTokenKind::Slash,
141 },
142
143 c if is_whitespace_byte(c) => self.whitespace(),
145
146 c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
148
149 b'0'..=b'9' => {
151 let kind = self.number(first_char);
152 RawTokenKind::Literal { kind }
153 }
154 b'.' if self.first().is_ascii_digit() => {
155 let kind = self.rational_number_after_dot(Base::Decimal);
156 RawTokenKind::Literal { kind }
157 }
158
159 b';' => RawTokenKind::Semi,
161 b',' => RawTokenKind::Comma,
162 b'.' => RawTokenKind::Dot,
163 b'(' => RawTokenKind::OpenParen,
164 b')' => RawTokenKind::CloseParen,
165 b'{' => RawTokenKind::OpenBrace,
166 b'}' => RawTokenKind::CloseBrace,
167 b'[' => RawTokenKind::OpenBracket,
168 b']' => RawTokenKind::CloseBracket,
169 b'~' => RawTokenKind::Tilde,
170 b'?' => RawTokenKind::Question,
171 b':' => RawTokenKind::Colon,
172 b'=' => RawTokenKind::Eq,
173 b'!' => RawTokenKind::Bang,
174 b'<' => RawTokenKind::Lt,
175 b'>' => RawTokenKind::Gt,
176 b'-' => RawTokenKind::Minus,
177 b'&' => RawTokenKind::And,
178 b'|' => RawTokenKind::Or,
179 b'+' => RawTokenKind::Plus,
180 b'*' => RawTokenKind::Star,
181 b'^' => RawTokenKind::Caret,
182 b'%' => RawTokenKind::Percent,
183
184 b'\'' | b'"' => {
186 let terminated = self.eat_string(first_char);
187 let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
188 RawTokenKind::Literal { kind }
189 }
190
191 _ => {
192 if unlikely(!first_char.is_ascii()) {
193 self.bump_utf8_with(first_char);
194 }
195 RawTokenKind::Unknown
196 }
197 }
198 }
199
200 #[inline(never)]
201 fn line_comment(&mut self) -> RawTokenKind {
202 debug_assert!(self.prev() == b'/' && self.first() == b'/');
203 self.bump();
204
205 let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
207
208 self.eat_until_either(b'\n', b'\r');
210 RawTokenKind::LineComment { is_doc }
211 }
212
213 #[inline(never)]
214 fn block_comment(&mut self) -> RawTokenKind {
215 debug_assert!(self.prev() == b'/' && self.first() == b'*');
216 self.bump();
217
218 let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
221
222 let b = self.as_bytes();
223 static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
224 let (terminated, n) = FINDER
225 .get_or_init(|| memmem::Finder::new(b"*/"))
226 .find(b)
227 .map_or((false, b.len()), |pos| (true, pos + 2));
228 self.ignore_bytes(n);
229
230 RawTokenKind::BlockComment { is_doc, terminated }
231 }
232
233 fn whitespace(&mut self) -> RawTokenKind {
234 debug_assert!(is_whitespace_byte(self.prev()));
235 self.eat_while(is_whitespace_byte);
236 RawTokenKind::Whitespace
237 }
238
239 fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
240 debug_assert!(is_id_start_byte(self.prev()));
241
242 let start = self.as_ptr();
244 self.eat_while(is_id_continue_byte);
245
246 if unlikely(matches!(first, b'h' | b'u')) {
248 let id = unsafe {
250 let start = start.sub(1);
251 std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
252 };
253 let is_hex = id == b"hex";
254 if (is_hex || id == b"unicode")
255 && let quote @ (b'\'' | b'"') = self.first()
256 {
257 self.bump();
258 let terminated = self.eat_string(quote);
259 let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
260 return RawTokenKind::Literal { kind: RawLiteralKind::Str { kind, terminated } };
261 }
262 }
263
264 RawTokenKind::Ident
265 }
266
267 fn number(&mut self, first_digit: u8) -> RawLiteralKind {
268 debug_assert!(self.prev().is_ascii_digit());
269 let mut base = Base::Decimal;
270 if first_digit == b'0' {
271 let has_digits = match self.first() {
273 b'b' => {
274 base = Base::Binary;
275 self.bump();
276 self.eat_decimal_digits()
277 }
278 b'o' => {
279 base = Base::Octal;
280 self.bump();
281 self.eat_decimal_digits()
282 }
283 b'x' => {
284 base = Base::Hexadecimal;
285 self.bump();
286 self.eat_hexadecimal_digits()
287 }
288 b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
290 self.eat_decimal_digits();
291 true
292 }
293 _ => return RawLiteralKind::Int { base, empty_int: false },
295 };
296 if !has_digits {
298 return RawLiteralKind::Int { base, empty_int: true };
299 }
300 } else {
301 self.eat_decimal_digits();
303 };
304
305 match self.first() {
306 b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
310 self.bump();
311 self.rational_number_after_dot(base)
312 }
313 b'e' | b'E' => {
314 self.bump();
315 let empty_exponent = !self.eat_exponent();
316 RawLiteralKind::Rational { base, empty_exponent }
317 }
318 _ => RawLiteralKind::Int { base, empty_int: false },
319 }
320 }
321
322 #[cold]
323 fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
324 self.eat_decimal_digits();
325 let empty_exponent = match self.first() {
326 b'e' | b'E' => {
327 self.bump();
328 !self.eat_exponent()
329 }
330 _ => false,
331 };
332 RawLiteralKind::Rational { base, empty_exponent }
333 }
334
335 fn eat_string(&mut self, quote: u8) -> bool {
337 debug_assert_eq!(self.prev(), quote);
338 while let Some(c) = self.bump_ret() {
339 if c == quote {
340 return true;
341 }
342 if c == b'\\' {
343 let first = self.first();
344 if first == b'\\' || first == quote {
345 self.bump();
347 }
348 }
349 }
350 false
352 }
353
354 fn eat_decimal_digits(&mut self) -> bool {
356 self.eat_digits(|x| x.is_ascii_digit())
357 }
358
359 fn eat_hexadecimal_digits(&mut self) -> bool {
361 self.eat_digits(|x| x.is_ascii_hexdigit())
362 }
363
364 fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
365 let mut has_digits = false;
366 loop {
367 match self.first() {
368 b'_' => {
369 self.bump();
370 }
371 c if is_digit(c) => {
372 has_digits = true;
373 self.bump();
374 }
375 _ => break,
376 }
377 }
378 has_digits
379 }
380
381 fn eat_exponent(&mut self) -> bool {
383 debug_assert!(self.prev() == b'e' || self.prev() == b'E');
384 if self.first() == b'-' {
386 self.bump();
387 }
388 self.eat_decimal_digits()
389 }
390
391 #[inline]
393 #[deprecated = "use `as_bytes` instead; utf-8 is not guaranteed anymore"]
394 pub fn as_str(&self) -> &'a str {
395 unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }
397 }
398
399 #[inline]
401 pub fn as_bytes(&self) -> &'a [u8] {
402 self.bytes.as_slice()
403 }
404
405 #[inline]
407 pub fn as_ptr(&self) -> *const u8 {
408 self.bytes.as_slice().as_ptr()
409 }
410
411 #[inline]
413 fn prev(&self) -> u8 {
414 unsafe { *self.as_ptr().sub(1) }
416 }
417
418 #[inline]
423 fn first(&self) -> u8 {
424 self.peek_byte(0)
425 }
426
427 #[inline]
429 fn second(&self) -> u8 {
430 self.peek_byte(1)
433 }
434
435 #[doc(hidden)]
437 #[inline]
438 fn peek_byte(&self, index: usize) -> u8 {
439 self.as_bytes().get(index).copied().unwrap_or(EOF)
440 }
441
442 fn bump(&mut self) {
444 self.bytes.next();
445 }
446
447 #[cold]
451 #[allow(clippy::match_overlapping_arm)]
452 fn bump_utf8_with(&mut self, x: u8) {
453 debug_assert_eq!(self.prev(), x);
454 let skip = match x {
455 ..0x80 => 0,
456 ..0xE0 => 1,
457 ..0xF0 => 2,
458 _ => 3,
459 };
460 self.ignore_bytes(skip);
463 }
464
465 fn bump_ret(&mut self) -> Option<u8> {
467 let c = self.as_bytes().first().copied();
468 self.bytes.next();
469 c
470 }
471
472 #[inline]
474 #[cfg_attr(debug_assertions, track_caller)]
475 fn ignore_bytes(&mut self, n: usize) {
476 debug_assert!(n <= self.as_bytes().len());
477 self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
478 }
479
480 #[inline]
484 fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
485 let b = self.as_bytes();
486 let res = memchr::memchr2(ch1, ch2, b);
487 self.ignore_bytes(res.unwrap_or(b.len()));
488 res.is_some()
489 }
490
491 #[inline]
493 fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
494 while predicate(self.first()) {
495 self.bump();
496 }
497 }
498}
499
500impl Iterator for Cursor<'_> {
501 type Item = RawToken;
502
503 #[inline]
504 fn next(&mut self) -> Option<Self::Item> {
505 let token = self.advance_token();
506 if token.kind == RawTokenKind::Eof { None } else { Some(token) }
507 }
508}
509
510impl std::iter::FusedIterator for Cursor<'_> {}
511
512#[derive(Clone, Debug)]
516pub struct CursorWithPosition<'a> {
517 cursor: Cursor<'a>,
518 position: u32,
519}
520
521impl<'a> CursorWithPosition<'a> {
522 #[inline]
524 fn new(cursor: Cursor<'a>) -> Self {
525 CursorWithPosition { cursor, position: 0 }
526 }
527
528 #[inline]
530 pub fn inner(&self) -> &Cursor<'a> {
531 &self.cursor
532 }
533
534 #[inline]
536 pub fn inner_mut(&mut self) -> &mut Cursor<'a> {
537 &mut self.cursor
538 }
539
540 #[inline]
542 pub fn position(&self) -> usize {
543 self.position as usize
544 }
545}
546
547impl Iterator for CursorWithPosition<'_> {
548 type Item = (usize, RawToken);
549
550 #[inline]
551 fn next(&mut self) -> Option<Self::Item> {
552 self.cursor.next().map(|t| {
553 let pos = self.position;
554 self.position = pos + t.len;
555 (pos as usize, t)
556 })
557 }
558
559 #[inline]
560 fn size_hint(&self) -> (usize, Option<usize>) {
561 self.cursor.size_hint()
562 }
563}
564
565impl std::iter::FusedIterator for CursorWithPosition<'_> {}