solar_parse/lexer/cursor/
mod.rs1use solar_ast::Base;
6use std::str::Chars;
7
8pub mod token;
9use token::{RawLiteralKind, RawToken, RawTokenKind};
10
11#[cfg(test)]
12mod tests;
13
14#[inline]
16pub const fn is_whitespace(c: char) -> bool {
17 is_whitespace_byte(ch2u8(c))
18}
19#[inline]
21pub const fn is_whitespace_byte(c: u8) -> bool {
22 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
23}
24
25#[inline]
27pub const fn is_id_start(c: char) -> bool {
28 is_id_start_byte(ch2u8(c))
29}
30#[inline]
32pub const fn is_id_start_byte(c: u8) -> bool {
33 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
34}
35
36#[inline]
38pub const fn is_id_continue(c: char) -> bool {
39 is_id_continue_byte(ch2u8(c))
40}
41#[inline]
43pub const fn is_id_continue_byte(c: u8) -> bool {
44 matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$')
45}
46
47#[inline]
54pub const fn is_ident(s: &str) -> bool {
55 is_ident_bytes(s.as_bytes())
56}
57
58pub const fn is_ident_bytes(s: &[u8]) -> bool {
62 let [first, ref rest @ ..] = *s else {
65 return false;
66 };
67
68 if !is_id_start_byte(first) {
69 return false;
70 }
71
72 let mut i = 0;
73 while i < rest.len() {
74 if !is_id_continue_byte(rest[i]) {
75 return false;
76 }
77 i += 1;
78 }
79
80 true
81}
82
83#[inline(always)]
85const fn ch2u8(c: char) -> u8 {
86 c as u32 as u8
87}
88
89const EOF: u8 = b'\0';
90
91#[derive(Clone, Debug)]
96pub struct Cursor<'a> {
97 len_remaining: usize,
98 chars: Chars<'a>,
99 #[cfg(debug_assertions)]
100 prev: u8,
101}
102
103impl<'a> Cursor<'a> {
104 pub fn new(input: &'a str) -> Self {
106 Cursor {
107 len_remaining: input.len(),
108 chars: input.chars(),
109 #[cfg(debug_assertions)]
110 prev: EOF,
111 }
112 }
113
114 pub fn advance_token(&mut self) -> RawToken {
116 let first_char = match self.bump_ret() {
117 Some(c) => c,
118 None => return RawToken::EOF,
119 };
120 let token_kind = if first_char.is_ascii() {
121 self.advance_token_kind(first_char)
122 } else {
123 RawTokenKind::Unknown
124 };
125 let len = self.pos_within_token();
126 self.reset_pos_within_token();
127 RawToken::new(token_kind, len)
128 }
129
130 #[inline]
131 fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
132 match first_char {
133 b'/' => match self.first() {
135 b'/' => self.line_comment(),
136 b'*' => self.block_comment(),
137 _ => RawTokenKind::Slash,
138 },
139
140 c if is_whitespace_byte(c) => self.whitespace(),
142
143 c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
145
146 b'0'..=b'9' => {
148 let kind = self.number(first_char);
149 RawTokenKind::Literal { kind }
150 }
151 b'.' if self.first().is_ascii_digit() => {
152 let kind = self.rational_number_after_dot(Base::Decimal);
153 RawTokenKind::Literal { kind }
154 }
155
156 b';' => RawTokenKind::Semi,
158 b',' => RawTokenKind::Comma,
159 b'.' => RawTokenKind::Dot,
160 b'(' => RawTokenKind::OpenParen,
161 b')' => RawTokenKind::CloseParen,
162 b'{' => RawTokenKind::OpenBrace,
163 b'}' => RawTokenKind::CloseBrace,
164 b'[' => RawTokenKind::OpenBracket,
165 b']' => RawTokenKind::CloseBracket,
166 b'~' => RawTokenKind::Tilde,
167 b'?' => RawTokenKind::Question,
168 b':' => RawTokenKind::Colon,
169 b'=' => RawTokenKind::Eq,
170 b'!' => RawTokenKind::Bang,
171 b'<' => RawTokenKind::Lt,
172 b'>' => RawTokenKind::Gt,
173 b'-' => RawTokenKind::Minus,
174 b'&' => RawTokenKind::And,
175 b'|' => RawTokenKind::Or,
176 b'+' => RawTokenKind::Plus,
177 b'*' => RawTokenKind::Star,
178 b'^' => RawTokenKind::Caret,
179 b'%' => RawTokenKind::Percent,
180
181 b'\'' | b'"' => {
183 let terminated = self.eat_string(first_char);
184 let kind = RawLiteralKind::Str { terminated, unicode: false };
185 RawTokenKind::Literal { kind }
186 }
187
188 _ => RawTokenKind::Unknown,
193 }
194 }
195
196 fn line_comment(&mut self) -> RawTokenKind {
197 debug_assert!(self.prev() == b'/' && self.first() == b'/');
198 self.bump();
199
200 let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
202
203 self.eat_while(|c| c != b'\n');
204 RawTokenKind::LineComment { is_doc }
205 }
206
207 fn block_comment(&mut self) -> RawTokenKind {
208 debug_assert!(self.prev() == b'/' && self.first() == b'*');
209 self.bump();
210
211 let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
214
215 let mut terminated = false;
216 while let Some(c) = self.bump_ret() {
217 if c == b'*' && self.first() == b'/' {
218 terminated = true;
219 self.bump();
220 break;
221 }
222 }
223
224 RawTokenKind::BlockComment { is_doc, terminated }
225 }
226
227 fn whitespace(&mut self) -> RawTokenKind {
228 debug_assert!(is_whitespace_byte(self.prev()));
229 self.eat_while(is_whitespace_byte);
230 RawTokenKind::Whitespace
231 }
232
233 fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
234 debug_assert!(is_id_start_byte(self.prev()));
235
236 match first {
238 b'h' => {
240 if let Some(terminated) = self.maybe_string_prefix("hex") {
241 let kind = RawLiteralKind::HexStr { terminated };
242 return RawTokenKind::Literal { kind };
243 }
244 }
245 b'u' => {
247 if let Some(terminated) = self.maybe_string_prefix("unicode") {
248 let kind = RawLiteralKind::Str { terminated, unicode: true };
249 return RawTokenKind::Literal { kind };
250 }
251 }
252 _ => {}
253 }
254
255 self.eat_while(is_id_continue_byte);
257 RawTokenKind::Ident
258 }
259
260 fn number(&mut self, first_digit: u8) -> RawLiteralKind {
261 debug_assert!(self.prev().is_ascii_digit());
262 let mut base = Base::Decimal;
263 if first_digit == b'0' {
264 let has_digits = match self.first() {
266 b'b' => {
267 base = Base::Binary;
268 self.bump();
269 self.eat_decimal_digits()
270 }
271 b'o' => {
272 base = Base::Octal;
273 self.bump();
274 self.eat_decimal_digits()
275 }
276 b'x' => {
277 base = Base::Hexadecimal;
278 self.bump();
279 self.eat_hexadecimal_digits()
280 }
281 b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
283 self.eat_decimal_digits();
284 true
285 }
286 _ => return RawLiteralKind::Int { base, empty_int: false },
288 };
289 if !has_digits {
291 return RawLiteralKind::Int { base, empty_int: true };
292 }
293 } else {
294 self.eat_decimal_digits();
296 };
297
298 match self.first() {
299 b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
303 self.bump();
304 self.rational_number_after_dot(base)
305 }
306 b'e' | b'E' => {
307 self.bump();
308 let empty_exponent = !self.eat_exponent();
309 RawLiteralKind::Rational { base, empty_exponent }
310 }
311 _ => RawLiteralKind::Int { base, empty_int: false },
312 }
313 }
314
315 #[cold]
316 fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
317 self.eat_decimal_digits();
318 let empty_exponent = match self.first() {
319 b'e' | b'E' => {
320 self.bump();
321 !self.eat_exponent()
322 }
323 _ => false,
324 };
325 RawLiteralKind::Rational { base, empty_exponent }
326 }
327
328 fn maybe_string_prefix(&mut self, prefix: &str) -> Option<bool> {
329 debug_assert_eq!(self.prev(), prefix.bytes().next().unwrap());
330 let prefix = &prefix[1..];
331 let s = self.as_str();
332 if s.starts_with(prefix) {
333 let skip = prefix.len();
334 let Some(quote @ (b'"' | b'\'')) = s.as_bytes().get(skip).copied() else { return None };
335 self.ignore_bytes(skip);
336 self.bump();
337 let terminated = self.eat_string(quote);
338 Some(terminated)
339 } else {
340 None
341 }
342 }
343
344 fn eat_string(&mut self, quote: u8) -> bool {
346 debug_assert_eq!(self.prev(), quote);
347 while let Some(c) = self.bump_ret() {
348 if c == quote {
349 return true;
350 }
351 if c == b'\\' {
352 let first = self.first();
353 if first == b'\\' || first == quote {
354 self.bump();
356 }
357 }
358 }
359 false
361 }
362
363 fn eat_decimal_digits(&mut self) -> bool {
365 self.eat_digits(|x| x.is_ascii_digit())
366 }
367
368 fn eat_hexadecimal_digits(&mut self) -> bool {
370 self.eat_digits(|x| x.is_ascii_hexdigit())
371 }
372
373 fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
374 let mut has_digits = false;
375 loop {
376 match self.first() {
377 b'_' => {
378 self.bump();
379 }
380 c if is_digit(c) => {
381 has_digits = true;
382 self.bump();
383 }
384 _ => break,
385 }
386 }
387 has_digits
388 }
389
390 fn eat_exponent(&mut self) -> bool {
392 debug_assert!(self.prev() == b'e' || self.prev() == b'E');
393 if self.first() == b'-' {
395 self.bump();
396 }
397 self.eat_decimal_digits()
398 }
399
400 #[inline]
402 pub fn as_str(&self) -> &'a str {
403 self.chars.as_str()
404 }
405
406 #[inline]
408 fn prev(&self) -> u8 {
409 #[cfg(debug_assertions)]
410 return self.prev;
411 #[cfg(not(debug_assertions))]
412 return EOF;
413 }
414
415 #[inline]
420 fn first(&self) -> u8 {
421 self.peek_byte(0)
422 }
423
424 #[inline]
426 fn second(&self) -> u8 {
427 self.peek_byte(1)
430 }
431
432 #[doc(hidden)]
434 #[inline]
435 fn peek_byte(&self, index: usize) -> u8 {
436 self.as_str().as_bytes().get(index).copied().unwrap_or(EOF)
437 }
438
439 #[inline]
441 fn is_eof(&self) -> bool {
442 self.as_str().is_empty()
443 }
444
445 #[inline]
447 fn pos_within_token(&self) -> u32 {
448 (self.len_remaining - self.as_str().len()) as u32
449 }
450
451 #[inline]
453 fn reset_pos_within_token(&mut self) {
454 self.len_remaining = self.as_str().len();
455 }
456
457 fn bump(&mut self) {
459 self.bump_inlined();
460 }
461
462 fn bump_ret(&mut self) -> Option<u8> {
464 let c = self.as_str().as_bytes().first().copied();
465 self.bump_inlined();
466 c
467 }
468
469 #[inline]
470 fn bump_inlined(&mut self) {
471 #[cfg(not(debug_assertions))]
474 self.chars.next();
475 #[cfg(debug_assertions)]
476 if let Some(c) = self.chars.next() {
477 self.prev = c as u8;
478 }
479 }
480
481 #[inline]
483 #[cfg_attr(debug_assertions, track_caller)]
484 fn ignore_bytes(&mut self, n: usize) {
485 self.chars = self.chars.as_str()[n..].chars();
486 }
487
488 #[inline]
490 fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
491 while predicate(self.first()) && !self.is_eof() {
492 self.bump();
493 }
494 }
495}
496
497impl Iterator for Cursor<'_> {
498 type Item = RawToken;
499
500 #[inline]
501 fn next(&mut self) -> Option<Self::Item> {
502 let token = self.advance_token();
503 if token.kind == RawTokenKind::Eof {
504 None
505 } else {
506 Some(token)
507 }
508 }
509}
510
511impl std::iter::FusedIterator for Cursor<'_> {}