solar_parse/lexer/cursor/
mod.rs1use solar_ast::Base;
6use std::str::Chars;
7
8pub mod token;
9use token::{RawLiteralKind, RawToken, RawTokenKind};
10
11#[cfg(test)]
12mod tests;
13
14#[inline]
16pub const fn is_whitespace(c: char) -> bool {
17 matches!(c, ' ' | '\t' | '\n' | '\r')
18}
19
20#[inline]
22pub const fn is_id_start(c: char) -> bool {
23 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '$')
24}
25
26#[inline]
28pub const fn is_id_continue(c: char) -> bool {
29 matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '$')
30}
31
32pub const fn is_ident(s: &str) -> bool {
39 let [first, rest @ ..] = s.as_bytes() else {
42 return false;
43 };
44
45 if !is_id_start(*first as char) {
46 return false;
47 }
48
49 let mut i = 0;
50 while i < rest.len() {
51 if !is_id_continue(rest[i] as char) {
52 return false;
53 }
54 i += 1;
55 }
56
57 true
58}
59
60const EOF_CHAR: char = '\0';
61
62#[derive(Clone, Debug)]
67pub struct Cursor<'a> {
68 len_remaining: usize,
69 chars: Chars<'a>,
71 #[cfg(debug_assertions)]
72 prev: char,
73}
74
75impl<'a> Cursor<'a> {
76 pub fn new(input: &'a str) -> Self {
78 Cursor {
79 len_remaining: input.len(),
80 chars: input.chars(),
81 #[cfg(debug_assertions)]
82 prev: EOF_CHAR,
83 }
84 }
85
86 pub fn advance_token(&mut self) -> RawToken {
88 let first_char = match self.bump() {
89 Some(c) => c,
90 None => return RawToken::EOF,
91 };
92
93 let token_kind = match first_char {
94 '/' => match self.first() {
96 '/' => self.line_comment(),
97 '*' => self.block_comment(),
98 _ => RawTokenKind::Slash,
99 },
100
101 c if is_whitespace(c) => self.whitespace(),
103
104 c if is_id_start(c) => self.ident_or_prefixed_literal(c),
106
107 c @ '0'..='9' => {
109 let kind = self.number(c);
110 RawTokenKind::Literal { kind }
111 }
112 '.' if self.first().is_ascii_digit() => {
113 let kind = self.rational_number_after_dot(Base::Decimal);
114 RawTokenKind::Literal { kind }
115 }
116
117 ';' => RawTokenKind::Semi,
119 ',' => RawTokenKind::Comma,
120 '.' => RawTokenKind::Dot,
121 '(' => RawTokenKind::OpenParen,
122 ')' => RawTokenKind::CloseParen,
123 '{' => RawTokenKind::OpenBrace,
124 '}' => RawTokenKind::CloseBrace,
125 '[' => RawTokenKind::OpenBracket,
126 ']' => RawTokenKind::CloseBracket,
127 '~' => RawTokenKind::Tilde,
128 '?' => RawTokenKind::Question,
129 ':' => RawTokenKind::Colon,
130 '=' => RawTokenKind::Eq,
131 '!' => RawTokenKind::Bang,
132 '<' => RawTokenKind::Lt,
133 '>' => RawTokenKind::Gt,
134 '-' => RawTokenKind::Minus,
135 '&' => RawTokenKind::And,
136 '|' => RawTokenKind::Or,
137 '+' => RawTokenKind::Plus,
138 '*' => RawTokenKind::Star,
139 '^' => RawTokenKind::Caret,
140 '%' => RawTokenKind::Percent,
141
142 c @ ('\'' | '"') => {
144 let terminated = self.eat_string(c);
145 let kind = RawLiteralKind::Str { terminated, unicode: false };
146 RawTokenKind::Literal { kind }
147 }
148
149 _ => RawTokenKind::Unknown,
154 };
155 let res = RawToken::new(token_kind, self.pos_within_token());
156 self.reset_pos_within_token();
157 res
158 }
159
160 fn line_comment(&mut self) -> RawTokenKind {
161 debug_assert!(self.prev() == '/' && self.first() == '/');
162 self.bump();
163
164 let is_doc = matches!(self.first(), '/' if self.second() != '/');
166
167 self.eat_while(|c| c != '\n');
168 RawTokenKind::LineComment { is_doc }
169 }
170
171 fn block_comment(&mut self) -> RawTokenKind {
172 debug_assert!(self.prev() == '/' && self.first() == '*');
173 self.bump();
174
175 let is_doc = matches!(self.first(), '*' if !matches!(self.second(), '*' | '/'));
178
179 let mut terminated = false;
180 while let Some(c) = self.bump() {
181 if c == '*' && self.first() == '/' {
182 terminated = true;
183 self.bump();
184 break;
185 }
186 }
187
188 RawTokenKind::BlockComment { is_doc, terminated }
189 }
190
191 fn whitespace(&mut self) -> RawTokenKind {
192 debug_assert!(is_whitespace(self.prev()));
193 self.eat_while(is_whitespace);
194 RawTokenKind::Whitespace
195 }
196
197 fn ident_or_prefixed_literal(&mut self, first_char: char) -> RawTokenKind {
198 debug_assert!(is_id_start(self.prev()));
199
200 match first_char {
202 'h' => {
204 if let Some(terminated) = self.maybe_string_prefix("hex") {
205 let kind = RawLiteralKind::HexStr { terminated };
206 return RawTokenKind::Literal { kind };
207 }
208 }
209 'u' => {
211 if let Some(terminated) = self.maybe_string_prefix("unicode") {
212 let kind = RawLiteralKind::Str { terminated, unicode: true };
213 return RawTokenKind::Literal { kind };
214 }
215 }
216 _ => {}
217 }
218
219 self.eat_while(is_id_continue);
221 match self.first() {
224 '"' | '\'' => RawTokenKind::UnknownPrefix,
225 _ => RawTokenKind::Ident,
226 }
227 }
228
229 fn number(&mut self, first_digit: char) -> RawLiteralKind {
230 debug_assert!('0' <= self.prev() && self.prev() <= '9');
231 let mut base = Base::Decimal;
232 if first_digit == '0' {
233 let has_digits = match self.first() {
235 'b' => {
236 base = Base::Binary;
237 self.bump();
238 self.eat_decimal_digits()
239 }
240 'o' => {
241 base = Base::Octal;
242 self.bump();
243 self.eat_decimal_digits()
244 }
245 'x' => {
246 base = Base::Hexadecimal;
247 self.bump();
248 self.eat_hexadecimal_digits()
249 }
250 '0'..='9' | '_' | '.' | 'e' | 'E' => {
252 self.eat_decimal_digits();
253 true
254 }
255 _ => return RawLiteralKind::Int { base, empty_int: false },
257 };
258 if !has_digits {
260 return RawLiteralKind::Int { base, empty_int: true };
261 }
262 } else {
263 self.eat_decimal_digits();
265 };
266
267 match self.first() {
268 '.' if !is_id_start(self.second()) => {
271 self.bump();
272 self.rational_number_after_dot(base)
273 }
274 'e' | 'E' => {
275 self.bump();
276 let empty_exponent = !self.eat_exponent();
277 RawLiteralKind::Rational { base, empty_exponent }
278 }
279 _ => RawLiteralKind::Int { base, empty_int: false },
280 }
281 }
282
283 fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
284 self.eat_decimal_digits();
285 let empty_exponent = match self.first() {
286 'e' | 'E' => {
287 self.bump();
288 !self.eat_exponent()
289 }
290 _ => false,
291 };
292 RawLiteralKind::Rational { base, empty_exponent }
293 }
294
295 fn maybe_string_prefix(&mut self, prefix: &str) -> Option<bool> {
296 debug_assert_eq!(self.prev(), prefix.chars().next().unwrap());
297 let prefix = &prefix[1..];
298 let s = self.as_str();
299 if s.starts_with(prefix) {
300 let skip = prefix.len();
301 let Some(quote @ ('"' | '\'')) = s.chars().nth(skip) else { return None };
302 self.ignore(skip);
303 self.bump();
304 let terminated = self.eat_string(quote);
305 Some(terminated)
306 } else {
307 None
308 }
309 }
310
311 fn eat_string(&mut self, quote: char) -> bool {
313 debug_assert_eq!(self.prev(), quote);
314 while let Some(c) = self.bump() {
315 if c == quote {
316 return true;
317 }
318 if c == '\\' {
319 let first = self.first();
320 if first == '\\' || first == quote {
321 self.bump();
323 }
324 }
325 }
326 false
328 }
329
330 fn eat_decimal_digits(&mut self) -> bool {
332 let mut has_digits = false;
333 loop {
334 match self.first() {
335 '_' => {
336 self.bump();
337 }
338 '0'..='9' => {
339 has_digits = true;
340 self.bump();
341 }
342 _ => break,
343 }
344 }
345 has_digits
346 }
347
348 fn eat_hexadecimal_digits(&mut self) -> bool {
350 let mut has_digits = false;
351 loop {
352 match self.first() {
353 '_' => {
354 self.bump();
355 }
356 '0'..='9' | 'a'..='f' | 'A'..='F' => {
357 has_digits = true;
358 self.bump();
359 }
360 _ => break,
361 }
362 }
363 has_digits
364 }
365
366 fn eat_exponent(&mut self) -> bool {
368 debug_assert!(self.prev() == 'e' || self.prev() == 'E');
369 if self.first() == '-' {
371 self.bump();
372 }
373 self.eat_decimal_digits()
374 }
375
376 pub fn as_str(&self) -> &'a str {
378 self.chars.as_str()
379 }
380
381 fn prev(&self) -> char {
383 #[cfg(debug_assertions)]
384 return self.prev;
385 #[cfg(not(debug_assertions))]
386 return EOF_CHAR;
387 }
388
389 fn first(&self) -> char {
394 self.chars.clone().next().unwrap_or(EOF_CHAR)
396 }
397
398 fn second(&self) -> char {
400 let mut iter = self.chars.clone();
402 iter.next();
403 iter.next().unwrap_or(EOF_CHAR)
404 }
405
406 fn is_eof(&self) -> bool {
408 self.chars.as_str().is_empty()
409 }
410
411 fn pos_within_token(&self) -> u32 {
413 (self.len_remaining - self.chars.as_str().len()) as u32
414 }
415
416 fn reset_pos_within_token(&mut self) {
418 self.len_remaining = self.chars.as_str().len();
419 }
420
421 fn bump(&mut self) -> Option<char> {
423 #[cfg(not(debug_assertions))]
424 {
425 self.chars.next()
426 }
427
428 #[cfg(debug_assertions)]
429 {
430 let c = self.chars.next();
431 if let Some(c) = c {
432 self.prev = c;
433 }
434 c
435 }
436 }
437
438 fn ignore(&mut self, n: usize) {
440 for _ in 0..n {
441 self.chars.next();
442 }
443 }
444
445 fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
447 while predicate(self.first()) && !self.is_eof() {
450 self.bump();
451 }
452 }
453}
454
455impl Iterator for Cursor<'_> {
456 type Item = RawToken;
457
458 fn next(&mut self) -> Option<Self::Item> {
459 let token = self.advance_token();
460 if token.kind == RawTokenKind::Eof {
461 None
462 } else {
463 Some(token)
464 }
465 }
466}
467
468impl std::iter::FusedIterator for Cursor<'_> {}