solar_parse/lexer/cursor/
mod.rs1use solar_ast::Base;
6use std::str::Chars;
7
8pub mod token;
9use token::{RawLiteralKind, RawToken, RawTokenKind};
10
11#[cfg(test)]
12mod tests;
13
14#[inline]
16pub const fn is_whitespace(c: char) -> bool {
17    matches!(c, ' ' | '\t' | '\n' | '\r')
18}
19
20#[inline]
22pub const fn is_id_start(c: char) -> bool {
23    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '$')
24}
25
26#[inline]
28pub const fn is_id_continue(c: char) -> bool {
29    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '$')
30}
31
32pub const fn is_ident(s: &str) -> bool {
39    let [first, rest @ ..] = s.as_bytes() else {
42        return false;
43    };
44
45    if !is_id_start(*first as char) {
46        return false;
47    }
48
49    let mut i = 0;
50    while i < rest.len() {
51        if !is_id_continue(rest[i] as char) {
52            return false;
53        }
54        i += 1;
55    }
56
57    true
58}
59
60const EOF_CHAR: char = '\0';
61
62#[derive(Clone, Debug)]
67pub struct Cursor<'a> {
68    len_remaining: usize,
69    chars: Chars<'a>,
71    #[cfg(debug_assertions)]
72    prev: char,
73}
74
75impl<'a> Cursor<'a> {
76    pub fn new(input: &'a str) -> Self {
78        Cursor {
79            len_remaining: input.len(),
80            chars: input.chars(),
81            #[cfg(debug_assertions)]
82            prev: EOF_CHAR,
83        }
84    }
85
86    pub fn advance_token(&mut self) -> RawToken {
88        let first_char = match self.bump() {
89            Some(c) => c,
90            None => return RawToken::EOF,
91        };
92
93        let token_kind = match first_char {
94            '/' => match self.first() {
96                '/' => self.line_comment(),
97                '*' => self.block_comment(),
98                _ => RawTokenKind::Slash,
99            },
100
101            c if is_whitespace(c) => self.whitespace(),
103
104            c if is_id_start(c) => self.ident_or_prefixed_literal(c),
106
107            c @ '0'..='9' => {
109                let kind = self.number(c);
110                RawTokenKind::Literal { kind }
111            }
112            '.' if self.first().is_ascii_digit() => {
113                let kind = self.rational_number_after_dot(Base::Decimal);
114                RawTokenKind::Literal { kind }
115            }
116
117            ';' => RawTokenKind::Semi,
119            ',' => RawTokenKind::Comma,
120            '.' => RawTokenKind::Dot,
121            '(' => RawTokenKind::OpenParen,
122            ')' => RawTokenKind::CloseParen,
123            '{' => RawTokenKind::OpenBrace,
124            '}' => RawTokenKind::CloseBrace,
125            '[' => RawTokenKind::OpenBracket,
126            ']' => RawTokenKind::CloseBracket,
127            '~' => RawTokenKind::Tilde,
128            '?' => RawTokenKind::Question,
129            ':' => RawTokenKind::Colon,
130            '=' => RawTokenKind::Eq,
131            '!' => RawTokenKind::Bang,
132            '<' => RawTokenKind::Lt,
133            '>' => RawTokenKind::Gt,
134            '-' => RawTokenKind::Minus,
135            '&' => RawTokenKind::And,
136            '|' => RawTokenKind::Or,
137            '+' => RawTokenKind::Plus,
138            '*' => RawTokenKind::Star,
139            '^' => RawTokenKind::Caret,
140            '%' => RawTokenKind::Percent,
141
142            c @ ('\'' | '"') => {
144                let terminated = self.eat_string(c);
145                let kind = RawLiteralKind::Str { terminated, unicode: false };
146                RawTokenKind::Literal { kind }
147            }
148
149            _ => RawTokenKind::Unknown,
154        };
155        let res = RawToken::new(token_kind, self.pos_within_token());
156        self.reset_pos_within_token();
157        res
158    }
159
160    fn line_comment(&mut self) -> RawTokenKind {
161        debug_assert!(self.prev() == '/' && self.first() == '/');
162        self.bump();
163
164        let is_doc = matches!(self.first(), '/' if self.second() != '/');
166
167        self.eat_while(|c| c != '\n');
168        RawTokenKind::LineComment { is_doc }
169    }
170
171    fn block_comment(&mut self) -> RawTokenKind {
172        debug_assert!(self.prev() == '/' && self.first() == '*');
173        self.bump();
174
175        let is_doc = matches!(self.first(), '*' if !matches!(self.second(), '*' | '/'));
178
179        let mut terminated = false;
180        while let Some(c) = self.bump() {
181            if c == '*' && self.first() == '/' {
182                terminated = true;
183                self.bump();
184                break;
185            }
186        }
187
188        RawTokenKind::BlockComment { is_doc, terminated }
189    }
190
191    fn whitespace(&mut self) -> RawTokenKind {
192        debug_assert!(is_whitespace(self.prev()));
193        self.eat_while(is_whitespace);
194        RawTokenKind::Whitespace
195    }
196
197    fn ident_or_prefixed_literal(&mut self, first_char: char) -> RawTokenKind {
198        debug_assert!(is_id_start(self.prev()));
199
200        match first_char {
202            'h' => {
204                if let Some(terminated) = self.maybe_string_prefix("hex") {
205                    let kind = RawLiteralKind::HexStr { terminated };
206                    return RawTokenKind::Literal { kind };
207                }
208            }
209            'u' => {
211                if let Some(terminated) = self.maybe_string_prefix("unicode") {
212                    let kind = RawLiteralKind::Str { terminated, unicode: true };
213                    return RawTokenKind::Literal { kind };
214                }
215            }
216            _ => {}
217        }
218
219        self.eat_while(is_id_continue);
221        match self.first() {
224            '"' | '\'' => RawTokenKind::UnknownPrefix,
225            _ => RawTokenKind::Ident,
226        }
227    }
228
229    fn number(&mut self, first_digit: char) -> RawLiteralKind {
230        debug_assert!('0' <= self.prev() && self.prev() <= '9');
231        let mut base = Base::Decimal;
232        if first_digit == '0' {
233            let has_digits = match self.first() {
235                'b' => {
236                    base = Base::Binary;
237                    self.bump();
238                    self.eat_decimal_digits()
239                }
240                'o' => {
241                    base = Base::Octal;
242                    self.bump();
243                    self.eat_decimal_digits()
244                }
245                'x' => {
246                    base = Base::Hexadecimal;
247                    self.bump();
248                    self.eat_hexadecimal_digits()
249                }
250                '0'..='9' | '_' | '.' | 'e' | 'E' => {
252                    self.eat_decimal_digits();
253                    true
254                }
255                _ => return RawLiteralKind::Int { base, empty_int: false },
257            };
258            if !has_digits {
260                return RawLiteralKind::Int { base, empty_int: true };
261            }
262        } else {
263            self.eat_decimal_digits();
265        };
266
267        match self.first() {
268            '.' if !is_id_start(self.second()) => {
271                self.bump();
272                self.rational_number_after_dot(base)
273            }
274            'e' | 'E' => {
275                self.bump();
276                let empty_exponent = !self.eat_exponent();
277                RawLiteralKind::Rational { base, empty_exponent }
278            }
279            _ => RawLiteralKind::Int { base, empty_int: false },
280        }
281    }
282
283    fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
284        self.eat_decimal_digits();
285        let empty_exponent = match self.first() {
286            'e' | 'E' => {
287                self.bump();
288                !self.eat_exponent()
289            }
290            _ => false,
291        };
292        RawLiteralKind::Rational { base, empty_exponent }
293    }
294
295    fn maybe_string_prefix(&mut self, prefix: &str) -> Option<bool> {
296        debug_assert_eq!(self.prev(), prefix.chars().next().unwrap());
297        let prefix = &prefix[1..];
298        let s = self.as_str();
299        if s.starts_with(prefix) {
300            let skip = prefix.len();
301            let Some(quote @ ('"' | '\'')) = s.chars().nth(skip) else { return None };
302            self.ignore(skip);
303            self.bump();
304            let terminated = self.eat_string(quote);
305            Some(terminated)
306        } else {
307            None
308        }
309    }
310
311    fn eat_string(&mut self, quote: char) -> bool {
313        debug_assert_eq!(self.prev(), quote);
314        while let Some(c) = self.bump() {
315            if c == quote {
316                return true;
317            }
318            if c == '\\' {
319                let first = self.first();
320                if first == '\\' || first == quote {
321                    self.bump();
323                }
324            }
325        }
326        false
328    }
329
330    fn eat_decimal_digits(&mut self) -> bool {
332        let mut has_digits = false;
333        loop {
334            match self.first() {
335                '_' => {
336                    self.bump();
337                }
338                '0'..='9' => {
339                    has_digits = true;
340                    self.bump();
341                }
342                _ => break,
343            }
344        }
345        has_digits
346    }
347
348    fn eat_hexadecimal_digits(&mut self) -> bool {
350        let mut has_digits = false;
351        loop {
352            match self.first() {
353                '_' => {
354                    self.bump();
355                }
356                '0'..='9' | 'a'..='f' | 'A'..='F' => {
357                    has_digits = true;
358                    self.bump();
359                }
360                _ => break,
361            }
362        }
363        has_digits
364    }
365
366    fn eat_exponent(&mut self) -> bool {
368        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
369        if self.first() == '-' {
371            self.bump();
372        }
373        self.eat_decimal_digits()
374    }
375
376    pub fn as_str(&self) -> &'a str {
378        self.chars.as_str()
379    }
380
381    fn prev(&self) -> char {
383        #[cfg(debug_assertions)]
384        return self.prev;
385        #[cfg(not(debug_assertions))]
386        return EOF_CHAR;
387    }
388
389    fn first(&self) -> char {
394        self.chars.clone().next().unwrap_or(EOF_CHAR)
396    }
397
398    fn second(&self) -> char {
400        let mut iter = self.chars.clone();
402        iter.next();
403        iter.next().unwrap_or(EOF_CHAR)
404    }
405
406    fn is_eof(&self) -> bool {
408        self.chars.as_str().is_empty()
409    }
410
411    fn pos_within_token(&self) -> u32 {
413        (self.len_remaining - self.chars.as_str().len()) as u32
414    }
415
416    fn reset_pos_within_token(&mut self) {
418        self.len_remaining = self.chars.as_str().len();
419    }
420
421    fn bump(&mut self) -> Option<char> {
423        #[cfg(not(debug_assertions))]
424        {
425            self.chars.next()
426        }
427
428        #[cfg(debug_assertions)]
429        {
430            let c = self.chars.next();
431            if let Some(c) = c {
432                self.prev = c;
433            }
434            c
435        }
436    }
437
438    fn ignore(&mut self, n: usize) {
440        for _ in 0..n {
441            self.chars.next();
442        }
443    }
444
445    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
447        while predicate(self.first()) && !self.is_eof() {
450            self.bump();
451        }
452    }
453}
454
455impl Iterator for Cursor<'_> {
456    type Item = RawToken;
457
458    fn next(&mut self) -> Option<Self::Item> {
459        let token = self.advance_token();
460        if token.kind == RawTokenKind::Eof {
461            None
462        } else {
463            Some(token)
464        }
465    }
466}
467
468impl std::iter::FusedIterator for Cursor<'_> {}