solar_parse/lexer/cursor/
mod.rs

1//! Low-level Solidity lexer.
2//!
3//! Modified from Rust's [`rustc_lexer`](https://github.com/rust-lang/rust/blob/45749b21b7fd836f6c4f11dd40376f7c83e2791b/compiler/rustc_lexer/src/lib.rs).
4
5use solar_ast::Base;
6use std::str::Chars;
7
8pub mod token;
9use token::{RawLiteralKind, RawToken, RawTokenKind};
10
11#[cfg(test)]
12mod tests;
13
14/// Returns `true` if the given character is considered a whitespace.
15#[inline]
16pub const fn is_whitespace(c: char) -> bool {
17    is_whitespace_byte(ch2u8(c))
18}
19/// Returns `true` if the given character is considered a whitespace.
20#[inline]
21pub const fn is_whitespace_byte(c: u8) -> bool {
22    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
23}
24
25/// Returns `true` if the given character is valid at the start of a Solidity identifier.
26#[inline]
27pub const fn is_id_start(c: char) -> bool {
28    is_id_start_byte(ch2u8(c))
29}
30/// Returns `true` if the given character is valid at the start of a Solidity identifier.
31#[inline]
32pub const fn is_id_start_byte(c: u8) -> bool {
33    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
34}
35
36/// Returns `true` if the given character is valid in a Solidity identifier.
37#[inline]
38pub const fn is_id_continue(c: char) -> bool {
39    is_id_continue_byte(ch2u8(c))
40}
41/// Returns `true` if the given character is valid in a Solidity identifier.
42#[inline]
43pub const fn is_id_continue_byte(c: u8) -> bool {
44    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$')
45}
46
47/// Returns `true` if the given string is a valid Solidity identifier.
48///
49/// An identifier in Solidity has to start with a letter, a dollar-sign or an underscore and may
50/// additionally contain numbers after the first symbol.
51///
52/// Reference: <https://docs.soliditylang.org/en/latest/grammar.html#a4.SolidityLexer.Identifier>
53#[inline]
54pub const fn is_ident(s: &str) -> bool {
55    is_ident_bytes(s.as_bytes())
56}
57
58/// Returns `true` if the given byte slice is a valid Solidity identifier.
59///
60/// See [`is_ident`] for more details.
61pub const fn is_ident_bytes(s: &[u8]) -> bool {
62    // Note: valid idents can only contain ASCII characters, so we can use the byte representation
63    // here.
64    let [first, ref rest @ ..] = *s else {
65        return false;
66    };
67
68    if !is_id_start_byte(first) {
69        return false;
70    }
71
72    let mut i = 0;
73    while i < rest.len() {
74        if !is_id_continue_byte(rest[i]) {
75            return false;
76        }
77        i += 1;
78    }
79
80    true
81}
82
83/// Converts a `char` to a `u8`.
84#[inline(always)]
85const fn ch2u8(c: char) -> u8 {
86    c as u32 as u8
87}
88
89const EOF: u8 = b'\0';
90
91/// Peekable iterator over a char sequence.
92///
93/// Next characters can be peeked via `first` method,
94/// and position can be shifted forward via `bump` method.
95#[derive(Clone, Debug)]
96pub struct Cursor<'a> {
97    len_remaining: usize,
98    chars: Chars<'a>,
99    #[cfg(debug_assertions)]
100    prev: u8,
101}
102
103impl<'a> Cursor<'a> {
104    /// Creates a new cursor over the given input string slice.
105    pub fn new(input: &'a str) -> Self {
106        Cursor {
107            len_remaining: input.len(),
108            chars: input.chars(),
109            #[cfg(debug_assertions)]
110            prev: EOF,
111        }
112    }
113
114    /// Parses a token from the input string.
115    pub fn advance_token(&mut self) -> RawToken {
116        let first_char = match self.bump_ret() {
117            Some(c) => c,
118            None => return RawToken::EOF,
119        };
120        let token_kind = if first_char.is_ascii() {
121            self.advance_token_kind(first_char)
122        } else {
123            RawTokenKind::Unknown
124        };
125        let len = self.pos_within_token();
126        self.reset_pos_within_token();
127        RawToken::new(token_kind, len)
128    }
129
130    #[inline]
131    fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
132        match first_char {
133            // Slash, comment or block comment.
134            b'/' => match self.first() {
135                b'/' => self.line_comment(),
136                b'*' => self.block_comment(),
137                _ => RawTokenKind::Slash,
138            },
139
140            // Whitespace sequence.
141            c if is_whitespace_byte(c) => self.whitespace(),
142
143            // Identifier (this should be checked after other variant that can start as identifier).
144            c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
145
146            // Numeric literal.
147            b'0'..=b'9' => {
148                let kind = self.number(first_char);
149                RawTokenKind::Literal { kind }
150            }
151            b'.' if self.first().is_ascii_digit() => {
152                let kind = self.rational_number_after_dot(Base::Decimal);
153                RawTokenKind::Literal { kind }
154            }
155
156            // One-symbol tokens.
157            b';' => RawTokenKind::Semi,
158            b',' => RawTokenKind::Comma,
159            b'.' => RawTokenKind::Dot,
160            b'(' => RawTokenKind::OpenParen,
161            b')' => RawTokenKind::CloseParen,
162            b'{' => RawTokenKind::OpenBrace,
163            b'}' => RawTokenKind::CloseBrace,
164            b'[' => RawTokenKind::OpenBracket,
165            b']' => RawTokenKind::CloseBracket,
166            b'~' => RawTokenKind::Tilde,
167            b'?' => RawTokenKind::Question,
168            b':' => RawTokenKind::Colon,
169            b'=' => RawTokenKind::Eq,
170            b'!' => RawTokenKind::Bang,
171            b'<' => RawTokenKind::Lt,
172            b'>' => RawTokenKind::Gt,
173            b'-' => RawTokenKind::Minus,
174            b'&' => RawTokenKind::And,
175            b'|' => RawTokenKind::Or,
176            b'+' => RawTokenKind::Plus,
177            b'*' => RawTokenKind::Star,
178            b'^' => RawTokenKind::Caret,
179            b'%' => RawTokenKind::Percent,
180
181            // String literal.
182            b'\'' | b'"' => {
183                let terminated = self.eat_string(first_char);
184                let kind = RawLiteralKind::Str { terminated, unicode: false };
185                RawTokenKind::Literal { kind }
186            }
187
188            // Identifier starting with an emoji. Only lexed for graceful error recovery.
189            // c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
190            //     self.fake_ident_or_unknown_prefix()
191            // }
192            _ => RawTokenKind::Unknown,
193        }
194    }
195
196    fn line_comment(&mut self) -> RawTokenKind {
197        debug_assert!(self.prev() == b'/' && self.first() == b'/');
198        self.bump();
199
200        // `////` (more than 3 slashes) is not considered a doc comment.
201        let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
202
203        self.eat_while(|c| c != b'\n');
204        RawTokenKind::LineComment { is_doc }
205    }
206
207    fn block_comment(&mut self) -> RawTokenKind {
208        debug_assert!(self.prev() == b'/' && self.first() == b'*');
209        self.bump();
210
211        // `/***` (more than 2 stars) is not considered a doc comment.
212        // `/**/` is not considered a doc comment.
213        let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
214
215        let mut terminated = false;
216        while let Some(c) = self.bump_ret() {
217            if c == b'*' && self.first() == b'/' {
218                terminated = true;
219                self.bump();
220                break;
221            }
222        }
223
224        RawTokenKind::BlockComment { is_doc, terminated }
225    }
226
227    fn whitespace(&mut self) -> RawTokenKind {
228        debug_assert!(is_whitespace_byte(self.prev()));
229        self.eat_while(is_whitespace_byte);
230        RawTokenKind::Whitespace
231    }
232
233    fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
234        debug_assert!(is_id_start_byte(self.prev()));
235
236        // Check for potential prefixed literals.
237        match first {
238            // `hex"01234"`
239            b'h' => {
240                if let Some(terminated) = self.maybe_string_prefix("hex") {
241                    let kind = RawLiteralKind::HexStr { terminated };
242                    return RawTokenKind::Literal { kind };
243                }
244            }
245            // `unicode"abc"`
246            b'u' => {
247                if let Some(terminated) = self.maybe_string_prefix("unicode") {
248                    let kind = RawLiteralKind::Str { terminated, unicode: true };
249                    return RawTokenKind::Literal { kind };
250                }
251            }
252            _ => {}
253        }
254
255        // Start is already eaten, eat the rest of identifier.
256        self.eat_while(is_id_continue_byte);
257        RawTokenKind::Ident
258    }
259
260    fn number(&mut self, first_digit: u8) -> RawLiteralKind {
261        debug_assert!(self.prev().is_ascii_digit());
262        let mut base = Base::Decimal;
263        if first_digit == b'0' {
264            // Attempt to parse encoding base.
265            let has_digits = match self.first() {
266                b'b' => {
267                    base = Base::Binary;
268                    self.bump();
269                    self.eat_decimal_digits()
270                }
271                b'o' => {
272                    base = Base::Octal;
273                    self.bump();
274                    self.eat_decimal_digits()
275                }
276                b'x' => {
277                    base = Base::Hexadecimal;
278                    self.bump();
279                    self.eat_hexadecimal_digits()
280                }
281                // Not a base prefix.
282                b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
283                    self.eat_decimal_digits();
284                    true
285                }
286                // Just a 0.
287                _ => return RawLiteralKind::Int { base, empty_int: false },
288            };
289            // Base prefix was provided, but there were no digits after it, e.g. "0x".
290            if !has_digits {
291                return RawLiteralKind::Int { base, empty_int: true };
292            }
293        } else {
294            // No base prefix, parse number in the usual way.
295            self.eat_decimal_digits();
296        };
297
298        match self.first() {
299            // Don't be greedy if this is actually an integer literal followed by field/method
300            // access (`12.foo()`).
301            // `_` is special cased, we assume it's always an invalid rational: https://github.com/ethereum/solidity/blob/c012b725bb8ce755b93ce0dd05e83c34c499acd6/liblangutil/Scanner.cpp#L979
302            b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
303                self.bump();
304                self.rational_number_after_dot(base)
305            }
306            b'e' | b'E' => {
307                self.bump();
308                let empty_exponent = !self.eat_exponent();
309                RawLiteralKind::Rational { base, empty_exponent }
310            }
311            _ => RawLiteralKind::Int { base, empty_int: false },
312        }
313    }
314
315    #[cold]
316    fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
317        self.eat_decimal_digits();
318        let empty_exponent = match self.first() {
319            b'e' | b'E' => {
320                self.bump();
321                !self.eat_exponent()
322            }
323            _ => false,
324        };
325        RawLiteralKind::Rational { base, empty_exponent }
326    }
327
328    fn maybe_string_prefix(&mut self, prefix: &str) -> Option<bool> {
329        debug_assert_eq!(self.prev(), prefix.bytes().next().unwrap());
330        let prefix = &prefix[1..];
331        let s = self.as_str();
332        if s.starts_with(prefix) {
333            let skip = prefix.len();
334            let Some(quote @ (b'"' | b'\'')) = s.as_bytes().get(skip).copied() else { return None };
335            self.ignore_bytes(skip);
336            self.bump();
337            let terminated = self.eat_string(quote);
338            Some(terminated)
339        } else {
340            None
341        }
342    }
343
344    /// Eats a string until the given quote character. Returns `true` if the string was terminated.
345    fn eat_string(&mut self, quote: u8) -> bool {
346        debug_assert_eq!(self.prev(), quote);
347        while let Some(c) = self.bump_ret() {
348            if c == quote {
349                return true;
350            }
351            if c == b'\\' {
352                let first = self.first();
353                if first == b'\\' || first == quote {
354                    // Bump again to skip escaped character.
355                    self.bump();
356                }
357            }
358        }
359        // End of file reached.
360        false
361    }
362
363    /// Eats characters for a decimal number. Returns `true` if any digits were encountered.
364    fn eat_decimal_digits(&mut self) -> bool {
365        self.eat_digits(|x| x.is_ascii_digit())
366    }
367
368    /// Eats characters for a hexadecimal number. Returns `true` if any digits were encountered.
369    fn eat_hexadecimal_digits(&mut self) -> bool {
370        self.eat_digits(|x| x.is_ascii_hexdigit())
371    }
372
373    fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
374        let mut has_digits = false;
375        loop {
376            match self.first() {
377                b'_' => {
378                    self.bump();
379                }
380                c if is_digit(c) => {
381                    has_digits = true;
382                    self.bump();
383                }
384                _ => break,
385            }
386        }
387        has_digits
388    }
389
390    /// Eats the exponent. Returns `true` if any digits were encountered.
391    fn eat_exponent(&mut self) -> bool {
392        debug_assert!(self.prev() == b'e' || self.prev() == b'E');
393        // b'+' is not a valid prefix for an exponent.
394        if self.first() == b'-' {
395            self.bump();
396        }
397        self.eat_decimal_digits()
398    }
399
400    /// Returns the remaining input as a string slice.
401    #[inline]
402    pub fn as_str(&self) -> &'a str {
403        self.chars.as_str()
404    }
405
406    /// Returns the last eaten symbol. Only available with `debug_assertions` enabled.
407    #[inline]
408    fn prev(&self) -> u8 {
409        #[cfg(debug_assertions)]
410        return self.prev;
411        #[cfg(not(debug_assertions))]
412        return EOF;
413    }
414
415    /// Peeks the next symbol from the input stream without consuming it.
416    /// If requested position doesn't exist, `EOF` is returned.
417    /// However, getting `EOF` doesn't always mean actual end of file,
418    /// it should be checked with `is_eof` method.
419    #[inline]
420    fn first(&self) -> u8 {
421        self.peek_byte(0)
422    }
423
424    /// Peeks the second symbol from the input stream without consuming it.
425    #[inline]
426    fn second(&self) -> u8 {
427        // This function is only called after `first` was called and checked, so in practice it
428        // doesn't matter if it's part of the first UTF-8 character.
429        self.peek_byte(1)
430    }
431
432    // Do not use directly.
433    #[doc(hidden)]
434    #[inline]
435    fn peek_byte(&self, index: usize) -> u8 {
436        self.as_str().as_bytes().get(index).copied().unwrap_or(EOF)
437    }
438
439    /// Checks if there is nothing more to consume.
440    #[inline]
441    fn is_eof(&self) -> bool {
442        self.as_str().is_empty()
443    }
444
445    /// Returns amount of already consumed symbols.
446    #[inline]
447    fn pos_within_token(&self) -> u32 {
448        (self.len_remaining - self.as_str().len()) as u32
449    }
450
451    /// Resets the number of bytes consumed to 0.
452    #[inline]
453    fn reset_pos_within_token(&mut self) {
454        self.len_remaining = self.as_str().len();
455    }
456
457    /// Moves to the next character.
458    fn bump(&mut self) {
459        self.bump_inlined();
460    }
461
462    /// Moves to the next character, returning the current one.
463    fn bump_ret(&mut self) -> Option<u8> {
464        let c = self.as_str().as_bytes().first().copied();
465        self.bump_inlined();
466        c
467    }
468
469    #[inline]
470    fn bump_inlined(&mut self) {
471        // NOTE: This intentionally does not assign `_c` in the next line, as rustc currently emit a
472        // lot more LLVM IR (for an `assume`), which messes with the optimizations and inling costs.
473        #[cfg(not(debug_assertions))]
474        self.chars.next();
475        #[cfg(debug_assertions)]
476        if let Some(c) = self.chars.next() {
477            self.prev = c as u8;
478        }
479    }
480
481    /// Advances `n` bytes, without setting `prev`.
482    #[inline]
483    #[cfg_attr(debug_assertions, track_caller)]
484    fn ignore_bytes(&mut self, n: usize) {
485        self.chars = self.chars.as_str()[n..].chars();
486    }
487
488    /// Eats symbols while predicate returns true or until the end of file is reached.
489    #[inline]
490    fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
491        while predicate(self.first()) && !self.is_eof() {
492            self.bump();
493        }
494    }
495}
496
497impl Iterator for Cursor<'_> {
498    type Item = RawToken;
499
500    #[inline]
501    fn next(&mut self) -> Option<Self::Item> {
502        let token = self.advance_token();
503        if token.kind == RawTokenKind::Eof {
504            None
505        } else {
506            Some(token)
507        }
508    }
509}
510
511impl std::iter::FusedIterator for Cursor<'_> {}