solar_parse/lexer/cursor/
mod.rs

1//! Low-level Solidity lexer.
2//!
3//! Modified from Rust's [`rustc_lexer`](https://github.com/rust-lang/rust/blob/45749b21b7fd836f6c4f11dd40376f7c83e2791b/compiler/rustc_lexer/src/lib.rs).
4
5use memchr::memmem;
6use solar_ast::{Base, StrKind};
7use solar_data_structures::hint::unlikely;
8use std::sync::OnceLock;
9
10pub mod token;
11use token::{RawLiteralKind, RawToken, RawTokenKind};
12
13#[cfg(test)]
14mod tests;
15
16/// Returns `true` if the given character is considered a whitespace.
17#[inline]
18pub const fn is_whitespace(c: char) -> bool {
19    is_whitespace_byte(ch2u8(c))
20}
21/// Returns `true` if the given character is considered a whitespace.
22#[inline]
23pub const fn is_whitespace_byte(c: u8) -> bool {
24    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
25}
26
27/// Returns `true` if the given character is valid at the start of a Solidity identifier.
28#[inline]
29pub const fn is_id_start(c: char) -> bool {
30    is_id_start_byte(ch2u8(c))
31}
32/// Returns `true` if the given character is valid at the start of a Solidity identifier.
33#[inline]
34pub const fn is_id_start_byte(c: u8) -> bool {
35    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
36}
37
38/// Returns `true` if the given character is valid in a Solidity identifier.
39#[inline]
40pub const fn is_id_continue(c: char) -> bool {
41    is_id_continue_byte(ch2u8(c))
42}
43/// Returns `true` if the given character is valid in a Solidity identifier.
44#[inline]
45pub const fn is_id_continue_byte(c: u8) -> bool {
46    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$')
47}
48
49/// Returns `true` if the given string is a valid Solidity identifier.
50///
51/// An identifier in Solidity has to start with a letter, a dollar-sign or an underscore and may
52/// additionally contain numbers after the first symbol.
53///
54/// Reference: <https://docs.soliditylang.org/en/latest/grammar.html#a4.SolidityLexer.Identifier>
55#[inline]
56pub const fn is_ident(s: &str) -> bool {
57    is_ident_bytes(s.as_bytes())
58}
59
60/// Returns `true` if the given byte slice is a valid Solidity identifier.
61///
62/// See [`is_ident`] for more details.
63pub const fn is_ident_bytes(s: &[u8]) -> bool {
64    // Note: valid idents can only contain ASCII characters, so we can use the byte representation
65    // here.
66    let [first, ref rest @ ..] = *s else {
67        return false;
68    };
69
70    if !is_id_start_byte(first) {
71        return false;
72    }
73
74    let mut i = 0;
75    while i < rest.len() {
76        if !is_id_continue_byte(rest[i]) {
77            return false;
78        }
79        i += 1;
80    }
81
82    true
83}
84
85/// Converts a `char` to a `u8`.
86#[inline(always)]
87const fn ch2u8(c: char) -> u8 {
88    c as u32 as u8
89}
90
91const EOF: u8 = b'\0';
92
93/// Peekable iterator over a char sequence.
94///
95/// Next characters can be peeked via `first` method,
96/// and position can be shifted forward via `bump` method.
97#[derive(Clone, Debug)]
98pub struct Cursor<'a> {
99    bytes: std::slice::Iter<'a, u8>,
100}
101
102impl<'a> Cursor<'a> {
103    /// Creates a new cursor over the given input string slice.
104    pub fn new(input: &'a str) -> Self {
105        Cursor { bytes: input.as_bytes().iter() }
106    }
107
108    /// Parses a token from the input string.
109    pub fn advance_token(&mut self) -> RawToken {
110        // Use the pointer instead of the length to track how many bytes were consumed, since
111        // internally the iterator is a pair of `start` and `end` pointers.
112        let start = self.as_ptr();
113
114        let first_char = match self.bump_ret() {
115            Some(c) => c,
116            None => return RawToken::EOF,
117        };
118        let token_kind = self.advance_token_kind(first_char);
119
120        // SAFETY: `start` points to the same string.
121        let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
122
123        RawToken::new(token_kind, len as u32)
124    }
125
126    #[inline]
127    fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
128        match first_char {
129            // Slash, comment or block comment.
130            b'/' => match self.first() {
131                b'/' => self.line_comment(),
132                b'*' => self.block_comment(),
133                _ => RawTokenKind::Slash,
134            },
135
136            // Whitespace sequence.
137            c if is_whitespace_byte(c) => self.whitespace(),
138
139            // Identifier (this should be checked after other variant that can start as identifier).
140            c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
141
142            // Numeric literal.
143            b'0'..=b'9' => {
144                let kind = self.number(first_char);
145                RawTokenKind::Literal { kind }
146            }
147            b'.' if self.first().is_ascii_digit() => {
148                let kind = self.rational_number_after_dot(Base::Decimal);
149                RawTokenKind::Literal { kind }
150            }
151
152            // One-symbol tokens.
153            b';' => RawTokenKind::Semi,
154            b',' => RawTokenKind::Comma,
155            b'.' => RawTokenKind::Dot,
156            b'(' => RawTokenKind::OpenParen,
157            b')' => RawTokenKind::CloseParen,
158            b'{' => RawTokenKind::OpenBrace,
159            b'}' => RawTokenKind::CloseBrace,
160            b'[' => RawTokenKind::OpenBracket,
161            b']' => RawTokenKind::CloseBracket,
162            b'~' => RawTokenKind::Tilde,
163            b'?' => RawTokenKind::Question,
164            b':' => RawTokenKind::Colon,
165            b'=' => RawTokenKind::Eq,
166            b'!' => RawTokenKind::Bang,
167            b'<' => RawTokenKind::Lt,
168            b'>' => RawTokenKind::Gt,
169            b'-' => RawTokenKind::Minus,
170            b'&' => RawTokenKind::And,
171            b'|' => RawTokenKind::Or,
172            b'+' => RawTokenKind::Plus,
173            b'*' => RawTokenKind::Star,
174            b'^' => RawTokenKind::Caret,
175            b'%' => RawTokenKind::Percent,
176
177            // String literal.
178            b'\'' | b'"' => {
179                let terminated = self.eat_string(first_char);
180                let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
181                RawTokenKind::Literal { kind }
182            }
183
184            _ => {
185                if unlikely(!first_char.is_ascii()) {
186                    self.bump_utf8_with(first_char);
187                }
188                RawTokenKind::Unknown
189            }
190        }
191    }
192
193    #[inline(never)]
194    fn line_comment(&mut self) -> RawTokenKind {
195        debug_assert!(self.prev() == b'/' && self.first() == b'/');
196        self.bump();
197
198        // `////` (more than 3 slashes) is not considered a doc comment.
199        let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
200
201        // Take into account Windows line ending (CRLF)
202        self.eat_until_either(b'\n', b'\r');
203        RawTokenKind::LineComment { is_doc }
204    }
205
206    #[inline(never)]
207    fn block_comment(&mut self) -> RawTokenKind {
208        debug_assert!(self.prev() == b'/' && self.first() == b'*');
209        self.bump();
210
211        // `/***` (more than 2 stars) is not considered a doc comment.
212        // `/**/` is not considered a doc comment.
213        let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
214
215        let b = self.as_bytes();
216        static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
217        let (terminated, n) = FINDER
218            .get_or_init(|| memmem::Finder::new(b"*/"))
219            .find(b)
220            .map_or((false, b.len()), |pos| (true, pos + 2));
221        self.ignore_bytes(n);
222
223        RawTokenKind::BlockComment { is_doc, terminated }
224    }
225
226    fn whitespace(&mut self) -> RawTokenKind {
227        debug_assert!(is_whitespace_byte(self.prev()));
228        self.eat_while(is_whitespace_byte);
229        RawTokenKind::Whitespace
230    }
231
232    fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
233        debug_assert!(is_id_start_byte(self.prev()));
234
235        // Start is already eaten, eat the rest of identifier.
236        let start = self.as_ptr();
237        self.eat_while(is_id_continue_byte);
238
239        // Check if the identifier is a string literal prefix.
240        if unlikely(matches!(first, b'h' | b'u')) {
241            // SAFETY: within bounds and lifetime of `self.chars`.
242            let id = unsafe {
243                let start = start.sub(1);
244                std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
245            };
246            let is_hex = id == b"hex";
247            if is_hex || id == b"unicode" {
248                if let quote @ (b'\'' | b'"') = self.first() {
249                    self.bump();
250                    let terminated = self.eat_string(quote);
251                    let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
252                    return RawTokenKind::Literal {
253                        kind: RawLiteralKind::Str { kind, terminated },
254                    };
255                }
256            }
257        }
258
259        RawTokenKind::Ident
260    }
261
262    fn number(&mut self, first_digit: u8) -> RawLiteralKind {
263        debug_assert!(self.prev().is_ascii_digit());
264        let mut base = Base::Decimal;
265        if first_digit == b'0' {
266            // Attempt to parse encoding base.
267            let has_digits = match self.first() {
268                b'b' => {
269                    base = Base::Binary;
270                    self.bump();
271                    self.eat_decimal_digits()
272                }
273                b'o' => {
274                    base = Base::Octal;
275                    self.bump();
276                    self.eat_decimal_digits()
277                }
278                b'x' => {
279                    base = Base::Hexadecimal;
280                    self.bump();
281                    self.eat_hexadecimal_digits()
282                }
283                // Not a base prefix.
284                b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
285                    self.eat_decimal_digits();
286                    true
287                }
288                // Just a 0.
289                _ => return RawLiteralKind::Int { base, empty_int: false },
290            };
291            // Base prefix was provided, but there were no digits after it, e.g. "0x".
292            if !has_digits {
293                return RawLiteralKind::Int { base, empty_int: true };
294            }
295        } else {
296            // No base prefix, parse number in the usual way.
297            self.eat_decimal_digits();
298        };
299
300        match self.first() {
301            // Don't be greedy if this is actually an integer literal followed by field/method
302            // access (`12.foo()`).
303            // `_` is special cased, we assume it's always an invalid rational: https://github.com/ethereum/solidity/blob/c012b725bb8ce755b93ce0dd05e83c34c499acd6/liblangutil/Scanner.cpp#L979
304            b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
305                self.bump();
306                self.rational_number_after_dot(base)
307            }
308            b'e' | b'E' => {
309                self.bump();
310                let empty_exponent = !self.eat_exponent();
311                RawLiteralKind::Rational { base, empty_exponent }
312            }
313            _ => RawLiteralKind::Int { base, empty_int: false },
314        }
315    }
316
317    #[cold]
318    fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
319        self.eat_decimal_digits();
320        let empty_exponent = match self.first() {
321            b'e' | b'E' => {
322                self.bump();
323                !self.eat_exponent()
324            }
325            _ => false,
326        };
327        RawLiteralKind::Rational { base, empty_exponent }
328    }
329
330    /// Eats a string until the given quote character. Returns `true` if the string was terminated.
331    fn eat_string(&mut self, quote: u8) -> bool {
332        debug_assert_eq!(self.prev(), quote);
333        while let Some(c) = self.bump_ret() {
334            if c == quote {
335                return true;
336            }
337            if c == b'\\' {
338                let first = self.first();
339                if first == b'\\' || first == quote {
340                    // Bump again to skip escaped character.
341                    self.bump();
342                }
343            }
344        }
345        // End of file reached.
346        false
347    }
348
349    /// Eats characters for a decimal number. Returns `true` if any digits were encountered.
350    fn eat_decimal_digits(&mut self) -> bool {
351        self.eat_digits(|x| x.is_ascii_digit())
352    }
353
354    /// Eats characters for a hexadecimal number. Returns `true` if any digits were encountered.
355    fn eat_hexadecimal_digits(&mut self) -> bool {
356        self.eat_digits(|x| x.is_ascii_hexdigit())
357    }
358
359    fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
360        let mut has_digits = false;
361        loop {
362            match self.first() {
363                b'_' => {
364                    self.bump();
365                }
366                c if is_digit(c) => {
367                    has_digits = true;
368                    self.bump();
369                }
370                _ => break,
371            }
372        }
373        has_digits
374    }
375
376    /// Eats the exponent. Returns `true` if any digits were encountered.
377    fn eat_exponent(&mut self) -> bool {
378        debug_assert!(self.prev() == b'e' || self.prev() == b'E');
379        // b'+' is not a valid prefix for an exponent.
380        if self.first() == b'-' {
381            self.bump();
382        }
383        self.eat_decimal_digits()
384    }
385
386    /// Returns the remaining input as a string slice.
387    #[inline]
388    #[deprecated = "use `as_bytes` instead; utf-8 is not guaranteed anymore"]
389    pub fn as_str(&self) -> &'a str {
390        // SAFETY: Not safe.
391        unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }
392    }
393
394    /// Returns the remaining input as a byte slice.
395    #[inline]
396    pub fn as_bytes(&self) -> &'a [u8] {
397        self.bytes.as_slice()
398    }
399
400    /// Returns the pointer to the first byte of the remaining input.
401    #[inline]
402    pub fn as_ptr(&self) -> *const u8 {
403        self.bytes.as_slice().as_ptr()
404    }
405
406    /// Returns the last eaten byte.
407    #[inline]
408    fn prev(&self) -> u8 {
409        // SAFETY: We always bump at least one character before calling this method.
410        unsafe { *self.as_ptr().sub(1) }
411    }
412
413    /// Peeks the next byte from the input stream without consuming it.
414    /// If requested position doesn't exist, `EOF` is returned.
415    /// However, getting `EOF` doesn't always mean actual end of file,
416    /// it should be checked with `is_eof` method.
417    #[inline]
418    fn first(&self) -> u8 {
419        self.peek_byte(0)
420    }
421
422    /// Peeks the second byte from the input stream without consuming it.
423    #[inline]
424    fn second(&self) -> u8 {
425        // This function is only called after `first` was called and checked, so in practice it
426        // doesn't matter if it's part of the first UTF-8 character.
427        self.peek_byte(1)
428    }
429
430    // Do not use directly.
431    #[doc(hidden)]
432    #[inline]
433    fn peek_byte(&self, index: usize) -> u8 {
434        self.as_bytes().get(index).copied().unwrap_or(EOF)
435    }
436
437    /// Checks if there is nothing more to consume.
438    #[inline]
439    fn is_eof(&self) -> bool {
440        self.as_bytes().is_empty()
441    }
442
443    /// Moves to the next character.
444    fn bump(&mut self) {
445        self.bytes.next();
446    }
447
448    /// Skips to the end of the current UTF-8 character sequence, with `x` as the first byte.
449    ///
450    /// Assumes that `x` is the previously consumed byte.
451    #[cold]
452    #[allow(clippy::match_overlapping_arm)]
453    fn bump_utf8_with(&mut self, x: u8) {
454        debug_assert_eq!(self.prev(), x);
455        let skip = match x {
456            ..0x80 => 0,
457            ..0xE0 => 1,
458            ..0xF0 => 2,
459            _ => 3,
460        };
461        // NOTE: The internal iterator was created with from valid UTF-8 string, so we can freely
462        // skip bytes here without checking bounds.
463        self.ignore_bytes(skip);
464    }
465
466    /// Moves to the next character, returning the current one.
467    fn bump_ret(&mut self) -> Option<u8> {
468        let c = self.as_bytes().first().copied();
469        self.bytes.next();
470        c
471    }
472
473    /// Advances `n` bytes.
474    #[inline]
475    #[cfg_attr(debug_assertions, track_caller)]
476    fn ignore_bytes(&mut self, n: usize) {
477        debug_assert!(n <= self.as_bytes().len());
478        self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
479    }
480
481    /// Eats symbols until `ch1` or `ch2` is found or until the end of file is reached.
482    ///
483    /// Returns `true` if `ch1` or `ch2` was found, `false` if the end of file was reached.
484    #[inline]
485    fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
486        let b = self.as_bytes();
487        let res = memchr::memchr2(ch1, ch2, b);
488        self.ignore_bytes(res.unwrap_or(b.len()));
489        res.is_some()
490    }
491
492    /// Eats symbols while predicate returns true or until the end of file is reached.
493    #[inline]
494    fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
495        while predicate(self.first()) && !self.is_eof() {
496            self.bump();
497        }
498    }
499}
500
501impl Iterator for Cursor<'_> {
502    type Item = RawToken;
503
504    #[inline]
505    fn next(&mut self) -> Option<Self::Item> {
506        let token = self.advance_token();
507        if token.kind == RawTokenKind::Eof {
508            None
509        } else {
510            Some(token)
511        }
512    }
513}
514
515impl std::iter::FusedIterator for Cursor<'_> {}