solar_parse/lexer/cursor/
mod.rs

1//! Low-level Solidity lexer.
2//!
3//! Modified from Rust's [`rustc_lexer`](https://github.com/rust-lang/rust/blob/45749b21b7fd836f6c4f11dd40376f7c83e2791b/compiler/rustc_lexer/src/lib.rs).
4
5use memchr::memmem;
6use solar_ast::{Base, StrKind};
7use solar_data_structures::hint::unlikely;
8use std::sync::OnceLock;
9
10pub mod token;
11use token::{RawLiteralKind, RawToken, RawTokenKind};
12
13#[cfg(test)]
14mod tests;
15
16/// Returns `true` if the given character is considered a whitespace.
17#[inline]
18pub const fn is_whitespace(c: char) -> bool {
19    is_whitespace_byte(ch2u8(c))
20}
21/// Returns `true` if the given character is considered a whitespace.
22#[inline]
23pub const fn is_whitespace_byte(c: u8) -> bool {
24    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
25}
26
27/// Returns `true` if the given character is valid at the start of a Solidity identifier.
28#[inline]
29pub const fn is_id_start(c: char) -> bool {
30    is_id_start_byte(ch2u8(c))
31}
32/// Returns `true` if the given character is valid at the start of a Solidity identifier.
33#[inline]
34pub const fn is_id_start_byte(c: u8) -> bool {
35    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
36}
37
38/// Returns `true` if the given character is valid in a Solidity identifier.
39#[inline]
40pub const fn is_id_continue(c: char) -> bool {
41    is_id_continue_byte(ch2u8(c))
42}
43/// Returns `true` if the given character is valid in a Solidity identifier.
44#[inline]
45pub const fn is_id_continue_byte(c: u8) -> bool {
46    let is_number = (c >= b'0') & (c <= b'9');
47    is_id_start_byte(c) || is_number
48}
49
50/// Returns `true` if the given string is a valid Solidity identifier.
51///
52/// An identifier in Solidity has to start with a letter, a dollar-sign or an underscore and may
53/// additionally contain numbers after the first symbol.
54///
55/// Reference: <https://docs.soliditylang.org/en/latest/grammar.html#a4.SolidityLexer.Identifier>
56#[inline]
57pub const fn is_ident(s: &str) -> bool {
58    is_ident_bytes(s.as_bytes())
59}
60
61/// Returns `true` if the given byte slice is a valid Solidity identifier.
62///
63/// See [`is_ident`] for more details.
64pub const fn is_ident_bytes(s: &[u8]) -> bool {
65    let [first, ref rest @ ..] = *s else {
66        return false;
67    };
68
69    if !is_id_start_byte(first) {
70        return false;
71    }
72
73    let mut i = 0;
74    while i < rest.len() {
75        if !is_id_continue_byte(rest[i]) {
76            return false;
77        }
78        i += 1;
79    }
80
81    true
82}
83
84/// Converts a `char` to a `u8`.
85#[inline(always)]
86const fn ch2u8(c: char) -> u8 {
87    c as u32 as u8
88}
89
90const EOF: u8 = b'\0';
91
92/// Peekable iterator over a char sequence.
93///
94/// Next characters can be peeked via `first` method,
95/// and position can be shifted forward via `bump` method.
96#[derive(Clone, Debug)]
97pub struct Cursor<'a> {
98    bytes: std::slice::Iter<'a, u8>,
99}
100
101impl<'a> Cursor<'a> {
102    /// Creates a new cursor over the given input string slice.
103    #[inline]
104    pub fn new(input: &'a str) -> Self {
105        Cursor { bytes: input.as_bytes().iter() }
106    }
107
108    /// Creates a new iterator that also returns the position of each token in the input string.
109    ///
110    /// Note that the position currently always starts at 0 when this method is called, so if called
111    /// after tokens are parsed the position will be relative to when this method is called, not the
112    /// beginning of the string.
113    #[inline]
114    pub fn with_position(self) -> CursorWithPosition<'a> {
115        CursorWithPosition::new(self)
116    }
117
118    /// Parses a token from the input string.
119    pub fn advance_token(&mut self) -> RawToken {
120        // Use the pointer instead of the length to track how many bytes were consumed, since
121        // internally the iterator is a pair of `start` and `end` pointers.
122        let start = self.as_ptr();
123
124        let Some(first_char) = self.bump_ret() else { return RawToken::EOF };
125        let token_kind = self.advance_token_kind(first_char);
126
127        // SAFETY: `start` points to the same string.
128        let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
129
130        RawToken::new(token_kind, len as u32)
131    }
132
133    #[inline]
134    fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
135        match first_char {
136            // Slash, comment or block comment.
137            b'/' => match self.first() {
138                b'/' => self.line_comment(),
139                b'*' => self.block_comment(),
140                _ => RawTokenKind::Slash,
141            },
142
143            // Whitespace sequence.
144            c if is_whitespace_byte(c) => self.whitespace(),
145
146            // Identifier (this should be checked after other variant that can start as identifier).
147            c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
148
149            // Numeric literal.
150            b'0'..=b'9' => {
151                let kind = self.number(first_char);
152                RawTokenKind::Literal { kind }
153            }
154            b'.' if self.first().is_ascii_digit() => {
155                let kind = self.rational_number_after_dot(Base::Decimal);
156                RawTokenKind::Literal { kind }
157            }
158
159            // One-symbol tokens.
160            b';' => RawTokenKind::Semi,
161            b',' => RawTokenKind::Comma,
162            b'.' => RawTokenKind::Dot,
163            b'(' => RawTokenKind::OpenParen,
164            b')' => RawTokenKind::CloseParen,
165            b'{' => RawTokenKind::OpenBrace,
166            b'}' => RawTokenKind::CloseBrace,
167            b'[' => RawTokenKind::OpenBracket,
168            b']' => RawTokenKind::CloseBracket,
169            b'~' => RawTokenKind::Tilde,
170            b'?' => RawTokenKind::Question,
171            b':' => RawTokenKind::Colon,
172            b'=' => RawTokenKind::Eq,
173            b'!' => RawTokenKind::Bang,
174            b'<' => RawTokenKind::Lt,
175            b'>' => RawTokenKind::Gt,
176            b'-' => RawTokenKind::Minus,
177            b'&' => RawTokenKind::And,
178            b'|' => RawTokenKind::Or,
179            b'+' => RawTokenKind::Plus,
180            b'*' => RawTokenKind::Star,
181            b'^' => RawTokenKind::Caret,
182            b'%' => RawTokenKind::Percent,
183
184            // String literal.
185            b'\'' | b'"' => {
186                let terminated = self.eat_string(first_char);
187                let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
188                RawTokenKind::Literal { kind }
189            }
190
191            _ => {
192                if unlikely(!first_char.is_ascii()) {
193                    self.bump_utf8_with(first_char);
194                }
195                RawTokenKind::Unknown
196            }
197        }
198    }
199
200    #[inline(never)]
201    fn line_comment(&mut self) -> RawTokenKind {
202        debug_assert!(self.prev() == b'/' && self.first() == b'/');
203        self.bump();
204
205        // `////` (more than 3 slashes) is not considered a doc comment.
206        let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
207
208        // Take into account Windows line ending (CRLF)
209        self.eat_until_either(b'\n', b'\r');
210        RawTokenKind::LineComment { is_doc }
211    }
212
213    #[inline(never)]
214    fn block_comment(&mut self) -> RawTokenKind {
215        debug_assert!(self.prev() == b'/' && self.first() == b'*');
216        self.bump();
217
218        // `/***` (more than 2 stars) is not considered a doc comment.
219        // `/**/` is not considered a doc comment.
220        let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
221
222        let b = self.as_bytes();
223        static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
224        let (terminated, n) = FINDER
225            .get_or_init(|| memmem::Finder::new(b"*/"))
226            .find(b)
227            .map_or((false, b.len()), |pos| (true, pos + 2));
228        self.ignore_bytes(n);
229
230        RawTokenKind::BlockComment { is_doc, terminated }
231    }
232
233    fn whitespace(&mut self) -> RawTokenKind {
234        debug_assert!(is_whitespace_byte(self.prev()));
235        self.eat_while(is_whitespace_byte);
236        RawTokenKind::Whitespace
237    }
238
239    fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
240        debug_assert!(is_id_start_byte(self.prev()));
241
242        // Start is already eaten, eat the rest of identifier.
243        let start = self.as_ptr();
244        self.eat_while(is_id_continue_byte);
245
246        // Check if the identifier is a string literal prefix.
247        if unlikely(matches!(first, b'h' | b'u')) {
248            // SAFETY: within bounds and lifetime of `self.chars`.
249            let id = unsafe {
250                let start = start.sub(1);
251                std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
252            };
253            let is_hex = id == b"hex";
254            if (is_hex || id == b"unicode")
255                && let quote @ (b'\'' | b'"') = self.first()
256            {
257                self.bump();
258                let terminated = self.eat_string(quote);
259                let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
260                return RawTokenKind::Literal { kind: RawLiteralKind::Str { kind, terminated } };
261            }
262        }
263
264        RawTokenKind::Ident
265    }
266
267    fn number(&mut self, first_digit: u8) -> RawLiteralKind {
268        debug_assert!(self.prev().is_ascii_digit());
269        let mut base = Base::Decimal;
270        if first_digit == b'0' {
271            // Attempt to parse encoding base.
272            let has_digits = match self.first() {
273                b'b' => {
274                    base = Base::Binary;
275                    self.bump();
276                    self.eat_decimal_digits()
277                }
278                b'o' => {
279                    base = Base::Octal;
280                    self.bump();
281                    self.eat_decimal_digits()
282                }
283                b'x' => {
284                    base = Base::Hexadecimal;
285                    self.bump();
286                    self.eat_hexadecimal_digits()
287                }
288                // Not a base prefix.
289                b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
290                    self.eat_decimal_digits();
291                    true
292                }
293                // Just a 0.
294                _ => return RawLiteralKind::Int { base, empty_int: false },
295            };
296            // Base prefix was provided, but there were no digits after it, e.g. "0x".
297            if !has_digits {
298                return RawLiteralKind::Int { base, empty_int: true };
299            }
300        } else {
301            // No base prefix, parse number in the usual way.
302            self.eat_decimal_digits();
303        };
304
305        match self.first() {
306            // Don't be greedy if this is actually an integer literal followed by field/method
307            // access (`12.foo()`).
308            // `_` is special cased, we assume it's always an invalid rational: https://github.com/ethereum/solidity/blob/c012b725bb8ce755b93ce0dd05e83c34c499acd6/liblangutil/Scanner.cpp#L979
309            b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
310                self.bump();
311                self.rational_number_after_dot(base)
312            }
313            b'e' | b'E' => {
314                self.bump();
315                let empty_exponent = !self.eat_exponent();
316                RawLiteralKind::Rational { base, empty_exponent }
317            }
318            _ => RawLiteralKind::Int { base, empty_int: false },
319        }
320    }
321
322    #[cold]
323    fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
324        self.eat_decimal_digits();
325        let empty_exponent = match self.first() {
326            b'e' | b'E' => {
327                self.bump();
328                !self.eat_exponent()
329            }
330            _ => false,
331        };
332        RawLiteralKind::Rational { base, empty_exponent }
333    }
334
335    /// Eats a string until the given quote character. Returns `true` if the string was terminated.
336    fn eat_string(&mut self, quote: u8) -> bool {
337        debug_assert_eq!(self.prev(), quote);
338        while let Some(c) = self.bump_ret() {
339            if c == quote {
340                return true;
341            }
342            if c == b'\\' {
343                let first = self.first();
344                if first == b'\\' || first == quote {
345                    // Bump again to skip escaped character.
346                    self.bump();
347                }
348            }
349        }
350        // End of file reached.
351        false
352    }
353
354    /// Eats characters for a decimal number. Returns `true` if any digits were encountered.
355    fn eat_decimal_digits(&mut self) -> bool {
356        self.eat_digits(|x| x.is_ascii_digit())
357    }
358
359    /// Eats characters for a hexadecimal number. Returns `true` if any digits were encountered.
360    fn eat_hexadecimal_digits(&mut self) -> bool {
361        self.eat_digits(|x| x.is_ascii_hexdigit())
362    }
363
364    fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
365        let mut has_digits = false;
366        loop {
367            match self.first() {
368                b'_' => {
369                    self.bump();
370                }
371                c if is_digit(c) => {
372                    has_digits = true;
373                    self.bump();
374                }
375                _ => break,
376            }
377        }
378        has_digits
379    }
380
381    /// Eats the exponent. Returns `true` if any digits were encountered.
382    fn eat_exponent(&mut self) -> bool {
383        debug_assert!(self.prev() == b'e' || self.prev() == b'E');
384        // b'+' is not a valid prefix for an exponent.
385        if self.first() == b'-' {
386            self.bump();
387        }
388        self.eat_decimal_digits()
389    }
390
391    /// Returns the remaining input as a string slice.
392    #[inline]
393    #[deprecated = "use `as_bytes` instead; utf-8 is not guaranteed anymore"]
394    pub fn as_str(&self) -> &'a str {
395        // SAFETY: Not safe.
396        unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }
397    }
398
399    /// Returns the remaining input as a byte slice.
400    #[inline]
401    pub fn as_bytes(&self) -> &'a [u8] {
402        self.bytes.as_slice()
403    }
404
405    /// Returns the pointer to the first byte of the remaining input.
406    #[inline]
407    pub fn as_ptr(&self) -> *const u8 {
408        self.bytes.as_slice().as_ptr()
409    }
410
411    /// Returns the last eaten byte.
412    #[inline]
413    fn prev(&self) -> u8 {
414        // SAFETY: We always bump at least one character before calling this method.
415        unsafe { *self.as_ptr().sub(1) }
416    }
417
418    /// Peeks the next byte from the input stream without consuming it.
419    /// If requested position doesn't exist, `EOF` is returned.
420    /// However, getting `EOF` doesn't always mean actual end of file,
421    /// it should be checked with `is_eof` method.
422    #[inline]
423    fn first(&self) -> u8 {
424        self.peek_byte(0)
425    }
426
427    /// Peeks the second byte from the input stream without consuming it.
428    #[inline]
429    fn second(&self) -> u8 {
430        // This function is only called after `first` was called and checked, so in practice it
431        // doesn't matter if it's part of the first UTF-8 character.
432        self.peek_byte(1)
433    }
434
435    // Do not use directly.
436    #[doc(hidden)]
437    #[inline]
438    fn peek_byte(&self, index: usize) -> u8 {
439        self.as_bytes().get(index).copied().unwrap_or(EOF)
440    }
441
442    /// Moves to the next character.
443    fn bump(&mut self) {
444        self.bytes.next();
445    }
446
447    /// Skips to the end of the current UTF-8 character sequence, with `x` as the first byte.
448    ///
449    /// Assumes that `x` is the previously consumed byte.
450    #[cold]
451    #[allow(clippy::match_overlapping_arm)]
452    fn bump_utf8_with(&mut self, x: u8) {
453        debug_assert_eq!(self.prev(), x);
454        let skip = match x {
455            ..0x80 => 0,
456            ..0xE0 => 1,
457            ..0xF0 => 2,
458            _ => 3,
459        };
460        // NOTE: The internal iterator was created with from valid UTF-8 string, so we can freely
461        // skip bytes here without checking bounds.
462        self.ignore_bytes(skip);
463    }
464
465    /// Moves to the next character, returning the current one.
466    fn bump_ret(&mut self) -> Option<u8> {
467        let c = self.as_bytes().first().copied();
468        self.bytes.next();
469        c
470    }
471
472    /// Advances `n` bytes.
473    #[inline]
474    #[cfg_attr(debug_assertions, track_caller)]
475    fn ignore_bytes(&mut self, n: usize) {
476        debug_assert!(n <= self.as_bytes().len());
477        self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
478    }
479
480    /// Eats symbols until `ch1` or `ch2` is found or until the end of file is reached.
481    ///
482    /// Returns `true` if `ch1` or `ch2` was found, `false` if the end of file was reached.
483    #[inline]
484    fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
485        let b = self.as_bytes();
486        let res = memchr::memchr2(ch1, ch2, b);
487        self.ignore_bytes(res.unwrap_or(b.len()));
488        res.is_some()
489    }
490
491    /// Eats symbols while predicate returns true or until the end of file is reached.
492    #[inline]
493    fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
494        while predicate(self.first()) {
495            self.bump();
496        }
497    }
498}
499
500impl Iterator for Cursor<'_> {
501    type Item = RawToken;
502
503    #[inline]
504    fn next(&mut self) -> Option<Self::Item> {
505        let token = self.advance_token();
506        if token.kind == RawTokenKind::Eof { None } else { Some(token) }
507    }
508}
509
510impl std::iter::FusedIterator for Cursor<'_> {}
511
512/// [`Cursor`] that also tracks the position of each token in the input string.
513///
514/// Created by calling [`Cursor::with_position`]. See that method and [`Cursor`] for more details.
515#[derive(Clone, Debug)]
516pub struct CursorWithPosition<'a> {
517    cursor: Cursor<'a>,
518    position: u32,
519}
520
521impl<'a> CursorWithPosition<'a> {
522    /// Creates a new cursor with position tracking from the given cursor.
523    #[inline]
524    fn new(cursor: Cursor<'a>) -> Self {
525        CursorWithPosition { cursor, position: 0 }
526    }
527
528    /// Returns a reference to the inner cursor.
529    #[inline]
530    pub fn inner(&self) -> &Cursor<'a> {
531        &self.cursor
532    }
533
534    /// Returns a mutable reference to the inner cursor.
535    #[inline]
536    pub fn inner_mut(&mut self) -> &mut Cursor<'a> {
537        &mut self.cursor
538    }
539
540    /// Returns the current position in the input string.
541    #[inline]
542    pub fn position(&self) -> usize {
543        self.position as usize
544    }
545}
546
547impl Iterator for CursorWithPosition<'_> {
548    type Item = (usize, RawToken);
549
550    #[inline]
551    fn next(&mut self) -> Option<Self::Item> {
552        self.cursor.next().map(|t| {
553            let pos = self.position;
554            self.position = pos + t.len;
555            (pos as usize, t)
556        })
557    }
558
559    #[inline]
560    fn size_hint(&self) -> (usize, Option<usize>) {
561        self.cursor.size_hint()
562    }
563}
564
565impl std::iter::FusedIterator for CursorWithPosition<'_> {}