solar_parse/lexer/cursor/
mod.rs

1//! Low-level Solidity lexer.
2//!
3//! Modified from Rust's [`rustc_lexer`](https://github.com/rust-lang/rust/blob/45749b21b7fd836f6c4f11dd40376f7c83e2791b/compiler/rustc_lexer/src/lib.rs).
4
5use memchr::memmem;
6use solar_ast::{
7    Base, StrKind,
8    token::{BinOpToken, Delimiter},
9};
10use solar_data_structures::hint::unlikely;
11use std::sync::OnceLock;
12
13pub mod token;
14use token::{RawLiteralKind, RawToken, RawTokenKind};
15
16#[cfg(test)]
17mod tests;
18
19/// Returns `true` if the given character is considered a whitespace.
20#[inline]
21pub const fn is_whitespace(c: char) -> bool {
22    is_whitespace_byte(ch2u8(c))
23}
24/// Returns `true` if the given character is considered a whitespace.
25#[inline]
26pub const fn is_whitespace_byte(c: u8) -> bool {
27    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
28}
29
30/// Returns `true` if the given character is valid at the start of a Solidity identifier.
31#[inline]
32pub const fn is_id_start(c: char) -> bool {
33    is_id_start_byte(ch2u8(c))
34}
35/// Returns `true` if the given character is valid at the start of a Solidity identifier.
36#[inline]
37pub const fn is_id_start_byte(c: u8) -> bool {
38    matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$')
39}
40
41/// Returns `true` if the given character is valid in a Solidity identifier.
42#[inline]
43pub const fn is_id_continue(c: char) -> bool {
44    is_id_continue_byte(ch2u8(c))
45}
46/// Returns `true` if the given character is valid in a Solidity identifier.
47#[inline]
48pub const fn is_id_continue_byte(c: u8) -> bool {
49    let is_number = (c >= b'0') & (c <= b'9');
50    is_id_start_byte(c) || is_number
51}
52
53/// Returns `true` if the given string is a valid Solidity identifier.
54///
55/// An identifier in Solidity has to start with a letter, a dollar-sign or an underscore and may
56/// additionally contain numbers after the first symbol.
57///
58/// Reference: <https://docs.soliditylang.org/en/latest/grammar.html#a4.SolidityLexer.Identifier>
59#[inline]
60pub const fn is_ident(s: &str) -> bool {
61    is_ident_bytes(s.as_bytes())
62}
63
64/// Returns `true` if the given byte slice is a valid Solidity identifier.
65///
66/// See [`is_ident`] for more details.
67pub const fn is_ident_bytes(s: &[u8]) -> bool {
68    let [first, ref rest @ ..] = *s else {
69        return false;
70    };
71
72    if !is_id_start_byte(first) {
73        return false;
74    }
75
76    let mut i = 0;
77    while i < rest.len() {
78        if !is_id_continue_byte(rest[i]) {
79            return false;
80        }
81        i += 1;
82    }
83
84    true
85}
86
87/// Converts a `char` to a `u8`.
88#[inline(always)]
89const fn ch2u8(c: char) -> u8 {
90    c as u32 as u8
91}
92
93const EOF: u8 = b'\0';
94
95/// Peekable iterator over a char sequence.
96///
97/// Next characters can be peeked via `first` method,
98/// and position can be shifted forward via `bump` method.
99#[derive(Clone, Debug)]
100pub struct Cursor<'a> {
101    bytes: std::slice::Iter<'a, u8>,
102}
103
104impl<'a> Cursor<'a> {
105    /// Creates a new cursor over the given input string slice.
106    #[inline]
107    pub fn new(input: &'a str) -> Self {
108        Cursor { bytes: input.as_bytes().iter() }
109    }
110
111    /// Creates a new iterator that also returns the position of each token in the input string.
112    ///
113    /// Note that the position currently always starts at 0 when this method is called, so if called
114    /// after tokens are parsed the position will be relative to when this method is called, not the
115    /// beginning of the string.
116    #[inline]
117    pub fn with_position(self) -> CursorWithPosition<'a> {
118        CursorWithPosition::new(self)
119    }
120
121    /// Slops up a token from the input string.
122    ///
123    /// Advances the cursor by the length of the token.
124    /// Prefer using `Cursor::with_position`, or using it as an iterator instead.
125    pub fn slop(&mut self) -> RawToken {
126        // Use the pointer instead of the length to track how many bytes were consumed, since
127        // internally the iterator is a pair of `start` and `end` pointers.
128        let start = self.as_ptr();
129
130        let Some(first_char) = self.bump_ret() else { return RawToken::EOF };
131        let token_kind = self.advance_token_kind(first_char);
132
133        // SAFETY: `start` points to the same string.
134        let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
135
136        RawToken::new(token_kind, len as u32)
137    }
138
139    #[inline]
140    fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
141        match first_char {
142            // Slash, comment or block comment.
143            b'/' => match self.first() {
144                b'/' => self.line_comment(),
145                b'*' => self.block_comment(),
146                b'=' => {
147                    self.bump();
148                    RawTokenKind::BinOpEq(BinOpToken::Slash)
149                }
150                _ => RawTokenKind::BinOp(BinOpToken::Slash),
151            },
152
153            // Whitespace sequence.
154            c if is_whitespace_byte(c) => self.whitespace(),
155
156            // Identifier (this should be checked after other variant that can start as identifier).
157            c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
158
159            // Numeric literal.
160            b'0'..=b'9' => {
161                let kind = self.number(first_char);
162                RawTokenKind::Literal { kind }
163            }
164            b'.' if self.first().is_ascii_digit() => {
165                let kind = self.rational_number_after_dot(Base::Decimal);
166                RawTokenKind::Literal { kind }
167            }
168
169            // One-symbol tokens.
170            b';' => RawTokenKind::Semi,
171            b',' => RawTokenKind::Comma,
172            b'.' => RawTokenKind::Dot,
173            b'(' => RawTokenKind::OpenDelim(Delimiter::Parenthesis),
174            b')' => RawTokenKind::CloseDelim(Delimiter::Parenthesis),
175            b'{' => RawTokenKind::OpenDelim(Delimiter::Brace),
176            b'}' => RawTokenKind::CloseDelim(Delimiter::Brace),
177            b'[' => RawTokenKind::OpenDelim(Delimiter::Bracket),
178            b']' => RawTokenKind::CloseDelim(Delimiter::Bracket),
179            b'~' => RawTokenKind::Tilde,
180            b'?' => RawTokenKind::Question,
181
182            // Multi-character tokens.
183            b':' => match self.first() {
184                b'=' => {
185                    self.bump();
186                    RawTokenKind::Walrus
187                }
188                _ => RawTokenKind::Colon,
189            },
190            b'=' => match self.first() {
191                b'=' => {
192                    self.bump();
193                    RawTokenKind::EqEq
194                }
195                b'>' => {
196                    self.bump();
197                    RawTokenKind::FatArrow
198                }
199                _ => RawTokenKind::Eq,
200            },
201            b'!' => match self.first() {
202                b'=' => {
203                    self.bump();
204                    RawTokenKind::Ne
205                }
206                _ => RawTokenKind::Not,
207            },
208            b'<' => match self.first() {
209                b'=' => {
210                    self.bump();
211                    RawTokenKind::Le
212                }
213                b'<' => {
214                    self.bump();
215                    // Now check for <<= or <<
216                    if self.first() == b'=' {
217                        self.bump();
218                        RawTokenKind::BinOpEq(BinOpToken::Shl)
219                    } else {
220                        RawTokenKind::BinOp(BinOpToken::Shl)
221                    }
222                }
223                _ => RawTokenKind::Lt,
224            },
225            b'>' => match self.first() {
226                b'=' => {
227                    self.bump();
228                    RawTokenKind::Ge
229                }
230                b'>' => {
231                    self.bump();
232                    match self.first() {
233                        b'>' => {
234                            // >>> or >>>=
235                            self.bump();
236                            if self.first() == b'=' {
237                                self.bump();
238                                RawTokenKind::BinOpEq(BinOpToken::Sar)
239                            } else {
240                                RawTokenKind::BinOp(BinOpToken::Sar)
241                            }
242                        }
243                        b'=' => {
244                            self.bump();
245                            RawTokenKind::BinOpEq(BinOpToken::Shr)
246                        }
247                        _ => RawTokenKind::BinOp(BinOpToken::Shr),
248                    }
249                }
250                _ => RawTokenKind::Gt,
251            },
252            b'-' => match self.first() {
253                b'-' => {
254                    self.bump();
255                    RawTokenKind::MinusMinus
256                }
257                b'=' => {
258                    self.bump();
259                    RawTokenKind::BinOpEq(BinOpToken::Minus)
260                }
261                b'>' => {
262                    self.bump();
263                    RawTokenKind::Arrow
264                }
265                _ => RawTokenKind::BinOp(BinOpToken::Minus),
266            },
267            b'&' => match self.first() {
268                b'&' => {
269                    self.bump();
270                    RawTokenKind::AndAnd
271                }
272                b'=' => {
273                    self.bump();
274                    RawTokenKind::BinOpEq(BinOpToken::And)
275                }
276                _ => RawTokenKind::BinOp(BinOpToken::And),
277            },
278            b'|' => match self.first() {
279                b'|' => {
280                    self.bump();
281                    RawTokenKind::OrOr
282                }
283                b'=' => {
284                    self.bump();
285                    RawTokenKind::BinOpEq(BinOpToken::Or)
286                }
287                _ => RawTokenKind::BinOp(BinOpToken::Or),
288            },
289            b'+' => match self.first() {
290                b'+' => {
291                    self.bump();
292                    RawTokenKind::PlusPlus
293                }
294                b'=' => {
295                    self.bump();
296                    RawTokenKind::BinOpEq(BinOpToken::Plus)
297                }
298                _ => RawTokenKind::BinOp(BinOpToken::Plus),
299            },
300            b'*' => match self.first() {
301                b'*' => {
302                    self.bump();
303                    RawTokenKind::StarStar
304                }
305                b'=' => {
306                    self.bump();
307                    RawTokenKind::BinOpEq(BinOpToken::Star)
308                }
309                _ => RawTokenKind::BinOp(BinOpToken::Star),
310            },
311            b'^' => match self.first() {
312                b'=' => {
313                    self.bump();
314                    RawTokenKind::BinOpEq(BinOpToken::Caret)
315                }
316                _ => RawTokenKind::BinOp(BinOpToken::Caret),
317            },
318            b'%' => match self.first() {
319                b'=' => {
320                    self.bump();
321                    RawTokenKind::BinOpEq(BinOpToken::Percent)
322                }
323                _ => RawTokenKind::BinOp(BinOpToken::Percent),
324            },
325
326            // String literal.
327            b'\'' | b'"' => {
328                let terminated = self.eat_string(first_char);
329                let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
330                RawTokenKind::Literal { kind }
331            }
332
333            _ => {
334                if unlikely(!first_char.is_ascii()) {
335                    self.bump_utf8_with(first_char);
336                }
337                RawTokenKind::Unknown
338            }
339        }
340    }
341
342    #[inline(never)]
343    fn line_comment(&mut self) -> RawTokenKind {
344        debug_assert!(self.prev() == b'/' && self.first() == b'/');
345        self.bump();
346
347        // `////` (more than 3 slashes) is not considered a doc comment.
348        let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
349
350        // Take into account Windows line ending (CRLF)
351        self.eat_until_either(b'\n', b'\r');
352        RawTokenKind::LineComment { is_doc }
353    }
354
355    #[inline(never)]
356    fn block_comment(&mut self) -> RawTokenKind {
357        debug_assert!(self.prev() == b'/' && self.first() == b'*');
358        self.bump();
359
360        // `/***` (more than 2 stars) is not considered a doc comment.
361        // `/**/` is not considered a doc comment.
362        let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
363
364        let b = self.as_bytes();
365        static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
366        let (terminated, n) = FINDER
367            .get_or_init(|| memmem::Finder::new(b"*/"))
368            .find(b)
369            .map_or((false, b.len()), |pos| (true, pos + 2));
370        self.ignore_bytes(n);
371
372        RawTokenKind::BlockComment { is_doc, terminated }
373    }
374
375    fn whitespace(&mut self) -> RawTokenKind {
376        debug_assert!(is_whitespace_byte(self.prev()));
377        self.eat_while(is_whitespace_byte);
378        RawTokenKind::Whitespace
379    }
380
381    fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
382        debug_assert!(is_id_start_byte(self.prev()));
383
384        // Start is already eaten, eat the rest of identifier.
385        let start = self.as_ptr();
386        self.eat_while(is_id_continue_byte);
387
388        // Check if the identifier is a string literal prefix.
389        if unlikely(matches!(first, b'h' | b'u')) {
390            // SAFETY: within bounds and lifetime of `self.chars`.
391            let id = unsafe {
392                let start = start.sub(1);
393                std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
394            };
395            let is_hex = id == b"hex";
396            if (is_hex || id == b"unicode")
397                && let quote @ (b'\'' | b'"') = self.first()
398            {
399                self.bump();
400                let terminated = self.eat_string(quote);
401                let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
402                return RawTokenKind::Literal { kind: RawLiteralKind::Str { kind, terminated } };
403            }
404        }
405
406        RawTokenKind::Ident
407    }
408
409    fn number(&mut self, first_digit: u8) -> RawLiteralKind {
410        debug_assert!(self.prev().is_ascii_digit());
411        let mut base = Base::Decimal;
412        if first_digit == b'0' {
413            // Attempt to parse encoding base.
414            let has_digits = match self.first() {
415                b'b' => {
416                    base = Base::Binary;
417                    self.bump();
418                    self.eat_decimal_digits()
419                }
420                b'o' => {
421                    base = Base::Octal;
422                    self.bump();
423                    self.eat_decimal_digits()
424                }
425                b'x' => {
426                    base = Base::Hexadecimal;
427                    self.bump();
428                    self.eat_hexadecimal_digits()
429                }
430                // Not a base prefix.
431                b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
432                    self.eat_decimal_digits();
433                    true
434                }
435                // Just a 0.
436                _ => return RawLiteralKind::Int { base, empty_int: false },
437            };
438            // Base prefix was provided, but there were no digits after it, e.g. "0x".
439            if !has_digits {
440                return RawLiteralKind::Int { base, empty_int: true };
441            }
442        } else {
443            // No base prefix, parse number in the usual way.
444            self.eat_decimal_digits();
445        };
446
447        match self.first() {
448            // Don't be greedy if this is actually an integer literal followed by field/method
449            // access (`12.foo()`).
450            // `_` is special cased, we assume it's always an invalid rational: https://github.com/argotorg/solidity/blob/c012b725bb8ce755b93ce0dd05e83c34c499acd6/liblangutil/Scanner.cpp#L979
451            b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
452                self.bump();
453                self.rational_number_after_dot(base)
454            }
455            b'e' | b'E' => {
456                self.bump();
457                let empty_exponent = !self.eat_exponent();
458                RawLiteralKind::Rational { base, empty_exponent }
459            }
460            _ => RawLiteralKind::Int { base, empty_int: false },
461        }
462    }
463
464    #[cold]
465    fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
466        self.eat_decimal_digits();
467        let empty_exponent = match self.first() {
468            b'e' | b'E' => {
469                self.bump();
470                !self.eat_exponent()
471            }
472            _ => false,
473        };
474        RawLiteralKind::Rational { base, empty_exponent }
475    }
476
477    /// Eats a string until the given quote character. Returns `true` if the string was terminated.
478    fn eat_string(&mut self, quote: u8) -> bool {
479        debug_assert_eq!(self.prev(), quote);
480        while let Some(c) = self.bump_ret() {
481            if c == quote {
482                return true;
483            }
484            if c == b'\\' {
485                let first = self.first();
486                if first == b'\\' || first == quote {
487                    // Bump again to skip escaped character.
488                    self.bump();
489                }
490            }
491        }
492        // End of file reached.
493        false
494    }
495
496    /// Eats characters for a decimal number. Returns `true` if any digits were encountered.
497    fn eat_decimal_digits(&mut self) -> bool {
498        self.eat_digits(|x| x.is_ascii_digit())
499    }
500
501    /// Eats characters for a hexadecimal number. Returns `true` if any digits were encountered.
502    fn eat_hexadecimal_digits(&mut self) -> bool {
503        self.eat_digits(|x| x.is_ascii_hexdigit())
504    }
505
506    fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
507        let mut has_digits = false;
508        loop {
509            match self.first() {
510                b'_' => {
511                    self.bump();
512                }
513                c if is_digit(c) => {
514                    has_digits = true;
515                    self.bump();
516                }
517                _ => break,
518            }
519        }
520        has_digits
521    }
522
523    /// Eats the exponent. Returns `true` if any digits were encountered.
524    fn eat_exponent(&mut self) -> bool {
525        debug_assert!(self.prev() == b'e' || self.prev() == b'E');
526        // b'+' is not a valid prefix for an exponent.
527        if self.first() == b'-' {
528            self.bump();
529        }
530        self.eat_decimal_digits()
531    }
532
533    /// Returns the remaining input as a byte slice.
534    #[inline]
535    pub fn as_bytes(&self) -> &'a [u8] {
536        self.bytes.as_slice()
537    }
538
539    /// Returns the pointer to the first byte of the remaining input.
540    #[inline]
541    pub fn as_ptr(&self) -> *const u8 {
542        self.bytes.as_slice().as_ptr()
543    }
544
545    /// Returns the last eaten byte.
546    #[inline]
547    fn prev(&self) -> u8 {
548        // SAFETY: We always bump at least one character before calling this method.
549        unsafe { *self.as_ptr().sub(1) }
550    }
551
552    /// Peeks the next byte from the input stream without consuming it.
553    /// If requested position doesn't exist, `EOF` is returned.
554    /// However, getting `EOF` doesn't always mean actual end of file,
555    /// it should be checked with `is_eof` method.
556    #[inline]
557    fn first(&self) -> u8 {
558        self.peek_byte(0)
559    }
560
561    /// Peeks the second byte from the input stream without consuming it.
562    #[inline]
563    fn second(&self) -> u8 {
564        // This function is only called after `first` was called and checked, so in practice it
565        // doesn't matter if it's part of the first UTF-8 character.
566        self.peek_byte(1)
567    }
568
569    // Do not use directly.
570    #[doc(hidden)]
571    #[inline]
572    fn peek_byte(&self, index: usize) -> u8 {
573        self.as_bytes().get(index).copied().unwrap_or(EOF)
574    }
575
576    /// Moves to the next character.
577    fn bump(&mut self) {
578        self.bytes.next();
579    }
580
581    /// Skips to the end of the current UTF-8 character sequence, with `x` as the first byte.
582    ///
583    /// Assumes that `x` is the previously consumed byte.
584    #[cold]
585    #[allow(clippy::match_overlapping_arm)]
586    fn bump_utf8_with(&mut self, x: u8) {
587        debug_assert_eq!(self.prev(), x);
588        let skip = match x {
589            ..0x80 => 0,
590            ..0xE0 => 1,
591            ..0xF0 => 2,
592            _ => 3,
593        };
594        // NOTE: The internal iterator was created with from valid UTF-8 string, so we can freely
595        // skip bytes here without checking bounds.
596        self.ignore_bytes(skip);
597    }
598
599    /// Moves to the next character, returning the current one.
600    fn bump_ret(&mut self) -> Option<u8> {
601        let c = self.as_bytes().first().copied();
602        self.bytes.next();
603        c
604    }
605
606    /// Advances `n` bytes.
607    #[inline]
608    #[cfg_attr(debug_assertions, track_caller)]
609    fn ignore_bytes(&mut self, n: usize) {
610        debug_assert!(n <= self.as_bytes().len());
611        self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
612    }
613
614    /// Eats symbols until `ch1` or `ch2` is found or until the end of file is reached.
615    ///
616    /// Returns `true` if `ch1` or `ch2` was found, `false` if the end of file was reached.
617    #[inline]
618    fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
619        let b = self.as_bytes();
620        let res = memchr::memchr2(ch1, ch2, b);
621        self.ignore_bytes(res.unwrap_or(b.len()));
622        res.is_some()
623    }
624
625    /// Eats symbols while predicate returns true or until the end of file is reached.
626    #[inline]
627    fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
628        while predicate(self.first()) {
629            self.bump();
630        }
631    }
632}
633
634impl Iterator for Cursor<'_> {
635    type Item = RawToken;
636
637    #[inline]
638    fn next(&mut self) -> Option<Self::Item> {
639        let token = self.slop();
640        if token.kind == RawTokenKind::Eof { None } else { Some(token) }
641    }
642}
643
644impl std::iter::FusedIterator for Cursor<'_> {}
645
646/// [`Cursor`] that also tracks the position of each token in the input string.
647///
648/// Created by calling [`Cursor::with_position`]. See that method and [`Cursor`] for more details.
649#[derive(Clone, Debug)]
650pub struct CursorWithPosition<'a> {
651    cursor: Cursor<'a>,
652    position: u32,
653}
654
655impl<'a> CursorWithPosition<'a> {
656    /// Creates a new cursor with position tracking from the given cursor.
657    #[inline]
658    fn new(cursor: Cursor<'a>) -> Self {
659        CursorWithPosition { cursor, position: 0 }
660    }
661
662    /// Returns a reference to the inner cursor.
663    #[inline]
664    pub fn inner(&self) -> &Cursor<'a> {
665        &self.cursor
666    }
667
668    /// Returns a mutable reference to the inner cursor.
669    #[inline]
670    pub fn inner_mut(&mut self) -> &mut Cursor<'a> {
671        &mut self.cursor
672    }
673
674    /// Returns the current position in the input string.
675    #[inline]
676    pub fn position(&self) -> usize {
677        self.position as usize
678    }
679}
680
681impl Iterator for CursorWithPosition<'_> {
682    type Item = (usize, RawToken);
683
684    #[inline]
685    fn next(&mut self) -> Option<Self::Item> {
686        self.cursor.next().map(|t| {
687            let pos = self.position;
688            self.position = pos + t.len;
689            (pos as usize, t)
690        })
691    }
692
693    #[inline]
694    fn size_hint(&self) -> (usize, Option<usize>) {
695        self.cursor.size_hint()
696    }
697}
698
699impl std::iter::FusedIterator for CursorWithPosition<'_> {}