solar_parse/lexer/cursor/
mod.rs

1//! Low-level Solidity lexer.
2//!
3//! Modified from Rust's [`rustc_lexer`](https://github.com/rust-lang/rust/blob/45749b21b7fd836f6c4f11dd40376f7c83e2791b/compiler/rustc_lexer/src/lib.rs).
4
5use memchr::memmem;
6use solar_ast::{
7    Base, StrKind,
8    token::{BinOpToken, Delimiter},
9};
10use solar_data_structures::hint::{likely, unlikely};
11use std::sync::OnceLock;
12
13pub mod token;
14use token::{RawLiteralKind, RawToken, RawTokenKind};
15
16mod char_info;
17pub use char_info::*;
18
19#[cfg(test)]
20mod tests;
21
22/// Peekable iterator over a char sequence.
23///
24/// Next characters can be peeked via `first` method,
25/// and position can be shifted forward via `bump` method.
26#[derive(Clone, Debug)]
27pub struct Cursor<'a> {
28    bytes: std::slice::Iter<'a, u8>,
29}
30
31impl<'a> Cursor<'a> {
32    /// Creates a new cursor over the given input string slice.
33    #[inline]
34    pub fn new(input: &'a str) -> Self {
35        Cursor { bytes: input.as_bytes().iter() }
36    }
37
38    /// Creates a new iterator that also returns the position of each token in the input string.
39    ///
40    /// Note that the position currently always starts at 0 when this method is called, so if called
41    /// after tokens are parsed the position will be relative to when this method is called, not the
42    /// beginning of the string.
43    #[inline]
44    pub fn with_position(self) -> CursorWithPosition<'a> {
45        CursorWithPosition::new(self)
46    }
47
48    /// Slops up a token from the input string.
49    ///
50    /// Advances the cursor by the length of the token.
51    /// Prefer using `Cursor::with_position`, or using it as an iterator instead.
52    pub fn slop(&mut self) -> RawToken {
53        // Use the pointer instead of the length to track how many bytes were consumed, since
54        // internally the iterator is a pair of `start` and `end` pointers.
55        let start = self.as_ptr();
56
57        let Some(first_char) = self.bump_ret() else { return RawToken::EOF };
58        let token_kind = self.advance_token_kind(first_char);
59
60        // SAFETY: `start` points to the same string.
61        let len = unsafe { self.as_ptr().offset_from_unsigned(start) };
62
63        RawToken::new(token_kind, len as u32)
64    }
65
66    #[inline]
67    fn advance_token_kind(&mut self, first_char: u8) -> RawTokenKind {
68        match first_char {
69            // Slash, comment or block comment.
70            b'/' => match self.first() {
71                b'/' => self.line_comment(),
72                b'*' => self.block_comment(),
73                b'=' => {
74                    self.bump();
75                    RawTokenKind::BinOpEq(BinOpToken::Slash)
76                }
77                _ => RawTokenKind::BinOp(BinOpToken::Slash),
78            },
79
80            // Whitespace sequence.
81            c if is_whitespace_byte(c) => self.whitespace(),
82
83            // Identifier. This should be checked after other variants that can start as identifier.
84            c if is_id_start_byte(c) => self.ident_or_prefixed_literal(c),
85
86            // Numeric literal.
87            b'0'..=b'9' => {
88                let kind = self.number(first_char);
89                RawTokenKind::Literal { kind }
90            }
91            b'.' if self.first().is_ascii_digit() => {
92                let kind = self.rational_number_after_dot(Base::Decimal);
93                RawTokenKind::Literal { kind }
94            }
95
96            // One-symbol tokens.
97            b';' => RawTokenKind::Semi,
98            b',' => RawTokenKind::Comma,
99            b'.' => RawTokenKind::Dot,
100            b'(' => RawTokenKind::OpenDelim(Delimiter::Parenthesis),
101            b')' => RawTokenKind::CloseDelim(Delimiter::Parenthesis),
102            b'{' => RawTokenKind::OpenDelim(Delimiter::Brace),
103            b'}' => RawTokenKind::CloseDelim(Delimiter::Brace),
104            b'[' => RawTokenKind::OpenDelim(Delimiter::Bracket),
105            b']' => RawTokenKind::CloseDelim(Delimiter::Bracket),
106            b'~' => RawTokenKind::Tilde,
107            b'?' => RawTokenKind::Question,
108
109            // Multi-character tokens.
110            // : :=
111            b':' => match self.first() {
112                b'=' => {
113                    self.bump();
114                    RawTokenKind::Walrus
115                }
116                _ => RawTokenKind::Colon,
117            },
118            // = == =>
119            b'=' => match self.first() {
120                b'=' => {
121                    self.bump();
122                    RawTokenKind::EqEq
123                }
124                b'>' => {
125                    self.bump();
126                    RawTokenKind::FatArrow
127                }
128                _ => RawTokenKind::Eq,
129            },
130            // ! !=
131            b'!' => match self.first() {
132                b'=' => {
133                    self.bump();
134                    RawTokenKind::Ne
135                }
136                _ => RawTokenKind::Not,
137            },
138            // < <= << <<=
139            b'<' => match self.first() {
140                b'=' => {
141                    self.bump();
142                    RawTokenKind::Le
143                }
144                b'<' => {
145                    self.bump();
146                    if self.first() == b'=' {
147                        self.bump();
148                        RawTokenKind::BinOpEq(BinOpToken::Shl)
149                    } else {
150                        RawTokenKind::BinOp(BinOpToken::Shl)
151                    }
152                }
153                _ => RawTokenKind::Lt,
154            },
155            // https://github.com/rust-lang/rustfmt/issues/6660
156            // `> >= >> >>= >>> >>>=`
157            b'>' => match self.first() {
158                b'=' => {
159                    self.bump();
160                    RawTokenKind::Ge
161                }
162                b'>' => {
163                    self.bump();
164                    match self.first() {
165                        b'>' => {
166                            self.bump();
167                            if self.first() == b'=' {
168                                self.bump();
169                                RawTokenKind::BinOpEq(BinOpToken::Sar)
170                            } else {
171                                RawTokenKind::BinOp(BinOpToken::Sar)
172                            }
173                        }
174                        b'=' => {
175                            self.bump();
176                            RawTokenKind::BinOpEq(BinOpToken::Shr)
177                        }
178                        _ => RawTokenKind::BinOp(BinOpToken::Shr),
179                    }
180                }
181                _ => RawTokenKind::Gt,
182            },
183            // - -- -= ->
184            b'-' => match self.first() {
185                b'-' => {
186                    self.bump();
187                    RawTokenKind::MinusMinus
188                }
189                b'=' => {
190                    self.bump();
191                    RawTokenKind::BinOpEq(BinOpToken::Minus)
192                }
193                b'>' => {
194                    self.bump();
195                    RawTokenKind::Arrow
196                }
197                _ => RawTokenKind::BinOp(BinOpToken::Minus),
198            },
199            // & && &=
200            b'&' => match self.first() {
201                b'&' => {
202                    self.bump();
203                    RawTokenKind::AndAnd
204                }
205                b'=' => {
206                    self.bump();
207                    RawTokenKind::BinOpEq(BinOpToken::And)
208                }
209                _ => RawTokenKind::BinOp(BinOpToken::And),
210            },
211            // | || |=
212            b'|' => match self.first() {
213                b'|' => {
214                    self.bump();
215                    RawTokenKind::OrOr
216                }
217                b'=' => {
218                    self.bump();
219                    RawTokenKind::BinOpEq(BinOpToken::Or)
220                }
221                _ => RawTokenKind::BinOp(BinOpToken::Or),
222            },
223            // + ++ +=
224            b'+' => match self.first() {
225                b'+' => {
226                    self.bump();
227                    RawTokenKind::PlusPlus
228                }
229                b'=' => {
230                    self.bump();
231                    RawTokenKind::BinOpEq(BinOpToken::Plus)
232                }
233                _ => RawTokenKind::BinOp(BinOpToken::Plus),
234            },
235            // * ** *=
236            b'*' => match self.first() {
237                b'*' => {
238                    self.bump();
239                    RawTokenKind::StarStar
240                }
241                b'=' => {
242                    self.bump();
243                    RawTokenKind::BinOpEq(BinOpToken::Star)
244                }
245                _ => RawTokenKind::BinOp(BinOpToken::Star),
246            },
247            // ^ ^=
248            b'^' => match self.first() {
249                b'=' => {
250                    self.bump();
251                    RawTokenKind::BinOpEq(BinOpToken::Caret)
252                }
253                _ => RawTokenKind::BinOp(BinOpToken::Caret),
254            },
255            // % %=
256            b'%' => match self.first() {
257                b'=' => {
258                    self.bump();
259                    RawTokenKind::BinOpEq(BinOpToken::Percent)
260                }
261                _ => RawTokenKind::BinOp(BinOpToken::Percent),
262            },
263
264            // String literal.
265            b'\'' | b'"' => {
266                let terminated = self.eat_string(first_char);
267                let kind = RawLiteralKind::Str { kind: StrKind::Str, terminated };
268                RawTokenKind::Literal { kind }
269            }
270
271            _ => {
272                if unlikely(!first_char.is_ascii()) {
273                    self.bump_utf8_with(first_char);
274                }
275                RawTokenKind::Unknown
276            }
277        }
278    }
279
280    #[inline(never)]
281    fn line_comment(&mut self) -> RawTokenKind {
282        debug_assert!(self.prev() == b'/' && self.first() == b'/');
283        self.bump();
284
285        // `////` (more than 3 slashes) is not considered a doc comment.
286        let is_doc = matches!(self.first(), b'/' if self.second() != b'/');
287
288        // Take into account Windows line ending (CRLF)
289        self.eat_until_either(b'\n', b'\r');
290        RawTokenKind::LineComment { is_doc }
291    }
292
293    #[inline(never)]
294    fn block_comment(&mut self) -> RawTokenKind {
295        debug_assert!(self.prev() == b'/' && self.first() == b'*');
296        self.bump();
297
298        // `/***` (more than 2 stars) is not considered a doc comment.
299        // `/**/` is not considered a doc comment.
300        let is_doc = matches!(self.first(), b'*' if !matches!(self.second(), b'*' | b'/'));
301
302        let b = self.as_bytes();
303        static FINDER: OnceLock<memmem::Finder<'static>> = OnceLock::new();
304        let (terminated, n) = FINDER
305            .get_or_init(|| memmem::Finder::new(b"*/"))
306            .find(b)
307            .map_or((false, b.len()), |pos| (true, pos + 2));
308        self.ignore_bytes(n);
309
310        RawTokenKind::BlockComment { is_doc, terminated }
311    }
312
313    fn whitespace(&mut self) -> RawTokenKind {
314        debug_assert!(is_whitespace_byte(self.prev()));
315        self.eat_while(is_whitespace_byte);
316        RawTokenKind::Whitespace
317    }
318
319    fn ident_or_prefixed_literal(&mut self, first: u8) -> RawTokenKind {
320        debug_assert!(is_id_start_byte(self.prev()));
321
322        // Start is already eaten, eat the rest of identifier.
323        let start = self.as_ptr();
324        self.eat_while(is_id_continue_byte);
325
326        // Check if the identifier is a string literal prefix.
327        if unlikely(matches!(first, b'h' | b'u')) {
328            // SAFETY: within bounds and lifetime of `self.chars`.
329            let id = unsafe {
330                let start = start.sub(1);
331                std::slice::from_raw_parts(start, self.as_ptr().offset_from_unsigned(start))
332            };
333            let is_hex = id == b"hex";
334            if (is_hex || id == b"unicode")
335                && let quote @ (b'\'' | b'"') = self.first()
336            {
337                self.bump();
338                let terminated = self.eat_string(quote);
339                let kind = if is_hex { StrKind::Hex } else { StrKind::Unicode };
340                return RawTokenKind::Literal { kind: RawLiteralKind::Str { kind, terminated } };
341            }
342        }
343
344        RawTokenKind::Ident
345    }
346
347    fn number(&mut self, first_digit: u8) -> RawLiteralKind {
348        debug_assert!(self.prev().is_ascii_digit());
349        let mut base = Base::Decimal;
350        if first_digit == b'0' {
351            // Attempt to parse encoding base.
352            let has_digits = match self.first() {
353                b'b' => {
354                    base = Base::Binary;
355                    self.bump();
356                    self.eat_decimal_digits()
357                }
358                b'o' => {
359                    base = Base::Octal;
360                    self.bump();
361                    self.eat_decimal_digits()
362                }
363                b'x' => {
364                    base = Base::Hexadecimal;
365                    self.bump();
366                    self.eat_hexadecimal_digits()
367                }
368                // Not a base prefix.
369                b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' => {
370                    self.eat_decimal_digits();
371                    true
372                }
373                // Just a 0.
374                _ => return RawLiteralKind::Int { base, empty_int: false },
375            };
376            // Base prefix was provided, but there were no digits after it, e.g. "0x".
377            if !has_digits {
378                return RawLiteralKind::Int { base, empty_int: true };
379            }
380        } else {
381            // No base prefix, parse number in the usual way.
382            self.eat_decimal_digits();
383        };
384
385        match self.first() {
386            // Don't be greedy if this is actually an integer literal followed by field/method
387            // access (`12.foo()`).
388            // `_` is special cased, we assume it's always an invalid rational: https://github.com/argotorg/solidity/blob/c012b725bb8ce755b93ce0dd05e83c34c499acd6/liblangutil/Scanner.cpp#L979
389            b'.' if !is_id_start_byte(self.second()) || self.second() == b'_' => {
390                self.bump();
391                self.rational_number_after_dot(base)
392            }
393            b'e' | b'E' => {
394                self.bump();
395                let empty_exponent = !self.eat_exponent();
396                RawLiteralKind::Rational { base, empty_exponent }
397            }
398            _ => RawLiteralKind::Int { base, empty_int: false },
399        }
400    }
401
402    #[cold]
403    fn rational_number_after_dot(&mut self, base: Base) -> RawLiteralKind {
404        self.eat_decimal_digits();
405        let empty_exponent = match self.first() {
406            b'e' | b'E' => {
407                self.bump();
408                !self.eat_exponent()
409            }
410            _ => false,
411        };
412        RawLiteralKind::Rational { base, empty_exponent }
413    }
414
415    /// Eats a string until the given quote character. Returns `true` if the string was terminated.
416    fn eat_string(&mut self, quote: u8) -> bool {
417        debug_assert_eq!(self.prev(), quote);
418        loop {
419            if unlikely(!self.eat_until_either(quote, b'\\')) {
420                return false;
421            }
422            // SAFETY: `eat_until_either` returns `true` if `quote` or `b'\\'` was found.
423            let c = unsafe { self.bump_ret().unwrap_unchecked() };
424            if likely(c == quote) {
425                return true;
426            }
427            // `\\` or `\"`: skip the escaped character.
428            debug_assert_eq!(c, b'\\');
429            let next = self.first();
430            if next == b'\\' || next == quote {
431                // Bump again to skip escaped character.
432                self.bump();
433            }
434        }
435    }
436
437    /// Eats characters for a decimal number. Returns `true` if any digits were encountered.
438    fn eat_decimal_digits(&mut self) -> bool {
439        self.eat_digits(is_decimal_digit)
440    }
441
442    /// Eats characters for a hexadecimal number. Returns `true` if any digits were encountered.
443    fn eat_hexadecimal_digits(&mut self) -> bool {
444        self.eat_digits(is_hex_digit)
445    }
446
447    fn eat_digits(&mut self, mut is_digit: impl FnMut(u8) -> bool) -> bool {
448        let mut has_digits = false;
449        loop {
450            match self.first() {
451                b'_' => {
452                    self.bump();
453                }
454                c if is_digit(c) => {
455                    has_digits = true;
456                    self.bump();
457                }
458                _ => break,
459            }
460        }
461        has_digits
462    }
463
464    /// Eats the exponent. Returns `true` if any digits were encountered.
465    fn eat_exponent(&mut self) -> bool {
466        debug_assert!(self.prev() == b'e' || self.prev() == b'E');
467        // b'+' is not a valid prefix for an exponent.
468        if self.first() == b'-' {
469            self.bump();
470        }
471        self.eat_decimal_digits()
472    }
473
474    /// Returns the remaining input as a byte slice.
475    #[inline]
476    pub fn as_bytes(&self) -> &'a [u8] {
477        self.bytes.as_slice()
478    }
479
480    /// Returns the pointer to the first byte of the remaining input.
481    #[inline]
482    pub fn as_ptr(&self) -> *const u8 {
483        self.bytes.as_slice().as_ptr()
484    }
485
486    /// Returns the last eaten byte.
487    #[inline]
488    fn prev(&self) -> u8 {
489        // SAFETY: We always bump at least one character before calling this method.
490        unsafe { *self.as_ptr().sub(1) }
491    }
492
493    /// Peeks the next byte from the input stream without consuming it.
494    /// If requested position doesn't exist, `EOF` is returned.
495    /// However, getting `EOF` doesn't always mean actual end of file,
496    /// it should be checked with `is_eof` method.
497    #[inline]
498    fn first(&self) -> u8 {
499        self.peek_byte(0)
500    }
501
502    /// Peeks the second byte from the input stream without consuming it.
503    #[inline]
504    fn second(&self) -> u8 {
505        // This function is only called after `first` was called and checked, so in practice it
506        // doesn't matter if it's part of the first UTF-8 character.
507        self.peek_byte(1)
508    }
509
510    // Do not use directly.
511    #[doc(hidden)]
512    #[inline]
513    fn peek_byte(&self, index: usize) -> u8 {
514        self.as_bytes().get(index).copied().unwrap_or(EOF)
515    }
516
517    /// Moves to the next character.
518    fn bump(&mut self) {
519        self.bytes.next();
520    }
521
522    /// Skips to the end of the current UTF-8 character sequence, with `x` as the first byte.
523    ///
524    /// Assumes that `x` is the previously consumed byte.
525    #[cold]
526    #[allow(clippy::match_overlapping_arm)]
527    fn bump_utf8_with(&mut self, x: u8) {
528        debug_assert_eq!(self.prev(), x);
529        let skip = match x {
530            ..0x80 => 0,
531            ..0xE0 => 1,
532            ..0xF0 => 2,
533            _ => 3,
534        };
535        // NOTE: The internal iterator was created with from valid UTF-8 string, so we can freely
536        // skip bytes here without checking bounds.
537        self.ignore_bytes(skip);
538    }
539
540    /// Moves to the next character, returning the current one.
541    fn bump_ret(&mut self) -> Option<u8> {
542        let c = self.as_bytes().first().copied();
543        self.bytes.next();
544        c
545    }
546
547    /// Advances `n` bytes.
548    #[inline]
549    #[cfg_attr(debug_assertions, track_caller)]
550    fn ignore_bytes(&mut self, n: usize) {
551        debug_assert!(n <= self.as_bytes().len());
552        self.bytes = unsafe { self.as_bytes().get_unchecked(n..) }.iter();
553    }
554
555    /// Eats symbols until `ch1` or `ch2` is found or until the end of file is reached.
556    ///
557    /// Returns `true` if `ch1` or `ch2` was found, `false` if the end of file was reached.
558    #[inline]
559    fn eat_until_either(&mut self, ch1: u8, ch2: u8) -> bool {
560        let b = self.as_bytes();
561        let res = memchr::memchr2(ch1, ch2, b);
562        self.ignore_bytes(res.unwrap_or(b.len()));
563        res.is_some()
564    }
565
566    /// Eats symbols while predicate returns true or until the end of file is reached.
567    #[inline]
568    fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
569        while predicate(self.first()) {
570            self.bump();
571        }
572    }
573}
574
575impl Iterator for Cursor<'_> {
576    type Item = RawToken;
577
578    #[inline]
579    fn next(&mut self) -> Option<Self::Item> {
580        let token = self.slop();
581        if token.kind == RawTokenKind::Eof { None } else { Some(token) }
582    }
583}
584
585impl std::iter::FusedIterator for Cursor<'_> {}
586
587/// [`Cursor`] that also tracks the position of each token in the input string.
588///
589/// Created by calling [`Cursor::with_position`]. See that method and [`Cursor`] for more details.
590#[derive(Clone, Debug)]
591pub struct CursorWithPosition<'a> {
592    cursor: Cursor<'a>,
593    position: u32,
594}
595
596impl<'a> CursorWithPosition<'a> {
597    /// Creates a new cursor with position tracking from the given cursor.
598    #[inline]
599    fn new(cursor: Cursor<'a>) -> Self {
600        CursorWithPosition { cursor, position: 0 }
601    }
602
603    /// Returns a reference to the inner cursor.
604    #[inline]
605    pub fn inner(&self) -> &Cursor<'a> {
606        &self.cursor
607    }
608
609    /// Returns a mutable reference to the inner cursor.
610    #[inline]
611    pub fn inner_mut(&mut self) -> &mut Cursor<'a> {
612        &mut self.cursor
613    }
614
615    /// Returns the current position in the input string.
616    #[inline]
617    pub fn position(&self) -> usize {
618        self.position as usize
619    }
620}
621
622impl Iterator for CursorWithPosition<'_> {
623    type Item = (usize, RawToken);
624
625    #[inline]
626    fn next(&mut self) -> Option<Self::Item> {
627        self.cursor.next().map(|t| {
628            let pos = self.position;
629            self.position = pos + t.len;
630            (pos as usize, t)
631        })
632    }
633
634    #[inline]
635    fn size_hint(&self) -> (usize, Option<usize>) {
636        self.cursor.size_hint()
637    }
638}
639
640impl std::iter::FusedIterator for CursorWithPosition<'_> {}