solar_parse/lexer/
mod.rs

1//! Solidity and Yul lexer.
2
3use solar_ast::{
4    token::{BinOpToken, CommentKind, Delimiter, Token, TokenKind, TokenLitKind},
5    Base,
6};
7use solar_interface::{
8    diagnostics::DiagCtxt, source_map::SourceFile, sym, BytePos, Session, Span, Symbol,
9};
10
11mod cursor;
12use cursor::token::{RawLiteralKind, RawToken, RawTokenKind};
13pub use cursor::{is_id_continue, is_id_start, is_ident, is_whitespace, token, Cursor};
14
15pub mod unescape;
16
17mod unicode_chars;
18
19mod utf8;
20
21/// Solidity and Yul lexer.
22///
23/// Converts a [`Cursor`]'s output from simple [`RawTokenKind`]s into rich [`TokenKind`]s, by
24/// converting strings into interned symbols, concatenating tokens together, and running additional
25/// validation.
26pub struct Lexer<'sess, 'src> {
27    /// The parsing context.
28    pub(crate) sess: &'sess Session,
29
30    /// Initial position, read-only.
31    start_pos: BytePos,
32
33    /// The absolute offset within the source_map of the current character.
34    pos: BytePos,
35
36    /// Source text to tokenize.
37    src: &'src str,
38
39    /// Cursor for getting lexer tokens.
40    cursor: Cursor<'src>,
41
42    /// The current token which has not been processed by `next_token` yet.
43    token: Token,
44
45    /// When a "unknown start of token: \u{a0}" has already been emitted earlier
46    /// in this file, it's safe to treat further occurrences of the non-breaking
47    /// space character as whitespace.
48    nbsp_is_whitespace: bool,
49}
50
51impl<'sess, 'src> Lexer<'sess, 'src> {
52    /// Creates a new `Lexer` for the given source string.
53    pub fn new(sess: &'sess Session, src: &'src str) -> Self {
54        Self::with_start_pos(sess, src, BytePos(0))
55    }
56
57    /// Creates a new `Lexer` for the given source file.
58    ///
59    /// Note that the source file must be added to the source map before calling this function.
60    pub fn from_source_file(sess: &'sess Session, file: &'src SourceFile) -> Self {
61        Self::with_start_pos(sess, &file.src, file.start_pos)
62    }
63
64    /// Creates a new `Lexer` for the given source string and starting position.
65    pub fn with_start_pos(sess: &'sess Session, src: &'src str, start_pos: BytePos) -> Self {
66        let mut lexer = Self {
67            sess,
68            start_pos,
69            pos: start_pos,
70            src,
71            cursor: Cursor::new(src),
72            token: Token::DUMMY,
73            nbsp_is_whitespace: false,
74        };
75        (lexer.token, _) = lexer.bump();
76        lexer
77    }
78
79    /// Returns a reference to the diagnostic context.
80    #[inline]
81    pub fn dcx(&self) -> &'sess DiagCtxt {
82        &self.sess.dcx
83    }
84
85    /// Consumes the lexer and collects the remaining tokens into a vector.
86    ///
87    /// Note that this skips comments, as [required by the parser](crate::Parser::new).
88    ///
89    /// Prefer using this method instead of manually collecting tokens using [`Iterator`].
90    #[instrument(name = "lex", level = "debug", skip_all)]
91    pub fn into_tokens(mut self) -> Vec<Token> {
92        // `src.len() / 8` is an estimate of the number of tokens in the source.
93        let mut tokens = Vec::with_capacity(self.src.len() / 8);
94        loop {
95            let token = self.next_token();
96            if token.is_eof() {
97                break;
98            }
99            tokens.push(token);
100        }
101        trace!(
102            src.len = self.src.len(),
103            tokens.len = tokens.len(),
104            ratio = %format!("{:.2}", self.src.len() as f64 / tokens.len() as f64),
105            "lexed"
106        );
107        tokens
108    }
109
110    /// Returns the next token, advancing the lexer.
111    pub fn next_token(&mut self) -> Token {
112        let mut next_token;
113        loop {
114            let preceded_by_whitespace;
115            (next_token, preceded_by_whitespace) = self.bump();
116            if preceded_by_whitespace {
117                break;
118            } else if let Some(glued) = self.token.glue(next_token) {
119                self.token = glued;
120            } else {
121                break;
122            }
123        }
124        std::mem::replace(&mut self.token, next_token)
125    }
126
127    fn bump(&mut self) -> (Token, bool) {
128        let mut preceded_by_whitespace = false;
129        let mut swallow_next_invalid = 0;
130        loop {
131            let RawToken { kind: raw_kind, len } = self.cursor.advance_token();
132            let start = self.pos;
133            self.pos += len;
134
135            // Now "cook" the token, converting the simple `RawTokenKind` into a rich `TokenKind`.
136            // This turns strings into interned symbols and runs additional validation.
137            let kind = match raw_kind {
138                RawTokenKind::LineComment { is_doc } => {
139                    preceded_by_whitespace = true;
140
141                    // Opening delimiter is not included into the symbol.
142                    let content_start = start + BytePos(if is_doc { 3 } else { 2 });
143                    let content = self.str_from(content_start);
144                    self.cook_doc_comment(content_start, content, is_doc, CommentKind::Line)
145                }
146                RawTokenKind::BlockComment { is_doc, terminated } => {
147                    preceded_by_whitespace = true;
148
149                    if !terminated {
150                        let msg = if is_doc {
151                            "unterminated block doc-comment"
152                        } else {
153                            "unterminated block comment"
154                        };
155                        self.dcx().err(msg).span(self.new_span(start, self.pos)).emit();
156                    }
157
158                    // Opening delimiter and closing delimiter are not included into the symbol.
159                    let content_start = start + BytePos(if is_doc { 3 } else { 2 });
160                    let content_end = self.pos - (terminated as u32) * 2;
161                    let content = self.str_from_to(content_start, content_end);
162                    self.cook_doc_comment(content_start, content, is_doc, CommentKind::Block)
163                }
164                RawTokenKind::Whitespace => {
165                    preceded_by_whitespace = true;
166                    continue;
167                }
168                RawTokenKind::Ident => {
169                    let sym = self.symbol_from(start);
170                    TokenKind::Ident(sym)
171                }
172                RawTokenKind::UnknownPrefix => {
173                    self.report_unknown_prefix(start);
174                    let sym = self.symbol_from(start);
175                    TokenKind::Ident(sym)
176                }
177                RawTokenKind::Literal { kind } => {
178                    let (kind, symbol) = self.cook_literal(start, self.pos, kind);
179                    TokenKind::Literal(kind, symbol)
180                }
181
182                RawTokenKind::Semi => TokenKind::Semi,
183                RawTokenKind::Comma => TokenKind::Comma,
184                RawTokenKind::Dot => TokenKind::Dot,
185                RawTokenKind::OpenParen => TokenKind::OpenDelim(Delimiter::Parenthesis),
186                RawTokenKind::CloseParen => TokenKind::CloseDelim(Delimiter::Parenthesis),
187                RawTokenKind::OpenBrace => TokenKind::OpenDelim(Delimiter::Brace),
188                RawTokenKind::CloseBrace => TokenKind::CloseDelim(Delimiter::Brace),
189                RawTokenKind::OpenBracket => TokenKind::OpenDelim(Delimiter::Bracket),
190                RawTokenKind::CloseBracket => TokenKind::CloseDelim(Delimiter::Bracket),
191                RawTokenKind::Tilde => TokenKind::Tilde,
192                RawTokenKind::Question => TokenKind::Question,
193                RawTokenKind::Colon => TokenKind::Colon,
194                RawTokenKind::Eq => TokenKind::Eq,
195                RawTokenKind::Bang => TokenKind::Not,
196                RawTokenKind::Lt => TokenKind::Lt,
197                RawTokenKind::Gt => TokenKind::Gt,
198                RawTokenKind::Minus => TokenKind::BinOp(BinOpToken::Minus),
199                RawTokenKind::And => TokenKind::BinOp(BinOpToken::And),
200                RawTokenKind::Or => TokenKind::BinOp(BinOpToken::Or),
201                RawTokenKind::Plus => TokenKind::BinOp(BinOpToken::Plus),
202                RawTokenKind::Star => TokenKind::BinOp(BinOpToken::Star),
203                RawTokenKind::Slash => TokenKind::BinOp(BinOpToken::Slash),
204                RawTokenKind::Caret => TokenKind::BinOp(BinOpToken::Caret),
205                RawTokenKind::Percent => TokenKind::BinOp(BinOpToken::Percent),
206
207                RawTokenKind::Unknown => {
208                    // Don't emit diagnostics for sequences of the same invalid token
209                    if swallow_next_invalid > 0 {
210                        swallow_next_invalid -= 1;
211                        continue;
212                    }
213                    let mut it = self.str_from_to_end(start).chars();
214                    let c = it.next().unwrap();
215                    if c == '\u{00a0}' {
216                        // If an error has already been reported on non-breaking
217                        // space characters earlier in the file, treat all
218                        // subsequent occurrences as whitespace.
219                        if self.nbsp_is_whitespace {
220                            preceded_by_whitespace = true;
221                            continue;
222                        }
223                        self.nbsp_is_whitespace = true;
224                    }
225
226                    let repeats = it.take_while(|c1| *c1 == c).count();
227                    swallow_next_invalid = repeats;
228
229                    let (token, sugg) =
230                        unicode_chars::check_for_substitution(self, start, c, repeats + 1);
231
232                    let span = self
233                        .new_span(start, self.pos + BytePos::from_usize(repeats * c.len_utf8()));
234                    let msg = format!("unknown start of token: {}", escaped_char(c));
235                    let mut err = self.dcx().err(msg).span(span);
236                    if let Some(sugg) = sugg {
237                        match sugg {
238                            unicode_chars::TokenSubstitution::DirectedQuotes {
239                                span,
240                                suggestion: _,
241                                ascii_str,
242                                ascii_name,
243                            } => {
244                                let msg = format!("Unicode characters '“' (Left Double Quotation Mark) and '”' (Right Double Quotation Mark) look like '{ascii_str}' ({ascii_name}), but are not");
245                                err = err.span_help(span, msg);
246                            }
247                            unicode_chars::TokenSubstitution::Other {
248                                span,
249                                suggestion: _,
250                                ch,
251                                u_name,
252                                ascii_str,
253                                ascii_name,
254                            } => {
255                                let msg = format!("Unicode character '{ch}' ({u_name}) looks like '{ascii_str}' ({ascii_name}), but it is not");
256                                err = err.span_help(span, msg);
257                            }
258                        }
259                    }
260                    if c == '\0' {
261                        let help = "source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used";
262                        err = err.help(help);
263                    }
264                    if repeats > 0 {
265                        let note = match repeats {
266                            1 => "once more".to_string(),
267                            _ => format!("{repeats} more times"),
268                        };
269                        err = err.note(format!("character repeats {note}"));
270                    }
271                    err.emit();
272
273                    if let Some(token) = token {
274                        token
275                    } else {
276                        preceded_by_whitespace = true;
277                        continue;
278                    }
279                }
280
281                RawTokenKind::Eof => TokenKind::Eof,
282            };
283            let span = self.new_span(start, self.pos);
284            return (Token::new(kind, span), preceded_by_whitespace);
285        }
286    }
287
288    fn cook_doc_comment(
289        &self,
290        _content_start: BytePos,
291        content: &str,
292        is_doc: bool,
293        comment_kind: CommentKind,
294    ) -> TokenKind {
295        TokenKind::Comment(is_doc, comment_kind, Symbol::intern(content))
296    }
297
298    fn cook_literal(
299        &self,
300        start: BytePos,
301        end: BytePos,
302        kind: RawLiteralKind,
303    ) -> (TokenLitKind, Symbol) {
304        match kind {
305            RawLiteralKind::Str { terminated, unicode } => {
306                if !terminated {
307                    let span = self.new_span(start, end);
308                    let guar = self.dcx().err("unterminated string").span(span).emit();
309                    (TokenLitKind::Err(guar), self.symbol_from_to(start, end))
310                } else {
311                    let kind = if unicode { TokenLitKind::UnicodeStr } else { TokenLitKind::Str };
312                    let prefix_len = if unicode { 7 } else { 0 }; // `unicode`
313                    self.cook_quoted(kind, start, end, prefix_len)
314                }
315            }
316            RawLiteralKind::HexStr { terminated } => {
317                if !terminated {
318                    let span = self.new_span(start, end);
319                    let guar = self.dcx().err("unterminated hex string").span(span).emit();
320                    (TokenLitKind::Err(guar), self.symbol_from_to(start, end))
321                } else {
322                    let prefix_len = 3; // `hex`
323                    self.cook_quoted(TokenLitKind::HexStr, start, end, prefix_len)
324                }
325            }
326            RawLiteralKind::Int { base, empty_int } => {
327                if empty_int {
328                    let span = self.new_span(start, end);
329                    self.dcx().err("no valid digits found for number").span(span).emit();
330                    (TokenLitKind::Integer, sym::integer(0))
331                } else {
332                    if matches!(base, Base::Binary | Base::Octal) {
333                        let start = start + 2;
334                        // To uncomment if binary and octal literals are ever supported.
335                        /*
336                        let base = base as u32;
337                        let s = self.str_from_to(start, end);
338                        for (i, c) in s.char_indices() {
339                            if c != '_' && c.to_digit(base).is_none() {
340                                let msg = format!("invalid digit for a base {base} literal");
341                                let lo = start + BytePos::from_usize(i);
342                                let hi = lo + BytePos::from_usize(c.len_utf8());
343                                let span = self.new_span(lo, hi);
344                                self.dcx().err(msg).span(span).emit();
345                            }
346                        }
347                        */
348                        let msg = format!("integers in base {base} are not supported");
349                        self.dcx().err(msg).span(self.new_span(start, end)).emit();
350                    }
351                    (TokenLitKind::Integer, self.symbol_from_to(start, end))
352                }
353            }
354            RawLiteralKind::Rational { base, empty_exponent } => {
355                if empty_exponent {
356                    let span = self.new_span(start, self.pos);
357                    self.dcx().err("expected at least one digit in exponent").span(span).emit();
358                }
359
360                let unsupported_base =
361                    matches!(base, Base::Binary | Base::Octal | Base::Hexadecimal);
362                if unsupported_base {
363                    let msg = format!("{base} rational numbers are not supported");
364                    self.dcx().err(msg).span(self.new_span(start, end)).emit();
365                }
366
367                (TokenLitKind::Rational, self.symbol_from_to(start, end))
368            }
369        }
370    }
371
372    fn cook_quoted(
373        &self,
374        kind: TokenLitKind,
375        start: BytePos,
376        end: BytePos,
377        prefix_len: u32,
378    ) -> (TokenLitKind, Symbol) {
379        let mode = match kind {
380            TokenLitKind::Str => unescape::Mode::Str,
381            TokenLitKind::UnicodeStr => unescape::Mode::UnicodeStr,
382            TokenLitKind::HexStr => unescape::Mode::HexStr,
383            _ => unreachable!(),
384        };
385
386        // Account for quote (`"` or `'`) and prefix.
387        let content_start = start + 1 + BytePos(prefix_len);
388        let content_end = end - 1;
389        let lit_content = self.str_from_to(content_start, content_end);
390
391        let mut has_err = false;
392        unescape::unescape_literal(lit_content, mode, |range, result| {
393            // Here we only check for errors. The actual unescaping is done later.
394            if let Err(err) = result {
395                has_err = true;
396                let (start, end) = (range.start as u32, range.end as u32);
397                let lo = content_start + BytePos(start);
398                let hi = lo + BytePos(end - start);
399                let span = self.new_span(lo, hi);
400                unescape::emit_unescape_error(self.dcx(), lit_content, span, range, err);
401            }
402        });
403
404        // We normally exclude the quotes for the symbol, but for errors we
405        // include it because it results in clearer error messages.
406        let symbol =
407            if has_err { self.symbol_from_to(start, end) } else { Symbol::intern(lit_content) };
408        (kind, symbol)
409    }
410
411    #[inline]
412    fn new_span(&self, lo: BytePos, hi: BytePos) -> Span {
413        Span::new(lo, hi)
414    }
415
416    #[inline]
417    fn src_index(&self, pos: BytePos) -> usize {
418        (pos - self.start_pos).to_usize()
419    }
420
421    /// Slice of the source text from `start` up to but excluding `self.pos`,
422    /// meaning the slice does not include the character `self.ch`.
423    fn symbol_from(&self, start: BytePos) -> Symbol {
424        self.symbol_from_to(start, self.pos)
425    }
426
427    /// Slice of the source text from `start` up to but excluding `self.pos`,
428    /// meaning the slice does not include the character `self.ch`.
429    fn str_from(&self, start: BytePos) -> &'src str {
430        self.str_from_to(start, self.pos)
431    }
432
433    /// Same as `symbol_from`, with an explicit endpoint.
434    fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
435        Symbol::intern(self.str_from_to(start, end))
436    }
437
438    /// Slice of the source text spanning from `start` up to but excluding `end`.
439    #[track_caller]
440    fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
441        &self.src[self.src_index(start)..self.src_index(end)]
442    }
443
444    /// Slice of the source text spanning from `start` until the end.
445    fn str_from_to_end(&self, start: BytePos) -> &'src str {
446        &self.src[self.src_index(start)..]
447    }
448
449    fn report_unknown_prefix(&self, start: BytePos) {
450        let prefix = self.str_from_to(start, self.pos);
451        let msg = format!("literal prefix {prefix} is unknown");
452        self.dcx().err(msg).span(self.new_span(start, self.pos)).emit();
453    }
454}
455
456impl Iterator for Lexer<'_, '_> {
457    type Item = Token;
458
459    #[inline]
460    fn next(&mut self) -> Option<Token> {
461        let token = self.next_token();
462        if token.is_eof() {
463            None
464        } else {
465            Some(token)
466        }
467    }
468}
469
470impl std::iter::FusedIterator for Lexer<'_, '_> {}
471
472/// Pushes a character to a message string for error reporting
473fn escaped_char(c: char) -> String {
474    match c {
475        '\u{20}'..='\u{7e}' => {
476            // Don't escape \, ' or " for user-facing messages
477            c.to_string()
478        }
479        _ => c.escape_default().to_string(),
480    }
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486    use std::ops::Range;
487    use BinOpToken::*;
488    use TokenKind::*;
489
490    type Expected<'a> = &'a [(Range<usize>, TokenKind)];
491
492    fn check(src: &str, expected: Expected<'_>) {
493        let sess = Session::builder().with_test_emitter().build();
494        let tokens: Vec<_> = Lexer::new(&sess, src)
495            .filter(|t| !t.is_comment())
496            .map(|t| (t.span.lo().to_usize()..t.span.hi().to_usize(), t.kind))
497            .collect();
498        sess.dcx.has_errors().unwrap();
499        assert_eq!(tokens, expected, "{src:?}");
500    }
501
502    fn checks(tests: &[(&str, Expected<'_>)]) {
503        for &(src, expected) in tests {
504            check(src, expected);
505        }
506    }
507
508    fn lit(kind: TokenLitKind, symbol: &str) -> TokenKind {
509        Literal(kind, sym(symbol))
510    }
511
512    fn id(symbol: &str) -> TokenKind {
513        Ident(sym(symbol))
514    }
515
516    fn sym(s: &str) -> Symbol {
517        Symbol::intern(s)
518    }
519
520    #[test]
521    fn empty() {
522        checks(&[
523            ("", &[]),
524            (" ", &[]),
525            (" \n", &[]),
526            ("\n", &[]),
527            ("\n\t", &[]),
528            ("\n \t", &[]),
529            ("\n \t ", &[]),
530            (" \n \t \t", &[]),
531        ]);
532    }
533
534    #[test]
535    fn literals() {
536        use TokenLitKind::*;
537        solar_interface::SessionGlobals::new().set(|| {
538            checks(&[
539                ("\"\"", &[(0..2, lit(Str, ""))]),
540                ("\"\"\"\"", &[(0..2, lit(Str, "")), (2..4, lit(Str, ""))]),
541                ("\"\" \"\"", &[(0..2, lit(Str, "")), (3..5, lit(Str, ""))]),
542                ("\"\\\"\"", &[(0..4, lit(Str, "\\\""))]),
543                ("unicode\"\"", &[(0..9, lit(UnicodeStr, ""))]),
544                ("unicode \"\"", &[(0..7, id("unicode")), (8..10, lit(Str, ""))]),
545                ("hex\"\"", &[(0..5, lit(HexStr, ""))]),
546                ("hex \"\"", &[(0..3, id("hex")), (4..6, lit(Str, ""))]),
547                //
548                ("0", &[(0..1, lit(Integer, "0"))]),
549                ("0a", &[(0..1, lit(Integer, "0")), (1..2, id("a"))]),
550                ("0xa", &[(0..3, lit(Integer, "0xa"))]),
551                ("0.e1", &[(0..1, lit(Integer, "0")), (1..2, Dot), (2..4, id("e1"))]),
552                (
553                    "0.e-1",
554                    &[
555                        (0..1, lit(Integer, "0")),
556                        (1..2, Dot),
557                        (2..3, id("e")),
558                        (3..4, BinOp(Minus)),
559                        (4..5, lit(Integer, "1")),
560                    ],
561                ),
562                ("0.0", &[(0..3, lit(Rational, "0.0"))]),
563                ("0.", &[(0..2, lit(Rational, "0."))]),
564                (".0", &[(0..2, lit(Rational, ".0"))]),
565                ("0.0e1", &[(0..5, lit(Rational, "0.0e1"))]),
566                ("0.0e-1", &[(0..6, lit(Rational, "0.0e-1"))]),
567                ("0e1", &[(0..3, lit(Rational, "0e1"))]),
568                ("0e1.", &[(0..3, lit(Rational, "0e1")), (3..4, Dot)]),
569            ]);
570        });
571    }
572
573    #[test]
574    fn idents() {
575        solar_interface::SessionGlobals::new().set(|| {
576            checks(&[
577                ("$", &[(0..1, id("$"))]),
578                ("a$", &[(0..2, id("a$"))]),
579                ("a_$123_", &[(0..7, id("a_$123_"))]),
580                ("   b", &[(3..4, id("b"))]),
581                (" c\t ", &[(1..2, id("c"))]),
582                (" \td ", &[(2..3, id("d"))]),
583                (" \t\nef ", &[(3..5, id("ef"))]),
584                (" \t\n\tghi ", &[(4..7, id("ghi"))]),
585            ]);
586        });
587    }
588
589    #[test]
590    fn doc_comments() {
591        use CommentKind::*;
592
593        fn doc(kind: CommentKind, symbol: &str) -> TokenKind {
594            Comment(true, kind, sym(symbol))
595        }
596
597        solar_interface::SessionGlobals::new().set(|| {
598            checks(&[
599                ("// line comment", &[]),
600                ("// / line comment", &[]),
601                ("// ! line comment", &[]),
602                ("// /* line comment", &[]), // */ <-- aaron-bond.better-comments doesn't like this
603                ("/// line doc-comment", &[(0..20, doc(Line, " line doc-comment"))]),
604                ("//// invalid doc-comment", &[]),
605                ("///// invalid doc-comment", &[]),
606                //
607                ("/**/", &[]),
608                ("/***/", &[]),
609                ("/****/", &[]),
610                ("/*/*/", &[]),
611                ("/* /*/", &[]),
612                ("/*/**/", &[]),
613                ("/* /**/", &[]),
614                ("/* normal block comment */", &[]),
615                ("/* /* normal block comment */", &[]),
616                ("/** block doc-comment */", &[(0..24, doc(Block, " block doc-comment "))]),
617                ("/** /* block doc-comment */", &[(0..27, doc(Block, " /* block doc-comment "))]),
618                ("/** block doc-comment /*/", &[(0..25, doc(Block, " block doc-comment /"))]),
619            ]);
620        });
621    }
622
623    #[test]
624    fn operators() {
625        use Delimiter::*;
626        // From Solc `TOKEN_LIST`: https://github.com/ethereum/solidity/blob/194b114664c7daebc2ff68af3c573272f5d28913/liblangutil/Token.h#L67
627        checks(&[
628            (")", &[(0..1, CloseDelim(Parenthesis))]),
629            ("(", &[(0..1, OpenDelim(Parenthesis))]),
630            ("[", &[(0..1, OpenDelim(Bracket))]),
631            ("]", &[(0..1, CloseDelim(Bracket))]),
632            ("{", &[(0..1, OpenDelim(Brace))]),
633            ("}", &[(0..1, CloseDelim(Brace))]),
634            (":", &[(0..1, Colon)]),
635            (";", &[(0..1, Semi)]),
636            (".", &[(0..1, Dot)]),
637            ("?", &[(0..1, Question)]),
638            ("=>", &[(0..2, FatArrow)]),
639            ("->", &[(0..2, Arrow)]),
640            ("=", &[(0..1, Eq)]),
641            ("|=", &[(0..2, BinOpEq(Or))]),
642            ("^=", &[(0..2, BinOpEq(Caret))]),
643            ("&=", &[(0..2, BinOpEq(And))]),
644            ("<<=", &[(0..3, BinOpEq(Shl))]),
645            (">>=", &[(0..3, BinOpEq(Shr))]),
646            (">>>=", &[(0..4, BinOpEq(Sar))]),
647            ("+=", &[(0..2, BinOpEq(Plus))]),
648            ("-=", &[(0..2, BinOpEq(Minus))]),
649            ("*=", &[(0..2, BinOpEq(Star))]),
650            ("/=", &[(0..2, BinOpEq(Slash))]),
651            ("%=", &[(0..2, BinOpEq(Percent))]),
652            (",", &[(0..1, Comma)]),
653            ("||", &[(0..2, OrOr)]),
654            ("&&", &[(0..2, AndAnd)]),
655            ("|", &[(0..1, BinOp(Or))]),
656            ("^", &[(0..1, BinOp(Caret))]),
657            ("&", &[(0..1, BinOp(And))]),
658            ("<<", &[(0..2, BinOp(Shl))]),
659            (">>", &[(0..2, BinOp(Shr))]),
660            (">>>", &[(0..3, BinOp(Sar))]),
661            ("+", &[(0..1, BinOp(Plus))]),
662            ("-", &[(0..1, BinOp(Minus))]),
663            ("*", &[(0..1, BinOp(Star))]),
664            ("/", &[(0..1, BinOp(Slash))]),
665            ("%", &[(0..1, BinOp(Percent))]),
666            ("**", &[(0..2, StarStar)]),
667            ("==", &[(0..2, EqEq)]),
668            ("!=", &[(0..2, Ne)]),
669            ("<", &[(0..1, Lt)]),
670            (">", &[(0..1, Gt)]),
671            ("<=", &[(0..2, Le)]),
672            (">=", &[(0..2, Ge)]),
673            ("!", &[(0..1, Not)]),
674            ("~", &[(0..1, Tilde)]),
675            ("++", &[(0..2, PlusPlus)]),
676            ("--", &[(0..2, MinusMinus)]),
677            (":=", &[(0..2, Walrus)]),
678        ]);
679    }
680
681    #[test]
682    fn glueing() {
683        checks(&[
684            ("=", &[(0..1, Eq)]),
685            ("==", &[(0..2, EqEq)]),
686            ("= =", &[(0..1, Eq), (2..3, Eq)]),
687            ("===", &[(0..2, EqEq), (2..3, Eq)]),
688            ("== =", &[(0..2, EqEq), (3..4, Eq)]),
689            ("= ==", &[(0..1, Eq), (2..4, EqEq)]),
690            ("====", &[(0..2, EqEq), (2..4, EqEq)]),
691            ("== ==", &[(0..2, EqEq), (3..5, EqEq)]),
692            ("= ===", &[(0..1, Eq), (2..4, EqEq), (4..5, Eq)]),
693            ("=====", &[(0..2, EqEq), (2..4, EqEq), (4..5, Eq)]),
694            //
695            (" <", &[(1..2, Lt)]),
696            (" <=", &[(1..3, Le)]),
697            (" < =", &[(1..2, Lt), (3..4, Eq)]),
698            (" <<", &[(1..3, BinOp(Shl))]),
699            (" <<=", &[(1..4, BinOpEq(Shl))]),
700            //
701            (" >", &[(1..2, Gt)]),
702            (" >=", &[(1..3, Ge)]),
703            (" > =", &[(1..2, Gt), (3..4, Eq)]),
704            (" >>", &[(1..3, BinOp(Shr))]),
705            (" >>>", &[(1..4, BinOp(Sar))]),
706            (" >>>=", &[(1..5, BinOpEq(Sar))]),
707            //
708            ("+", &[(0..1, BinOp(Plus))]),
709            ("++", &[(0..2, PlusPlus)]),
710            ("+++", &[(0..2, PlusPlus), (2..3, BinOp(Plus))]),
711            ("+ =", &[(0..1, BinOp(Plus)), (2..3, Eq)]),
712            ("+ +=", &[(0..1, BinOp(Plus)), (2..4, BinOpEq(Plus))]),
713            ("+++=", &[(0..2, PlusPlus), (2..4, BinOpEq(Plus))]),
714            ("+ +", &[(0..1, BinOp(Plus)), (2..3, BinOp(Plus))]),
715            //
716            ("-", &[(0..1, BinOp(Minus))]),
717            ("--", &[(0..2, MinusMinus)]),
718            ("---", &[(0..2, MinusMinus), (2..3, BinOp(Minus))]),
719            ("- =", &[(0..1, BinOp(Minus)), (2..3, Eq)]),
720            ("- -=", &[(0..1, BinOp(Minus)), (2..4, BinOpEq(Minus))]),
721            ("---=", &[(0..2, MinusMinus), (2..4, BinOpEq(Minus))]),
722            ("- -", &[(0..1, BinOp(Minus)), (2..3, BinOp(Minus))]),
723        ]);
724    }
725}