foundry_solang_parser/
lexer.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Custom Solidity lexer.
4//!
5//! Solidity needs a custom lexer for two reasons:
6//!  - comments and doc comments
7//!  - pragma value is [^;]+
8
9use crate::pt::{Comment, Loc};
10use itertools::{peek_nth, PeekNth};
11use phf::phf_map;
12use std::{fmt, str::CharIndices};
13use thiserror::Error;
14use unicode_xid::UnicodeXID;
15
16/// A spanned [Token].
17pub type Spanned<'a> = (usize, Token<'a>, usize);
18
19/// [Lexer]'s Result type.
20pub type Result<'a, T = Spanned<'a>, E = LexicalError> = std::result::Result<T, E>;
21
22/// A Solidity lexical token. Produced by [Lexer].
23#[derive(Copy, Clone, PartialEq, Eq, Debug)]
24#[allow(missing_docs)]
25pub enum Token<'input> {
26    Identifier(&'input str),
27    /// `(unicode, literal)`
28    StringLiteral(bool, &'input str),
29    AddressLiteral(&'input str),
30    HexLiteral(&'input str),
31    /// `(number, exponent)`
32    Number(&'input str, &'input str),
33    /// `(number, fraction, exponent)`
34    RationalNumber(&'input str, &'input str, &'input str),
35    HexNumber(&'input str),
36    Divide,
37    Contract,
38    Library,
39    Interface,
40    Function,
41    Pragma,
42    Import,
43
44    Struct,
45    Event,
46    Enum,
47    Type,
48
49    Layout,
50    KwAt,
51
52    Memory,
53    Storage,
54    Calldata,
55    Transient,
56
57    Public,
58    Private,
59    Internal,
60    External,
61
62    Constant,
63
64    New,
65    Delete,
66
67    Pure,
68    View,
69    Payable,
70
71    Do,
72    Continue,
73    Break,
74
75    Throw,
76    Emit,
77    Return,
78    Returns,
79    Revert,
80
81    Uint(u16),
82    Int(u16),
83    Bytes(u8),
84    // prior to 0.8.0 `byte` used to be an alias for `bytes1`
85    Byte,
86    DynamicBytes,
87    Bool,
88    Address,
89    String,
90
91    Semicolon,
92    Comma,
93    OpenParenthesis,
94    CloseParenthesis,
95    OpenCurlyBrace,
96    CloseCurlyBrace,
97
98    BitwiseOr,
99    BitwiseOrAssign,
100    Or,
101
102    BitwiseXor,
103    BitwiseXorAssign,
104
105    BitwiseAnd,
106    BitwiseAndAssign,
107    And,
108
109    AddAssign,
110    Increment,
111    Add,
112
113    SubtractAssign,
114    Decrement,
115    Subtract,
116
117    MulAssign,
118    Mul,
119    Power,
120    DivideAssign,
121    ModuloAssign,
122    Modulo,
123
124    Equal,
125    Assign,
126    ColonAssign,
127
128    NotEqual,
129    Not,
130
131    True,
132    False,
133    Else,
134    Anonymous,
135    For,
136    While,
137    If,
138
139    ShiftRight,
140    ShiftRightAssign,
141    Less,
142    LessEqual,
143
144    ShiftLeft,
145    ShiftLeftAssign,
146    More,
147    MoreEqual,
148
149    Constructor,
150    Indexed,
151
152    Member,
153    Colon,
154    OpenBracket,
155    CloseBracket,
156    BitwiseNot,
157    Question,
158
159    Mapping,
160    Arrow,
161
162    Try,
163    Catch,
164
165    Receive,
166    Fallback,
167
168    As,
169    Is,
170    Abstract,
171    Virtual,
172    Override,
173    Using,
174    Modifier,
175    Immutable,
176    Unchecked,
177
178    Assembly,
179    Let,
180    Leave,
181    Switch,
182    Case,
183    Default,
184    YulArrow,
185
186    // Storage types for Soroban
187    Persistent,
188    Temporary,
189    Instance,
190
191    Annotation(&'input str),
192}
193
194impl fmt::Display for Token<'_> {
195    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196        match self {
197            Token::Identifier(id) => write!(f, "{id}"),
198            Token::StringLiteral(false, s) => write!(f, "\"{s}\""),
199            Token::StringLiteral(true, s) => write!(f, "unicode\"{s}\""),
200            Token::HexLiteral(hex) => write!(f, "{hex}"),
201            Token::AddressLiteral(address) => write!(f, "{address}"),
202            Token::Number(integer, "") => write!(f, "{integer}"),
203            Token::Number(integer, exp) => write!(f, "{integer}e{exp}"),
204            Token::RationalNumber(integer, fraction, "") => {
205                write!(f, "{integer}.{fraction}")
206            }
207            Token::RationalNumber(integer, fraction, exp) => {
208                write!(f, "{integer}.{fraction}e{exp}")
209            }
210            Token::HexNumber(n) => write!(f, "{n}"),
211            Token::Uint(w) => write!(f, "uint{w}"),
212            Token::Int(w) => write!(f, "int{w}"),
213            Token::Bytes(w) => write!(f, "bytes{w}"),
214            Token::Byte => write!(f, "byte"),
215            Token::DynamicBytes => write!(f, "bytes"),
216            Token::Semicolon => write!(f, ";"),
217            Token::Comma => write!(f, ","),
218            Token::OpenParenthesis => write!(f, "("),
219            Token::CloseParenthesis => write!(f, ")"),
220            Token::OpenCurlyBrace => write!(f, "{{"),
221            Token::CloseCurlyBrace => write!(f, "}}"),
222            Token::BitwiseOr => write!(f, "|"),
223            Token::BitwiseOrAssign => write!(f, "|="),
224            Token::Or => write!(f, "||"),
225            Token::BitwiseXor => write!(f, "^"),
226            Token::BitwiseXorAssign => write!(f, "^="),
227            Token::BitwiseAnd => write!(f, "&"),
228            Token::BitwiseAndAssign => write!(f, "&="),
229            Token::And => write!(f, "&&"),
230            Token::AddAssign => write!(f, "+="),
231            Token::Increment => write!(f, "++"),
232            Token::Add => write!(f, "+"),
233            Token::SubtractAssign => write!(f, "-="),
234            Token::Decrement => write!(f, "--"),
235            Token::Subtract => write!(f, "-"),
236            Token::MulAssign => write!(f, "*="),
237            Token::Mul => write!(f, "*"),
238            Token::Power => write!(f, "**"),
239            Token::Divide => write!(f, "/"),
240            Token::DivideAssign => write!(f, "/="),
241            Token::ModuloAssign => write!(f, "%="),
242            Token::Modulo => write!(f, "%"),
243            Token::Equal => write!(f, "=="),
244            Token::Assign => write!(f, "="),
245            Token::ColonAssign => write!(f, ":="),
246            Token::NotEqual => write!(f, "!="),
247            Token::Not => write!(f, "!"),
248            Token::ShiftLeft => write!(f, "<<"),
249            Token::ShiftLeftAssign => write!(f, "<<="),
250            Token::More => write!(f, ">"),
251            Token::MoreEqual => write!(f, ">="),
252            Token::Member => write!(f, "."),
253            Token::Colon => write!(f, ":"),
254            Token::OpenBracket => write!(f, "["),
255            Token::CloseBracket => write!(f, "]"),
256            Token::BitwiseNot => write!(f, "~"),
257            Token::Question => write!(f, "?"),
258            Token::ShiftRightAssign => write!(f, ">>="),
259            Token::ShiftRight => write!(f, ">>"),
260            Token::Less => write!(f, "<"),
261            Token::LessEqual => write!(f, "<="),
262            Token::Bool => write!(f, "bool"),
263            Token::Address => write!(f, "address"),
264            Token::String => write!(f, "string"),
265            Token::Contract => write!(f, "contract"),
266            Token::Library => write!(f, "library"),
267            Token::Interface => write!(f, "interface"),
268            Token::Function => write!(f, "function"),
269            Token::Pragma => write!(f, "pragma"),
270            Token::Import => write!(f, "import"),
271            Token::Struct => write!(f, "struct"),
272            Token::Event => write!(f, "event"),
273            Token::Enum => write!(f, "enum"),
274            Token::Type => write!(f, "type"),
275            Token::Memory => write!(f, "memory"),
276            Token::Storage => write!(f, "storage"),
277            Token::Calldata => write!(f, "calldata"),
278            Token::Public => write!(f, "public"),
279            Token::Private => write!(f, "private"),
280            Token::Internal => write!(f, "internal"),
281            Token::External => write!(f, "external"),
282            Token::Constant => write!(f, "constant"),
283            Token::New => write!(f, "new"),
284            Token::Delete => write!(f, "delete"),
285            Token::Pure => write!(f, "pure"),
286            Token::View => write!(f, "view"),
287            Token::Payable => write!(f, "payable"),
288            Token::Do => write!(f, "do"),
289            Token::Continue => write!(f, "continue"),
290            Token::Break => write!(f, "break"),
291            Token::Throw => write!(f, "throw"),
292            Token::Emit => write!(f, "emit"),
293            Token::Return => write!(f, "return"),
294            Token::Returns => write!(f, "returns"),
295            Token::Revert => write!(f, "revert"),
296            Token::True => write!(f, "true"),
297            Token::False => write!(f, "false"),
298            Token::Else => write!(f, "else"),
299            Token::Anonymous => write!(f, "anonymous"),
300            Token::For => write!(f, "for"),
301            Token::While => write!(f, "while"),
302            Token::If => write!(f, "if"),
303            Token::Constructor => write!(f, "constructor"),
304            Token::Indexed => write!(f, "indexed"),
305            Token::Mapping => write!(f, "mapping"),
306            Token::Arrow => write!(f, "=>"),
307            Token::Try => write!(f, "try"),
308            Token::Catch => write!(f, "catch"),
309            Token::Receive => write!(f, "receive"),
310            Token::Fallback => write!(f, "fallback"),
311            Token::As => write!(f, "as"),
312            Token::Is => write!(f, "is"),
313            Token::Abstract => write!(f, "abstract"),
314            Token::Virtual => write!(f, "virtual"),
315            Token::Override => write!(f, "override"),
316            Token::Using => write!(f, "using"),
317            Token::Modifier => write!(f, "modifier"),
318            Token::Immutable => write!(f, "immutable"),
319            Token::Unchecked => write!(f, "unchecked"),
320            Token::Assembly => write!(f, "assembly"),
321            Token::Let => write!(f, "let"),
322            Token::Leave => write!(f, "leave"),
323            Token::Switch => write!(f, "switch"),
324            Token::Case => write!(f, "case"),
325            Token::Default => write!(f, "default"),
326            Token::YulArrow => write!(f, "->"),
327            Token::Annotation(name) => write!(f, "@{name}"),
328            Token::Persistent => write!(f, "persistent"),
329            Token::Temporary => write!(f, "temporary"),
330            Token::Instance => write!(f, "instance"),
331            Token::Transient => write!(f, "transient"),
332            Token::Layout => write!(f, "layout"),
333            Token::KwAt => write!(f, "at"),
334        }
335    }
336}
337
338/// Custom Solidity lexer.
339///
340/// # Examples
341///
342/// ```
343/// use solang_parser::lexer::{Lexer, Token};
344///
345/// let source = "uint256 number = 0;";
346/// let mut comments = Vec::new();
347/// let mut errors = Vec::new();
348/// let mut lexer = Lexer::new(source, 0, &mut comments, &mut errors);
349///
350/// let mut next_token = || lexer.next().map(|(_, token, _)| token);
351/// assert_eq!(next_token(), Some(Token::Uint(256)));
352/// assert_eq!(next_token(), Some(Token::Identifier("number")));
353/// assert_eq!(next_token(), Some(Token::Assign));
354/// assert_eq!(next_token(), Some(Token::Number("0", "")));
355/// assert_eq!(next_token(), Some(Token::Semicolon));
356/// assert_eq!(next_token(), None);
357/// assert!(errors.is_empty());
358/// assert!(comments.is_empty());
359/// ```
360#[derive(Debug)]
361pub struct Lexer<'input> {
362    input: &'input str,
363    chars: PeekNth<CharIndices<'input>>,
364    comments: &'input mut Vec<Comment>,
365    file_no: usize,
366    /// While parsing version semver, do not parse rational numbers
367    parse_semver: bool,
368    last_tokens: [Option<Token<'input>>; 2],
369    /// The mutable reference to the error vector.
370    pub errors: &'input mut Vec<LexicalError>,
371}
372
373/// An error thrown by [Lexer].
374#[derive(Debug, Clone, PartialEq, Eq, Error)]
375#[allow(missing_docs)]
376pub enum LexicalError {
377    #[error("end of file found in comment")]
378    EndOfFileInComment(Loc),
379
380    #[error("end of file found in string literal")]
381    EndOfFileInString(Loc),
382
383    #[error("end of file found in hex literal string")]
384    EndofFileInHex(Loc),
385
386    #[error("missing number")]
387    MissingNumber(Loc),
388
389    #[error("invalid character '{1}' in hex literal string")]
390    InvalidCharacterInHexLiteral(Loc, char),
391
392    #[error("unrecognised token '{1}'")]
393    UnrecognisedToken(Loc, String),
394
395    #[error("missing exponent")]
396    MissingExponent(Loc),
397
398    #[error("'{1}' found where 'from' expected")]
399    ExpectedFrom(Loc, String),
400}
401
402/// Returns whether `word` is a keyword in Solidity.
403pub fn is_keyword(word: &str) -> bool {
404    KEYWORDS.contains_key(word)
405}
406
407static KEYWORDS: phf::Map<&'static str, Token> = phf_map! {
408    "address" => Token::Address,
409    "anonymous" => Token::Anonymous,
410    "bool" => Token::Bool,
411    "break" => Token::Break,
412    "bytes1" => Token::Bytes(1),
413    "bytes2" => Token::Bytes(2),
414    "bytes3" => Token::Bytes(3),
415    "bytes4" => Token::Bytes(4),
416    "bytes5" => Token::Bytes(5),
417    "bytes6" => Token::Bytes(6),
418    "bytes7" => Token::Bytes(7),
419    "bytes8" => Token::Bytes(8),
420    "bytes9" => Token::Bytes(9),
421    "bytes10" => Token::Bytes(10),
422    "bytes11" => Token::Bytes(11),
423    "bytes12" => Token::Bytes(12),
424    "bytes13" => Token::Bytes(13),
425    "bytes14" => Token::Bytes(14),
426    "bytes15" => Token::Bytes(15),
427    "bytes16" => Token::Bytes(16),
428    "bytes17" => Token::Bytes(17),
429    "bytes18" => Token::Bytes(18),
430    "bytes19" => Token::Bytes(19),
431    "bytes20" => Token::Bytes(20),
432    "bytes21" => Token::Bytes(21),
433    "bytes22" => Token::Bytes(22),
434    "bytes23" => Token::Bytes(23),
435    "bytes24" => Token::Bytes(24),
436    "bytes25" => Token::Bytes(25),
437    "bytes26" => Token::Bytes(26),
438    "bytes27" => Token::Bytes(27),
439    "bytes28" => Token::Bytes(28),
440    "bytes29" => Token::Bytes(29),
441    "bytes30" => Token::Bytes(30),
442    "bytes31" => Token::Bytes(31),
443    "bytes32" => Token::Bytes(32),
444    "bytes" => Token::DynamicBytes,
445    "byte" => Token::Byte,
446    "calldata" => Token::Calldata,
447    "case" => Token::Case,
448    "constant" => Token::Constant,
449    "constructor" => Token::Constructor,
450    "continue" => Token::Continue,
451    "contract" => Token::Contract,
452    "default" => Token::Default,
453    "delete" => Token::Delete,
454    "do" => Token::Do,
455    "else" => Token::Else,
456    "emit" => Token::Emit,
457    "enum" => Token::Enum,
458    "event" => Token::Event,
459    "external" => Token::External,
460    "false" => Token::False,
461    "for" => Token::For,
462    "function" => Token::Function,
463    "if" => Token::If,
464    "import" => Token::Import,
465    "indexed" => Token::Indexed,
466    "int8" => Token::Int(8),
467    "int16" => Token::Int(16),
468    "int24" => Token::Int(24),
469    "int32" => Token::Int(32),
470    "int40" => Token::Int(40),
471    "int48" => Token::Int(48),
472    "int56" => Token::Int(56),
473    "int64" => Token::Int(64),
474    "int72" => Token::Int(72),
475    "int80" => Token::Int(80),
476    "int88" => Token::Int(88),
477    "int96" => Token::Int(96),
478    "int104" => Token::Int(104),
479    "int112" => Token::Int(112),
480    "int120" => Token::Int(120),
481    "int128" => Token::Int(128),
482    "int136" => Token::Int(136),
483    "int144" => Token::Int(144),
484    "int152" => Token::Int(152),
485    "int160" => Token::Int(160),
486    "int168" => Token::Int(168),
487    "int176" => Token::Int(176),
488    "int184" => Token::Int(184),
489    "int192" => Token::Int(192),
490    "int200" => Token::Int(200),
491    "int208" => Token::Int(208),
492    "int216" => Token::Int(216),
493    "int224" => Token::Int(224),
494    "int232" => Token::Int(232),
495    "int240" => Token::Int(240),
496    "int248" => Token::Int(248),
497    "int256" => Token::Int(256),
498    "interface" => Token::Interface,
499    "internal" => Token::Internal,
500    "int" => Token::Int(256),
501    "leave" => Token::Leave,
502    "library" => Token::Library,
503    "mapping" => Token::Mapping,
504    "memory" => Token::Memory,
505    "new" => Token::New,
506    "payable" => Token::Payable,
507    "pragma" => Token::Pragma,
508    "private" => Token::Private,
509    "public" => Token::Public,
510    "pure" => Token::Pure,
511    "returns" => Token::Returns,
512    "return" => Token::Return,
513    "revert" => Token::Revert,
514    "storage" => Token::Storage,
515    "string" => Token::String,
516    "struct" => Token::Struct,
517    "switch" => Token::Switch,
518    "throw" => Token::Throw,
519    "true" => Token::True,
520    "type" => Token::Type,
521    "uint8" => Token::Uint(8),
522    "uint16" => Token::Uint(16),
523    "uint24" => Token::Uint(24),
524    "uint32" => Token::Uint(32),
525    "uint40" => Token::Uint(40),
526    "uint48" => Token::Uint(48),
527    "uint56" => Token::Uint(56),
528    "uint64" => Token::Uint(64),
529    "uint72" => Token::Uint(72),
530    "uint80" => Token::Uint(80),
531    "uint88" => Token::Uint(88),
532    "uint96" => Token::Uint(96),
533    "uint104" => Token::Uint(104),
534    "uint112" => Token::Uint(112),
535    "uint120" => Token::Uint(120),
536    "uint128" => Token::Uint(128),
537    "uint136" => Token::Uint(136),
538    "uint144" => Token::Uint(144),
539    "uint152" => Token::Uint(152),
540    "uint160" => Token::Uint(160),
541    "uint168" => Token::Uint(168),
542    "uint176" => Token::Uint(176),
543    "uint184" => Token::Uint(184),
544    "uint192" => Token::Uint(192),
545    "uint200" => Token::Uint(200),
546    "uint208" => Token::Uint(208),
547    "uint216" => Token::Uint(216),
548    "uint224" => Token::Uint(224),
549    "uint232" => Token::Uint(232),
550    "uint240" => Token::Uint(240),
551    "uint248" => Token::Uint(248),
552    "uint256" => Token::Uint(256),
553    "uint" => Token::Uint(256),
554    "view" => Token::View,
555    "while" => Token::While,
556    "try" => Token::Try,
557    "catch" => Token::Catch,
558    "receive" => Token::Receive,
559    "fallback" => Token::Fallback,
560    "as" => Token::As,
561    "is" => Token::Is,
562    "layout" => Token::Layout,
563    "at" => Token::KwAt,
564    "abstract" => Token::Abstract,
565    "virtual" => Token::Virtual,
566    "override" => Token::Override,
567    "using" => Token::Using,
568    "modifier" => Token::Modifier,
569    "immutable" => Token::Immutable,
570    "unchecked" => Token::Unchecked,
571    "assembly" => Token::Assembly,
572    "let" => Token::Let,
573    "persistent" => Token::Persistent,
574    "temporary" => Token::Temporary,
575    "instance" => Token::Instance,
576    "transient" => Token::Transient,
577};
578
579impl<'input> Lexer<'input> {
580    /// Instantiates a new Lexer.
581    ///
582    /// # Examples
583    ///
584    /// ```
585    /// use solang_parser::lexer::Lexer;
586    ///
587    /// let source = "uint256 number = 0;";
588    /// let mut comments = Vec::new();
589    /// let mut errors = Vec::new();
590    /// let mut lexer = Lexer::new(source, 0, &mut comments, &mut errors);
591    /// ```
592    pub fn new(
593        input: &'input str,
594        file_no: usize,
595        comments: &'input mut Vec<Comment>,
596        errors: &'input mut Vec<LexicalError>,
597    ) -> Self {
598        Lexer {
599            input,
600            chars: peek_nth(input.char_indices()),
601            comments,
602            file_no,
603            parse_semver: false,
604            last_tokens: [None, None],
605            errors,
606        }
607    }
608
609    fn parse_number(&mut self, mut start: usize, ch: char) -> Result<'input> {
610        let mut is_rational = false;
611        if ch == '0' {
612            if let Some((_, 'x')) = self.chars.peek() {
613                // hex number
614                self.chars.next();
615
616                let mut end = match self.chars.next() {
617                    Some((end, ch)) if ch.is_ascii_hexdigit() => end,
618                    Some((..)) => {
619                        return Err(LexicalError::MissingNumber(Loc::File(
620                            self.file_no,
621                            start,
622                            start + 1,
623                        )));
624                    }
625                    None => {
626                        return Err(LexicalError::EndofFileInHex(Loc::File(
627                            self.file_no,
628                            start,
629                            self.input.len(),
630                        )));
631                    }
632                };
633
634                while let Some((i, ch)) = self.chars.peek() {
635                    if !ch.is_ascii_hexdigit() && *ch != '_' {
636                        break;
637                    }
638                    end = *i;
639                    self.chars.next();
640                }
641
642                return Ok((start, Token::HexNumber(&self.input[start..=end]), end + 1));
643            }
644        }
645
646        if ch == '.' {
647            is_rational = true;
648            start -= 1;
649        }
650
651        let mut end = start;
652        while let Some((i, ch)) = self.chars.peek() {
653            if !ch.is_ascii_digit() && *ch != '_' {
654                break;
655            }
656            end = *i;
657            self.chars.next();
658        }
659
660        if self.parse_semver {
661            let integer = &self.input[start..=end];
662            let exp = &self.input[0..0];
663
664            return Ok((start, Token::Number(integer, exp), end + 1));
665        }
666
667        let mut rational_end = end;
668        let mut end_before_rational = end + 1;
669        let mut rational_start = end;
670        if is_rational {
671            end_before_rational = start;
672            rational_start = start + 1;
673        }
674
675        if let Some((_, '.')) = self.chars.peek() {
676            if let Some((i, ch)) = self.chars.peek_nth(1) {
677                if ch.is_ascii_digit() && !is_rational {
678                    rational_start = *i;
679                    rational_end = *i;
680                    is_rational = true;
681                    self.chars.next(); // advance over '.'
682                    while let Some((i, ch)) = self.chars.peek() {
683                        if !ch.is_ascii_digit() && *ch != '_' {
684                            break;
685                        }
686                        rational_end = *i;
687                        end = *i;
688                        self.chars.next();
689                    }
690                }
691            }
692        }
693
694        let old_end = end;
695        let mut exp_start = end + 1;
696
697        if let Some((i, 'e' | 'E')) = self.chars.peek() {
698            exp_start = *i + 1;
699            self.chars.next();
700            // Negative exponent
701            while matches!(self.chars.peek(), Some((_, '-'))) {
702                self.chars.next();
703            }
704            while let Some((i, ch)) = self.chars.peek() {
705                if !ch.is_ascii_digit() && *ch != '_' {
706                    break;
707                }
708                end = *i;
709                self.chars.next();
710            }
711
712            if exp_start > end {
713                return Err(LexicalError::MissingExponent(Loc::File(
714                    self.file_no,
715                    start,
716                    self.input.len(),
717                )));
718            }
719        }
720
721        if is_rational {
722            let integer = &self.input[start..end_before_rational];
723            let fraction = &self.input[rational_start..=rational_end];
724            let exp = &self.input[exp_start..=end];
725
726            return Ok((
727                start,
728                Token::RationalNumber(integer, fraction, exp),
729                end + 1,
730            ));
731        }
732
733        let integer = &self.input[start..=old_end];
734        let exp = &self.input[exp_start..=end];
735
736        Ok((start, Token::Number(integer, exp), end + 1))
737    }
738
739    fn string(
740        &mut self,
741        unicode: bool,
742        token_start: usize,
743        string_start: usize,
744        quote_char: char,
745    ) -> Result<'input> {
746        let mut end;
747
748        let mut last_was_escape = false;
749
750        loop {
751            if let Some((i, ch)) = self.chars.next() {
752                end = i;
753                if !last_was_escape {
754                    if ch == quote_char {
755                        break;
756                    }
757                    last_was_escape = ch == '\\';
758                } else {
759                    last_was_escape = false;
760                }
761            } else {
762                return Err(LexicalError::EndOfFileInString(Loc::File(
763                    self.file_no,
764                    token_start,
765                    self.input.len(),
766                )));
767            }
768        }
769
770        Ok((
771            token_start,
772            Token::StringLiteral(unicode, &self.input[string_start..end]),
773            end + 1,
774        ))
775    }
776
777    fn next(&mut self) -> Option<Spanned<'input>> {
778        'toplevel: loop {
779            match self.chars.next() {
780                Some((start, ch)) if ch == '_' || ch == '$' || UnicodeXID::is_xid_start(ch) => {
781                    let (id, end) = self.match_identifier(start);
782
783                    if id == "unicode" {
784                        match self.chars.peek() {
785                            Some((_, quote_char @ '"')) | Some((_, quote_char @ '\'')) => {
786                                let quote_char = *quote_char;
787
788                                self.chars.next();
789                                let str_res = self.string(true, start, start + 8, quote_char);
790                                match str_res {
791                                    Err(lex_err) => self.errors.push(lex_err),
792                                    Ok(val) => return Some(val),
793                                }
794                            }
795                            _ => (),
796                        }
797                    }
798
799                    if id == "hex" {
800                        match self.chars.peek() {
801                            Some((_, quote_char @ '"')) | Some((_, quote_char @ '\'')) => {
802                                let quote_char = *quote_char;
803
804                                self.chars.next();
805
806                                for (i, ch) in &mut self.chars {
807                                    if ch == quote_char {
808                                        return Some((
809                                            start,
810                                            Token::HexLiteral(&self.input[start..=i]),
811                                            i + 1,
812                                        ));
813                                    }
814
815                                    if !ch.is_ascii_hexdigit() && ch != '_' {
816                                        // Eat up the remainer of the string
817                                        for (_, ch) in &mut self.chars {
818                                            if ch == quote_char {
819                                                break;
820                                            }
821                                        }
822
823                                        self.errors.push(
824                                            LexicalError::InvalidCharacterInHexLiteral(
825                                                Loc::File(self.file_no, i, i + 1),
826                                                ch,
827                                            ),
828                                        );
829                                        continue 'toplevel;
830                                    }
831                                }
832
833                                self.errors.push(LexicalError::EndOfFileInString(Loc::File(
834                                    self.file_no,
835                                    start,
836                                    self.input.len(),
837                                )));
838                                return None;
839                            }
840                            _ => (),
841                        }
842                    }
843
844                    if id == "address" {
845                        match self.chars.peek() {
846                            Some((_, quote_char @ '"')) | Some((_, quote_char @ '\'')) => {
847                                let quote_char = *quote_char;
848
849                                self.chars.next();
850
851                                for (i, ch) in &mut self.chars {
852                                    if ch == quote_char {
853                                        return Some((
854                                            start,
855                                            Token::AddressLiteral(&self.input[start..=i]),
856                                            i + 1,
857                                        ));
858                                    }
859                                }
860
861                                self.errors.push(LexicalError::EndOfFileInString(Loc::File(
862                                    self.file_no,
863                                    start,
864                                    self.input.len(),
865                                )));
866                                return None;
867                            }
868                            _ => (),
869                        }
870                    }
871
872                    return if let Some(w) = KEYWORDS.get(id) {
873                        Some((start, *w, end))
874                    } else {
875                        Some((start, Token::Identifier(id), end))
876                    };
877                }
878                Some((start, quote_char @ '"')) | Some((start, quote_char @ '\'')) => {
879                    let str_res = self.string(false, start, start + 1, quote_char);
880                    match str_res {
881                        Err(lex_err) => self.errors.push(lex_err),
882                        Ok(val) => return Some(val),
883                    }
884                }
885                Some((start, '/')) => {
886                    match self.chars.peek() {
887                        Some((_, '=')) => {
888                            self.chars.next();
889                            return Some((start, Token::DivideAssign, start + 2));
890                        }
891                        Some((_, '/')) => {
892                            // line comment
893                            self.chars.next();
894
895                            let mut newline = false;
896
897                            let doc_comment = match self.chars.next() {
898                                Some((_, '/')) => {
899                                    // ///(/)+ is still a line comment
900                                    !matches!(self.chars.peek(), Some((_, '/')))
901                                }
902                                Some((_, ch)) if ch == '\n' || ch == '\r' => {
903                                    newline = true;
904                                    false
905                                }
906                                _ => false,
907                            };
908
909                            let mut last = start + 3;
910
911                            if !newline {
912                                loop {
913                                    match self.chars.next() {
914                                        None => {
915                                            last = self.input.len();
916                                            break;
917                                        }
918                                        Some((offset, '\n' | '\r')) => {
919                                            last = offset;
920                                            break;
921                                        }
922                                        Some(_) => (),
923                                    }
924                                }
925                            }
926
927                            if doc_comment {
928                                self.comments.push(Comment::DocLine(
929                                    Loc::File(self.file_no, start, last),
930                                    self.input[start..last].to_owned(),
931                                ));
932                            } else {
933                                self.comments.push(Comment::Line(
934                                    Loc::File(self.file_no, start, last),
935                                    self.input[start..last].to_owned(),
936                                ));
937                            }
938                        }
939                        Some((_, '*')) => {
940                            // multiline comment
941                            self.chars.next();
942
943                            let doc_comment_start = matches!(self.chars.peek(), Some((_, '*')));
944
945                            let mut last = start + 3;
946                            let mut seen_star = false;
947
948                            loop {
949                                if let Some((i, ch)) = self.chars.next() {
950                                    if seen_star && ch == '/' {
951                                        break;
952                                    }
953                                    seen_star = ch == '*';
954                                    last = i;
955                                } else {
956                                    self.errors.push(LexicalError::EndOfFileInComment(Loc::File(
957                                        self.file_no,
958                                        start,
959                                        self.input.len(),
960                                    )));
961                                    return None;
962                                }
963                            }
964
965                            // `/**/` is not a doc comment
966                            if doc_comment_start && last > start + 2 {
967                                self.comments.push(Comment::DocBlock(
968                                    Loc::File(self.file_no, start, last + 2),
969                                    self.input[start..last + 2].to_owned(),
970                                ));
971                            } else {
972                                self.comments.push(Comment::Block(
973                                    Loc::File(self.file_no, start, last + 2),
974                                    self.input[start..last + 2].to_owned(),
975                                ));
976                            }
977                        }
978                        _ => {
979                            return Some((start, Token::Divide, start + 1));
980                        }
981                    }
982                }
983                Some((start, ch)) if ch.is_ascii_digit() => {
984                    let parse_result = self.parse_number(start, ch);
985                    match parse_result {
986                        Err(lex_err) => {
987                            self.errors.push(lex_err.clone());
988                            if matches!(lex_err, LexicalError::EndofFileInHex(_)) {
989                                return None;
990                            }
991                        }
992                        Ok(parse_result) => return Some(parse_result),
993                    }
994                }
995                Some((start, '@')) => {
996                    let (id, end) = self.match_identifier(start);
997                    if id.len() == 1 {
998                        self.errors.push(LexicalError::UnrecognisedToken(
999                            Loc::File(self.file_no, start, start + 1),
1000                            id.to_owned(),
1001                        ));
1002                    } else {
1003                        return Some((start, Token::Annotation(&id[1..]), end));
1004                    };
1005                }
1006                Some((i, ';')) => {
1007                    self.parse_semver = false;
1008                    return Some((i, Token::Semicolon, i + 1));
1009                }
1010                Some((i, ',')) => return Some((i, Token::Comma, i + 1)),
1011                Some((i, '(')) => return Some((i, Token::OpenParenthesis, i + 1)),
1012                Some((i, ')')) => return Some((i, Token::CloseParenthesis, i + 1)),
1013                Some((i, '{')) => return Some((i, Token::OpenCurlyBrace, i + 1)),
1014                Some((i, '}')) => return Some((i, Token::CloseCurlyBrace, i + 1)),
1015                Some((i, '~')) => return Some((i, Token::BitwiseNot, i + 1)),
1016                Some((i, '=')) => {
1017                    return match self.chars.peek() {
1018                        Some((_, '=')) => {
1019                            self.chars.next();
1020                            Some((i, Token::Equal, i + 2))
1021                        }
1022                        Some((_, '>')) => {
1023                            self.chars.next();
1024                            Some((i, Token::Arrow, i + 2))
1025                        }
1026                        _ => Some((i, Token::Assign, i + 1)),
1027                    }
1028                }
1029                Some((i, '!')) => {
1030                    return if let Some((_, '=')) = self.chars.peek() {
1031                        self.chars.next();
1032                        Some((i, Token::NotEqual, i + 2))
1033                    } else {
1034                        Some((i, Token::Not, i + 1))
1035                    }
1036                }
1037                Some((i, '|')) => {
1038                    return match self.chars.peek() {
1039                        Some((_, '=')) => {
1040                            self.chars.next();
1041                            Some((i, Token::BitwiseOrAssign, i + 2))
1042                        }
1043                        Some((_, '|')) => {
1044                            self.chars.next();
1045                            Some((i, Token::Or, i + 2))
1046                        }
1047                        _ => Some((i, Token::BitwiseOr, i + 1)),
1048                    };
1049                }
1050                Some((i, '&')) => {
1051                    return match self.chars.peek() {
1052                        Some((_, '=')) => {
1053                            self.chars.next();
1054                            Some((i, Token::BitwiseAndAssign, i + 2))
1055                        }
1056                        Some((_, '&')) => {
1057                            self.chars.next();
1058                            Some((i, Token::And, i + 2))
1059                        }
1060                        _ => Some((i, Token::BitwiseAnd, i + 1)),
1061                    };
1062                }
1063                Some((i, '^')) => {
1064                    return match self.chars.peek() {
1065                        Some((_, '=')) => {
1066                            self.chars.next();
1067                            Some((i, Token::BitwiseXorAssign, i + 2))
1068                        }
1069                        _ => Some((i, Token::BitwiseXor, i + 1)),
1070                    };
1071                }
1072                Some((i, '+')) => {
1073                    return match self.chars.peek() {
1074                        Some((_, '=')) => {
1075                            self.chars.next();
1076                            Some((i, Token::AddAssign, i + 2))
1077                        }
1078                        Some((_, '+')) => {
1079                            self.chars.next();
1080                            Some((i, Token::Increment, i + 2))
1081                        }
1082                        _ => Some((i, Token::Add, i + 1)),
1083                    };
1084                }
1085                Some((i, '-')) => {
1086                    return match self.chars.peek() {
1087                        Some((_, '=')) => {
1088                            self.chars.next();
1089                            Some((i, Token::SubtractAssign, i + 2))
1090                        }
1091                        Some((_, '-')) => {
1092                            self.chars.next();
1093                            Some((i, Token::Decrement, i + 2))
1094                        }
1095                        Some((_, '>')) => {
1096                            self.chars.next();
1097                            Some((i, Token::YulArrow, i + 2))
1098                        }
1099                        _ => Some((i, Token::Subtract, i + 1)),
1100                    };
1101                }
1102                Some((i, '*')) => {
1103                    return match self.chars.peek() {
1104                        Some((_, '=')) => {
1105                            self.chars.next();
1106                            Some((i, Token::MulAssign, i + 2))
1107                        }
1108                        Some((_, '*')) => {
1109                            self.chars.next();
1110                            Some((i, Token::Power, i + 2))
1111                        }
1112                        _ => Some((i, Token::Mul, i + 1)),
1113                    };
1114                }
1115                Some((i, '%')) => {
1116                    return match self.chars.peek() {
1117                        Some((_, '=')) => {
1118                            self.chars.next();
1119                            Some((i, Token::ModuloAssign, i + 2))
1120                        }
1121                        _ => Some((i, Token::Modulo, i + 1)),
1122                    };
1123                }
1124                Some((i, '<')) => {
1125                    return match self.chars.peek() {
1126                        Some((_, '<')) => {
1127                            self.chars.next();
1128                            if let Some((_, '=')) = self.chars.peek() {
1129                                self.chars.next();
1130                                Some((i, Token::ShiftLeftAssign, i + 3))
1131                            } else {
1132                                Some((i, Token::ShiftLeft, i + 2))
1133                            }
1134                        }
1135                        Some((_, '=')) => {
1136                            self.chars.next();
1137                            Some((i, Token::LessEqual, i + 2))
1138                        }
1139                        _ => Some((i, Token::Less, i + 1)),
1140                    };
1141                }
1142                Some((i, '>')) => {
1143                    return match self.chars.peek() {
1144                        Some((_, '>')) => {
1145                            self.chars.next();
1146                            if let Some((_, '=')) = self.chars.peek() {
1147                                self.chars.next();
1148                                Some((i, Token::ShiftRightAssign, i + 3))
1149                            } else {
1150                                Some((i, Token::ShiftRight, i + 2))
1151                            }
1152                        }
1153                        Some((_, '=')) => {
1154                            self.chars.next();
1155                            Some((i, Token::MoreEqual, i + 2))
1156                        }
1157                        _ => Some((i, Token::More, i + 1)),
1158                    };
1159                }
1160                Some((i, '.')) => {
1161                    if let Some((_, a)) = self.chars.peek() {
1162                        if a.is_ascii_digit() && !self.parse_semver {
1163                            return match self.parse_number(i + 1, '.') {
1164                                Err(lex_error) => {
1165                                    self.errors.push(lex_error);
1166                                    None
1167                                }
1168                                Ok(parse_result) => Some(parse_result),
1169                            };
1170                        }
1171                    }
1172                    return Some((i, Token::Member, i + 1));
1173                }
1174                Some((i, '[')) => return Some((i, Token::OpenBracket, i + 1)),
1175                Some((i, ']')) => return Some((i, Token::CloseBracket, i + 1)),
1176                Some((i, ':')) => {
1177                    return match self.chars.peek() {
1178                        Some((_, '=')) => {
1179                            self.chars.next();
1180                            Some((i, Token::ColonAssign, i + 2))
1181                        }
1182                        _ => Some((i, Token::Colon, i + 1)),
1183                    };
1184                }
1185                Some((i, '?')) => return Some((i, Token::Question, i + 1)),
1186                Some((_, ch)) if ch.is_whitespace() => (),
1187                Some((start, _)) => {
1188                    let mut end;
1189
1190                    loop {
1191                        if let Some((i, ch)) = self.chars.next() {
1192                            end = i;
1193
1194                            if ch.is_whitespace() {
1195                                break;
1196                            }
1197                        } else {
1198                            end = self.input.len();
1199                            break;
1200                        }
1201                    }
1202
1203                    self.errors.push(LexicalError::UnrecognisedToken(
1204                        Loc::File(self.file_no, start, end),
1205                        self.input[start..end].to_owned(),
1206                    ));
1207                }
1208                None => return None, // End of file
1209            }
1210        }
1211    }
1212
1213    fn match_identifier(&mut self, start: usize) -> (&'input str, usize) {
1214        let end;
1215        loop {
1216            if let Some((i, ch)) = self.chars.peek() {
1217                if !UnicodeXID::is_xid_continue(*ch) && *ch != '$' {
1218                    end = *i;
1219                    break;
1220                }
1221                self.chars.next();
1222            } else {
1223                end = self.input.len();
1224                break;
1225            }
1226        }
1227
1228        (&self.input[start..end], end)
1229    }
1230}
1231
1232impl<'input> Iterator for Lexer<'input> {
1233    type Item = Spanned<'input>;
1234
1235    fn next(&mut self) -> Option<Self::Item> {
1236        // Lexer should be aware of whether the last two tokens were
1237        // pragma followed by identifier. If this is true, then special parsing should be
1238        // done for the pragma value
1239        if let [Some(Token::Pragma), Some(Token::Identifier(_))] = self.last_tokens {
1240            self.parse_semver = true;
1241        }
1242
1243        let token = self.next();
1244
1245        self.last_tokens = [
1246            self.last_tokens[1],
1247            match token {
1248                Some((_, n, _)) => Some(n),
1249                _ => None,
1250            },
1251        ];
1252
1253        token
1254    }
1255}
1256
1257#[cfg(test)]
1258mod tests {
1259    use super::*;
1260
1261    #[test]
1262    fn test_lexer() {
1263        let mut comments = Vec::new();
1264        let mut errors = Vec::new();
1265
1266        let multiple_errors = r#" 9ea -9e € bool hex uint8 hex"g"   /**  "#;
1267        let tokens = Lexer::new(multiple_errors, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1268        assert_eq!(
1269            tokens,
1270            vec![
1271                (3, Token::Identifier("a"), 4),
1272                (5, Token::Subtract, 6),
1273                (13, Token::Bool, 17),
1274                (18, Token::Identifier("hex"), 21),
1275                (22, Token::Uint(8), 27),
1276            ]
1277        );
1278
1279        assert_eq!(
1280            errors,
1281            vec![
1282                LexicalError::MissingExponent(Loc::File(0, 1, 42)),
1283                LexicalError::MissingExponent(Loc::File(0, 6, 42)),
1284                LexicalError::UnrecognisedToken(Loc::File(0, 9, 12), '€'.to_string()),
1285                LexicalError::InvalidCharacterInHexLiteral(Loc::File(0, 32, 33), 'g'),
1286                LexicalError::EndOfFileInComment(Loc::File(0, 37, 42)),
1287            ]
1288        );
1289
1290        let mut errors = Vec::new();
1291        let tokens = Lexer::new("bool", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1292
1293        assert_eq!(tokens, vec!((0, Token::Bool, 4)));
1294
1295        let tokens = Lexer::new("uint8", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1296
1297        assert_eq!(tokens, vec!((0, Token::Uint(8), 5)));
1298
1299        let tokens = Lexer::new("hex", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1300
1301        assert_eq!(tokens, vec!((0, Token::Identifier("hex"), 3)));
1302
1303        let tokens = Lexer::new(
1304            "hex\"cafe_dead\" /* adad*** */",
1305            0,
1306            &mut comments,
1307            &mut errors,
1308        )
1309        .collect::<Vec<_>>();
1310
1311        assert_eq!(tokens, vec!((0, Token::HexLiteral("hex\"cafe_dead\""), 14)));
1312
1313        let tokens = Lexer::new(
1314            "// foo bar\n0x00fead0_12 00090 0_0",
1315            0,
1316            &mut comments,
1317            &mut errors,
1318        )
1319        .collect::<Vec<_>>();
1320
1321        assert_eq!(
1322            tokens,
1323            vec!(
1324                (11, Token::HexNumber("0x00fead0_12"), 23),
1325                (24, Token::Number("00090", ""), 29),
1326                (30, Token::Number("0_0", ""), 33)
1327            )
1328        );
1329
1330        let tokens = Lexer::new(
1331            "// foo bar\n0x00fead0_12 9.0008 0_0",
1332            0,
1333            &mut comments,
1334            &mut errors,
1335        )
1336        .collect::<Vec<_>>();
1337
1338        assert_eq!(
1339            tokens,
1340            vec!(
1341                (11, Token::HexNumber("0x00fead0_12"), 23),
1342                (24, Token::RationalNumber("9", "0008", ""), 30),
1343                (31, Token::Number("0_0", ""), 34)
1344            )
1345        );
1346
1347        let tokens = Lexer::new(
1348            "// foo bar\n0x00fead0_12 .0008 0.9e2",
1349            0,
1350            &mut comments,
1351            &mut errors,
1352        )
1353        .collect::<Vec<_>>();
1354
1355        assert_eq!(
1356            tokens,
1357            vec!(
1358                (11, Token::HexNumber("0x00fead0_12"), 23),
1359                (24, Token::RationalNumber("", "0008", ""), 29),
1360                (30, Token::RationalNumber("0", "9", "2"), 35)
1361            )
1362        );
1363
1364        let tokens = Lexer::new(
1365            "// foo bar\n0x00fead0_12 .0008 0.9e-2-2",
1366            0,
1367            &mut comments,
1368            &mut errors,
1369        )
1370        .collect::<Vec<_>>();
1371
1372        assert_eq!(
1373            tokens,
1374            vec!(
1375                (11, Token::HexNumber("0x00fead0_12"), 23),
1376                (24, Token::RationalNumber("", "0008", ""), 29),
1377                (30, Token::RationalNumber("0", "9", "-2"), 36),
1378                (36, Token::Subtract, 37),
1379                (37, Token::Number("2", ""), 38)
1380            )
1381        );
1382
1383        let tokens = Lexer::new("1.2_3e2-", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1384
1385        assert_eq!(
1386            tokens,
1387            vec!(
1388                (0, Token::RationalNumber("1", "2_3", "2"), 7),
1389                (7, Token::Subtract, 8)
1390            )
1391        );
1392
1393        let tokens = Lexer::new("\"foo\"", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1394
1395        assert_eq!(tokens, vec!((0, Token::StringLiteral(false, "foo"), 5)));
1396
1397        let tokens = Lexer::new(
1398            "pragma solidity >=0.5.0 <0.7.0;",
1399            0,
1400            &mut comments,
1401            &mut errors,
1402        )
1403        .collect::<Vec<_>>();
1404
1405        assert_eq!(
1406            tokens,
1407            vec!(
1408                (0, Token::Pragma, 6),
1409                (7, Token::Identifier("solidity"), 15),
1410                (16, Token::MoreEqual, 18),
1411                (18, Token::Number("0", ""), 19),
1412                (19, Token::Member, 20),
1413                (20, Token::Number("5", ""), 21),
1414                (21, Token::Member, 22),
1415                (22, Token::Number("0", ""), 23),
1416                (24, Token::Less, 25),
1417                (25, Token::Number("0", ""), 26),
1418                (26, Token::Member, 27),
1419                (27, Token::Number("7", ""), 28),
1420                (28, Token::Member, 29),
1421                (29, Token::Number("0", ""), 30),
1422                (30, Token::Semicolon, 31),
1423            )
1424        );
1425
1426        let tokens = Lexer::new(
1427            "pragma solidity \t>=0.5.0 <0.7.0 \n ;",
1428            0,
1429            &mut comments,
1430            &mut errors,
1431        )
1432        .collect::<Vec<_>>();
1433
1434        assert_eq!(
1435            tokens,
1436            vec!(
1437                (0, Token::Pragma, 6),
1438                (7, Token::Identifier("solidity"), 15),
1439                (17, Token::MoreEqual, 19),
1440                (19, Token::Number("0", ""), 20),
1441                (20, Token::Member, 21),
1442                (21, Token::Number("5", ""), 22),
1443                (22, Token::Member, 23),
1444                (23, Token::Number("0", ""), 24),
1445                (25, Token::Less, 26),
1446                (26, Token::Number("0", ""), 27),
1447                (27, Token::Member, 28),
1448                (28, Token::Number("7", ""), 29),
1449                (29, Token::Member, 30),
1450                (30, Token::Number("0", ""), 31),
1451                (34, Token::Semicolon, 35),
1452            )
1453        );
1454
1455        let tokens =
1456            Lexer::new("pragma solidity 赤;", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1457
1458        assert_eq!(
1459            tokens,
1460            vec!(
1461                (0, Token::Pragma, 6),
1462                (7, Token::Identifier("solidity"), 15),
1463                (16, Token::Identifier("赤"), 19),
1464                (19, Token::Semicolon, 20)
1465            )
1466        );
1467
1468        let tokens = Lexer::new(">>= >> >= >", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1469
1470        assert_eq!(
1471            tokens,
1472            vec!(
1473                (0, Token::ShiftRightAssign, 3),
1474                (4, Token::ShiftRight, 6),
1475                (7, Token::MoreEqual, 9),
1476                (10, Token::More, 11),
1477            )
1478        );
1479
1480        let tokens = Lexer::new("<<= << <= <", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1481
1482        assert_eq!(
1483            tokens,
1484            vec!(
1485                (0, Token::ShiftLeftAssign, 3),
1486                (4, Token::ShiftLeft, 6),
1487                (7, Token::LessEqual, 9),
1488                (10, Token::Less, 11),
1489            )
1490        );
1491
1492        let tokens = Lexer::new("-16 -- - -=", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1493
1494        assert_eq!(
1495            tokens,
1496            vec!(
1497                (0, Token::Subtract, 1),
1498                (1, Token::Number("16", ""), 3),
1499                (4, Token::Decrement, 6),
1500                (7, Token::Subtract, 8),
1501                (9, Token::SubtractAssign, 11),
1502            )
1503        );
1504
1505        let tokens = Lexer::new("-4 ", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1506
1507        assert_eq!(
1508            tokens,
1509            vec!((0, Token::Subtract, 1), (1, Token::Number("4", ""), 2),)
1510        );
1511
1512        let mut errors = Vec::new();
1513        let _ = Lexer::new(r#"hex"abcdefg""#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1514
1515        assert_eq!(
1516            errors,
1517            vec![LexicalError::InvalidCharacterInHexLiteral(
1518                Loc::File(0, 10, 11),
1519                'g'
1520            )]
1521        );
1522
1523        let mut errors = Vec::new();
1524        let _ = Lexer::new(r#" € "#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1525
1526        assert_eq!(
1527            errors,
1528            vec!(LexicalError::UnrecognisedToken(
1529                Loc::File(0, 1, 4),
1530                "€".to_owned()
1531            ))
1532        );
1533
1534        let mut errors = Vec::new();
1535        let _ = Lexer::new(r#"€"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1536
1537        assert_eq!(
1538            errors,
1539            vec!(LexicalError::UnrecognisedToken(
1540                Loc::File(0, 0, 3),
1541                "€".to_owned()
1542            ))
1543        );
1544
1545        let tokens =
1546            Lexer::new(r#"pragma foo bar"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1547
1548        assert_eq!(
1549            tokens,
1550            vec!(
1551                (0, Token::Pragma, 6),
1552                (7, Token::Identifier("foo"), 10),
1553                (11, Token::Identifier("bar"), 14),
1554            )
1555        );
1556
1557        comments.truncate(0);
1558
1559        let tokens = Lexer::new(r#"/// foo"#, 0, &mut comments, &mut errors).count();
1560
1561        assert_eq!(tokens, 0);
1562        assert_eq!(
1563            comments,
1564            vec![Comment::DocLine(Loc::File(0, 0, 7), "/// foo".to_owned())],
1565        );
1566
1567        comments.truncate(0);
1568
1569        let tokens = Lexer::new("/// jadajadadjada\n// bar", 0, &mut comments, &mut errors).count();
1570
1571        assert_eq!(tokens, 0);
1572        assert_eq!(
1573            comments,
1574            vec!(
1575                Comment::DocLine(Loc::File(0, 0, 17), "/// jadajadadjada".to_owned()),
1576                Comment::Line(Loc::File(0, 18, 24), "// bar".to_owned())
1577            )
1578        );
1579
1580        comments.truncate(0);
1581
1582        let tokens = Lexer::new("/**/", 0, &mut comments, &mut errors).count();
1583
1584        assert_eq!(tokens, 0);
1585        assert_eq!(
1586            comments,
1587            vec!(Comment::Block(Loc::File(0, 0, 4), "/**/".to_owned()))
1588        );
1589
1590        comments.truncate(0);
1591
1592        let tokens = Lexer::new(r#"/** foo */"#, 0, &mut comments, &mut errors).count();
1593
1594        assert_eq!(tokens, 0);
1595        assert_eq!(
1596            comments,
1597            vec!(Comment::DocBlock(
1598                Loc::File(0, 0, 10),
1599                "/** foo */".to_owned()
1600            ))
1601        );
1602
1603        comments.truncate(0);
1604
1605        let tokens = Lexer::new(
1606            "/** jadajadadjada */\n/* bar */",
1607            0,
1608            &mut comments,
1609            &mut errors,
1610        )
1611        .count();
1612
1613        assert_eq!(tokens, 0);
1614        assert_eq!(
1615            comments,
1616            vec!(
1617                Comment::DocBlock(Loc::File(0, 0, 20), "/** jadajadadjada */".to_owned()),
1618                Comment::Block(Loc::File(0, 21, 30), "/* bar */".to_owned())
1619            )
1620        );
1621
1622        let tokens = Lexer::new("/************/", 0, &mut comments, &mut errors).next();
1623        assert_eq!(tokens, None);
1624
1625        let mut errors = Vec::new();
1626        let _ = Lexer::new("/**", 0, &mut comments, &mut errors).next();
1627        assert_eq!(
1628            errors,
1629            vec!(LexicalError::EndOfFileInComment(Loc::File(0, 0, 3)))
1630        );
1631
1632        let mut errors = Vec::new();
1633        let tokens = Lexer::new("//////////////", 0, &mut comments, &mut errors).next();
1634        assert_eq!(tokens, None);
1635
1636        // some unicode tests
1637        let tokens = Lexer::new(
1638            ">=\u{a0} . très\u{2028}αβγδεζηθικλμνξοπρστυφχψω\u{85}カラス",
1639            0,
1640            &mut comments,
1641            &mut errors,
1642        )
1643        .collect::<Vec<_>>();
1644
1645        assert_eq!(
1646            tokens,
1647            vec!(
1648                (0, Token::MoreEqual, 2),
1649                (5, Token::Member, 6),
1650                (7, Token::Identifier("très"), 12),
1651                (15, Token::Identifier("αβγδεζηθικλμνξοπρστυφχψω"), 63),
1652                (65, Token::Identifier("カラス"), 74)
1653            )
1654        );
1655
1656        let tokens = Lexer::new(r#"unicode"€""#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1657
1658        assert_eq!(tokens, vec!((0, Token::StringLiteral(true, "€"), 12)));
1659
1660        let tokens =
1661            Lexer::new(r#"unicode "€""#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1662
1663        assert_eq!(
1664            tokens,
1665            vec!(
1666                (0, Token::Identifier("unicode"), 7),
1667                (8, Token::StringLiteral(false, "€"), 13),
1668            )
1669        );
1670
1671        // scientific notation
1672        let tokens = Lexer::new(r#" 1e0 "#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1673
1674        assert_eq!(tokens, vec!((1, Token::Number("1", "0"), 4)));
1675
1676        let tokens = Lexer::new(r#" -9e0123"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1677
1678        assert_eq!(
1679            tokens,
1680            vec!((1, Token::Subtract, 2), (2, Token::Number("9", "0123"), 8),)
1681        );
1682
1683        let mut errors = Vec::new();
1684        let tokens = Lexer::new(r#" -9e"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1685
1686        assert_eq!(tokens, vec!((1, Token::Subtract, 2)));
1687        assert_eq!(
1688            errors,
1689            vec!(LexicalError::MissingExponent(Loc::File(0, 2, 4)))
1690        );
1691
1692        let mut errors = Vec::new();
1693        let tokens = Lexer::new(r#"9ea"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1694
1695        assert_eq!(tokens, vec!((2, Token::Identifier("a"), 3)));
1696        assert_eq!(
1697            errors,
1698            vec!(LexicalError::MissingExponent(Loc::File(0, 0, 3)))
1699        );
1700
1701        let mut errors = Vec::new();
1702        let tokens = Lexer::new(r#"42.a"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1703
1704        assert_eq!(
1705            tokens,
1706            vec!(
1707                (0, Token::Number("42", ""), 2),
1708                (2, Token::Member, 3),
1709                (3, Token::Identifier("a"), 4)
1710            )
1711        );
1712
1713        let tokens = Lexer::new(r#"42..a"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1714
1715        assert_eq!(
1716            tokens,
1717            vec!(
1718                (0, Token::Number("42", ""), 2),
1719                (2, Token::Member, 3),
1720                (3, Token::Member, 4),
1721                (4, Token::Identifier("a"), 5)
1722            )
1723        );
1724
1725        comments.truncate(0);
1726
1727        let tokens = Lexer::new("/// jadajadadjada\n// bar", 0, &mut comments, &mut errors).count();
1728
1729        assert_eq!(tokens, 0);
1730        assert_eq!(
1731            comments,
1732            vec!(
1733                Comment::DocLine(Loc::File(0, 0, 17), "/// jadajadadjada".to_owned()),
1734                Comment::Line(Loc::File(0, 18, 24), "// bar".to_owned())
1735            )
1736        );
1737
1738        comments.truncate(0);
1739
1740        let tokens = Lexer::new("/**/", 0, &mut comments, &mut errors).count();
1741
1742        assert_eq!(tokens, 0);
1743        assert_eq!(
1744            comments,
1745            vec!(Comment::Block(Loc::File(0, 0, 4), "/**/".to_owned()))
1746        );
1747
1748        comments.truncate(0);
1749
1750        let tokens = Lexer::new(r#"/** foo */"#, 0, &mut comments, &mut errors).count();
1751
1752        assert_eq!(tokens, 0);
1753        assert_eq!(
1754            comments,
1755            vec!(Comment::DocBlock(
1756                Loc::File(0, 0, 10),
1757                "/** foo */".to_owned()
1758            ))
1759        );
1760
1761        comments.truncate(0);
1762
1763        let tokens = Lexer::new(
1764            "/** jadajadadjada */\n/* bar */",
1765            0,
1766            &mut comments,
1767            &mut errors,
1768        )
1769        .count();
1770
1771        assert_eq!(tokens, 0);
1772        assert_eq!(
1773            comments,
1774            vec!(
1775                Comment::DocBlock(Loc::File(0, 0, 20), "/** jadajadadjada */".to_owned()),
1776                Comment::Block(Loc::File(0, 21, 30), "/* bar */".to_owned())
1777            )
1778        );
1779
1780        let tokens = Lexer::new("/************/", 0, &mut comments, &mut errors).next();
1781        assert_eq!(tokens, None);
1782
1783        let mut errors = Vec::new();
1784        let _ = Lexer::new("/**", 0, &mut comments, &mut errors).next();
1785        assert_eq!(
1786            errors,
1787            vec!(LexicalError::EndOfFileInComment(Loc::File(0, 0, 3)))
1788        );
1789
1790        let mut errors = Vec::new();
1791        let tokens = Lexer::new("//////////////", 0, &mut comments, &mut errors).next();
1792        assert_eq!(tokens, None);
1793
1794        // some unicode tests
1795        let tokens = Lexer::new(
1796            ">=\u{a0} . très\u{2028}αβγδεζηθικλμνξοπρστυφχψω\u{85}カラス",
1797            0,
1798            &mut comments,
1799            &mut errors,
1800        )
1801        .collect::<Vec<(usize, Token, usize)>>();
1802
1803        assert_eq!(
1804            tokens,
1805            vec!(
1806                (0, Token::MoreEqual, 2),
1807                (5, Token::Member, 6),
1808                (7, Token::Identifier("très"), 12),
1809                (15, Token::Identifier("αβγδεζηθικλμνξοπρστυφχψω"), 63),
1810                (65, Token::Identifier("カラス"), 74)
1811            )
1812        );
1813
1814        let tokens =
1815            Lexer::new(r#"unicode"€""#, 0, &mut comments, &mut errors)
1816                .collect::<Vec<(usize, Token, usize)>>();
1817
1818        assert_eq!(tokens, vec!((0, Token::StringLiteral(true, "€"), 12)));
1819
1820        let tokens =
1821            Lexer::new(r#"unicode "€""#, 0, &mut comments, &mut errors)
1822                .collect::<Vec<(usize, Token, usize)>>();
1823
1824        assert_eq!(
1825            tokens,
1826            vec!(
1827                (0, Token::Identifier("unicode"), 7),
1828                (8, Token::StringLiteral(false, "€"), 13),
1829            )
1830        );
1831
1832        // scientific notation
1833        let tokens =
1834            Lexer::new(r#" 1e0 "#, 0, &mut comments, &mut errors)
1835                .collect::<Vec<(usize, Token, usize)>>();
1836
1837        assert_eq!(tokens, vec!((1, Token::Number("1", "0"), 4)));
1838
1839        let tokens =
1840            Lexer::new(r#" -9e0123"#, 0, &mut comments, &mut errors)
1841                .collect::<Vec<(usize, Token, usize)>>();
1842
1843        assert_eq!(
1844            tokens,
1845            vec!((1, Token::Subtract, 2), (2, Token::Number("9", "0123"), 8),)
1846        );
1847
1848        let mut errors = Vec::new();
1849        let tokens = Lexer::new(r#" -9e"#, 0, &mut comments, &mut errors)
1850            .collect::<Vec<(usize, Token, usize)>>();
1851
1852        assert_eq!(tokens, vec!((1, Token::Subtract, 2)));
1853        assert_eq!(
1854            errors,
1855            vec!(LexicalError::MissingExponent(Loc::File(0, 2, 4)))
1856        );
1857
1858        let mut errors = Vec::new();
1859        let tokens = Lexer::new(r#"9ea"#, 0, &mut comments, &mut errors)
1860            .collect::<Vec<(usize, Token, usize)>>();
1861
1862        assert_eq!(tokens, vec!((2, Token::Identifier("a"), 3)));
1863        assert_eq!(
1864            errors,
1865            vec!(LexicalError::MissingExponent(Loc::File(0, 0, 3)))
1866        );
1867
1868        let mut errors = Vec::new();
1869        let tokens = Lexer::new(r#"42.a"#, 0, &mut comments, &mut errors)
1870            .collect::<Vec<(usize, Token, usize)>>();
1871
1872        assert_eq!(
1873            tokens,
1874            vec!(
1875                (0, Token::Number("42", ""), 2),
1876                (2, Token::Member, 3),
1877                (3, Token::Identifier("a"), 4)
1878            )
1879        );
1880
1881        let tokens =
1882            Lexer::new(r#"42..a"#, 0, &mut comments, &mut errors)
1883                .collect::<Vec<(usize, Token, usize)>>();
1884
1885        assert_eq!(
1886            tokens,
1887            vec!(
1888                (0, Token::Number("42", ""), 2),
1889                (2, Token::Member, 3),
1890                (3, Token::Member, 4),
1891                (4, Token::Identifier("a"), 5)
1892            )
1893        );
1894
1895        let mut errors = Vec::new();
1896        let _ = Lexer::new(r#"hex"g""#, 0, &mut comments, &mut errors)
1897            .collect::<Vec<(usize, Token, usize)>>();
1898        assert_eq!(
1899            errors,
1900            vec!(LexicalError::InvalidCharacterInHexLiteral(
1901                Loc::File(0, 4, 5),
1902                'g'
1903            ),)
1904        );
1905
1906        let mut errors = Vec::new();
1907        let tokens =
1908            Lexer::new(".9", 0, &mut comments, &mut errors).collect::<Vec<(usize, Token, usize)>>();
1909
1910        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", ""), 2)));
1911
1912        let mut errors = Vec::new();
1913        let tokens = Lexer::new(".9e10", 0, &mut comments, &mut errors)
1914            .collect::<Vec<(usize, Token, usize)>>();
1915
1916        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", "10"), 5)));
1917
1918        let mut errors = Vec::new();
1919        let tokens = Lexer::new(".9", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1920
1921        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", ""), 2)));
1922
1923        let mut errors = Vec::new();
1924        let tokens = Lexer::new(".9e10", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1925
1926        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", "10"), 5)));
1927
1928        errors.clear();
1929        comments.clear();
1930        let tokens =
1931            Lexer::new("@my_annotation", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1932        assert_eq!(tokens, vec![(0, Token::Annotation("my_annotation"), 14)]);
1933        assert!(errors.is_empty());
1934        assert!(comments.is_empty());
1935
1936        errors.clear();
1937        comments.clear();
1938        let tokens =
1939            Lexer::new("@ my_annotation", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1940        assert_eq!(tokens, vec![(2, Token::Identifier("my_annotation"), 15)]);
1941        assert_eq!(
1942            errors,
1943            vec![LexicalError::UnrecognisedToken(
1944                Loc::File(0, 0, 1),
1945                "@".to_string()
1946            )]
1947        );
1948        assert!(comments.is_empty());
1949    }
1950}