cairo_lang_parser/
lexer.rs

1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
6use cairo_lang_syntax::node::Token;
7use cairo_lang_syntax::node::ast::{
8    TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
9    TokenWhitespace, TriviumGreen,
10};
11use cairo_lang_syntax::node::db::SyntaxGroup;
12use cairo_lang_syntax::node::kind::SyntaxKind;
13use cairo_lang_utils::require;
14use smol_str::SmolStr;
15
16pub struct Lexer<'a> {
17    db: &'a dyn SyntaxGroup,
18    text: &'a str,
19    previous_position: TextOffset,
20    current_position: TextOffset,
21    done: bool,
22}
23
24impl<'a> Lexer<'a> {
25    // Ctors.
26    pub fn from_text(db: &'a dyn SyntaxGroup, text: &'a str) -> Lexer<'a> {
27        Lexer {
28            db,
29            text,
30            previous_position: TextOffset::START,
31            current_position: TextOffset::START,
32            done: false,
33        }
34    }
35
36    pub fn position(&self) -> TextOffset {
37        self.current_position
38    }
39
40    // Helpers.
41    fn peek(&self) -> Option<char> {
42        self.current_position.take_from(self.text).chars().next()
43    }
44
45    fn peek_nth(&self, n: usize) -> Option<char> {
46        self.current_position.take_from(self.text).chars().nth(n)
47    }
48
49    fn take(&mut self) -> Option<char> {
50        let res = self.peek()?;
51        self.current_position = self.current_position.add_width(TextWidth::from_char(res));
52        Some(res)
53    }
54
55    /// Takes a character while the given function returns true.
56    fn take_while<F>(&mut self, f: F)
57    where
58        F: Fn(char) -> bool,
59    {
60        while self.peek().map(&f).unwrap_or(false) {
61            self.take();
62        }
63    }
64
65    fn peek_span_text(&self) -> &'a str {
66        let span = TextSpan { start: self.previous_position, end: self.current_position };
67        span.take(self.text)
68    }
69
70    fn consume_span(&mut self) -> &str {
71        let val = self.peek_span_text();
72        self.previous_position = self.current_position;
73        val
74    }
75
76    // Trivia matchers.
77    fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
78        let mut res: Vec<TriviumGreen> = Vec::new();
79        while let Some(current) = self.peek() {
80            let trivium = match current {
81                ' ' | '\r' | '\t' => self.match_trivium_whitespace(),
82                '\n' => self.match_trivium_newline(),
83                '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
84                _ => break,
85            };
86            res.push(trivium);
87            if current == '\n' && !leading {
88                break;
89            }
90        }
91        res
92    }
93
94    /// Assumes the next character is one of [' ', '\r', '\t'].
95    fn match_trivium_whitespace(&mut self) -> TriviumGreen {
96        self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
97        TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
98    }
99
100    /// Assumes the next character '\n'.
101    fn match_trivium_newline(&mut self) -> TriviumGreen {
102        self.take();
103        TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
104    }
105
106    /// Assumes the next 2 characters are "//".
107    fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
108        match self.peek_nth(2) {
109            Some('/') => {
110                self.take_while(|c| c != '\n');
111                TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
112                    .into()
113            }
114            Some('!') => {
115                self.take_while(|c| c != '\n');
116                TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
117                    .into()
118            }
119            _ => {
120                self.take_while(|c| c != '\n');
121                TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
122                    .into()
123            }
124        }
125    }
126
127    // Token matchers.
128    // =================================================================================
129
130    /// Takes a number. May be decimal, hex, oct or bin.
131    fn take_token_literal_number(&mut self) -> TokenKind {
132        let special = if self.peek() == Some('0') {
133            self.take();
134            match self.peek() {
135                Some('x' | 'o' | 'b') => {
136                    match self.take() {
137                        Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
138                        Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
139                        Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
140                        _ => unreachable!(),
141                    }
142                    true
143                }
144                _ => false,
145            }
146        } else {
147            false
148        };
149        // Not a special case - so just reading the rest of the digits.
150        if !special {
151            self.take_while(|c| c.is_ascii_digit());
152        }
153
154        // Parse _type suffix.
155        if self.peek() == Some('_') {
156            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
157        }
158        TokenKind::LiteralNumber
159    }
160
161    /// Takes a short string.
162    fn take_token_short_string(&mut self) -> TokenKind {
163        self.take_token_string_helper('\'');
164
165        // Parse _type suffix.
166        if self.peek() == Some('_') {
167            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
168        }
169        TokenKind::ShortString
170    }
171
172    /// Takes a string.
173    fn take_token_string(&mut self) -> TokenKind {
174        self.take_token_string_helper('"');
175        TokenKind::String
176    }
177
178    fn take_token_string_helper(&mut self, delimiter: char) {
179        self.take();
180        let mut escaped = false;
181        while let Some(token) = self.peek() {
182            self.take();
183            match token {
184                _ if escaped => escaped = false,
185                '\\' => escaped = true,
186                _ if token == delimiter => {
187                    break;
188                }
189                _ => {}
190            };
191        }
192    }
193
194    /// Assumes the next character is [a-zA-Z_].
195    fn take_token_identifier(&mut self) -> TokenKind {
196        // TODO(spapini): Support or explicitly report general unicode characters.
197        self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
198
199        match self.peek_span_text() {
200            "as" => TokenKind::As,
201            "const" => TokenKind::Const,
202            "false" => TokenKind::False,
203            "true" => TokenKind::True,
204            "extern" => TokenKind::Extern,
205            "type" => TokenKind::Type,
206            "fn" => TokenKind::Function,
207            "trait" => TokenKind::Trait,
208            "impl" => TokenKind::Impl,
209            "of" => TokenKind::Of,
210            "mod" => TokenKind::Module,
211            "struct" => TokenKind::Struct,
212            "enum" => TokenKind::Enum,
213            "let" => TokenKind::Let,
214            "return" => TokenKind::Return,
215            "match" => TokenKind::Match,
216            "macro" => TokenKind::Macro,
217            "if" => TokenKind::If,
218            "loop" => TokenKind::Loop,
219            "continue" => TokenKind::Continue,
220            "break" => TokenKind::Break,
221            "else" => TokenKind::Else,
222            "while" => TokenKind::While,
223            "use" => TokenKind::Use,
224            "implicits" => TokenKind::Implicits,
225            "ref" => TokenKind::Ref,
226            "mut" => TokenKind::Mut,
227            "for" => TokenKind::For,
228            "nopanic" => TokenKind::NoPanic,
229            "pub" => TokenKind::Pub,
230            "_" => TokenKind::Underscore,
231            _ => TokenKind::Identifier,
232        }
233    }
234
235    /// Takes a single character and returns the given kind.
236    fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
237        self.take();
238        kind
239    }
240
241    /// If the next character is `second_char`, returns `long_kind`, otherwise returns `short_kind`.
242    fn pick_kind(
243        &mut self,
244        second_char: char,
245        long_kind: TokenKind,
246        short_kind: TokenKind,
247    ) -> TokenKind {
248        self.take();
249        if self.peek() == Some(second_char) {
250            self.take();
251            long_kind
252        } else {
253            short_kind
254        }
255    }
256
257    fn match_terminal(&mut self) -> LexerTerminal {
258        let leading_trivia = self.match_trivia(true);
259
260        let kind = if let Some(current) = self.peek() {
261            match current {
262                '0'..='9' => self.take_token_literal_number(),
263                '\'' => self.take_token_short_string(),
264                '"' => self.take_token_string(),
265                ',' => self.take_token_of_kind(TokenKind::Comma),
266                ';' => self.take_token_of_kind(TokenKind::Semicolon),
267                '?' => self.take_token_of_kind(TokenKind::QuestionMark),
268                '{' => self.take_token_of_kind(TokenKind::LBrace),
269                '}' => self.take_token_of_kind(TokenKind::RBrace),
270                '[' => self.take_token_of_kind(TokenKind::LBrack),
271                ']' => self.take_token_of_kind(TokenKind::RBrack),
272                '(' => self.take_token_of_kind(TokenKind::LParen),
273                ')' => self.take_token_of_kind(TokenKind::RParen),
274                '.' => {
275                    self.take();
276                    match self.peek() {
277                        Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
278                        _ => TokenKind::Dot,
279                    }
280                }
281                '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
282                '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
283                '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
284                '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
285                '#' => self.take_token_of_kind(TokenKind::Hash),
286                '$' => self.take_token_of_kind(TokenKind::Dollar),
287                '-' => {
288                    self.take();
289                    match self.peek() {
290                        Some('>') => self.take_token_of_kind(TokenKind::Arrow),
291                        Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
292                        _ => TokenKind::Minus,
293                    }
294                }
295                '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
296                '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
297                'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
298                ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
299                '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
300                '~' => self.take_token_of_kind(TokenKind::BitNot),
301                '=' => {
302                    self.take();
303                    match self.peek() {
304                        Some('=') => self.take_token_of_kind(TokenKind::EqEq),
305                        Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
306                        _ => TokenKind::Eq,
307                    }
308                }
309                '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
310                '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
311                '^' => self.take_token_of_kind(TokenKind::Xor),
312                '@' => self.take_token_of_kind(TokenKind::At),
313                _ => self.take_token_of_kind(TokenKind::BadCharacters),
314            }
315        } else {
316            TokenKind::EndOfFile
317        };
318
319        let text = SmolStr::from(self.consume_span());
320        let trailing_trivia = self.match_trivia(false);
321        let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
322
323        // TODO(yuval): log(verbose) "consumed text: ..."
324        LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
325    }
326}
327
328/// Output terminal emitted by the lexer.
329#[derive(Clone, PartialEq, Eq, Debug)]
330pub struct LexerTerminal {
331    pub text: SmolStr,
332    /// The kind of the inner token of this terminal.
333    pub kind: SyntaxKind,
334    pub leading_trivia: Vec<TriviumGreen>,
335    pub trailing_trivia: Vec<TriviumGreen>,
336}
337impl LexerTerminal {
338    pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
339        self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
340            + TextWidth::from_str(&self.text)
341            + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
342    }
343}
344
345impl Iterator for Lexer<'_> {
346    type Item = LexerTerminal;
347
348    /// Returns the next token. Once there are no more tokens left, returns token EOF.
349    /// One should not call this after EOF was returned. If one does, None is returned.
350    fn next(&mut self) -> Option<Self::Item> {
351        require(!self.done)?;
352        let lexer_terminal = self.match_terminal();
353        if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
354            self.done = true;
355        };
356        Some(lexer_terminal)
357    }
358}
359
360#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
361enum TokenKind {
362    Identifier,
363
364    // Literals.
365    LiteralNumber,
366    ShortString,
367    String,
368
369    // Keywords.
370    As,
371    Const,
372    False,
373    True,
374    Extern,
375    Type,
376    Function,
377    Trait,
378    Impl,
379    Of,
380    Module,
381    Struct,
382    Enum,
383    Let,
384    Return,
385    Match,
386    Macro,
387    If,
388    While,
389    For,
390    Loop,
391    Continue,
392    Break,
393    Else,
394    Use,
395    Implicits,
396    NoPanic,
397    Pub,
398
399    // Modifiers.
400    Ref,
401    Mut,
402
403    // Punctuation.
404    And,
405    AndAnd,
406    At,
407    Or,
408    OrOr,
409    Xor,
410    EqEq,
411    Neq,
412    GE,
413    GT,
414    LE,
415    LT,
416    Not,
417    BitNot,
418    Plus,
419    PlusEq,
420    Minus,
421    MinusEq,
422    Mul,
423    MulEq,
424    Div,
425    DivEq,
426    Mod,
427    ModEq,
428
429    Colon,
430    ColonColon,
431    Comma,
432    Dollar,
433    Dot,
434    DotDot,
435    DotDotEq,
436    Eq,
437    Hash,
438    Semicolon,
439    QuestionMark,
440    Underscore,
441    LBrace,
442    RBrace,
443    LBrack,
444    RBrack,
445    LParen,
446    RParen,
447    Arrow,
448    MatchArrow,
449
450    // Meta.
451    EndOfFile,
452    BadCharacters,
453}
454
455fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
456    match kind {
457        TokenKind::As => SyntaxKind::TerminalAs,
458        TokenKind::Const => SyntaxKind::TerminalConst,
459        TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
460        TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
461        TokenKind::ShortString => SyntaxKind::TerminalShortString,
462        TokenKind::String => SyntaxKind::TerminalString,
463        TokenKind::False => SyntaxKind::TerminalFalse,
464        TokenKind::True => SyntaxKind::TerminalTrue,
465        TokenKind::Extern => SyntaxKind::TerminalExtern,
466        TokenKind::Type => SyntaxKind::TerminalType,
467        TokenKind::Function => SyntaxKind::TerminalFunction,
468        TokenKind::Trait => SyntaxKind::TerminalTrait,
469        TokenKind::Impl => SyntaxKind::TerminalImpl,
470        TokenKind::Of => SyntaxKind::TerminalOf,
471        TokenKind::Module => SyntaxKind::TerminalModule,
472        TokenKind::Struct => SyntaxKind::TerminalStruct,
473        TokenKind::Enum => SyntaxKind::TerminalEnum,
474        TokenKind::Let => SyntaxKind::TerminalLet,
475        TokenKind::Return => SyntaxKind::TerminalReturn,
476        TokenKind::Match => SyntaxKind::TerminalMatch,
477        TokenKind::If => SyntaxKind::TerminalIf,
478        TokenKind::While => SyntaxKind::TerminalWhile,
479        TokenKind::For => SyntaxKind::TerminalFor,
480        TokenKind::Loop => SyntaxKind::TerminalLoop,
481        TokenKind::Continue => SyntaxKind::TerminalContinue,
482        TokenKind::Break => SyntaxKind::TerminalBreak,
483        TokenKind::Else => SyntaxKind::TerminalElse,
484        TokenKind::Use => SyntaxKind::TerminalUse,
485        TokenKind::Implicits => SyntaxKind::TerminalImplicits,
486        TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
487        TokenKind::Pub => SyntaxKind::TerminalPub,
488        TokenKind::Macro => SyntaxKind::TerminalMacro,
489        TokenKind::And => SyntaxKind::TerminalAnd,
490        TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
491        TokenKind::At => SyntaxKind::TerminalAt,
492        TokenKind::Or => SyntaxKind::TerminalOr,
493        TokenKind::OrOr => SyntaxKind::TerminalOrOr,
494        TokenKind::Xor => SyntaxKind::TerminalXor,
495        TokenKind::EqEq => SyntaxKind::TerminalEqEq,
496        TokenKind::Neq => SyntaxKind::TerminalNeq,
497        TokenKind::GE => SyntaxKind::TerminalGE,
498        TokenKind::GT => SyntaxKind::TerminalGT,
499        TokenKind::LE => SyntaxKind::TerminalLE,
500        TokenKind::LT => SyntaxKind::TerminalLT,
501        TokenKind::Not => SyntaxKind::TerminalNot,
502        TokenKind::BitNot => SyntaxKind::TerminalBitNot,
503        TokenKind::Plus => SyntaxKind::TerminalPlus,
504        TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
505        TokenKind::Minus => SyntaxKind::TerminalMinus,
506        TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
507        TokenKind::Mul => SyntaxKind::TerminalMul,
508        TokenKind::MulEq => SyntaxKind::TerminalMulEq,
509        TokenKind::Div => SyntaxKind::TerminalDiv,
510        TokenKind::DivEq => SyntaxKind::TerminalDivEq,
511        TokenKind::Mod => SyntaxKind::TerminalMod,
512        TokenKind::ModEq => SyntaxKind::TerminalModEq,
513        TokenKind::Colon => SyntaxKind::TerminalColon,
514        TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
515        TokenKind::Comma => SyntaxKind::TerminalComma,
516        TokenKind::Dollar => SyntaxKind::TerminalDollar,
517        TokenKind::Dot => SyntaxKind::TerminalDot,
518        TokenKind::DotDot => SyntaxKind::TerminalDotDot,
519        TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
520        TokenKind::Eq => SyntaxKind::TerminalEq,
521        TokenKind::Hash => SyntaxKind::TerminalHash,
522        TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
523        TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
524        TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
525        TokenKind::LBrace => SyntaxKind::TerminalLBrace,
526        TokenKind::RBrace => SyntaxKind::TerminalRBrace,
527        TokenKind::LBrack => SyntaxKind::TerminalLBrack,
528        TokenKind::RBrack => SyntaxKind::TerminalRBrack,
529        TokenKind::LParen => SyntaxKind::TerminalLParen,
530        TokenKind::RParen => SyntaxKind::TerminalRParen,
531        TokenKind::Ref => SyntaxKind::TerminalRef,
532        TokenKind::Mut => SyntaxKind::TerminalMut,
533        TokenKind::Arrow => SyntaxKind::TerminalArrow,
534        TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
535        TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
536        TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
537    }
538}