Skip to main content

cairo_lang_parser/
lexer.rs

1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use std::sync::Arc;
6
7use cairo_lang_filesystem::ids::{SmolStrId, Tracked};
8use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
9use cairo_lang_syntax::node::Token;
10use cairo_lang_syntax::node::ast::{
11    TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
12    TokenWhitespace, TriviumGreen,
13};
14use cairo_lang_syntax::node::kind::SyntaxKind;
15use cairo_lang_utils::deque::Deque;
16use salsa::Database;
17
18#[derive(Clone, PartialEq, Eq, Hash)]
19pub struct Lexer {
20    text: Arc<str>,
21    previous_position: TextOffset,
22    current_position: TextOffset,
23}
24
25impl Lexer {
26    pub fn position(&self) -> TextOffset {
27        self.current_position
28    }
29
30    // Helpers.
31    fn peek(&self) -> Option<char> {
32        self.current_position.take_from(&self.text).chars().next()
33    }
34
35    fn peek_nth(&self, n: usize) -> Option<char> {
36        self.current_position.take_from(&self.text).chars().nth(n)
37    }
38
39    fn take(&mut self) -> Option<char> {
40        let res = self.peek()?;
41        self.current_position = self.current_position.add_width(TextWidth::from_char(res));
42        Some(res)
43    }
44
45    /// Takes a character while the given function returns true.
46    fn take_while<F>(&mut self, f: F)
47    where
48        F: Fn(char) -> bool,
49    {
50        while self.peek().map(&f).unwrap_or(false) {
51            self.take();
52        }
53    }
54
55    fn peek_text_span(&self) -> TextSpan {
56        TextSpan::new(self.previous_position, self.current_position)
57    }
58
59    fn consume_text_span(&mut self) -> TextSpan {
60        let val = self.peek_text_span();
61        self.previous_position = self.current_position;
62        val
63    }
64
65    // Trivia matchers.
66    fn match_trivia<'a>(&mut self, db: &'a dyn Database, leading: bool) -> Vec<TriviumGreen<'a>> {
67        let mut res: Vec<TriviumGreen<'a>> = Vec::new();
68        while let Some(current) = self.peek() {
69            let trivium = match current {
70                ' ' | '\r' | '\t' => self.match_trivium_whitespace(db),
71                '\n' => self.match_trivium_newline(db),
72                '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(db),
73                _ => break,
74            };
75            res.push(trivium);
76            if current == '\n' && !leading {
77                break;
78            }
79        }
80        res
81    }
82
83    /// Assumes the next character is one of [' ', '\r', '\t'].
84    fn match_trivium_whitespace<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
85        self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
86        let span = self.consume_text_span();
87        let text = span.take(&self.text);
88        TokenWhitespace::new_green(db, SmolStrId::from(db, text)).into()
89    }
90
91    /// Assumes the next character is '\n'.
92    fn match_trivium_newline<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
93        self.take();
94        let span = self.consume_text_span();
95        let text = span.take(&self.text);
96        TokenNewline::new_green(db, SmolStrId::from(db, text)).into()
97    }
98
99    /// Assumes the next 2 characters are "//".
100    fn match_trivium_single_line_comment<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
101        match self.peek_nth(2) {
102            Some('/') => {
103                self.take_while(|c| c != '\n');
104                let span = self.consume_text_span();
105                let text = span.take(&self.text);
106                TokenSingleLineDocComment::new_green(db, SmolStrId::from(db, text)).into()
107            }
108            Some('!') => {
109                self.take_while(|c| c != '\n');
110                let span = self.consume_text_span();
111                let text = span.take(&self.text);
112                TokenSingleLineInnerComment::new_green(db, SmolStrId::from(db, text)).into()
113            }
114            _ => {
115                self.take_while(|c| c != '\n');
116                let span = self.consume_text_span();
117                let text = span.take(&self.text);
118                TokenSingleLineComment::new_green(db, SmolStrId::from(db, text)).into()
119            }
120        }
121    }
122
123    // Token matchers.
124    // =================================================================================
125
126    /// Takes a number. May be decimal, hex, oct or bin.
127    fn take_token_literal_number(&mut self) -> TokenKind {
128        let special = if self.peek() == Some('0') {
129            self.take();
130            match self.peek() {
131                Some('x' | 'o' | 'b') => {
132                    match self.take() {
133                        Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
134                        Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
135                        Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
136                        _ => unreachable!(),
137                    }
138                    true
139                }
140                _ => false,
141            }
142        } else {
143            false
144        };
145        // Not a special case - so just reading the rest of the digits.
146        if !special {
147            self.take_while(|c| c.is_ascii_digit());
148        }
149
150        // Parse _type suffix.
151        if self.peek() == Some('_') {
152            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
153        }
154        TokenKind::LiteralNumber
155    }
156
157    /// Takes a short string.
158    fn take_token_short_string(&mut self) -> TokenKind {
159        self.take_token_string_helper('\'');
160
161        // Parse _type suffix.
162        if self.peek() == Some('_') {
163            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
164        }
165        TokenKind::ShortString
166    }
167
168    /// Takes a string.
169    fn take_token_string(&mut self) -> TokenKind {
170        self.take_token_string_helper('"');
171        TokenKind::String
172    }
173
174    fn take_token_string_helper(&mut self, delimiter: char) {
175        self.take();
176        let mut escaped = false;
177        while let Some(token) = self.peek() {
178            self.take();
179            match token {
180                _ if escaped => escaped = false,
181                '\\' => escaped = true,
182                _ if token == delimiter => {
183                    break;
184                }
185                _ => {}
186            };
187        }
188    }
189
190    /// Assumes the next character is [a-zA-Z_].
191    fn take_token_identifier(&mut self) -> TokenKind {
192        // TODO(spapini): Support or explicitly report general unicode characters.
193        self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
194
195        let span = self.peek_text_span();
196        match span.take(&self.text) {
197            "as" => TokenKind::As,
198            "const" => TokenKind::Const,
199            "false" => TokenKind::False,
200            "true" => TokenKind::True,
201            "extern" => TokenKind::Extern,
202            "type" => TokenKind::Type,
203            "fn" => TokenKind::Function,
204            "trait" => TokenKind::Trait,
205            "impl" => TokenKind::Impl,
206            "of" => TokenKind::Of,
207            "mod" => TokenKind::Module,
208            "struct" => TokenKind::Struct,
209            "enum" => TokenKind::Enum,
210            "let" => TokenKind::Let,
211            "return" => TokenKind::Return,
212            "match" => TokenKind::Match,
213            "macro" => TokenKind::Macro,
214            "if" => TokenKind::If,
215            "loop" => TokenKind::Loop,
216            "continue" => TokenKind::Continue,
217            "break" => TokenKind::Break,
218            "else" => TokenKind::Else,
219            "while" => TokenKind::While,
220            "use" => TokenKind::Use,
221            "implicits" => TokenKind::Implicits,
222            "ref" => TokenKind::Ref,
223            "mut" => TokenKind::Mut,
224            "for" => TokenKind::For,
225            "nopanic" => TokenKind::NoPanic,
226            "pub" => TokenKind::Pub,
227            "_" => TokenKind::Underscore,
228            _ => TokenKind::Identifier,
229        }
230    }
231
232    /// Takes a single character and returns the given kind.
233    fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
234        self.take();
235        kind
236    }
237
238    /// If the next character is `second_char`, returns `long_kind`, otherwise returns `short_kind`.
239    fn pick_kind(
240        &mut self,
241        second_char: char,
242        long_kind: TokenKind,
243        short_kind: TokenKind,
244    ) -> TokenKind {
245        self.take();
246        if self.peek() == Some(second_char) {
247            self.take();
248            long_kind
249        } else {
250            short_kind
251        }
252    }
253
254    fn match_terminal<'a>(&mut self, db: &'a dyn Database) -> LexerTerminal<'a> {
255        let leading_trivia = self.match_trivia(db, true);
256
257        let kind = if let Some(current) = self.peek() {
258            match current {
259                '0'..='9' => self.take_token_literal_number(),
260                '\'' => self.take_token_short_string(),
261                '"' => self.take_token_string(),
262                ',' => self.take_token_of_kind(TokenKind::Comma),
263                ';' => self.take_token_of_kind(TokenKind::Semicolon),
264                '?' => self.take_token_of_kind(TokenKind::QuestionMark),
265                '{' => self.take_token_of_kind(TokenKind::LBrace),
266                '}' => self.take_token_of_kind(TokenKind::RBrace),
267                '[' => self.take_token_of_kind(TokenKind::LBrack),
268                ']' => self.take_token_of_kind(TokenKind::RBrack),
269                '(' => self.take_token_of_kind(TokenKind::LParen),
270                ')' => self.take_token_of_kind(TokenKind::RParen),
271                '.' => {
272                    self.take();
273                    match self.peek() {
274                        Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
275                        _ => TokenKind::Dot,
276                    }
277                }
278                '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
279                '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
280                '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
281                '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
282                '#' => self.take_token_of_kind(TokenKind::Hash),
283                '$' => self.take_token_of_kind(TokenKind::Dollar),
284                '-' => {
285                    self.take();
286                    match self.peek() {
287                        Some('>') => self.take_token_of_kind(TokenKind::Arrow),
288                        Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
289                        _ => TokenKind::Minus,
290                    }
291                }
292                '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
293                '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
294                'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
295                ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
296                '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
297                '~' => self.take_token_of_kind(TokenKind::BitNot),
298                '=' => {
299                    self.take();
300                    match self.peek() {
301                        Some('=') => self.take_token_of_kind(TokenKind::EqEq),
302                        Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
303                        _ => TokenKind::Eq,
304                    }
305                }
306                '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
307                '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
308                '^' => self.take_token_of_kind(TokenKind::Xor),
309                '@' => self.take_token_of_kind(TokenKind::At),
310                _ => self.take_token_of_kind(TokenKind::BadCharacters),
311            }
312        } else {
313            TokenKind::EndOfFile
314        };
315
316        let span = self.consume_text_span();
317        let text = SmolStrId::from(db, span.take(&self.text));
318        let trailing_trivia = self.match_trivia(db, false);
319        let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
320
321        // TODO(yuval): log(verbose) "consumed text: ..."
322        LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
323    }
324}
325
326/// Tokenizes the entire text and returns a deque of terminals.
327#[salsa::tracked]
328pub fn tokenize_all<'a>(
329    db: &'a dyn Database,
330    _tracked: Tracked,
331    text: Arc<str>,
332) -> cairo_lang_utils::deque::Deque<LexerTerminal<'a>> {
333    let mut lexer =
334        Lexer { text, previous_position: TextOffset::START, current_position: TextOffset::START };
335    let mut result: Deque<LexerTerminal<'a>> = Default::default();
336    loop {
337        let terminal = lexer.match_terminal(db);
338        let is_eof = terminal.kind == SyntaxKind::TerminalEndOfFile;
339        result.push_back(terminal);
340        if is_eof {
341            break;
342        }
343    }
344    result
345}
346
347/// Output terminal emitted by the lexer.
348#[derive(Clone, PartialEq, Eq, Debug, salsa::Update)]
349pub struct LexerTerminal<'a> {
350    pub text: SmolStrId<'a>,
351    /// The kind of the inner token of this terminal.
352    pub kind: SyntaxKind,
353    pub leading_trivia: Vec<TriviumGreen<'a>>,
354    pub trailing_trivia: Vec<TriviumGreen<'a>>,
355}
356impl<'a> LexerTerminal<'a> {
357    pub fn width(&self, db: &dyn Database) -> TextWidth {
358        self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
359            + TextWidth::from_str(self.text.long(db))
360            + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
361    }
362
363    pub fn text(&self, db: &'a dyn Database) -> &'a str {
364        self.text.long(db)
365    }
366}
367
368#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
369enum TokenKind {
370    Identifier,
371
372    // Literals.
373    LiteralNumber,
374    ShortString,
375    String,
376
377    // Keywords.
378    As,
379    Const,
380    False,
381    True,
382    Extern,
383    Type,
384    Function,
385    Trait,
386    Impl,
387    Of,
388    Module,
389    Struct,
390    Enum,
391    Let,
392    Return,
393    Match,
394    Macro,
395    If,
396    While,
397    For,
398    Loop,
399    Continue,
400    Break,
401    Else,
402    Use,
403    Implicits,
404    NoPanic,
405    Pub,
406
407    // Modifiers.
408    Ref,
409    Mut,
410
411    // Punctuation.
412    And,
413    AndAnd,
414    At,
415    Or,
416    OrOr,
417    Xor,
418    EqEq,
419    Neq,
420    GE,
421    GT,
422    LE,
423    LT,
424    Not,
425    BitNot,
426    Plus,
427    PlusEq,
428    Minus,
429    MinusEq,
430    Mul,
431    MulEq,
432    Div,
433    DivEq,
434    Mod,
435    ModEq,
436
437    Colon,
438    ColonColon,
439    Comma,
440    Dollar,
441    Dot,
442    DotDot,
443    DotDotEq,
444    Eq,
445    Hash,
446    Semicolon,
447    QuestionMark,
448    Underscore,
449    LBrace,
450    RBrace,
451    LBrack,
452    RBrack,
453    LParen,
454    RParen,
455    Arrow,
456    MatchArrow,
457
458    // Meta.
459    EndOfFile,
460    BadCharacters,
461}
462
463fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
464    match kind {
465        TokenKind::As => SyntaxKind::TerminalAs,
466        TokenKind::Const => SyntaxKind::TerminalConst,
467        TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
468        TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
469        TokenKind::ShortString => SyntaxKind::TerminalShortString,
470        TokenKind::String => SyntaxKind::TerminalString,
471        TokenKind::False => SyntaxKind::TerminalFalse,
472        TokenKind::True => SyntaxKind::TerminalTrue,
473        TokenKind::Extern => SyntaxKind::TerminalExtern,
474        TokenKind::Type => SyntaxKind::TerminalType,
475        TokenKind::Function => SyntaxKind::TerminalFunction,
476        TokenKind::Trait => SyntaxKind::TerminalTrait,
477        TokenKind::Impl => SyntaxKind::TerminalImpl,
478        TokenKind::Of => SyntaxKind::TerminalOf,
479        TokenKind::Module => SyntaxKind::TerminalModule,
480        TokenKind::Struct => SyntaxKind::TerminalStruct,
481        TokenKind::Enum => SyntaxKind::TerminalEnum,
482        TokenKind::Let => SyntaxKind::TerminalLet,
483        TokenKind::Return => SyntaxKind::TerminalReturn,
484        TokenKind::Match => SyntaxKind::TerminalMatch,
485        TokenKind::If => SyntaxKind::TerminalIf,
486        TokenKind::While => SyntaxKind::TerminalWhile,
487        TokenKind::For => SyntaxKind::TerminalFor,
488        TokenKind::Loop => SyntaxKind::TerminalLoop,
489        TokenKind::Continue => SyntaxKind::TerminalContinue,
490        TokenKind::Break => SyntaxKind::TerminalBreak,
491        TokenKind::Else => SyntaxKind::TerminalElse,
492        TokenKind::Use => SyntaxKind::TerminalUse,
493        TokenKind::Implicits => SyntaxKind::TerminalImplicits,
494        TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
495        TokenKind::Pub => SyntaxKind::TerminalPub,
496        TokenKind::Macro => SyntaxKind::TerminalMacro,
497        TokenKind::And => SyntaxKind::TerminalAnd,
498        TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
499        TokenKind::At => SyntaxKind::TerminalAt,
500        TokenKind::Or => SyntaxKind::TerminalOr,
501        TokenKind::OrOr => SyntaxKind::TerminalOrOr,
502        TokenKind::Xor => SyntaxKind::TerminalXor,
503        TokenKind::EqEq => SyntaxKind::TerminalEqEq,
504        TokenKind::Neq => SyntaxKind::TerminalNeq,
505        TokenKind::GE => SyntaxKind::TerminalGE,
506        TokenKind::GT => SyntaxKind::TerminalGT,
507        TokenKind::LE => SyntaxKind::TerminalLE,
508        TokenKind::LT => SyntaxKind::TerminalLT,
509        TokenKind::Not => SyntaxKind::TerminalNot,
510        TokenKind::BitNot => SyntaxKind::TerminalBitNot,
511        TokenKind::Plus => SyntaxKind::TerminalPlus,
512        TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
513        TokenKind::Minus => SyntaxKind::TerminalMinus,
514        TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
515        TokenKind::Mul => SyntaxKind::TerminalMul,
516        TokenKind::MulEq => SyntaxKind::TerminalMulEq,
517        TokenKind::Div => SyntaxKind::TerminalDiv,
518        TokenKind::DivEq => SyntaxKind::TerminalDivEq,
519        TokenKind::Mod => SyntaxKind::TerminalMod,
520        TokenKind::ModEq => SyntaxKind::TerminalModEq,
521        TokenKind::Colon => SyntaxKind::TerminalColon,
522        TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
523        TokenKind::Comma => SyntaxKind::TerminalComma,
524        TokenKind::Dollar => SyntaxKind::TerminalDollar,
525        TokenKind::Dot => SyntaxKind::TerminalDot,
526        TokenKind::DotDot => SyntaxKind::TerminalDotDot,
527        TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
528        TokenKind::Eq => SyntaxKind::TerminalEq,
529        TokenKind::Hash => SyntaxKind::TerminalHash,
530        TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
531        TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
532        TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
533        TokenKind::LBrace => SyntaxKind::TerminalLBrace,
534        TokenKind::RBrace => SyntaxKind::TerminalRBrace,
535        TokenKind::LBrack => SyntaxKind::TerminalLBrack,
536        TokenKind::RBrack => SyntaxKind::TerminalRBrack,
537        TokenKind::LParen => SyntaxKind::TerminalLParen,
538        TokenKind::RParen => SyntaxKind::TerminalRParen,
539        TokenKind::Ref => SyntaxKind::TerminalRef,
540        TokenKind::Mut => SyntaxKind::TerminalMut,
541        TokenKind::Arrow => SyntaxKind::TerminalArrow,
542        TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
543        TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
544        TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
545    }
546}