cairo_lang_parser/
lexer.rs

1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use std::sync::Arc;
6
7use cairo_lang_filesystem::ids::{SmolStrId, Tracked};
8use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
9use cairo_lang_syntax::node::Token;
10use cairo_lang_syntax::node::ast::{
11    TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
12    TokenWhitespace, TriviumGreen,
13};
14use cairo_lang_syntax::node::kind::SyntaxKind;
15use cairo_lang_utils::deque::Deque;
16use salsa::Database;
17
18#[derive(Clone, PartialEq, Eq, Hash)]
19pub struct Lexer {
20    text: Arc<str>,
21    previous_position: TextOffset,
22    current_position: TextOffset,
23}
24
25impl Lexer {
26    pub fn position(&self) -> TextOffset {
27        self.current_position
28    }
29
30    // Helpers.
31    fn peek(&self) -> Option<char> {
32        self.current_position.take_from(&self.text).chars().next()
33    }
34
35    fn peek_nth(&self, n: usize) -> Option<char> {
36        self.current_position.take_from(&self.text).chars().nth(n)
37    }
38
39    fn take(&mut self) -> Option<char> {
40        let res = self.peek()?;
41        self.current_position = self.current_position.add_width(TextWidth::from_char(res));
42        Some(res)
43    }
44
45    /// Takes a character while the given function returns true.
46    fn take_while<F>(&mut self, f: F)
47    where
48        F: Fn(char) -> bool,
49    {
50        while self.peek().map(&f).unwrap_or(false) {
51            self.take();
52        }
53    }
54
55    fn peek_text_span(&self) -> TextSpan {
56        TextSpan::new(self.previous_position, self.current_position)
57    }
58
59    fn consume_text_span(&mut self) -> TextSpan {
60        let val = self.peek_text_span();
61        self.previous_position = self.current_position;
62        val
63    }
64
65    // Trivia matchers.
66    fn match_trivia<'a>(&mut self, db: &'a dyn Database, leading: bool) -> Vec<TriviumGreen<'a>> {
67        let mut res: Vec<TriviumGreen<'a>> = Vec::new();
68        while let Some(current) = self.peek() {
69            let trivium = match current {
70                ' ' | '\r' | '\t' => self.match_trivium_whitespace(db),
71                '\n' => self.match_trivium_newline(db),
72                '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(db),
73                _ => break,
74            };
75            res.push(trivium);
76            if current == '\n' && !leading {
77                break;
78            }
79        }
80        res
81    }
82
83    /// Assumes the next character is one of [' ', '\r', '\t'].
84    fn match_trivium_whitespace<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
85        self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
86        let span = self.consume_text_span();
87        let text = span.take(&self.text);
88        TokenWhitespace::new_green(db, SmolStrId::from(db, text)).into()
89    }
90
91    /// Assumes the next character '\n'.
92    fn match_trivium_newline<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
93        self.take();
94        let span = self.consume_text_span();
95        let text = span.take(&self.text);
96        TokenNewline::new_green(db, SmolStrId::from(db, text)).into()
97    }
98
99    /// Assumes the next 2 characters are "//".
100    fn match_trivium_single_line_comment<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
101        match self.peek_nth(2) {
102            Some('/') => {
103                self.take_while(|c| c != '\n');
104                let span = self.consume_text_span();
105                let text = span.take(&self.text);
106                TokenSingleLineDocComment::new_green(db, SmolStrId::from(db, text)).into()
107            }
108            Some('!') => {
109                self.take_while(|c| c != '\n');
110                let span = self.consume_text_span();
111                let text = span.take(&self.text);
112                TokenSingleLineInnerComment::new_green(db, SmolStrId::from(db, text)).into()
113            }
114            _ => {
115                self.take_while(|c| c != '\n');
116                let span = self.consume_text_span();
117                let text = span.take(&self.text);
118                TokenSingleLineComment::new_green(db, SmolStrId::from(db, text)).into()
119            }
120        }
121    }
122
123    // Token matchers.
124    // =================================================================================
125
126    /// Takes a number. May be decimal, hex, oct or bin.
127    fn take_token_literal_number(&mut self) -> TokenKind {
128        let special = if self.peek() == Some('0') {
129            self.take();
130            match self.peek() {
131                Some('x' | 'o' | 'b') => {
132                    match self.take() {
133                        Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
134                        Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
135                        Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
136                        _ => unreachable!(),
137                    }
138                    true
139                }
140                _ => false,
141            }
142        } else {
143            false
144        };
145        // Not a special case - so just reading the rest of the digits.
146        if !special {
147            self.take_while(|c| c.is_ascii_digit());
148        }
149
150        // Parse _type suffix.
151        if self.peek() == Some('_') {
152            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
153        }
154        TokenKind::LiteralNumber
155    }
156
157    /// Takes a short string.
158    fn take_token_short_string(&mut self) -> TokenKind {
159        self.take_token_string_helper('\'');
160
161        // Parse _type suffix.
162        if self.peek() == Some('_') {
163            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
164        }
165        TokenKind::ShortString
166    }
167
168    /// Takes a string.
169    fn take_token_string(&mut self) -> TokenKind {
170        self.take_token_string_helper('"');
171        TokenKind::String
172    }
173
174    fn take_token_string_helper(&mut self, delimiter: char) {
175        self.take();
176        let mut escaped = false;
177        while let Some(token) = self.peek() {
178            self.take();
179            match token {
180                _ if escaped => escaped = false,
181                '\\' => escaped = true,
182                _ if token == delimiter => {
183                    break;
184                }
185                _ => {}
186            };
187        }
188    }
189
190    /// Assumes the next character is [a-zA-Z_].
191    fn take_token_identifier(&mut self) -> TokenKind {
192        // TODO(spapini): Support or explicitly report general unicode characters.
193        self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
194
195        let span = self.peek_text_span();
196        match span.take(&self.text) {
197            "as" => TokenKind::As,
198            "const" => TokenKind::Const,
199            "false" => TokenKind::False,
200            "true" => TokenKind::True,
201            "extern" => TokenKind::Extern,
202            "type" => TokenKind::Type,
203            "fn" => TokenKind::Function,
204            "trait" => TokenKind::Trait,
205            "impl" => TokenKind::Impl,
206            "of" => TokenKind::Of,
207            "mod" => TokenKind::Module,
208            "struct" => TokenKind::Struct,
209            "enum" => TokenKind::Enum,
210            "let" => TokenKind::Let,
211            "return" => TokenKind::Return,
212            "match" => TokenKind::Match,
213            "macro" => TokenKind::Macro,
214            "if" => TokenKind::If,
215            "loop" => TokenKind::Loop,
216            "continue" => TokenKind::Continue,
217            "break" => TokenKind::Break,
218            "else" => TokenKind::Else,
219            "while" => TokenKind::While,
220            "use" => TokenKind::Use,
221            "implicits" => TokenKind::Implicits,
222            "ref" => TokenKind::Ref,
223            "mut" => TokenKind::Mut,
224            "for" => TokenKind::For,
225            "nopanic" => TokenKind::NoPanic,
226            "pub" => TokenKind::Pub,
227            "_" => TokenKind::Underscore,
228            _ => TokenKind::Identifier,
229        }
230    }
231
232    /// Takes a single character and returns the given kind.
233    fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
234        self.take();
235        kind
236    }
237
238    /// If the next character is `second_char`, returns `long_kind`, otherwise returns `short_kind`.
239    fn pick_kind(
240        &mut self,
241        second_char: char,
242        long_kind: TokenKind,
243        short_kind: TokenKind,
244    ) -> TokenKind {
245        self.take();
246        if self.peek() == Some(second_char) {
247            self.take();
248            long_kind
249        } else {
250            short_kind
251        }
252    }
253
254    fn match_terminal<'a>(&mut self, db: &'a dyn Database) -> LexerTerminal<'a> {
255        let leading_trivia = self.match_trivia(db, true);
256
257        let kind = if let Some(current) = self.peek() {
258            match current {
259                '0'..='9' => self.take_token_literal_number(),
260                '\'' => self.take_token_short_string(),
261                '"' => self.take_token_string(),
262                ',' => self.take_token_of_kind(TokenKind::Comma),
263                ';' => self.take_token_of_kind(TokenKind::Semicolon),
264                '?' => self.take_token_of_kind(TokenKind::QuestionMark),
265                '{' => self.take_token_of_kind(TokenKind::LBrace),
266                '}' => self.take_token_of_kind(TokenKind::RBrace),
267                '[' => self.take_token_of_kind(TokenKind::LBrack),
268                ']' => self.take_token_of_kind(TokenKind::RBrack),
269                '(' => self.take_token_of_kind(TokenKind::LParen),
270                ')' => self.take_token_of_kind(TokenKind::RParen),
271                '.' => {
272                    self.take();
273                    match self.peek() {
274                        Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
275                        _ => TokenKind::Dot,
276                    }
277                }
278                '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
279                '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
280                '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
281                '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
282                '#' => self.take_token_of_kind(TokenKind::Hash),
283                '$' => self.take_token_of_kind(TokenKind::Dollar),
284                '-' => {
285                    self.take();
286                    match self.peek() {
287                        Some('>') => self.take_token_of_kind(TokenKind::Arrow),
288                        Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
289                        _ => TokenKind::Minus,
290                    }
291                }
292                '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
293                '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
294                'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
295                ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
296                '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
297                '~' => self.take_token_of_kind(TokenKind::BitNot),
298                '=' => {
299                    self.take();
300                    match self.peek() {
301                        Some('=') => self.take_token_of_kind(TokenKind::EqEq),
302                        Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
303                        _ => TokenKind::Eq,
304                    }
305                }
306                '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
307                '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
308                '^' => self.take_token_of_kind(TokenKind::Xor),
309                '@' => self.take_token_of_kind(TokenKind::At),
310                _ => self.take_token_of_kind(TokenKind::BadCharacters),
311            }
312        } else {
313            TokenKind::EndOfFile
314        };
315
316        let span = self.consume_text_span();
317        let text_arc = self.text.clone();
318        let text = span.take(&text_arc);
319        let trailing_trivia = self.match_trivia(db, false);
320        let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
321
322        // TODO(yuval): log(verbose) "consumed text: ..."
323        LexerTerminal {
324            text: SmolStrId::from(db, text),
325            kind: terminal_kind,
326            leading_trivia,
327            trailing_trivia,
328        }
329    }
330}
331
332/// Tokenizes the entire text and returns a deque of terminals.
333#[salsa::tracked]
334pub fn tokenize_all<'a>(
335    db: &'a dyn Database,
336    _tracked: Tracked,
337    text: Arc<str>,
338) -> cairo_lang_utils::deque::Deque<LexerTerminal<'a>> {
339    let mut lexer =
340        Lexer { text, previous_position: TextOffset::START, current_position: TextOffset::START };
341    let mut result: Deque<LexerTerminal<'a>> = Default::default();
342    loop {
343        let terminal = lexer.match_terminal(db);
344        let is_eof = terminal.kind == SyntaxKind::TerminalEndOfFile;
345        result.push_back(terminal);
346        if is_eof {
347            break;
348        }
349    }
350    result
351}
352
353/// Output terminal emitted by the lexer.
354#[derive(Clone, PartialEq, Eq, Debug, salsa::Update)]
355pub struct LexerTerminal<'a> {
356    pub text: SmolStrId<'a>,
357    /// The kind of the inner token of this terminal.
358    pub kind: SyntaxKind,
359    pub leading_trivia: Vec<TriviumGreen<'a>>,
360    pub trailing_trivia: Vec<TriviumGreen<'a>>,
361}
362impl<'a> LexerTerminal<'a> {
363    pub fn width(&self, db: &dyn Database) -> TextWidth {
364        self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
365            + TextWidth::from_str(self.text.long(db))
366            + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
367    }
368
369    pub fn text(&self, db: &'a dyn Database) -> &'a str {
370        self.text.long(db)
371    }
372}
373
374#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
375enum TokenKind {
376    Identifier,
377
378    // Literals.
379    LiteralNumber,
380    ShortString,
381    String,
382
383    // Keywords.
384    As,
385    Const,
386    False,
387    True,
388    Extern,
389    Type,
390    Function,
391    Trait,
392    Impl,
393    Of,
394    Module,
395    Struct,
396    Enum,
397    Let,
398    Return,
399    Match,
400    Macro,
401    If,
402    While,
403    For,
404    Loop,
405    Continue,
406    Break,
407    Else,
408    Use,
409    Implicits,
410    NoPanic,
411    Pub,
412
413    // Modifiers.
414    Ref,
415    Mut,
416
417    // Punctuation.
418    And,
419    AndAnd,
420    At,
421    Or,
422    OrOr,
423    Xor,
424    EqEq,
425    Neq,
426    GE,
427    GT,
428    LE,
429    LT,
430    Not,
431    BitNot,
432    Plus,
433    PlusEq,
434    Minus,
435    MinusEq,
436    Mul,
437    MulEq,
438    Div,
439    DivEq,
440    Mod,
441    ModEq,
442
443    Colon,
444    ColonColon,
445    Comma,
446    Dollar,
447    Dot,
448    DotDot,
449    DotDotEq,
450    Eq,
451    Hash,
452    Semicolon,
453    QuestionMark,
454    Underscore,
455    LBrace,
456    RBrace,
457    LBrack,
458    RBrack,
459    LParen,
460    RParen,
461    Arrow,
462    MatchArrow,
463
464    // Meta.
465    EndOfFile,
466    BadCharacters,
467}
468
469fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
470    match kind {
471        TokenKind::As => SyntaxKind::TerminalAs,
472        TokenKind::Const => SyntaxKind::TerminalConst,
473        TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
474        TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
475        TokenKind::ShortString => SyntaxKind::TerminalShortString,
476        TokenKind::String => SyntaxKind::TerminalString,
477        TokenKind::False => SyntaxKind::TerminalFalse,
478        TokenKind::True => SyntaxKind::TerminalTrue,
479        TokenKind::Extern => SyntaxKind::TerminalExtern,
480        TokenKind::Type => SyntaxKind::TerminalType,
481        TokenKind::Function => SyntaxKind::TerminalFunction,
482        TokenKind::Trait => SyntaxKind::TerminalTrait,
483        TokenKind::Impl => SyntaxKind::TerminalImpl,
484        TokenKind::Of => SyntaxKind::TerminalOf,
485        TokenKind::Module => SyntaxKind::TerminalModule,
486        TokenKind::Struct => SyntaxKind::TerminalStruct,
487        TokenKind::Enum => SyntaxKind::TerminalEnum,
488        TokenKind::Let => SyntaxKind::TerminalLet,
489        TokenKind::Return => SyntaxKind::TerminalReturn,
490        TokenKind::Match => SyntaxKind::TerminalMatch,
491        TokenKind::If => SyntaxKind::TerminalIf,
492        TokenKind::While => SyntaxKind::TerminalWhile,
493        TokenKind::For => SyntaxKind::TerminalFor,
494        TokenKind::Loop => SyntaxKind::TerminalLoop,
495        TokenKind::Continue => SyntaxKind::TerminalContinue,
496        TokenKind::Break => SyntaxKind::TerminalBreak,
497        TokenKind::Else => SyntaxKind::TerminalElse,
498        TokenKind::Use => SyntaxKind::TerminalUse,
499        TokenKind::Implicits => SyntaxKind::TerminalImplicits,
500        TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
501        TokenKind::Pub => SyntaxKind::TerminalPub,
502        TokenKind::Macro => SyntaxKind::TerminalMacro,
503        TokenKind::And => SyntaxKind::TerminalAnd,
504        TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
505        TokenKind::At => SyntaxKind::TerminalAt,
506        TokenKind::Or => SyntaxKind::TerminalOr,
507        TokenKind::OrOr => SyntaxKind::TerminalOrOr,
508        TokenKind::Xor => SyntaxKind::TerminalXor,
509        TokenKind::EqEq => SyntaxKind::TerminalEqEq,
510        TokenKind::Neq => SyntaxKind::TerminalNeq,
511        TokenKind::GE => SyntaxKind::TerminalGE,
512        TokenKind::GT => SyntaxKind::TerminalGT,
513        TokenKind::LE => SyntaxKind::TerminalLE,
514        TokenKind::LT => SyntaxKind::TerminalLT,
515        TokenKind::Not => SyntaxKind::TerminalNot,
516        TokenKind::BitNot => SyntaxKind::TerminalBitNot,
517        TokenKind::Plus => SyntaxKind::TerminalPlus,
518        TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
519        TokenKind::Minus => SyntaxKind::TerminalMinus,
520        TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
521        TokenKind::Mul => SyntaxKind::TerminalMul,
522        TokenKind::MulEq => SyntaxKind::TerminalMulEq,
523        TokenKind::Div => SyntaxKind::TerminalDiv,
524        TokenKind::DivEq => SyntaxKind::TerminalDivEq,
525        TokenKind::Mod => SyntaxKind::TerminalMod,
526        TokenKind::ModEq => SyntaxKind::TerminalModEq,
527        TokenKind::Colon => SyntaxKind::TerminalColon,
528        TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
529        TokenKind::Comma => SyntaxKind::TerminalComma,
530        TokenKind::Dollar => SyntaxKind::TerminalDollar,
531        TokenKind::Dot => SyntaxKind::TerminalDot,
532        TokenKind::DotDot => SyntaxKind::TerminalDotDot,
533        TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
534        TokenKind::Eq => SyntaxKind::TerminalEq,
535        TokenKind::Hash => SyntaxKind::TerminalHash,
536        TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
537        TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
538        TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
539        TokenKind::LBrace => SyntaxKind::TerminalLBrace,
540        TokenKind::RBrace => SyntaxKind::TerminalRBrace,
541        TokenKind::LBrack => SyntaxKind::TerminalLBrack,
542        TokenKind::RBrack => SyntaxKind::TerminalRBrack,
543        TokenKind::LParen => SyntaxKind::TerminalLParen,
544        TokenKind::RParen => SyntaxKind::TerminalRParen,
545        TokenKind::Ref => SyntaxKind::TerminalRef,
546        TokenKind::Mut => SyntaxKind::TerminalMut,
547        TokenKind::Arrow => SyntaxKind::TerminalArrow,
548        TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
549        TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
550        TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
551    }
552}