1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use std::sync::Arc;
6
7use cairo_lang_filesystem::ids::{SmolStrId, Tracked};
8use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
9use cairo_lang_syntax::node::Token;
10use cairo_lang_syntax::node::ast::{
11 TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
12 TokenWhitespace, TriviumGreen,
13};
14use cairo_lang_syntax::node::kind::SyntaxKind;
15use cairo_lang_utils::deque::Deque;
16use salsa::Database;
17
18#[derive(Clone, PartialEq, Eq, Hash)]
19pub struct Lexer {
20 text: Arc<str>,
21 previous_position: TextOffset,
22 current_position: TextOffset,
23}
24
25impl Lexer {
26 pub fn position(&self) -> TextOffset {
27 self.current_position
28 }
29
30 fn peek(&self) -> Option<char> {
32 self.current_position.take_from(&self.text).chars().next()
33 }
34
35 fn peek_nth(&self, n: usize) -> Option<char> {
36 self.current_position.take_from(&self.text).chars().nth(n)
37 }
38
39 fn take(&mut self) -> Option<char> {
40 let res = self.peek()?;
41 self.current_position = self.current_position.add_width(TextWidth::from_char(res));
42 Some(res)
43 }
44
45 fn take_while<F>(&mut self, f: F)
47 where
48 F: Fn(char) -> bool,
49 {
50 while self.peek().map(&f).unwrap_or(false) {
51 self.take();
52 }
53 }
54
55 fn peek_text_span(&self) -> TextSpan {
56 TextSpan::new(self.previous_position, self.current_position)
57 }
58
59 fn consume_text_span(&mut self) -> TextSpan {
60 let val = self.peek_text_span();
61 self.previous_position = self.current_position;
62 val
63 }
64
65 fn match_trivia<'a>(&mut self, db: &'a dyn Database, leading: bool) -> Vec<TriviumGreen<'a>> {
67 let mut res: Vec<TriviumGreen<'a>> = Vec::new();
68 while let Some(current) = self.peek() {
69 let trivium = match current {
70 ' ' | '\r' | '\t' => self.match_trivium_whitespace(db),
71 '\n' => self.match_trivium_newline(db),
72 '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(db),
73 _ => break,
74 };
75 res.push(trivium);
76 if current == '\n' && !leading {
77 break;
78 }
79 }
80 res
81 }
82
83 fn match_trivium_whitespace<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
85 self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
86 let span = self.consume_text_span();
87 let text = span.take(&self.text);
88 TokenWhitespace::new_green(db, SmolStrId::from(db, text)).into()
89 }
90
91 fn match_trivium_newline<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
93 self.take();
94 let span = self.consume_text_span();
95 let text = span.take(&self.text);
96 TokenNewline::new_green(db, SmolStrId::from(db, text)).into()
97 }
98
99 fn match_trivium_single_line_comment<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
101 match self.peek_nth(2) {
102 Some('/') => {
103 self.take_while(|c| c != '\n');
104 let span = self.consume_text_span();
105 let text = span.take(&self.text);
106 TokenSingleLineDocComment::new_green(db, SmolStrId::from(db, text)).into()
107 }
108 Some('!') => {
109 self.take_while(|c| c != '\n');
110 let span = self.consume_text_span();
111 let text = span.take(&self.text);
112 TokenSingleLineInnerComment::new_green(db, SmolStrId::from(db, text)).into()
113 }
114 _ => {
115 self.take_while(|c| c != '\n');
116 let span = self.consume_text_span();
117 let text = span.take(&self.text);
118 TokenSingleLineComment::new_green(db, SmolStrId::from(db, text)).into()
119 }
120 }
121 }
122
123 fn take_token_literal_number(&mut self) -> TokenKind {
128 let special = if self.peek() == Some('0') {
129 self.take();
130 match self.peek() {
131 Some('x' | 'o' | 'b') => {
132 match self.take() {
133 Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
134 Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
135 Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
136 _ => unreachable!(),
137 }
138 true
139 }
140 _ => false,
141 }
142 } else {
143 false
144 };
145 if !special {
147 self.take_while(|c| c.is_ascii_digit());
148 }
149
150 if self.peek() == Some('_') {
152 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
153 }
154 TokenKind::LiteralNumber
155 }
156
157 fn take_token_short_string(&mut self) -> TokenKind {
159 self.take_token_string_helper('\'');
160
161 if self.peek() == Some('_') {
163 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
164 }
165 TokenKind::ShortString
166 }
167
168 fn take_token_string(&mut self) -> TokenKind {
170 self.take_token_string_helper('"');
171 TokenKind::String
172 }
173
174 fn take_token_string_helper(&mut self, delimiter: char) {
175 self.take();
176 let mut escaped = false;
177 while let Some(token) = self.peek() {
178 self.take();
179 match token {
180 _ if escaped => escaped = false,
181 '\\' => escaped = true,
182 _ if token == delimiter => {
183 break;
184 }
185 _ => {}
186 };
187 }
188 }
189
190 fn take_token_identifier(&mut self) -> TokenKind {
192 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
194
195 let span = self.peek_text_span();
196 match span.take(&self.text) {
197 "as" => TokenKind::As,
198 "const" => TokenKind::Const,
199 "false" => TokenKind::False,
200 "true" => TokenKind::True,
201 "extern" => TokenKind::Extern,
202 "type" => TokenKind::Type,
203 "fn" => TokenKind::Function,
204 "trait" => TokenKind::Trait,
205 "impl" => TokenKind::Impl,
206 "of" => TokenKind::Of,
207 "mod" => TokenKind::Module,
208 "struct" => TokenKind::Struct,
209 "enum" => TokenKind::Enum,
210 "let" => TokenKind::Let,
211 "return" => TokenKind::Return,
212 "match" => TokenKind::Match,
213 "macro" => TokenKind::Macro,
214 "if" => TokenKind::If,
215 "loop" => TokenKind::Loop,
216 "continue" => TokenKind::Continue,
217 "break" => TokenKind::Break,
218 "else" => TokenKind::Else,
219 "while" => TokenKind::While,
220 "use" => TokenKind::Use,
221 "implicits" => TokenKind::Implicits,
222 "ref" => TokenKind::Ref,
223 "mut" => TokenKind::Mut,
224 "for" => TokenKind::For,
225 "nopanic" => TokenKind::NoPanic,
226 "pub" => TokenKind::Pub,
227 "_" => TokenKind::Underscore,
228 _ => TokenKind::Identifier,
229 }
230 }
231
232 fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
234 self.take();
235 kind
236 }
237
238 fn pick_kind(
240 &mut self,
241 second_char: char,
242 long_kind: TokenKind,
243 short_kind: TokenKind,
244 ) -> TokenKind {
245 self.take();
246 if self.peek() == Some(second_char) {
247 self.take();
248 long_kind
249 } else {
250 short_kind
251 }
252 }
253
254 fn match_terminal<'a>(&mut self, db: &'a dyn Database) -> LexerTerminal<'a> {
255 let leading_trivia = self.match_trivia(db, true);
256
257 let kind = if let Some(current) = self.peek() {
258 match current {
259 '0'..='9' => self.take_token_literal_number(),
260 '\'' => self.take_token_short_string(),
261 '"' => self.take_token_string(),
262 ',' => self.take_token_of_kind(TokenKind::Comma),
263 ';' => self.take_token_of_kind(TokenKind::Semicolon),
264 '?' => self.take_token_of_kind(TokenKind::QuestionMark),
265 '{' => self.take_token_of_kind(TokenKind::LBrace),
266 '}' => self.take_token_of_kind(TokenKind::RBrace),
267 '[' => self.take_token_of_kind(TokenKind::LBrack),
268 ']' => self.take_token_of_kind(TokenKind::RBrack),
269 '(' => self.take_token_of_kind(TokenKind::LParen),
270 ')' => self.take_token_of_kind(TokenKind::RParen),
271 '.' => {
272 self.take();
273 match self.peek() {
274 Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
275 _ => TokenKind::Dot,
276 }
277 }
278 '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
279 '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
280 '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
281 '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
282 '#' => self.take_token_of_kind(TokenKind::Hash),
283 '$' => self.take_token_of_kind(TokenKind::Dollar),
284 '-' => {
285 self.take();
286 match self.peek() {
287 Some('>') => self.take_token_of_kind(TokenKind::Arrow),
288 Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
289 _ => TokenKind::Minus,
290 }
291 }
292 '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
293 '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
294 'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
295 ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
296 '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
297 '~' => self.take_token_of_kind(TokenKind::BitNot),
298 '=' => {
299 self.take();
300 match self.peek() {
301 Some('=') => self.take_token_of_kind(TokenKind::EqEq),
302 Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
303 _ => TokenKind::Eq,
304 }
305 }
306 '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
307 '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
308 '^' => self.take_token_of_kind(TokenKind::Xor),
309 '@' => self.take_token_of_kind(TokenKind::At),
310 _ => self.take_token_of_kind(TokenKind::BadCharacters),
311 }
312 } else {
313 TokenKind::EndOfFile
314 };
315
316 let span = self.consume_text_span();
317 let text_arc = self.text.clone();
318 let text = span.take(&text_arc);
319 let trailing_trivia = self.match_trivia(db, false);
320 let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
321
322 LexerTerminal {
324 text: SmolStrId::from(db, text),
325 kind: terminal_kind,
326 leading_trivia,
327 trailing_trivia,
328 }
329 }
330}
331
332#[salsa::tracked]
334pub fn tokenize_all<'a>(
335 db: &'a dyn Database,
336 _tracked: Tracked,
337 text: Arc<str>,
338) -> cairo_lang_utils::deque::Deque<LexerTerminal<'a>> {
339 let mut lexer =
340 Lexer { text, previous_position: TextOffset::START, current_position: TextOffset::START };
341 let mut result: Deque<LexerTerminal<'a>> = Default::default();
342 loop {
343 let terminal = lexer.match_terminal(db);
344 let is_eof = terminal.kind == SyntaxKind::TerminalEndOfFile;
345 result.push_back(terminal);
346 if is_eof {
347 break;
348 }
349 }
350 result
351}
352
353#[derive(Clone, PartialEq, Eq, Debug, salsa::Update)]
355pub struct LexerTerminal<'a> {
356 pub text: SmolStrId<'a>,
357 pub kind: SyntaxKind,
359 pub leading_trivia: Vec<TriviumGreen<'a>>,
360 pub trailing_trivia: Vec<TriviumGreen<'a>>,
361}
362impl<'a> LexerTerminal<'a> {
363 pub fn width(&self, db: &dyn Database) -> TextWidth {
364 self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
365 + TextWidth::from_str(self.text.long(db))
366 + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
367 }
368
369 pub fn text(&self, db: &'a dyn Database) -> &'a str {
370 self.text.long(db)
371 }
372}
373
374#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
375enum TokenKind {
376 Identifier,
377
378 LiteralNumber,
380 ShortString,
381 String,
382
383 As,
385 Const,
386 False,
387 True,
388 Extern,
389 Type,
390 Function,
391 Trait,
392 Impl,
393 Of,
394 Module,
395 Struct,
396 Enum,
397 Let,
398 Return,
399 Match,
400 Macro,
401 If,
402 While,
403 For,
404 Loop,
405 Continue,
406 Break,
407 Else,
408 Use,
409 Implicits,
410 NoPanic,
411 Pub,
412
413 Ref,
415 Mut,
416
417 And,
419 AndAnd,
420 At,
421 Or,
422 OrOr,
423 Xor,
424 EqEq,
425 Neq,
426 GE,
427 GT,
428 LE,
429 LT,
430 Not,
431 BitNot,
432 Plus,
433 PlusEq,
434 Minus,
435 MinusEq,
436 Mul,
437 MulEq,
438 Div,
439 DivEq,
440 Mod,
441 ModEq,
442
443 Colon,
444 ColonColon,
445 Comma,
446 Dollar,
447 Dot,
448 DotDot,
449 DotDotEq,
450 Eq,
451 Hash,
452 Semicolon,
453 QuestionMark,
454 Underscore,
455 LBrace,
456 RBrace,
457 LBrack,
458 RBrack,
459 LParen,
460 RParen,
461 Arrow,
462 MatchArrow,
463
464 EndOfFile,
466 BadCharacters,
467}
468
469fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
470 match kind {
471 TokenKind::As => SyntaxKind::TerminalAs,
472 TokenKind::Const => SyntaxKind::TerminalConst,
473 TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
474 TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
475 TokenKind::ShortString => SyntaxKind::TerminalShortString,
476 TokenKind::String => SyntaxKind::TerminalString,
477 TokenKind::False => SyntaxKind::TerminalFalse,
478 TokenKind::True => SyntaxKind::TerminalTrue,
479 TokenKind::Extern => SyntaxKind::TerminalExtern,
480 TokenKind::Type => SyntaxKind::TerminalType,
481 TokenKind::Function => SyntaxKind::TerminalFunction,
482 TokenKind::Trait => SyntaxKind::TerminalTrait,
483 TokenKind::Impl => SyntaxKind::TerminalImpl,
484 TokenKind::Of => SyntaxKind::TerminalOf,
485 TokenKind::Module => SyntaxKind::TerminalModule,
486 TokenKind::Struct => SyntaxKind::TerminalStruct,
487 TokenKind::Enum => SyntaxKind::TerminalEnum,
488 TokenKind::Let => SyntaxKind::TerminalLet,
489 TokenKind::Return => SyntaxKind::TerminalReturn,
490 TokenKind::Match => SyntaxKind::TerminalMatch,
491 TokenKind::If => SyntaxKind::TerminalIf,
492 TokenKind::While => SyntaxKind::TerminalWhile,
493 TokenKind::For => SyntaxKind::TerminalFor,
494 TokenKind::Loop => SyntaxKind::TerminalLoop,
495 TokenKind::Continue => SyntaxKind::TerminalContinue,
496 TokenKind::Break => SyntaxKind::TerminalBreak,
497 TokenKind::Else => SyntaxKind::TerminalElse,
498 TokenKind::Use => SyntaxKind::TerminalUse,
499 TokenKind::Implicits => SyntaxKind::TerminalImplicits,
500 TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
501 TokenKind::Pub => SyntaxKind::TerminalPub,
502 TokenKind::Macro => SyntaxKind::TerminalMacro,
503 TokenKind::And => SyntaxKind::TerminalAnd,
504 TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
505 TokenKind::At => SyntaxKind::TerminalAt,
506 TokenKind::Or => SyntaxKind::TerminalOr,
507 TokenKind::OrOr => SyntaxKind::TerminalOrOr,
508 TokenKind::Xor => SyntaxKind::TerminalXor,
509 TokenKind::EqEq => SyntaxKind::TerminalEqEq,
510 TokenKind::Neq => SyntaxKind::TerminalNeq,
511 TokenKind::GE => SyntaxKind::TerminalGE,
512 TokenKind::GT => SyntaxKind::TerminalGT,
513 TokenKind::LE => SyntaxKind::TerminalLE,
514 TokenKind::LT => SyntaxKind::TerminalLT,
515 TokenKind::Not => SyntaxKind::TerminalNot,
516 TokenKind::BitNot => SyntaxKind::TerminalBitNot,
517 TokenKind::Plus => SyntaxKind::TerminalPlus,
518 TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
519 TokenKind::Minus => SyntaxKind::TerminalMinus,
520 TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
521 TokenKind::Mul => SyntaxKind::TerminalMul,
522 TokenKind::MulEq => SyntaxKind::TerminalMulEq,
523 TokenKind::Div => SyntaxKind::TerminalDiv,
524 TokenKind::DivEq => SyntaxKind::TerminalDivEq,
525 TokenKind::Mod => SyntaxKind::TerminalMod,
526 TokenKind::ModEq => SyntaxKind::TerminalModEq,
527 TokenKind::Colon => SyntaxKind::TerminalColon,
528 TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
529 TokenKind::Comma => SyntaxKind::TerminalComma,
530 TokenKind::Dollar => SyntaxKind::TerminalDollar,
531 TokenKind::Dot => SyntaxKind::TerminalDot,
532 TokenKind::DotDot => SyntaxKind::TerminalDotDot,
533 TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
534 TokenKind::Eq => SyntaxKind::TerminalEq,
535 TokenKind::Hash => SyntaxKind::TerminalHash,
536 TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
537 TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
538 TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
539 TokenKind::LBrace => SyntaxKind::TerminalLBrace,
540 TokenKind::RBrace => SyntaxKind::TerminalRBrace,
541 TokenKind::LBrack => SyntaxKind::TerminalLBrack,
542 TokenKind::RBrack => SyntaxKind::TerminalRBrack,
543 TokenKind::LParen => SyntaxKind::TerminalLParen,
544 TokenKind::RParen => SyntaxKind::TerminalRParen,
545 TokenKind::Ref => SyntaxKind::TerminalRef,
546 TokenKind::Mut => SyntaxKind::TerminalMut,
547 TokenKind::Arrow => SyntaxKind::TerminalArrow,
548 TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
549 TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
550 TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
551 }
552}