1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use std::sync::Arc;
6
7use cairo_lang_filesystem::ids::{SmolStrId, Tracked};
8use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
9use cairo_lang_syntax::node::Token;
10use cairo_lang_syntax::node::ast::{
11 TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
12 TokenWhitespace, TriviumGreen,
13};
14use cairo_lang_syntax::node::kind::SyntaxKind;
15use cairo_lang_utils::deque::Deque;
16use salsa::Database;
17
18#[derive(Clone, PartialEq, Eq, Hash)]
19pub struct Lexer {
20 text: Arc<str>,
21 previous_position: TextOffset,
22 current_position: TextOffset,
23}
24
25impl Lexer {
26 pub fn position(&self) -> TextOffset {
27 self.current_position
28 }
29
30 fn peek(&self) -> Option<char> {
32 self.current_position.take_from(&self.text).chars().next()
33 }
34
35 fn peek_nth(&self, n: usize) -> Option<char> {
36 self.current_position.take_from(&self.text).chars().nth(n)
37 }
38
39 fn take(&mut self) -> Option<char> {
40 let res = self.peek()?;
41 self.current_position = self.current_position.add_width(TextWidth::from_char(res));
42 Some(res)
43 }
44
45 fn take_while<F>(&mut self, f: F)
47 where
48 F: Fn(char) -> bool,
49 {
50 while self.peek().map(&f).unwrap_or(false) {
51 self.take();
52 }
53 }
54
55 fn peek_text_span(&self) -> TextSpan {
56 TextSpan::new(self.previous_position, self.current_position)
57 }
58
59 fn consume_text_span(&mut self) -> TextSpan {
60 let val = self.peek_text_span();
61 self.previous_position = self.current_position;
62 val
63 }
64
65 fn match_trivia<'a>(&mut self, db: &'a dyn Database, leading: bool) -> Vec<TriviumGreen<'a>> {
67 let mut res: Vec<TriviumGreen<'a>> = Vec::new();
68 while let Some(current) = self.peek() {
69 let trivium = match current {
70 ' ' | '\r' | '\t' => self.match_trivium_whitespace(db),
71 '\n' => self.match_trivium_newline(db),
72 '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(db),
73 _ => break,
74 };
75 res.push(trivium);
76 if current == '\n' && !leading {
77 break;
78 }
79 }
80 res
81 }
82
83 fn match_trivium_whitespace<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
85 self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
86 let span = self.consume_text_span();
87 let text = span.take(&self.text);
88 TokenWhitespace::new_green(db, SmolStrId::from(db, text)).into()
89 }
90
91 fn match_trivium_newline<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
93 self.take();
94 let span = self.consume_text_span();
95 let text = span.take(&self.text);
96 TokenNewline::new_green(db, SmolStrId::from(db, text)).into()
97 }
98
99 fn match_trivium_single_line_comment<'a>(&mut self, db: &'a dyn Database) -> TriviumGreen<'a> {
101 match self.peek_nth(2) {
102 Some('/') => {
103 self.take_while(|c| c != '\n');
104 let span = self.consume_text_span();
105 let text = span.take(&self.text);
106 TokenSingleLineDocComment::new_green(db, SmolStrId::from(db, text)).into()
107 }
108 Some('!') => {
109 self.take_while(|c| c != '\n');
110 let span = self.consume_text_span();
111 let text = span.take(&self.text);
112 TokenSingleLineInnerComment::new_green(db, SmolStrId::from(db, text)).into()
113 }
114 _ => {
115 self.take_while(|c| c != '\n');
116 let span = self.consume_text_span();
117 let text = span.take(&self.text);
118 TokenSingleLineComment::new_green(db, SmolStrId::from(db, text)).into()
119 }
120 }
121 }
122
123 fn take_token_literal_number(&mut self) -> TokenKind {
128 let special = if self.peek() == Some('0') {
129 self.take();
130 match self.peek() {
131 Some('x' | 'o' | 'b') => {
132 match self.take() {
133 Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
134 Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
135 Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
136 _ => unreachable!(),
137 }
138 true
139 }
140 _ => false,
141 }
142 } else {
143 false
144 };
145 if !special {
147 self.take_while(|c| c.is_ascii_digit());
148 }
149
150 if self.peek() == Some('_') {
152 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
153 }
154 TokenKind::LiteralNumber
155 }
156
157 fn take_token_short_string(&mut self) -> TokenKind {
159 self.take_token_string_helper('\'');
160
161 if self.peek() == Some('_') {
163 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
164 }
165 TokenKind::ShortString
166 }
167
168 fn take_token_string(&mut self) -> TokenKind {
170 self.take_token_string_helper('"');
171 TokenKind::String
172 }
173
174 fn take_token_string_helper(&mut self, delimiter: char) {
175 self.take();
176 let mut escaped = false;
177 while let Some(token) = self.peek() {
178 self.take();
179 match token {
180 _ if escaped => escaped = false,
181 '\\' => escaped = true,
182 _ if token == delimiter => {
183 break;
184 }
185 _ => {}
186 };
187 }
188 }
189
190 fn take_token_identifier(&mut self) -> TokenKind {
192 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
194
195 let span = self.peek_text_span();
196 match span.take(&self.text) {
197 "as" => TokenKind::As,
198 "const" => TokenKind::Const,
199 "false" => TokenKind::False,
200 "true" => TokenKind::True,
201 "extern" => TokenKind::Extern,
202 "type" => TokenKind::Type,
203 "fn" => TokenKind::Function,
204 "trait" => TokenKind::Trait,
205 "impl" => TokenKind::Impl,
206 "of" => TokenKind::Of,
207 "mod" => TokenKind::Module,
208 "struct" => TokenKind::Struct,
209 "enum" => TokenKind::Enum,
210 "let" => TokenKind::Let,
211 "return" => TokenKind::Return,
212 "match" => TokenKind::Match,
213 "macro" => TokenKind::Macro,
214 "if" => TokenKind::If,
215 "loop" => TokenKind::Loop,
216 "continue" => TokenKind::Continue,
217 "break" => TokenKind::Break,
218 "else" => TokenKind::Else,
219 "while" => TokenKind::While,
220 "use" => TokenKind::Use,
221 "implicits" => TokenKind::Implicits,
222 "ref" => TokenKind::Ref,
223 "mut" => TokenKind::Mut,
224 "for" => TokenKind::For,
225 "nopanic" => TokenKind::NoPanic,
226 "pub" => TokenKind::Pub,
227 "_" => TokenKind::Underscore,
228 _ => TokenKind::Identifier,
229 }
230 }
231
232 fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
234 self.take();
235 kind
236 }
237
238 fn pick_kind(
240 &mut self,
241 second_char: char,
242 long_kind: TokenKind,
243 short_kind: TokenKind,
244 ) -> TokenKind {
245 self.take();
246 if self.peek() == Some(second_char) {
247 self.take();
248 long_kind
249 } else {
250 short_kind
251 }
252 }
253
254 fn match_terminal<'a>(&mut self, db: &'a dyn Database) -> LexerTerminal<'a> {
255 let leading_trivia = self.match_trivia(db, true);
256
257 let kind = if let Some(current) = self.peek() {
258 match current {
259 '0'..='9' => self.take_token_literal_number(),
260 '\'' => self.take_token_short_string(),
261 '"' => self.take_token_string(),
262 ',' => self.take_token_of_kind(TokenKind::Comma),
263 ';' => self.take_token_of_kind(TokenKind::Semicolon),
264 '?' => self.take_token_of_kind(TokenKind::QuestionMark),
265 '{' => self.take_token_of_kind(TokenKind::LBrace),
266 '}' => self.take_token_of_kind(TokenKind::RBrace),
267 '[' => self.take_token_of_kind(TokenKind::LBrack),
268 ']' => self.take_token_of_kind(TokenKind::RBrack),
269 '(' => self.take_token_of_kind(TokenKind::LParen),
270 ')' => self.take_token_of_kind(TokenKind::RParen),
271 '.' => {
272 self.take();
273 match self.peek() {
274 Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
275 _ => TokenKind::Dot,
276 }
277 }
278 '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
279 '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
280 '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
281 '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
282 '#' => self.take_token_of_kind(TokenKind::Hash),
283 '$' => self.take_token_of_kind(TokenKind::Dollar),
284 '-' => {
285 self.take();
286 match self.peek() {
287 Some('>') => self.take_token_of_kind(TokenKind::Arrow),
288 Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
289 _ => TokenKind::Minus,
290 }
291 }
292 '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
293 '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
294 'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
295 ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
296 '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
297 '~' => self.take_token_of_kind(TokenKind::BitNot),
298 '=' => {
299 self.take();
300 match self.peek() {
301 Some('=') => self.take_token_of_kind(TokenKind::EqEq),
302 Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
303 _ => TokenKind::Eq,
304 }
305 }
306 '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
307 '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
308 '^' => self.take_token_of_kind(TokenKind::Xor),
309 '@' => self.take_token_of_kind(TokenKind::At),
310 _ => self.take_token_of_kind(TokenKind::BadCharacters),
311 }
312 } else {
313 TokenKind::EndOfFile
314 };
315
316 let span = self.consume_text_span();
317 let text = SmolStrId::from(db, span.take(&self.text));
318 let trailing_trivia = self.match_trivia(db, false);
319 let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
320
321 LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
323 }
324}
325
326#[salsa::tracked]
328pub fn tokenize_all<'a>(
329 db: &'a dyn Database,
330 _tracked: Tracked,
331 text: Arc<str>,
332) -> cairo_lang_utils::deque::Deque<LexerTerminal<'a>> {
333 let mut lexer =
334 Lexer { text, previous_position: TextOffset::START, current_position: TextOffset::START };
335 let mut result: Deque<LexerTerminal<'a>> = Default::default();
336 loop {
337 let terminal = lexer.match_terminal(db);
338 let is_eof = terminal.kind == SyntaxKind::TerminalEndOfFile;
339 result.push_back(terminal);
340 if is_eof {
341 break;
342 }
343 }
344 result
345}
346
347#[derive(Clone, PartialEq, Eq, Debug, salsa::Update)]
349pub struct LexerTerminal<'a> {
350 pub text: SmolStrId<'a>,
351 pub kind: SyntaxKind,
353 pub leading_trivia: Vec<TriviumGreen<'a>>,
354 pub trailing_trivia: Vec<TriviumGreen<'a>>,
355}
356impl<'a> LexerTerminal<'a> {
357 pub fn width(&self, db: &dyn Database) -> TextWidth {
358 self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
359 + TextWidth::from_str(self.text.long(db))
360 + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
361 }
362
363 pub fn text(&self, db: &'a dyn Database) -> &'a str {
364 self.text.long(db)
365 }
366}
367
368#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
369enum TokenKind {
370 Identifier,
371
372 LiteralNumber,
374 ShortString,
375 String,
376
377 As,
379 Const,
380 False,
381 True,
382 Extern,
383 Type,
384 Function,
385 Trait,
386 Impl,
387 Of,
388 Module,
389 Struct,
390 Enum,
391 Let,
392 Return,
393 Match,
394 Macro,
395 If,
396 While,
397 For,
398 Loop,
399 Continue,
400 Break,
401 Else,
402 Use,
403 Implicits,
404 NoPanic,
405 Pub,
406
407 Ref,
409 Mut,
410
411 And,
413 AndAnd,
414 At,
415 Or,
416 OrOr,
417 Xor,
418 EqEq,
419 Neq,
420 GE,
421 GT,
422 LE,
423 LT,
424 Not,
425 BitNot,
426 Plus,
427 PlusEq,
428 Minus,
429 MinusEq,
430 Mul,
431 MulEq,
432 Div,
433 DivEq,
434 Mod,
435 ModEq,
436
437 Colon,
438 ColonColon,
439 Comma,
440 Dollar,
441 Dot,
442 DotDot,
443 DotDotEq,
444 Eq,
445 Hash,
446 Semicolon,
447 QuestionMark,
448 Underscore,
449 LBrace,
450 RBrace,
451 LBrack,
452 RBrack,
453 LParen,
454 RParen,
455 Arrow,
456 MatchArrow,
457
458 EndOfFile,
460 BadCharacters,
461}
462
463fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
464 match kind {
465 TokenKind::As => SyntaxKind::TerminalAs,
466 TokenKind::Const => SyntaxKind::TerminalConst,
467 TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
468 TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
469 TokenKind::ShortString => SyntaxKind::TerminalShortString,
470 TokenKind::String => SyntaxKind::TerminalString,
471 TokenKind::False => SyntaxKind::TerminalFalse,
472 TokenKind::True => SyntaxKind::TerminalTrue,
473 TokenKind::Extern => SyntaxKind::TerminalExtern,
474 TokenKind::Type => SyntaxKind::TerminalType,
475 TokenKind::Function => SyntaxKind::TerminalFunction,
476 TokenKind::Trait => SyntaxKind::TerminalTrait,
477 TokenKind::Impl => SyntaxKind::TerminalImpl,
478 TokenKind::Of => SyntaxKind::TerminalOf,
479 TokenKind::Module => SyntaxKind::TerminalModule,
480 TokenKind::Struct => SyntaxKind::TerminalStruct,
481 TokenKind::Enum => SyntaxKind::TerminalEnum,
482 TokenKind::Let => SyntaxKind::TerminalLet,
483 TokenKind::Return => SyntaxKind::TerminalReturn,
484 TokenKind::Match => SyntaxKind::TerminalMatch,
485 TokenKind::If => SyntaxKind::TerminalIf,
486 TokenKind::While => SyntaxKind::TerminalWhile,
487 TokenKind::For => SyntaxKind::TerminalFor,
488 TokenKind::Loop => SyntaxKind::TerminalLoop,
489 TokenKind::Continue => SyntaxKind::TerminalContinue,
490 TokenKind::Break => SyntaxKind::TerminalBreak,
491 TokenKind::Else => SyntaxKind::TerminalElse,
492 TokenKind::Use => SyntaxKind::TerminalUse,
493 TokenKind::Implicits => SyntaxKind::TerminalImplicits,
494 TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
495 TokenKind::Pub => SyntaxKind::TerminalPub,
496 TokenKind::Macro => SyntaxKind::TerminalMacro,
497 TokenKind::And => SyntaxKind::TerminalAnd,
498 TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
499 TokenKind::At => SyntaxKind::TerminalAt,
500 TokenKind::Or => SyntaxKind::TerminalOr,
501 TokenKind::OrOr => SyntaxKind::TerminalOrOr,
502 TokenKind::Xor => SyntaxKind::TerminalXor,
503 TokenKind::EqEq => SyntaxKind::TerminalEqEq,
504 TokenKind::Neq => SyntaxKind::TerminalNeq,
505 TokenKind::GE => SyntaxKind::TerminalGE,
506 TokenKind::GT => SyntaxKind::TerminalGT,
507 TokenKind::LE => SyntaxKind::TerminalLE,
508 TokenKind::LT => SyntaxKind::TerminalLT,
509 TokenKind::Not => SyntaxKind::TerminalNot,
510 TokenKind::BitNot => SyntaxKind::TerminalBitNot,
511 TokenKind::Plus => SyntaxKind::TerminalPlus,
512 TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
513 TokenKind::Minus => SyntaxKind::TerminalMinus,
514 TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
515 TokenKind::Mul => SyntaxKind::TerminalMul,
516 TokenKind::MulEq => SyntaxKind::TerminalMulEq,
517 TokenKind::Div => SyntaxKind::TerminalDiv,
518 TokenKind::DivEq => SyntaxKind::TerminalDivEq,
519 TokenKind::Mod => SyntaxKind::TerminalMod,
520 TokenKind::ModEq => SyntaxKind::TerminalModEq,
521 TokenKind::Colon => SyntaxKind::TerminalColon,
522 TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
523 TokenKind::Comma => SyntaxKind::TerminalComma,
524 TokenKind::Dollar => SyntaxKind::TerminalDollar,
525 TokenKind::Dot => SyntaxKind::TerminalDot,
526 TokenKind::DotDot => SyntaxKind::TerminalDotDot,
527 TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
528 TokenKind::Eq => SyntaxKind::TerminalEq,
529 TokenKind::Hash => SyntaxKind::TerminalHash,
530 TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
531 TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
532 TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
533 TokenKind::LBrace => SyntaxKind::TerminalLBrace,
534 TokenKind::RBrace => SyntaxKind::TerminalRBrace,
535 TokenKind::LBrack => SyntaxKind::TerminalLBrack,
536 TokenKind::RBrack => SyntaxKind::TerminalRBrack,
537 TokenKind::LParen => SyntaxKind::TerminalLParen,
538 TokenKind::RParen => SyntaxKind::TerminalRParen,
539 TokenKind::Ref => SyntaxKind::TerminalRef,
540 TokenKind::Mut => SyntaxKind::TerminalMut,
541 TokenKind::Arrow => SyntaxKind::TerminalArrow,
542 TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
543 TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
544 TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
545 }
546}