1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
6use cairo_lang_syntax::node::Token;
7use cairo_lang_syntax::node::ast::{
8 TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
9 TokenWhitespace, TriviumGreen,
10};
11use cairo_lang_syntax::node::db::SyntaxGroup;
12use cairo_lang_syntax::node::kind::SyntaxKind;
13use cairo_lang_utils::require;
14use smol_str::SmolStr;
15
16pub struct Lexer<'a> {
17 db: &'a dyn SyntaxGroup,
18 text: &'a str,
19 previous_position: TextOffset,
20 current_position: TextOffset,
21 done: bool,
22}
23
24impl<'a> Lexer<'a> {
25 pub fn from_text(db: &'a dyn SyntaxGroup, text: &'a str) -> Lexer<'a> {
27 Lexer {
28 db,
29 text,
30 previous_position: TextOffset::START,
31 current_position: TextOffset::START,
32 done: false,
33 }
34 }
35
36 pub fn position(&self) -> TextOffset {
37 self.current_position
38 }
39
40 fn peek(&self) -> Option<char> {
42 self.current_position.take_from(self.text).chars().next()
43 }
44
45 fn peek_nth(&self, n: usize) -> Option<char> {
46 self.current_position.take_from(self.text).chars().nth(n)
47 }
48
49 fn take(&mut self) -> Option<char> {
50 let res = self.peek()?;
51 self.current_position = self.current_position.add_width(TextWidth::from_char(res));
52 Some(res)
53 }
54
55 fn take_while<F>(&mut self, f: F)
57 where
58 F: Fn(char) -> bool,
59 {
60 while self.peek().map(&f).unwrap_or(false) {
61 self.take();
62 }
63 }
64
65 fn peek_span_text(&self) -> &'a str {
66 let span = TextSpan { start: self.previous_position, end: self.current_position };
67 span.take(self.text)
68 }
69
70 fn consume_span(&mut self) -> &str {
71 let val = self.peek_span_text();
72 self.previous_position = self.current_position;
73 val
74 }
75
76 fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
78 let mut res: Vec<TriviumGreen> = Vec::new();
79 while let Some(current) = self.peek() {
80 let trivium = match current {
81 ' ' | '\r' | '\t' => self.match_trivium_whitespace(),
82 '\n' => self.match_trivium_newline(),
83 '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
84 _ => break,
85 };
86 res.push(trivium);
87 if current == '\n' && !leading {
88 break;
89 }
90 }
91 res
92 }
93
94 fn match_trivium_whitespace(&mut self) -> TriviumGreen {
96 self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
97 TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
98 }
99
100 fn match_trivium_newline(&mut self) -> TriviumGreen {
102 self.take();
103 TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
104 }
105
106 fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
108 match self.peek_nth(2) {
109 Some('/') => {
110 self.take_while(|c| c != '\n');
111 TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
112 .into()
113 }
114 Some('!') => {
115 self.take_while(|c| c != '\n');
116 TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
117 .into()
118 }
119 _ => {
120 self.take_while(|c| c != '\n');
121 TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
122 .into()
123 }
124 }
125 }
126
127 fn take_token_literal_number(&mut self) -> TokenKind {
132 let special = if self.peek() == Some('0') {
133 self.take();
134 match self.peek() {
135 Some('x' | 'o' | 'b') => {
136 match self.take() {
137 Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
138 Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
139 Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
140 _ => unreachable!(),
141 }
142 true
143 }
144 _ => false,
145 }
146 } else {
147 false
148 };
149 if !special {
151 self.take_while(|c| c.is_ascii_digit());
152 }
153
154 if self.peek() == Some('_') {
156 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
157 }
158 TokenKind::LiteralNumber
159 }
160
161 fn take_token_short_string(&mut self) -> TokenKind {
163 self.take_token_string_helper('\'');
164
165 if self.peek() == Some('_') {
167 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
168 }
169 TokenKind::ShortString
170 }
171
172 fn take_token_string(&mut self) -> TokenKind {
174 self.take_token_string_helper('"');
175 TokenKind::String
176 }
177
178 fn take_token_string_helper(&mut self, delimiter: char) {
179 self.take();
180 let mut escaped = false;
181 while let Some(token) = self.peek() {
182 self.take();
183 match token {
184 _ if escaped => escaped = false,
185 '\\' => escaped = true,
186 _ if token == delimiter => {
187 break;
188 }
189 _ => {}
190 };
191 }
192 }
193
194 fn take_token_identifier(&mut self) -> TokenKind {
196 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
198
199 match self.peek_span_text() {
200 "as" => TokenKind::As,
201 "const" => TokenKind::Const,
202 "false" => TokenKind::False,
203 "true" => TokenKind::True,
204 "extern" => TokenKind::Extern,
205 "type" => TokenKind::Type,
206 "fn" => TokenKind::Function,
207 "trait" => TokenKind::Trait,
208 "impl" => TokenKind::Impl,
209 "of" => TokenKind::Of,
210 "mod" => TokenKind::Module,
211 "struct" => TokenKind::Struct,
212 "enum" => TokenKind::Enum,
213 "let" => TokenKind::Let,
214 "return" => TokenKind::Return,
215 "match" => TokenKind::Match,
216 "macro" => TokenKind::Macro,
217 "if" => TokenKind::If,
218 "loop" => TokenKind::Loop,
219 "continue" => TokenKind::Continue,
220 "break" => TokenKind::Break,
221 "else" => TokenKind::Else,
222 "while" => TokenKind::While,
223 "use" => TokenKind::Use,
224 "implicits" => TokenKind::Implicits,
225 "ref" => TokenKind::Ref,
226 "mut" => TokenKind::Mut,
227 "for" => TokenKind::For,
228 "nopanic" => TokenKind::NoPanic,
229 "pub" => TokenKind::Pub,
230 "_" => TokenKind::Underscore,
231 _ => TokenKind::Identifier,
232 }
233 }
234
235 fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
237 self.take();
238 kind
239 }
240
241 fn pick_kind(
243 &mut self,
244 second_char: char,
245 long_kind: TokenKind,
246 short_kind: TokenKind,
247 ) -> TokenKind {
248 self.take();
249 if self.peek() == Some(second_char) {
250 self.take();
251 long_kind
252 } else {
253 short_kind
254 }
255 }
256
257 fn match_terminal(&mut self) -> LexerTerminal {
258 let leading_trivia = self.match_trivia(true);
259
260 let kind = if let Some(current) = self.peek() {
261 match current {
262 '0'..='9' => self.take_token_literal_number(),
263 '\'' => self.take_token_short_string(),
264 '"' => self.take_token_string(),
265 ',' => self.take_token_of_kind(TokenKind::Comma),
266 ';' => self.take_token_of_kind(TokenKind::Semicolon),
267 '?' => self.take_token_of_kind(TokenKind::QuestionMark),
268 '{' => self.take_token_of_kind(TokenKind::LBrace),
269 '}' => self.take_token_of_kind(TokenKind::RBrace),
270 '[' => self.take_token_of_kind(TokenKind::LBrack),
271 ']' => self.take_token_of_kind(TokenKind::RBrack),
272 '(' => self.take_token_of_kind(TokenKind::LParen),
273 ')' => self.take_token_of_kind(TokenKind::RParen),
274 '.' => {
275 self.take();
276 match self.peek() {
277 Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
278 _ => TokenKind::Dot,
279 }
280 }
281 '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
282 '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
283 '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
284 '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
285 '#' => self.take_token_of_kind(TokenKind::Hash),
286 '$' => self.take_token_of_kind(TokenKind::Dollar),
287 '-' => {
288 self.take();
289 match self.peek() {
290 Some('>') => self.take_token_of_kind(TokenKind::Arrow),
291 Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
292 _ => TokenKind::Minus,
293 }
294 }
295 '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
296 '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
297 'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
298 ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
299 '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
300 '~' => self.take_token_of_kind(TokenKind::BitNot),
301 '=' => {
302 self.take();
303 match self.peek() {
304 Some('=') => self.take_token_of_kind(TokenKind::EqEq),
305 Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
306 _ => TokenKind::Eq,
307 }
308 }
309 '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
310 '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
311 '^' => self.take_token_of_kind(TokenKind::Xor),
312 '@' => self.take_token_of_kind(TokenKind::At),
313 _ => self.take_token_of_kind(TokenKind::BadCharacters),
314 }
315 } else {
316 TokenKind::EndOfFile
317 };
318
319 let text = SmolStr::from(self.consume_span());
320 let trailing_trivia = self.match_trivia(false);
321 let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
322
323 LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
325 }
326}
327
328#[derive(Clone, PartialEq, Eq, Debug)]
330pub struct LexerTerminal {
331 pub text: SmolStr,
332 pub kind: SyntaxKind,
334 pub leading_trivia: Vec<TriviumGreen>,
335 pub trailing_trivia: Vec<TriviumGreen>,
336}
337impl LexerTerminal {
338 pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
339 self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
340 + TextWidth::from_str(&self.text)
341 + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
342 }
343}
344
345impl Iterator for Lexer<'_> {
346 type Item = LexerTerminal;
347
348 fn next(&mut self) -> Option<Self::Item> {
351 require(!self.done)?;
352 let lexer_terminal = self.match_terminal();
353 if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
354 self.done = true;
355 };
356 Some(lexer_terminal)
357 }
358}
359
360#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
361enum TokenKind {
362 Identifier,
363
364 LiteralNumber,
366 ShortString,
367 String,
368
369 As,
371 Const,
372 False,
373 True,
374 Extern,
375 Type,
376 Function,
377 Trait,
378 Impl,
379 Of,
380 Module,
381 Struct,
382 Enum,
383 Let,
384 Return,
385 Match,
386 Macro,
387 If,
388 While,
389 For,
390 Loop,
391 Continue,
392 Break,
393 Else,
394 Use,
395 Implicits,
396 NoPanic,
397 Pub,
398
399 Ref,
401 Mut,
402
403 And,
405 AndAnd,
406 At,
407 Or,
408 OrOr,
409 Xor,
410 EqEq,
411 Neq,
412 GE,
413 GT,
414 LE,
415 LT,
416 Not,
417 BitNot,
418 Plus,
419 PlusEq,
420 Minus,
421 MinusEq,
422 Mul,
423 MulEq,
424 Div,
425 DivEq,
426 Mod,
427 ModEq,
428
429 Colon,
430 ColonColon,
431 Comma,
432 Dollar,
433 Dot,
434 DotDot,
435 DotDotEq,
436 Eq,
437 Hash,
438 Semicolon,
439 QuestionMark,
440 Underscore,
441 LBrace,
442 RBrace,
443 LBrack,
444 RBrack,
445 LParen,
446 RParen,
447 Arrow,
448 MatchArrow,
449
450 EndOfFile,
452 BadCharacters,
453}
454
455fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
456 match kind {
457 TokenKind::As => SyntaxKind::TerminalAs,
458 TokenKind::Const => SyntaxKind::TerminalConst,
459 TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
460 TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
461 TokenKind::ShortString => SyntaxKind::TerminalShortString,
462 TokenKind::String => SyntaxKind::TerminalString,
463 TokenKind::False => SyntaxKind::TerminalFalse,
464 TokenKind::True => SyntaxKind::TerminalTrue,
465 TokenKind::Extern => SyntaxKind::TerminalExtern,
466 TokenKind::Type => SyntaxKind::TerminalType,
467 TokenKind::Function => SyntaxKind::TerminalFunction,
468 TokenKind::Trait => SyntaxKind::TerminalTrait,
469 TokenKind::Impl => SyntaxKind::TerminalImpl,
470 TokenKind::Of => SyntaxKind::TerminalOf,
471 TokenKind::Module => SyntaxKind::TerminalModule,
472 TokenKind::Struct => SyntaxKind::TerminalStruct,
473 TokenKind::Enum => SyntaxKind::TerminalEnum,
474 TokenKind::Let => SyntaxKind::TerminalLet,
475 TokenKind::Return => SyntaxKind::TerminalReturn,
476 TokenKind::Match => SyntaxKind::TerminalMatch,
477 TokenKind::If => SyntaxKind::TerminalIf,
478 TokenKind::While => SyntaxKind::TerminalWhile,
479 TokenKind::For => SyntaxKind::TerminalFor,
480 TokenKind::Loop => SyntaxKind::TerminalLoop,
481 TokenKind::Continue => SyntaxKind::TerminalContinue,
482 TokenKind::Break => SyntaxKind::TerminalBreak,
483 TokenKind::Else => SyntaxKind::TerminalElse,
484 TokenKind::Use => SyntaxKind::TerminalUse,
485 TokenKind::Implicits => SyntaxKind::TerminalImplicits,
486 TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
487 TokenKind::Pub => SyntaxKind::TerminalPub,
488 TokenKind::Macro => SyntaxKind::TerminalMacro,
489 TokenKind::And => SyntaxKind::TerminalAnd,
490 TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
491 TokenKind::At => SyntaxKind::TerminalAt,
492 TokenKind::Or => SyntaxKind::TerminalOr,
493 TokenKind::OrOr => SyntaxKind::TerminalOrOr,
494 TokenKind::Xor => SyntaxKind::TerminalXor,
495 TokenKind::EqEq => SyntaxKind::TerminalEqEq,
496 TokenKind::Neq => SyntaxKind::TerminalNeq,
497 TokenKind::GE => SyntaxKind::TerminalGE,
498 TokenKind::GT => SyntaxKind::TerminalGT,
499 TokenKind::LE => SyntaxKind::TerminalLE,
500 TokenKind::LT => SyntaxKind::TerminalLT,
501 TokenKind::Not => SyntaxKind::TerminalNot,
502 TokenKind::BitNot => SyntaxKind::TerminalBitNot,
503 TokenKind::Plus => SyntaxKind::TerminalPlus,
504 TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
505 TokenKind::Minus => SyntaxKind::TerminalMinus,
506 TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
507 TokenKind::Mul => SyntaxKind::TerminalMul,
508 TokenKind::MulEq => SyntaxKind::TerminalMulEq,
509 TokenKind::Div => SyntaxKind::TerminalDiv,
510 TokenKind::DivEq => SyntaxKind::TerminalDivEq,
511 TokenKind::Mod => SyntaxKind::TerminalMod,
512 TokenKind::ModEq => SyntaxKind::TerminalModEq,
513 TokenKind::Colon => SyntaxKind::TerminalColon,
514 TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
515 TokenKind::Comma => SyntaxKind::TerminalComma,
516 TokenKind::Dollar => SyntaxKind::TerminalDollar,
517 TokenKind::Dot => SyntaxKind::TerminalDot,
518 TokenKind::DotDot => SyntaxKind::TerminalDotDot,
519 TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
520 TokenKind::Eq => SyntaxKind::TerminalEq,
521 TokenKind::Hash => SyntaxKind::TerminalHash,
522 TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
523 TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
524 TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
525 TokenKind::LBrace => SyntaxKind::TerminalLBrace,
526 TokenKind::RBrace => SyntaxKind::TerminalRBrace,
527 TokenKind::LBrack => SyntaxKind::TerminalLBrack,
528 TokenKind::RBrack => SyntaxKind::TerminalRBrack,
529 TokenKind::LParen => SyntaxKind::TerminalLParen,
530 TokenKind::RParen => SyntaxKind::TerminalRParen,
531 TokenKind::Ref => SyntaxKind::TerminalRef,
532 TokenKind::Mut => SyntaxKind::TerminalMut,
533 TokenKind::Arrow => SyntaxKind::TerminalArrow,
534 TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
535 TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
536 TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
537 }
538}