1use crate::{SyntaxKind, SyntaxKind::*};
24use logos::Logos;
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct Token {
29 pub kind: SyntaxKind,
31 pub len: u32,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct LexError {
38 pub offset: usize,
40 pub message: String,
42}
43
44fn comment_block(lex: &mut logos::Lexer<LogosToken>) -> bool {
49 let mut last_asterisk = false;
50 for (index, c) in lex.remainder().char_indices() {
51 if c == '*' {
52 last_asterisk = true;
53 } else if c == '/' && last_asterisk {
54 lex.bump(index + 1);
55 return true;
56 } else if matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}') {
57 lex.bump(index);
60 return true;
61 } else {
62 last_asterisk = false;
63 }
64 }
65 let remaining = lex.remainder().len();
67 lex.bump(remaining);
68 true
69}
70
71#[derive(Clone, Copy, Debug, PartialEq, Eq, Logos)]
76#[logos(skip r"")] enum LogosToken {
78 #[regex(r"[ \t\f]+")]
82 Whitespace,
83
84 #[regex(r"\r?\n")]
85 Linebreak,
86
87 #[regex(r"//[^\r\n\u{202A}-\u{202E}\u{2066}-\u{2069}]*")]
89 CommentLine,
90
91 #[token(r"/*", comment_block)]
92 CommentBlock,
93
94 #[regex(r"aleo1[a-z0-9]*")]
100 AddressLiteral,
101
102 #[regex(r"0x[0-9A-F_]+([ui](8|16|32|64|128)|field|group|scalar)?")]
106 #[regex(r"0o[0-7_]+([ui](8|16|32|64|128)|field|group|scalar)?")]
107 #[regex(r"0b[01_]+([ui](8|16|32|64|128)|field|group|scalar)?")]
108 #[regex(r"[0-9][0-9_]*([ui](8|16|32|64|128)|field|group|scalar)?")]
109 Integer,
110
111 #[regex(r#""[^"]*""#)]
112 StaticString,
113
114 #[regex(r"group::[a-zA-Z][a-zA-Z0-9_]*")]
124 #[regex(r"signature::[a-zA-Z][a-zA-Z0-9_]*")]
125 #[regex(r"Future::[a-zA-Z][a-zA-Z0-9_]*")]
126 PathSpecial,
127
128 #[regex(r"_[a-zA-Z][a-zA-Z0-9_]*")]
130 IdentIntrinsic,
131
132 #[regex(r"[a-zA-Z][a-zA-Z0-9_]*")]
134 Ident,
135
136 #[token("**=")]
140 PowAssign,
141 #[token("&&=")]
142 AndAssign,
143 #[token("||=")]
144 OrAssign,
145 #[token("<<=")]
146 ShlAssign,
147 #[token(">>=")]
148 ShrAssign,
149
150 #[token("**")]
151 Pow,
152 #[token("&&")]
153 And,
154 #[token("||")]
155 Or,
156 #[token("<<")]
157 Shl,
158 #[token(">>")]
159 Shr,
160 #[token("==")]
161 EqEq,
162 #[token("!=")]
163 NotEq,
164 #[token("<=")]
165 LtEq,
166 #[token(">=")]
167 GtEq,
168 #[token("+=")]
169 AddAssign,
170 #[token("-=")]
171 SubAssign,
172 #[token("*=")]
173 MulAssign,
174 #[token("/=")]
175 DivAssign,
176 #[token("%=")]
177 RemAssign,
178 #[token("&=")]
179 BitAndAssign,
180 #[token("|=")]
181 BitOrAssign,
182 #[token("^=")]
183 BitXorAssign,
184
185 #[token("->")]
186 Arrow,
187 #[token("=>")]
188 FatArrow,
189 #[token("..")]
190 DotDot,
191 #[token("::")]
192 ColonColon,
193
194 #[token("=")]
196 Eq,
197 #[token("!")]
198 Bang,
199 #[token("<")]
200 Lt,
201 #[token(">")]
202 Gt,
203 #[token("+")]
204 Plus,
205 #[token("-")]
206 Minus,
207 #[token("*")]
208 Star,
209 #[token("/")]
210 Slash,
211 #[token("%")]
212 Percent,
213 #[token("&")]
214 Amp,
215 #[token("|")]
216 Pipe,
217 #[token("^")]
218 Caret,
219
220 #[token("(")]
224 LParen,
225 #[token(")")]
226 RParen,
227 #[token("[")]
228 LBracket,
229 #[token("]")]
230 RBracket,
231 #[token("{")]
232 LBrace,
233 #[token("}")]
234 RBrace,
235 #[token(",")]
236 Comma,
237 #[token(".")]
238 Dot,
239 #[token(";")]
240 Semicolon,
241 #[token(":")]
242 Colon,
243 #[token("?")]
244 Question,
245 #[token("_")]
246 Underscore,
247 #[token("@")]
248 At,
249
250 #[regex(r"[\u{202A}-\u{202E}\u{2066}-\u{2069}]")]
256 Bidi,
257}
258
259fn ident_to_kind(s: &str) -> SyntaxKind {
261 match s {
262 "true" => KW_TRUE,
264 "false" => KW_FALSE,
265 "none" => KW_NONE,
266 "address" => KW_ADDRESS,
268 "bool" => KW_BOOL,
269 "field" => KW_FIELD,
270 "group" => KW_GROUP,
271 "scalar" => KW_SCALAR,
272 "signature" => KW_SIGNATURE,
273 "string" => KW_STRING,
274 "record" => KW_RECORD,
275 "Future" => KW_FUTURE,
276 "i8" => KW_I8,
277 "i16" => KW_I16,
278 "i32" => KW_I32,
279 "i64" => KW_I64,
280 "i128" => KW_I128,
281 "u8" => KW_U8,
282 "u16" => KW_U16,
283 "u32" => KW_U32,
284 "u64" => KW_U64,
285 "u128" => KW_U128,
286 "if" => KW_IF,
288 "else" => KW_ELSE,
289 "for" => KW_FOR,
290 "in" => KW_IN,
291 "return" => KW_RETURN,
292 "let" => KW_LET,
294 "const" => KW_CONST,
295 "constant" => KW_CONSTANT,
296 "function" => KW_FUNCTION,
297 "transition" => KW_TRANSITION,
298 "inline" => KW_INLINE,
299 "async" => KW_ASYNC,
300 "Fn" => KW_FN,
301 "struct" => KW_STRUCT,
302 "constructor" => KW_CONSTRUCTOR,
303 "program" => KW_PROGRAM,
305 "import" => KW_IMPORT,
306 "mapping" => KW_MAPPING,
307 "storage" => KW_STORAGE,
308 "network" => KW_NETWORK,
309 "aleo" => KW_ALEO,
310 "script" => KW_SCRIPT,
311 "block" => KW_BLOCK,
312 "public" => KW_PUBLIC,
314 "private" => KW_PRIVATE,
315 "as" => KW_AS,
316 "self" => KW_SELF,
317 "assert" => KW_ASSERT,
318 "assert_eq" => KW_ASSERT_EQ,
319 "assert_neq" => KW_ASSERT_NEQ,
320 _ => IDENT,
322 }
323}
324
325pub fn lex(source: &str) -> (Vec<Token>, Vec<LexError>) {
330 let mut tokens = Vec::new();
331 let mut errors = Vec::new();
332 let mut lexer = LogosToken::lexer(source);
333
334 while let Some(result) = lexer.next() {
335 let span = lexer.span();
336 let len = (span.end - span.start) as u32;
337 let slice = lexer.slice();
338
339 let kind = match result {
340 Ok(token) => match token {
341 LogosToken::Whitespace => WHITESPACE,
343 LogosToken::Linebreak => LINEBREAK,
344 LogosToken::CommentLine => COMMENT_LINE,
345 LogosToken::CommentBlock => COMMENT_BLOCK,
346
347 LogosToken::AddressLiteral => ADDRESS_LIT,
349 LogosToken::Integer => INTEGER,
350 LogosToken::StaticString => STRING,
351
352 LogosToken::Ident => ident_to_kind(slice),
354 LogosToken::IdentIntrinsic => IDENT,
355 LogosToken::PathSpecial => IDENT, LogosToken::PowAssign => STAR2_EQ,
359 LogosToken::AndAssign => AMP2_EQ,
360 LogosToken::OrAssign => PIPE2_EQ,
361 LogosToken::ShlAssign => SHL_EQ,
362 LogosToken::ShrAssign => SHR_EQ,
363 LogosToken::Pow => STAR2,
364 LogosToken::And => AMP2,
365 LogosToken::Or => PIPE2,
366 LogosToken::Shl => SHL,
367 LogosToken::Shr => SHR,
368 LogosToken::EqEq => EQ2,
369 LogosToken::NotEq => BANG_EQ,
370 LogosToken::LtEq => LT_EQ,
371 LogosToken::GtEq => GT_EQ,
372 LogosToken::AddAssign => PLUS_EQ,
373 LogosToken::SubAssign => MINUS_EQ,
374 LogosToken::MulAssign => STAR_EQ,
375 LogosToken::DivAssign => SLASH_EQ,
376 LogosToken::RemAssign => PERCENT_EQ,
377 LogosToken::BitAndAssign => AMP_EQ,
378 LogosToken::BitOrAssign => PIPE_EQ,
379 LogosToken::BitXorAssign => CARET_EQ,
380 LogosToken::Arrow => ARROW,
381 LogosToken::FatArrow => FAT_ARROW,
382 LogosToken::DotDot => DOT_DOT,
383 LogosToken::ColonColon => COLON_COLON,
384
385 LogosToken::Eq => EQ,
387 LogosToken::Bang => BANG,
388 LogosToken::Lt => LT,
389 LogosToken::Gt => GT,
390 LogosToken::Plus => PLUS,
391 LogosToken::Minus => MINUS,
392 LogosToken::Star => STAR,
393 LogosToken::Slash => SLASH,
394 LogosToken::Percent => PERCENT,
395 LogosToken::Amp => AMP,
396 LogosToken::Pipe => PIPE,
397 LogosToken::Caret => CARET,
398
399 LogosToken::LParen => L_PAREN,
401 LogosToken::RParen => R_PAREN,
402 LogosToken::LBracket => L_BRACKET,
403 LogosToken::RBracket => R_BRACKET,
404 LogosToken::LBrace => L_BRACE,
405 LogosToken::RBrace => R_BRACE,
406 LogosToken::Comma => COMMA,
407 LogosToken::Dot => DOT,
408 LogosToken::Semicolon => SEMICOLON,
409 LogosToken::Colon => COLON,
410 LogosToken::Question => QUESTION,
411 LogosToken::Underscore => UNDERSCORE,
412 LogosToken::At => AT,
413
414 LogosToken::Bidi => {
416 errors.push(LexError {
417 offset: span.start,
418 message: "Unicode bidirectional override character detected".to_string(),
419 });
420 ERROR
421 }
422 },
423 Err(()) => {
424 errors.push(LexError { offset: span.start, message: format!("unexpected character: {:?}", slice) });
425 ERROR
426 }
427 };
428
429 tokens.push(Token { kind, len });
430 }
431
432 tokens.push(Token { kind: EOF, len: 0 });
434
435 (tokens, errors)
436}
437
438#[cfg(test)]
439mod tests {
440 use super::*;
441 use expect_test::{Expect, expect};
442
443 fn check_lex(input: &str, expect: Expect) {
445 let (tokens, _errors) = lex(input);
446 let mut output = String::new();
447 let mut offset = 0usize;
448 for token in &tokens {
449 let text = &input[offset..offset + token.len as usize];
450 output.push_str(&format!("{:?} {:?}\n", token.kind, text));
451 offset += token.len as usize;
452 }
453 expect.assert_eq(&output);
454 }
455
456 fn check_lex_errors(input: &str, expect: Expect) {
458 let (_tokens, errors) = lex(input);
459 let output = errors.iter().map(|e| format!("{}:{}", e.offset, e.message)).collect::<Vec<_>>().join("\n");
460 expect.assert_eq(&output);
461 }
462
463 #[test]
464 fn lex_empty() {
465 check_lex("", expect![[r#"
466 EOF ""
467 "#]]);
468 }
469
470 #[test]
471 fn lex_whitespace() {
472 check_lex(" \t ", expect![[r#"
473 WHITESPACE " \t "
474 EOF ""
475 "#]]);
476 }
477
478 #[test]
479 fn lex_linebreaks() {
480 check_lex("\n\r\n\n", expect![[r#"
481 LINEBREAK "\n"
482 LINEBREAK "\r\n"
483 LINEBREAK "\n"
484 EOF ""
485"#]]);
486 }
487
488 #[test]
489 fn lex_mixed_whitespace() {
490 check_lex(" \n \t\n", expect![[r#"
491 WHITESPACE " "
492 LINEBREAK "\n"
493 WHITESPACE " \t"
494 LINEBREAK "\n"
495 EOF ""
496 "#]]);
497 }
498
499 #[test]
500 fn lex_line_comments() {
501 check_lex("// hello\n// world", expect![[r#"
502 COMMENT_LINE "// hello"
503 LINEBREAK "\n"
504 COMMENT_LINE "// world"
505 EOF ""
506 "#]]);
507 }
508
509 #[test]
510 fn lex_block_comments() {
511 check_lex("/* hello */ /* multi\nline */", expect![[r#"
512 COMMENT_BLOCK "/* hello */"
513 WHITESPACE " "
514 COMMENT_BLOCK "/* multi\nline */"
515 EOF ""
516 "#]]);
517 }
518
519 #[test]
520 fn lex_identifiers() {
521 check_lex("foo Bar _baz x123", expect![[r#"
522 IDENT "foo"
523 WHITESPACE " "
524 IDENT "Bar"
525 WHITESPACE " "
526 IDENT "_baz"
527 WHITESPACE " "
528 IDENT "x123"
529 EOF ""
530 "#]]);
531 }
532
533 #[test]
534 fn lex_keywords() {
535 check_lex("let function if return true false", expect![[r#"
536 KW_LET "let"
537 WHITESPACE " "
538 KW_FUNCTION "function"
539 WHITESPACE " "
540 KW_IF "if"
541 WHITESPACE " "
542 KW_RETURN "return"
543 WHITESPACE " "
544 KW_TRUE "true"
545 WHITESPACE " "
546 KW_FALSE "false"
547 EOF ""
548 "#]]);
549 }
550
551 #[test]
552 fn lex_type_keywords() {
553 check_lex("u8 u16 u32 u64 u128 i8 i16 i32 i64 i128", expect![[r#"
554 KW_U8 "u8"
555 WHITESPACE " "
556 KW_U16 "u16"
557 WHITESPACE " "
558 KW_U32 "u32"
559 WHITESPACE " "
560 KW_U64 "u64"
561 WHITESPACE " "
562 KW_U128 "u128"
563 WHITESPACE " "
564 KW_I8 "i8"
565 WHITESPACE " "
566 KW_I16 "i16"
567 WHITESPACE " "
568 KW_I32 "i32"
569 WHITESPACE " "
570 KW_I64 "i64"
571 WHITESPACE " "
572 KW_I128 "i128"
573 EOF ""
574 "#]]);
575 }
576
577 #[test]
578 fn lex_more_type_keywords() {
579 check_lex("bool field group scalar address signature string record", expect![[r#"
580 KW_BOOL "bool"
581 WHITESPACE " "
582 KW_FIELD "field"
583 WHITESPACE " "
584 KW_GROUP "group"
585 WHITESPACE " "
586 KW_SCALAR "scalar"
587 WHITESPACE " "
588 KW_ADDRESS "address"
589 WHITESPACE " "
590 KW_SIGNATURE "signature"
591 WHITESPACE " "
592 KW_STRING "string"
593 WHITESPACE " "
594 KW_RECORD "record"
595 EOF ""
596 "#]]);
597 }
598
599 #[test]
600 fn lex_integers() {
601 check_lex("123 0xFF 0b101 0o77", expect![[r#"
602 INTEGER "123"
603 WHITESPACE " "
604 INTEGER "0xFF"
605 WHITESPACE " "
606 INTEGER "0b101"
607 WHITESPACE " "
608 INTEGER "0o77"
609 EOF ""
610 "#]]);
611 }
612
613 #[test]
614 fn lex_integers_with_underscores() {
615 check_lex("1_000_000 0xFF_FF", expect![[r#"
616 INTEGER "1_000_000"
617 WHITESPACE " "
618 INTEGER "0xFF_FF"
619 EOF ""
620 "#]]);
621 }
622
623 #[test]
624 fn lex_address_literal() {
625 check_lex("aleo1abc123", expect![[r#"
626 ADDRESS_LIT "aleo1abc123"
627 EOF ""
628 "#]]);
629 }
630
631 #[test]
632 fn lex_strings() {
633 check_lex(r#""hello" "world""#, expect![[r#"
634 STRING "\"hello\""
635 WHITESPACE " "
636 STRING "\"world\""
637 EOF ""
638 "#]]);
639 }
640
641 #[test]
642 fn lex_punctuation() {
643 check_lex("( ) [ ] { } , . ; : :: ? -> => _ @", expect![[r#"
644 L_PAREN "("
645 WHITESPACE " "
646 R_PAREN ")"
647 WHITESPACE " "
648 L_BRACKET "["
649 WHITESPACE " "
650 R_BRACKET "]"
651 WHITESPACE " "
652 L_BRACE "{"
653 WHITESPACE " "
654 R_BRACE "}"
655 WHITESPACE " "
656 COMMA ","
657 WHITESPACE " "
658 DOT "."
659 WHITESPACE " "
660 SEMICOLON ";"
661 WHITESPACE " "
662 COLON ":"
663 WHITESPACE " "
664 COLON_COLON "::"
665 WHITESPACE " "
666 QUESTION "?"
667 WHITESPACE " "
668 ARROW "->"
669 WHITESPACE " "
670 FAT_ARROW "=>"
671 WHITESPACE " "
672 UNDERSCORE "_"
673 WHITESPACE " "
674 AT "@"
675 EOF ""
676 "#]]);
677 }
678
679 #[test]
680 fn lex_arithmetic_operators() {
681 check_lex("+ - * / % **", expect![[r#"
682 PLUS "+"
683 WHITESPACE " "
684 MINUS "-"
685 WHITESPACE " "
686 STAR "*"
687 WHITESPACE " "
688 SLASH "/"
689 WHITESPACE " "
690 PERCENT "%"
691 WHITESPACE " "
692 STAR2 "**"
693 EOF ""
694 "#]]);
695 }
696
697 #[test]
698 fn lex_comparison_operators() {
699 check_lex("== != < <= > >=", expect![[r#"
700 EQ2 "=="
701 WHITESPACE " "
702 BANG_EQ "!="
703 WHITESPACE " "
704 LT "<"
705 WHITESPACE " "
706 LT_EQ "<="
707 WHITESPACE " "
708 GT ">"
709 WHITESPACE " "
710 GT_EQ ">="
711 EOF ""
712 "#]]);
713 }
714
715 #[test]
716 fn lex_logical_operators() {
717 check_lex("&& || !", expect![[r#"
718 AMP2 "&&"
719 WHITESPACE " "
720 PIPE2 "||"
721 WHITESPACE " "
722 BANG "!"
723 EOF ""
724 "#]]);
725 }
726
727 #[test]
728 fn lex_bitwise_operators() {
729 check_lex("& | ^ << >>", expect![[r#"
730 AMP "&"
731 WHITESPACE " "
732 PIPE "|"
733 WHITESPACE " "
734 CARET "^"
735 WHITESPACE " "
736 SHL "<<"
737 WHITESPACE " "
738 SHR ">>"
739 EOF ""
740 "#]]);
741 }
742
743 #[test]
744 fn lex_assignment_operators() {
745 check_lex("= += -= *= /= %= **= &&= ||=", expect![[r#"
746 EQ "="
747 WHITESPACE " "
748 PLUS_EQ "+="
749 WHITESPACE " "
750 MINUS_EQ "-="
751 WHITESPACE " "
752 STAR_EQ "*="
753 WHITESPACE " "
754 SLASH_EQ "/="
755 WHITESPACE " "
756 PERCENT_EQ "%="
757 WHITESPACE " "
758 STAR2_EQ "**="
759 WHITESPACE " "
760 AMP2_EQ "&&="
761 WHITESPACE " "
762 PIPE2_EQ "||="
763 EOF ""
764 "#]]);
765 }
766
767 #[test]
768 fn lex_more_assignment_operators() {
769 check_lex("&= |= ^= <<= >>=", expect![[r#"
770 AMP_EQ "&="
771 WHITESPACE " "
772 PIPE_EQ "|="
773 WHITESPACE " "
774 CARET_EQ "^="
775 WHITESPACE " "
776 SHL_EQ "<<="
777 WHITESPACE " "
778 SHR_EQ ">>="
779 EOF ""
780 "#]]);
781 }
782
783 #[test]
784 fn lex_dot_dot() {
785 check_lex("0..10", expect![[r#"
786 INTEGER "0"
787 DOT_DOT ".."
788 INTEGER "10"
789 EOF ""
790 "#]]);
791 }
792
793 #[test]
794 fn lex_simple_expression() {
795 check_lex("x + y * 2", expect![[r#"
796 IDENT "x"
797 WHITESPACE " "
798 PLUS "+"
799 WHITESPACE " "
800 IDENT "y"
801 WHITESPACE " "
802 STAR "*"
803 WHITESPACE " "
804 INTEGER "2"
805 EOF ""
806 "#]]);
807 }
808
809 #[test]
810 fn lex_function_call() {
811 check_lex("foo(a, b)", expect![[r#"
812 IDENT "foo"
813 L_PAREN "("
814 IDENT "a"
815 COMMA ","
816 WHITESPACE " "
817 IDENT "b"
818 R_PAREN ")"
819 EOF ""
820 "#]]);
821 }
822
823 #[test]
824 fn lex_function_definition() {
825 check_lex("function add(x: u32) -> u32 {", expect![[r#"
826 KW_FUNCTION "function"
827 WHITESPACE " "
828 IDENT "add"
829 L_PAREN "("
830 IDENT "x"
831 COLON ":"
832 WHITESPACE " "
833 KW_U32 "u32"
834 R_PAREN ")"
835 WHITESPACE " "
836 ARROW "->"
837 WHITESPACE " "
838 KW_U32 "u32"
839 WHITESPACE " "
840 L_BRACE "{"
841 EOF ""
842 "#]]);
843 }
844
845 #[test]
846 fn lex_let_statement() {
847 check_lex("let x: u32 = 42;", expect![[r#"
848 KW_LET "let"
849 WHITESPACE " "
850 IDENT "x"
851 COLON ":"
852 WHITESPACE " "
853 KW_U32 "u32"
854 WHITESPACE " "
855 EQ "="
856 WHITESPACE " "
857 INTEGER "42"
858 SEMICOLON ";"
859 EOF ""
860 "#]]);
861 }
862
863 #[test]
864 fn lex_typed_integers() {
865 check_lex("1000u32 42i64 0u8 255u128", expect![[r#"
867 INTEGER "1000u32"
868 WHITESPACE " "
869 INTEGER "42i64"
870 WHITESPACE " "
871 INTEGER "0u8"
872 WHITESPACE " "
873 INTEGER "255u128"
874 EOF ""
875 "#]]);
876 }
877
878 #[test]
879 fn lex_typed_integers_field() {
880 check_lex("123field 456group 789scalar", expect![[r#"
882 INTEGER "123field"
883 WHITESPACE " "
884 INTEGER "456group"
885 WHITESPACE " "
886 INTEGER "789scalar"
887 EOF ""
888 "#]]);
889 }
890
891 #[test]
892 fn lex_special_paths() {
893 check_lex("group::GEN signature::verify Future::await", expect![[r#"
895 IDENT "group::GEN"
896 WHITESPACE " "
897 IDENT "signature::verify"
898 WHITESPACE " "
899 IDENT "Future::await"
900 EOF ""
901 "#]]);
902 }
903
904 #[test]
905 fn lex_typed_integer_range() {
906 check_lex("0u8..STOP", expect![[r#"
908 INTEGER "0u8"
909 DOT_DOT ".."
910 IDENT "STOP"
911 EOF ""
912 "#]]);
913 }
914
915 #[test]
916 fn lex_error_unknown_char() {
917 check_lex_errors("hello $ world", expect![[r#"6:unexpected character: "$""#]]);
918 }
919}