1use crate::manifest::compiler::CompileErrorDiagnosticsStyle;
2use crate::manifest::diagnostic_snippets::create_snippet;
3use crate::manifest::token::{Position, Span, Token, TokenWithSpan};
4use sbor::prelude::*;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum ExpectedChar {
8 Exact(char),
9 OneOf(Vec<char>),
10 HexDigit,
11 DigitLetterQuotePunctuation,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum LexerErrorKind {
16 UnexpectedEof,
17 UnexpectedChar(char, ExpectedChar),
18 InvalidIntegerLiteral(String),
19 InvalidIntegerType(String),
20 InvalidInteger(String),
21 InvalidUnicode(u32),
22 MissingUnicodeSurrogate(u32),
23}
24
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub struct LexerError {
27 pub error_kind: LexerErrorKind,
28 pub span: Span,
29}
30
31impl LexerError {
32 fn unexpected_char(position: Position, c: char, expected: ExpectedChar) -> Self {
33 Self {
34 error_kind: LexerErrorKind::UnexpectedChar(c, expected),
35 span: Span {
36 start: position,
37 end: position.advance(c),
38 },
39 }
40 }
41
42 fn invalid_integer_type(ty: String, start: Position, end: Position) -> Self {
43 Self {
44 error_kind: LexerErrorKind::InvalidIntegerType(ty),
45 span: Span { start, end },
46 }
47 }
48}
49
50#[derive(Debug, Clone)]
51pub struct Lexer {
52 text: Vec<char>,
54 current: Position,
56}
57
58pub fn tokenize(s: &str) -> Result<Vec<TokenWithSpan>, LexerError> {
59 let mut lexer = Lexer::new(s);
60 let mut tokens = Vec::new();
61 loop {
62 if let Some(token) = lexer.next_token()? {
63 tokens.push(token);
64 } else {
65 break;
66 }
67 }
68 Ok(tokens)
69}
70
71impl Lexer {
72 pub fn new(text: &str) -> Self {
73 Self {
74 text: text.chars().collect(),
75 current: Position {
76 full_index: 0,
77 line_idx: 0,
78 line_char_index: 0,
79 },
80 }
81 }
82
83 pub fn is_eof(&self) -> bool {
84 self.current.full_index == self.text.len()
85 }
86
87 fn peek(&self) -> Result<char, LexerError> {
88 if self.is_eof() {
89 Err(LexerError {
90 error_kind: LexerErrorKind::UnexpectedEof,
91 span: Span {
92 start: self.current,
93 end: self.current,
94 },
95 })
96 } else {
97 Ok(self.text[self.current.full_index])
98 }
99 }
100
101 fn advance(&mut self) -> Result<char, LexerError> {
102 let c = self.peek()?;
103 self.current = self.current.advance(c);
104 Ok(c)
105 }
106
107 fn advance_expected(&mut self, expected: char) -> Result<char, LexerError> {
108 self.advance_matching(|c| c == expected, ExpectedChar::Exact(expected))
109 }
110
111 fn advance_matching(
112 &mut self,
113 matcher: impl Fn(char) -> bool,
114 expected: ExpectedChar,
115 ) -> Result<char, LexerError> {
116 let previous = self.current;
117 let c = self.advance()?;
118 if !matcher(c) {
119 Err(LexerError::unexpected_char(previous, c, expected))
120 } else {
121 Ok(c)
122 }
123 }
124
125 fn advance_and_append(&mut self, s: &mut String) -> Result<char, LexerError> {
126 let c = self.advance()?;
127 s.push(c);
128 Ok(c)
129 }
130
131 fn is_whitespace(c: char) -> bool {
132 c == ' ' || c == '\t' || c == '\r' || c == '\n'
135 }
136
137 pub fn next_token(&mut self) -> Result<Option<TokenWithSpan>, LexerError> {
138 let mut in_comment = false;
140 while !self.is_eof() {
141 if in_comment {
142 if self.advance()? == '\n' {
143 in_comment = false;
144 }
145 } else if self.peek()? == '#' {
146 in_comment = true;
147 } else if Self::is_whitespace(self.peek()?) {
148 self.advance()?;
149 } else {
150 break;
151 }
152 }
153
154 if self.is_eof() {
156 return Ok(None);
157 }
158
159 match self.peek()? {
161 '-' | '0'..='9' => self.tokenize_number(),
162 '"' => self.tokenize_string(),
163 'a'..='z' | 'A'..='Z' => self.tokenize_identifier(),
164 '{' | '}' | '(' | ')' | '<' | '>' | ',' | ';' | '&' | '=' => {
165 self.tokenize_punctuation()
166 }
167 c => Err(LexerError::unexpected_char(
168 self.current,
169 c,
170 ExpectedChar::DigitLetterQuotePunctuation,
171 )),
172 }
173 .map(Option::from)
174 }
175
176 fn tokenize_number(&mut self) -> Result<TokenWithSpan, LexerError> {
178 let literal_start = self.current;
179 let mut s = String::new();
180
181 if self.peek()? == '-' {
183 s.push(self.advance()?);
184 }
185
186 match self.advance_and_append(&mut s)? {
188 '0' => {}
189 '1'..='9' => {
190 while self.peek()?.is_ascii_digit() {
191 s.push(self.advance()?);
192 }
193 }
194 _ => {
195 return Err(LexerError {
196 error_kind: LexerErrorKind::InvalidIntegerLiteral(s),
197 span: Span {
198 start: literal_start,
199 end: self.current,
200 },
201 });
202 }
203 }
204
205 let ty_start = self.current;
207 let mut t = String::new();
208 match self.advance_and_append(&mut t)? {
209 'i' => match self.advance_and_append(&mut t)? {
210 '1' => match self.advance_and_append(&mut t)? {
211 '2' => match self.advance_and_append(&mut t)? {
212 '8' => self.parse_int(&s, "i128", Token::I128Literal, literal_start),
213 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
214 },
215 '6' => self.parse_int(&s, "i16", Token::I16Literal, literal_start),
216 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
217 },
218 '3' => match self.advance_and_append(&mut t)? {
219 '2' => self.parse_int(&s, "i32", Token::I32Literal, literal_start),
220 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
221 },
222 '6' => match self.advance_and_append(&mut t)? {
223 '4' => self.parse_int(&s, "i64", Token::I64Literal, literal_start),
224 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
225 },
226 '8' => self.parse_int(&s, "i8", Token::I8Literal, literal_start),
227 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
228 },
229 'u' => match self.advance_and_append(&mut t)? {
230 '1' => match self.advance_and_append(&mut t)? {
231 '2' => match self.advance_and_append(&mut t)? {
232 '8' => self.parse_int(&s, "u128", Token::U128Literal, literal_start),
233 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
234 },
235 '6' => self.parse_int(&s, "u16", Token::U16Literal, literal_start),
236 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
237 },
238 '3' => match self.advance_and_append(&mut t)? {
239 '2' => self.parse_int(&s, "u32", Token::U32Literal, literal_start),
240 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
241 },
242 '6' => match self.advance_and_append(&mut t)? {
243 '4' => self.parse_int(&s, "u64", Token::U64Literal, literal_start),
244 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
245 },
246 '8' => self.parse_int(&s, "u8", Token::U8Literal, literal_start),
247 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
248 },
249 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
250 }
251 .map(|token| self.new_token(token, literal_start, self.current))
252 }
253
254 fn parse_int<T>(
255 &self,
256 int: &str,
257 ty: &str,
258 map: fn(T) -> Token,
259 token_start: Position,
260 ) -> Result<Token, LexerError>
261 where
262 T: FromStr,
263 <T as FromStr>::Err: Display,
264 {
265 int.parse::<T>().map(map).map_err(|err| LexerError {
266 error_kind: LexerErrorKind::InvalidInteger(format!(
267 "'{}{}' - {}",
268 int,
269 ty,
270 err.to_string()
271 )),
272 span: Span {
273 start: token_start,
274 end: self.current,
275 },
276 })
277 }
278
279 fn tokenize_string(&mut self) -> Result<TokenWithSpan, LexerError> {
280 let start = self.current;
281 assert_eq!(self.advance()?, '"');
282
283 let mut s = String::new();
284 while self.peek()? != '"' {
285 let c = self.advance()?;
286 if c == '\\' {
287 let token_start = self.current;
289
290 match self.advance()? {
292 '"' => s.push('\"'),
293 '\\' => s.push('\\'),
294 '/' => s.push('/'),
295 'b' => s.push('\x08'),
296 'f' => s.push('\x0c'),
297 'n' => s.push('\n'),
298 'r' => s.push('\r'),
299 't' => s.push('\t'),
300 'u' => {
301 let mut unicode = self.read_utf16_unit()?;
302 if (0xD800..=0xDFFF).contains(&unicode) {
305 let position = self.current;
306 if self.advance()? == '\\' && self.advance()? == 'u' {
307 unicode = 0x10000
308 + ((unicode - 0xD800) << 10)
309 + self.read_utf16_unit()?
310 - 0xDC00;
311 } else {
312 return Err(LexerError {
313 error_kind: LexerErrorKind::MissingUnicodeSurrogate(unicode),
314 span: Span {
315 start: token_start,
316 end: position,
317 },
318 });
319 }
320 }
321 s.push(char::from_u32(unicode).ok_or(LexerError {
322 error_kind: LexerErrorKind::InvalidUnicode(unicode),
323 span: Span {
324 start: token_start,
325 end: self.current,
326 },
327 })?);
328 }
329 c => {
330 return Err(LexerError::unexpected_char(
331 token_start,
332 c,
333 ExpectedChar::OneOf(vec!['"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u']),
334 ));
335 }
336 }
337 } else {
338 s.push(c);
339 }
340 }
341 self.advance()?;
342
343 Ok(self.new_token(Token::StringLiteral(s), start, self.current))
344 }
345
346 fn read_utf16_unit(&mut self) -> Result<u32, LexerError> {
347 let mut code: u32 = 0;
348
349 for _ in 0..4 {
350 let c = self.advance_matching(|c| c.is_ascii_hexdigit(), ExpectedChar::HexDigit)?;
351 code = code * 16 + c.to_digit(16).unwrap();
352 }
353
354 Ok(code)
355 }
356
357 fn tokenize_identifier(&mut self) -> Result<TokenWithSpan, LexerError> {
358 let start = self.current;
359
360 let mut id = String::from(self.advance()?);
361 while !self.is_eof() {
362 let next_char = self.peek()?;
363 let next_char_can_be_part_of_ident =
364 next_char.is_ascii_alphanumeric() || next_char == '_' || next_char == ':';
365 if !next_char_can_be_part_of_ident {
366 break;
367 }
368 id.push(self.advance()?);
369 }
370
371 let token = match id.as_str() {
372 "true" => Token::BoolLiteral(true),
373 "false" => Token::BoolLiteral(false),
374 other => Token::Ident(other.to_string()),
375 };
376 Ok(self.new_token(token, start, self.current))
377 }
378
379 fn tokenize_punctuation(&mut self) -> Result<TokenWithSpan, LexerError> {
380 let token_start = self.current;
381
382 let token = match self.advance()? {
383 '(' => Token::OpenParenthesis,
384 ')' => Token::CloseParenthesis,
385 '<' => Token::LessThan,
386 '>' => Token::GreaterThan,
387 ',' => Token::Comma,
388 ';' => Token::Semicolon,
389 '=' => {
390 self.advance_expected('>')?;
391 Token::FatArrow
392 }
393 c => {
394 return Err(LexerError::unexpected_char(
395 token_start,
396 c,
397 ExpectedChar::OneOf(vec!['(', ')', '<', '>', ',', ';', '=']),
398 ))
399 }
400 };
401
402 Ok(self.new_token(token, token_start, self.current))
403 }
404
405 fn new_token(&self, token: Token, start: Position, end: Position) -> TokenWithSpan {
406 TokenWithSpan {
407 token,
408 span: Span { start, end },
409 }
410 }
411}
412
413pub fn lexer_error_diagnostics(
414 s: &str,
415 err: LexerError,
416 style: CompileErrorDiagnosticsStyle,
417) -> String {
418 let (title, label) = match err.error_kind {
419 LexerErrorKind::UnexpectedEof => (
420 "unexpected end of file".to_string(),
421 "unexpected end of file".to_string(),
422 ),
423 LexerErrorKind::UnexpectedChar(c, expected) => {
424 let expected = match expected {
425 ExpectedChar::Exact(exact) => format!("'{}'", exact),
426 ExpectedChar::OneOf(one_of) => {
427 let v: Vec<String> = one_of.iter().map(|c| format!("'{}'", c)).collect();
428 if let Some((last, init)) = v.split_last() {
429 format!("{} or {}", init.join(", "), last)
430 }
431 else {
432 "unknown".to_string()
433 }
434 }
435 ExpectedChar::HexDigit => "hex digit".to_string(),
436 ExpectedChar::DigitLetterQuotePunctuation => "digit, letter, quotation mark or one of punctuation characters '(', ')', '<', '>', ',', ';', '='".to_string(),
437 };
438 (
439 format!("unexpected character {:?}, expected {}", c, expected),
440 "unexpected character".to_string(),
441 )
442 }
443 LexerErrorKind::InvalidIntegerLiteral(string) => (
444 format!("invalid integer literal '{}'", string),
445 "invalid integer literal".to_string(),
446 ),
447 LexerErrorKind::InvalidIntegerType(string) => (
448 format!("invalid integer type '{}'", string),
449 "invalid integer type".to_string(),
450 ),
451 LexerErrorKind::InvalidInteger(string) => (
452 format!("invalid integer value {}", string),
453 "invalid integer value".to_string(),
454 ),
455 LexerErrorKind::InvalidUnicode(value) => (
456 format!("invalid unicode code point {}", value),
457 "invalid unicode code point".to_string(),
458 ),
459 LexerErrorKind::MissingUnicodeSurrogate(value) => (
460 format!("missing unicode '{:X}' surrogate pair", value),
461 "missing unicode surrogate pair".to_string(),
462 ),
463 };
464 create_snippet(s, &err.span, &title, &label, style)
465}
466
467#[cfg(test)]
468mod tests {
469 use super::*;
470 use crate::{position, span};
471
472 #[macro_export]
473 macro_rules! lex_ok {
474 ( $s:expr, $expected:expr ) => {{
475 let mut lexer = Lexer::new($s);
476 for i in 0..$expected.len() {
477 assert_eq!(
478 lexer.next_token().map(|opt| opt.map(|t| t.token)),
479 Ok(Some($expected[i].clone()))
480 );
481 }
482 assert_eq!(lexer.next_token(), Ok(None));
483 }};
484 }
485
486 #[macro_export]
487 macro_rules! lex_error {
488 ( $s:expr, $expected:expr ) => {{
489 let mut lexer = Lexer::new($s);
490 loop {
491 match lexer.next_token() {
492 Ok(Some(_)) => {}
493 Ok(None) => {
494 panic!("Expected {:?} but no error is thrown", $expected);
495 }
496 Err(e) => {
497 assert_eq!(e, $expected);
498 break;
499 }
500 }
501 }
502 }};
503 }
504
505 #[test]
506 fn test_empty_strings() {
507 lex_ok!("", Vec::<Token>::new());
508 lex_ok!(" ", Vec::<Token>::new());
509 lex_ok!("\r\n\t", Vec::<Token>::new());
510 }
511
512 #[test]
513 fn test_bool() {
514 lex_ok!("true", vec![Token::BoolLiteral(true)]);
515 lex_ok!("false", vec![Token::BoolLiteral(false)]);
516 lex_ok!("false123u8", vec![Token::Ident("false123u8".into())]);
517 }
518
519 #[test]
520 fn test_int() {
521 lex_ok!(
522 "1u82u1283i84i128",
523 vec![
524 Token::U8Literal(1),
525 Token::U128Literal(2),
526 Token::I8Literal(3),
527 Token::I128Literal(4),
528 ]
529 );
530 lex_ok!("1u8 2u32", vec![Token::U8Literal(1), Token::U32Literal(2)]);
531 lex_error!(
532 "123",
533 LexerError {
534 error_kind: LexerErrorKind::UnexpectedEof,
535 span: span!(start = (3, 0, 3), end = (3, 0, 3))
536 }
537 );
538 }
539
540 #[test]
541 fn test_comment() {
542 lex_ok!("# 1u8", Vec::<Token>::new());
543 lex_ok!("1u8 # comment", vec![Token::U8Literal(1),]);
544 lex_ok!(
545 "# multiple\n# line\nCALL_FUNCTION",
546 vec![Token::Ident("CALL_FUNCTION".to_string()),]
547 );
548 }
549
550 #[test]
551 fn test_string() {
552 lex_ok!(
553 r#" "" "abc" "abc\r\n\"def\uD83C\uDF0D" "#,
554 vec![
555 Token::StringLiteral("".into()),
556 Token::StringLiteral("abc".into()),
557 Token::StringLiteral("abc\r\n\"defπ".into()),
558 ]
559 );
560 lex_error!(
561 "\"",
562 LexerError {
563 error_kind: LexerErrorKind::UnexpectedEof,
564 span: span!(start = (1, 0, 1), end = (1, 0, 1))
565 }
566 );
567 }
568
569 #[test]
570 fn test_mixed() {
571 lex_ok!(
572 r#"CALL_FUNCTION Map<String, Array>("test", Array<String>("abc"));"#,
573 vec![
574 Token::Ident("CALL_FUNCTION".to_string()),
575 Token::Ident("Map".to_string()),
576 Token::LessThan,
577 Token::Ident("String".to_string()),
578 Token::Comma,
579 Token::Ident("Array".to_string()),
580 Token::GreaterThan,
581 Token::OpenParenthesis,
582 Token::StringLiteral("test".into()),
583 Token::Comma,
584 Token::Ident("Array".to_string()),
585 Token::LessThan,
586 Token::Ident("String".to_string()),
587 Token::GreaterThan,
588 Token::OpenParenthesis,
589 Token::StringLiteral("abc".into()),
590 Token::CloseParenthesis,
591 Token::CloseParenthesis,
592 Token::Semicolon,
593 ]
594 );
595 }
596
597 #[test]
598 fn test_precise_decimal() {
599 lex_ok!(
600 "PreciseDecimal(\"12\")",
601 vec![
602 Token::Ident("PreciseDecimal".to_string()),
603 Token::OpenParenthesis,
604 Token::StringLiteral("12".into()),
605 Token::CloseParenthesis,
606 ]
607 );
608 }
609
610 #[test]
611 fn test_precise_decimal_collection() {
612 lex_ok!(
613 "Array<PreciseDecimal>(PreciseDecimal(\"12\"), PreciseDecimal(\"212\"), PreciseDecimal(\"1984\"))",
614 vec![
615 Token::Ident("Array".to_string()),
616 Token::LessThan,
617 Token::Ident("PreciseDecimal".to_string()),
618 Token::GreaterThan,
619 Token::OpenParenthesis,
620 Token::Ident("PreciseDecimal".to_string()),
621 Token::OpenParenthesis,
622 Token::StringLiteral("12".into()),
623 Token::CloseParenthesis,
624 Token::Comma,
625 Token::Ident("PreciseDecimal".to_string()),
626 Token::OpenParenthesis,
627 Token::StringLiteral("212".into()),
628 Token::CloseParenthesis,
629 Token::Comma,
630 Token::Ident("PreciseDecimal".to_string()),
631 Token::OpenParenthesis,
632 Token::StringLiteral("1984".into()),
633 Token::CloseParenthesis,
634 Token::CloseParenthesis,
635 ]
636 );
637 }
638
639 #[test]
640 fn test_invalid_integer() {
641 lex_error!(
642 "-_28u32",
643 LexerError {
644 error_kind: LexerErrorKind::InvalidIntegerLiteral("-_".to_string()),
645 span: span!(start = (0, 0, 0), end = (2, 0, 2))
646 }
647 );
648
649 lex_error!(
650 "1i128\n 1u64 \n 1i37",
651 LexerError {
652 error_kind: LexerErrorKind::InvalidIntegerType("i37".to_string()),
653 span: span!(start = (15, 2, 2), end = (18, 2, 5))
654 }
655 );
656
657 lex_error!(
658 "3_0i8",
659 LexerError {
660 error_kind: LexerErrorKind::InvalidIntegerType("_".to_string()),
661 span: span!(start = (1, 0, 1), end = (2, 0, 2))
662 }
663 );
664 }
665
666 #[test]
667 fn test_unexpected_char() {
668 lex_error!(
669 "1u8 +2u32",
670 LexerError {
671 error_kind: LexerErrorKind::UnexpectedChar(
672 '+',
673 ExpectedChar::DigitLetterQuotePunctuation
674 ),
675 span: span!(start = (4, 0, 4), end = (5, 0, 5))
676 }
677 );
678
679 lex_error!(
680 "x=7",
681 LexerError {
682 error_kind: LexerErrorKind::UnexpectedChar('7', ExpectedChar::Exact('>')),
683 span: span!(start = (2, 0, 2), end = (3, 0, 3))
684 }
685 );
686 }
687
688 #[test]
689 fn test_unicode() {
690 lex_ok!(r#""\u2764""#, vec![Token::StringLiteral("β€".to_string())]);
691 lex_ok!(r#""\uFA84""#, vec![Token::StringLiteral("οͺ".to_string())]);
692 lex_ok!(
693 r#""\uD83D\uDC69""#,
694 vec![Token::StringLiteral("π©".to_string())]
695 );
696 lex_ok!(r#""π©""#, vec![Token::StringLiteral("π©".to_string())]);
697 lex_error!(
698 r#""\uDCAC\u1234""#,
699 LexerError {
700 error_kind: LexerErrorKind::InvalidUnicode(1238580),
701 span: span!(start = (2, 0, 2), end = (13, 0, 13))
702 }
703 );
704 }
705}