1use crate::manifest::compiler::CompileErrorDiagnosticsStyle;
2use crate::manifest::diagnostic_snippets::create_snippet;
3use crate::manifest::token::{Position, Span, Token, TokenWithSpan};
4use sbor::prelude::*;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum ExpectedChar {
8 Exact(char),
9 OneOf(Vec<char>),
10 HexDigit,
11 DigitLetterQuotePunctuation,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum LexerErrorKind {
16 UnexpectedEof,
17 UnexpectedChar(char, ExpectedChar),
18 InvalidIntegerLiteral(String),
19 InvalidIntegerType(String),
20 InvalidInteger(String),
21 InvalidUnicode(u32),
22 MissingUnicodeSurrogate(u32),
23}
24
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub struct LexerError {
27 pub error_kind: LexerErrorKind,
28 pub span: Span,
29}
30
31impl LexerError {
32 fn unexpected_char(position: Position, c: char, expected: ExpectedChar) -> Self {
33 Self {
34 error_kind: LexerErrorKind::UnexpectedChar(c, expected),
35 span: Span {
36 start: position,
37 end: position.advance(c),
38 },
39 }
40 }
41
42 fn invalid_integer_type(ty: String, start: Position, end: Position) -> Self {
43 Self {
44 error_kind: LexerErrorKind::InvalidIntegerType(ty),
45 span: Span { start, end },
46 }
47 }
48}
49
50#[derive(Debug, Clone)]
51pub struct Lexer {
52 text: Vec<char>,
54 current: Position,
56}
57
58pub fn tokenize(s: &str) -> Result<Vec<TokenWithSpan>, LexerError> {
59 let mut lexer = Lexer::new(s);
60 let mut tokens = Vec::new();
61 while let Some(token) = lexer.next_token()? {
62 tokens.push(token);
63 }
64 Ok(tokens)
65}
66
67impl Lexer {
68 pub fn new(text: &str) -> Self {
69 Self {
70 text: text.chars().collect(),
71 current: Position {
72 full_index: 0,
73 line_idx: 0,
74 line_char_index: 0,
75 },
76 }
77 }
78
79 pub fn is_eof(&self) -> bool {
80 self.current.full_index == self.text.len()
81 }
82
83 fn peek(&self) -> Result<char, LexerError> {
84 if self.is_eof() {
85 Err(LexerError {
86 error_kind: LexerErrorKind::UnexpectedEof,
87 span: Span {
88 start: self.current,
89 end: self.current,
90 },
91 })
92 } else {
93 Ok(self.text[self.current.full_index])
94 }
95 }
96
97 fn advance(&mut self) -> Result<char, LexerError> {
98 let c = self.peek()?;
99 self.current = self.current.advance(c);
100 Ok(c)
101 }
102
103 fn advance_expected(&mut self, expected: char) -> Result<char, LexerError> {
104 self.advance_matching(|c| c == expected, ExpectedChar::Exact(expected))
105 }
106
107 fn advance_matching(
108 &mut self,
109 matcher: impl Fn(char) -> bool,
110 expected: ExpectedChar,
111 ) -> Result<char, LexerError> {
112 let previous = self.current;
113 let c = self.advance()?;
114 if !matcher(c) {
115 Err(LexerError::unexpected_char(previous, c, expected))
116 } else {
117 Ok(c)
118 }
119 }
120
121 fn advance_and_append(&mut self, s: &mut String) -> Result<char, LexerError> {
122 let c = self.advance()?;
123 s.push(c);
124 Ok(c)
125 }
126
127 fn is_whitespace(c: char) -> bool {
128 c == ' ' || c == '\t' || c == '\r' || c == '\n'
131 }
132
133 pub fn next_token(&mut self) -> Result<Option<TokenWithSpan>, LexerError> {
134 let mut in_comment = false;
136 while !self.is_eof() {
137 if in_comment {
138 if self.advance()? == '\n' {
139 in_comment = false;
140 }
141 } else if self.peek()? == '#' {
142 in_comment = true;
143 } else if Self::is_whitespace(self.peek()?) {
144 self.advance()?;
145 } else {
146 break;
147 }
148 }
149
150 if self.is_eof() {
152 return Ok(None);
153 }
154
155 match self.peek()? {
157 '-' | '0'..='9' => self.tokenize_number(),
158 '"' => self.tokenize_string(),
159 'a'..='z' | 'A'..='Z' => self.tokenize_identifier(),
160 '{' | '}' | '(' | ')' | '<' | '>' | ',' | ';' | '&' | '=' => {
161 self.tokenize_punctuation()
162 }
163 c => Err(LexerError::unexpected_char(
164 self.current,
165 c,
166 ExpectedChar::DigitLetterQuotePunctuation,
167 )),
168 }
169 .map(Option::from)
170 }
171
172 fn tokenize_number(&mut self) -> Result<TokenWithSpan, LexerError> {
174 let literal_start = self.current;
175 let mut s = String::new();
176
177 if self.peek()? == '-' {
179 s.push(self.advance()?);
180 }
181
182 match self.advance_and_append(&mut s)? {
184 '0' => {}
185 '1'..='9' => {
186 while self.peek()?.is_ascii_digit() {
187 s.push(self.advance()?);
188 }
189 }
190 _ => {
191 return Err(LexerError {
192 error_kind: LexerErrorKind::InvalidIntegerLiteral(s),
193 span: Span {
194 start: literal_start,
195 end: self.current,
196 },
197 });
198 }
199 }
200
201 let ty_start = self.current;
203 let mut t = String::new();
204 match self.advance_and_append(&mut t)? {
205 'i' => match self.advance_and_append(&mut t)? {
206 '1' => match self.advance_and_append(&mut t)? {
207 '2' => match self.advance_and_append(&mut t)? {
208 '8' => self.parse_int(&s, "i128", Token::I128Literal, literal_start),
209 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
210 },
211 '6' => self.parse_int(&s, "i16", Token::I16Literal, literal_start),
212 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
213 },
214 '3' => match self.advance_and_append(&mut t)? {
215 '2' => self.parse_int(&s, "i32", Token::I32Literal, literal_start),
216 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
217 },
218 '6' => match self.advance_and_append(&mut t)? {
219 '4' => self.parse_int(&s, "i64", Token::I64Literal, literal_start),
220 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
221 },
222 '8' => self.parse_int(&s, "i8", Token::I8Literal, literal_start),
223 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
224 },
225 'u' => match self.advance_and_append(&mut t)? {
226 '1' => match self.advance_and_append(&mut t)? {
227 '2' => match self.advance_and_append(&mut t)? {
228 '8' => self.parse_int(&s, "u128", Token::U128Literal, literal_start),
229 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
230 },
231 '6' => self.parse_int(&s, "u16", Token::U16Literal, literal_start),
232 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
233 },
234 '3' => match self.advance_and_append(&mut t)? {
235 '2' => self.parse_int(&s, "u32", Token::U32Literal, literal_start),
236 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
237 },
238 '6' => match self.advance_and_append(&mut t)? {
239 '4' => self.parse_int(&s, "u64", Token::U64Literal, literal_start),
240 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
241 },
242 '8' => self.parse_int(&s, "u8", Token::U8Literal, literal_start),
243 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
244 },
245 _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
246 }
247 .map(|token| self.new_token(token, literal_start, self.current))
248 }
249
250 fn parse_int<T>(
251 &self,
252 int: &str,
253 ty: &str,
254 map: fn(T) -> Token,
255 token_start: Position,
256 ) -> Result<Token, LexerError>
257 where
258 T: FromStr,
259 <T as FromStr>::Err: Display,
260 {
261 int.parse::<T>().map(map).map_err(|err| LexerError {
262 error_kind: LexerErrorKind::InvalidInteger(format!("'{}{}' - {}", int, ty, err)),
263 span: Span {
264 start: token_start,
265 end: self.current,
266 },
267 })
268 }
269
270 fn tokenize_string(&mut self) -> Result<TokenWithSpan, LexerError> {
271 let start = self.current;
272 assert_eq!(self.advance()?, '"');
273
274 let mut s = String::new();
275 while self.peek()? != '"' {
276 let c = self.advance()?;
277 if c == '\\' {
278 let token_start = self.current;
280
281 match self.advance()? {
283 '"' => s.push('\"'),
284 '\\' => s.push('\\'),
285 '/' => s.push('/'),
286 'b' => s.push('\x08'),
287 'f' => s.push('\x0c'),
288 'n' => s.push('\n'),
289 'r' => s.push('\r'),
290 't' => s.push('\t'),
291 'u' => {
292 let mut unicode = self.read_utf16_unit()?;
293 if (0xD800..=0xDFFF).contains(&unicode) {
296 let position = self.current;
297 if self.advance()? == '\\' && self.advance()? == 'u' {
298 unicode = 0x10000
299 + ((unicode - 0xD800) << 10)
300 + self.read_utf16_unit()?
301 - 0xDC00;
302 } else {
303 return Err(LexerError {
304 error_kind: LexerErrorKind::MissingUnicodeSurrogate(unicode),
305 span: Span {
306 start: token_start,
307 end: position,
308 },
309 });
310 }
311 }
312 s.push(char::from_u32(unicode).ok_or(LexerError {
313 error_kind: LexerErrorKind::InvalidUnicode(unicode),
314 span: Span {
315 start: token_start,
316 end: self.current,
317 },
318 })?);
319 }
320 c => {
321 return Err(LexerError::unexpected_char(
322 token_start,
323 c,
324 ExpectedChar::OneOf(vec!['"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u']),
325 ));
326 }
327 }
328 } else {
329 s.push(c);
330 }
331 }
332 self.advance()?;
333
334 Ok(self.new_token(Token::StringLiteral(s), start, self.current))
335 }
336
337 fn read_utf16_unit(&mut self) -> Result<u32, LexerError> {
338 let mut code: u32 = 0;
339
340 for _ in 0..4 {
341 let c = self.advance_matching(|c| c.is_ascii_hexdigit(), ExpectedChar::HexDigit)?;
342 code = code * 16 + c.to_digit(16).unwrap();
343 }
344
345 Ok(code)
346 }
347
348 fn tokenize_identifier(&mut self) -> Result<TokenWithSpan, LexerError> {
349 let start = self.current;
350
351 let mut id = String::from(self.advance()?);
352 while !self.is_eof() {
353 let next_char = self.peek()?;
354 let next_char_can_be_part_of_ident =
355 next_char.is_ascii_alphanumeric() || next_char == '_' || next_char == ':';
356 if !next_char_can_be_part_of_ident {
357 break;
358 }
359 id.push(self.advance()?);
360 }
361
362 let token = match id.as_str() {
363 "true" => Token::BoolLiteral(true),
364 "false" => Token::BoolLiteral(false),
365 other => Token::Ident(other.to_string()),
366 };
367 Ok(self.new_token(token, start, self.current))
368 }
369
370 fn tokenize_punctuation(&mut self) -> Result<TokenWithSpan, LexerError> {
371 let token_start = self.current;
372
373 let token = match self.advance()? {
374 '(' => Token::OpenParenthesis,
375 ')' => Token::CloseParenthesis,
376 '<' => Token::LessThan,
377 '>' => Token::GreaterThan,
378 ',' => Token::Comma,
379 ';' => Token::Semicolon,
380 '=' => {
381 self.advance_expected('>')?;
382 Token::FatArrow
383 }
384 c => {
385 return Err(LexerError::unexpected_char(
386 token_start,
387 c,
388 ExpectedChar::OneOf(vec!['(', ')', '<', '>', ',', ';', '=']),
389 ))
390 }
391 };
392
393 Ok(self.new_token(token, token_start, self.current))
394 }
395
396 fn new_token(&self, token: Token, start: Position, end: Position) -> TokenWithSpan {
397 TokenWithSpan {
398 token,
399 span: Span { start, end },
400 }
401 }
402}
403
404pub fn lexer_error_diagnostics(
405 s: &str,
406 err: LexerError,
407 style: CompileErrorDiagnosticsStyle,
408) -> String {
409 let (title, label) = match err.error_kind {
410 LexerErrorKind::UnexpectedEof => (
411 "unexpected end of file".to_string(),
412 "unexpected end of file".to_string(),
413 ),
414 LexerErrorKind::UnexpectedChar(c, expected) => {
415 let expected = match expected {
416 ExpectedChar::Exact(exact) => format!("'{}'", exact),
417 ExpectedChar::OneOf(one_of) => {
418 let v: Vec<String> = one_of.iter().map(|c| format!("'{}'", c)).collect();
419 if let Some((last, init)) = v.split_last() {
420 format!("{} or {}", init.join(", "), last)
421 }
422 else {
423 "unknown".to_string()
424 }
425 }
426 ExpectedChar::HexDigit => "hex digit".to_string(),
427 ExpectedChar::DigitLetterQuotePunctuation => "digit, letter, quotation mark or one of punctuation characters '(', ')', '<', '>', ',', ';', '='".to_string(),
428 };
429 (
430 format!("unexpected character {:?}, expected {}", c, expected),
431 "unexpected character".to_string(),
432 )
433 }
434 LexerErrorKind::InvalidIntegerLiteral(string) => (
435 format!("invalid integer literal '{}'", string),
436 "invalid integer literal".to_string(),
437 ),
438 LexerErrorKind::InvalidIntegerType(string) => (
439 format!("invalid integer type '{}'", string),
440 "invalid integer type".to_string(),
441 ),
442 LexerErrorKind::InvalidInteger(string) => (
443 format!("invalid integer value {}", string),
444 "invalid integer value".to_string(),
445 ),
446 LexerErrorKind::InvalidUnicode(value) => (
447 format!("invalid unicode code point {}", value),
448 "invalid unicode code point".to_string(),
449 ),
450 LexerErrorKind::MissingUnicodeSurrogate(value) => (
451 format!("missing unicode '{:X}' surrogate pair", value),
452 "missing unicode surrogate pair".to_string(),
453 ),
454 };
455 create_snippet(s, &err.span, &title, &label, style)
456}
457
458#[cfg(test)]
459mod tests {
460 use super::*;
461 use crate::{position, span};
462
463 #[macro_export]
464 macro_rules! lex_ok {
465 ( $s:expr, $expected:expr ) => {{
466 let mut lexer = Lexer::new($s);
467 for i in 0..$expected.len() {
468 assert_eq!(
469 lexer.next_token().map(|opt| opt.map(|t| t.token)),
470 Ok(Some($expected[i].clone()))
471 );
472 }
473 assert_eq!(lexer.next_token(), Ok(None));
474 }};
475 }
476
477 #[macro_export]
478 macro_rules! lex_error {
479 ( $s:expr, $expected:expr ) => {{
480 let mut lexer = Lexer::new($s);
481 loop {
482 match lexer.next_token() {
483 Ok(Some(_)) => {}
484 Ok(None) => {
485 panic!("Expected {:?} but no error is thrown", $expected);
486 }
487 Err(e) => {
488 assert_eq!(e, $expected);
489 break;
490 }
491 }
492 }
493 }};
494 }
495
496 #[test]
497 fn test_empty_strings() {
498 lex_ok!("", Vec::<Token>::new());
499 lex_ok!(" ", Vec::<Token>::new());
500 lex_ok!("\r\n\t", Vec::<Token>::new());
501 }
502
503 #[test]
504 fn test_bool() {
505 lex_ok!("true", vec![Token::BoolLiteral(true)]);
506 lex_ok!("false", vec![Token::BoolLiteral(false)]);
507 lex_ok!("false123u8", vec![Token::Ident("false123u8".into())]);
508 }
509
510 #[test]
511 fn test_int() {
512 lex_ok!(
513 "1u82u1283i84i128",
514 vec![
515 Token::U8Literal(1),
516 Token::U128Literal(2),
517 Token::I8Literal(3),
518 Token::I128Literal(4),
519 ]
520 );
521 lex_ok!("1u8 2u32", vec![Token::U8Literal(1), Token::U32Literal(2)]);
522 lex_error!(
523 "123",
524 LexerError {
525 error_kind: LexerErrorKind::UnexpectedEof,
526 span: span!(start = (3, 0, 3), end = (3, 0, 3))
527 }
528 );
529 }
530
531 #[test]
532 fn test_comment() {
533 lex_ok!("# 1u8", Vec::<Token>::new());
534 lex_ok!("1u8 # comment", vec![Token::U8Literal(1),]);
535 lex_ok!(
536 "# multiple\n# line\nCALL_FUNCTION",
537 vec![Token::Ident("CALL_FUNCTION".to_string()),]
538 );
539 }
540
541 #[test]
542 fn test_string() {
543 lex_ok!(
544 r#" "" "abc" "abc\r\n\"def\uD83C\uDF0D" "#,
545 vec![
546 Token::StringLiteral("".into()),
547 Token::StringLiteral("abc".into()),
548 Token::StringLiteral("abc\r\n\"defπ".into()),
549 ]
550 );
551 lex_error!(
552 "\"",
553 LexerError {
554 error_kind: LexerErrorKind::UnexpectedEof,
555 span: span!(start = (1, 0, 1), end = (1, 0, 1))
556 }
557 );
558 }
559
560 #[test]
561 fn test_mixed() {
562 lex_ok!(
563 r#"CALL_FUNCTION Map<String, Array>("test", Array<String>("abc"));"#,
564 vec![
565 Token::Ident("CALL_FUNCTION".to_string()),
566 Token::Ident("Map".to_string()),
567 Token::LessThan,
568 Token::Ident("String".to_string()),
569 Token::Comma,
570 Token::Ident("Array".to_string()),
571 Token::GreaterThan,
572 Token::OpenParenthesis,
573 Token::StringLiteral("test".into()),
574 Token::Comma,
575 Token::Ident("Array".to_string()),
576 Token::LessThan,
577 Token::Ident("String".to_string()),
578 Token::GreaterThan,
579 Token::OpenParenthesis,
580 Token::StringLiteral("abc".into()),
581 Token::CloseParenthesis,
582 Token::CloseParenthesis,
583 Token::Semicolon,
584 ]
585 );
586 }
587
588 #[test]
589 fn test_precise_decimal() {
590 lex_ok!(
591 "PreciseDecimal(\"12\")",
592 vec![
593 Token::Ident("PreciseDecimal".to_string()),
594 Token::OpenParenthesis,
595 Token::StringLiteral("12".into()),
596 Token::CloseParenthesis,
597 ]
598 );
599 }
600
601 #[test]
602 fn test_precise_decimal_collection() {
603 lex_ok!(
604 "Array<PreciseDecimal>(PreciseDecimal(\"12\"), PreciseDecimal(\"212\"), PreciseDecimal(\"1984\"))",
605 vec![
606 Token::Ident("Array".to_string()),
607 Token::LessThan,
608 Token::Ident("PreciseDecimal".to_string()),
609 Token::GreaterThan,
610 Token::OpenParenthesis,
611 Token::Ident("PreciseDecimal".to_string()),
612 Token::OpenParenthesis,
613 Token::StringLiteral("12".into()),
614 Token::CloseParenthesis,
615 Token::Comma,
616 Token::Ident("PreciseDecimal".to_string()),
617 Token::OpenParenthesis,
618 Token::StringLiteral("212".into()),
619 Token::CloseParenthesis,
620 Token::Comma,
621 Token::Ident("PreciseDecimal".to_string()),
622 Token::OpenParenthesis,
623 Token::StringLiteral("1984".into()),
624 Token::CloseParenthesis,
625 Token::CloseParenthesis,
626 ]
627 );
628 }
629
630 #[test]
631 fn test_invalid_integer() {
632 lex_error!(
633 "-_28u32",
634 LexerError {
635 error_kind: LexerErrorKind::InvalidIntegerLiteral("-_".to_string()),
636 span: span!(start = (0, 0, 0), end = (2, 0, 2))
637 }
638 );
639
640 lex_error!(
641 "1i128\n 1u64 \n 1i37",
642 LexerError {
643 error_kind: LexerErrorKind::InvalidIntegerType("i37".to_string()),
644 span: span!(start = (15, 2, 2), end = (18, 2, 5))
645 }
646 );
647
648 lex_error!(
649 "3_0i8",
650 LexerError {
651 error_kind: LexerErrorKind::InvalidIntegerType("_".to_string()),
652 span: span!(start = (1, 0, 1), end = (2, 0, 2))
653 }
654 );
655 }
656
657 #[test]
658 fn test_unexpected_char() {
659 lex_error!(
660 "1u8 +2u32",
661 LexerError {
662 error_kind: LexerErrorKind::UnexpectedChar(
663 '+',
664 ExpectedChar::DigitLetterQuotePunctuation
665 ),
666 span: span!(start = (4, 0, 4), end = (5, 0, 5))
667 }
668 );
669
670 lex_error!(
671 "x=7",
672 LexerError {
673 error_kind: LexerErrorKind::UnexpectedChar('7', ExpectedChar::Exact('>')),
674 span: span!(start = (2, 0, 2), end = (3, 0, 3))
675 }
676 );
677 }
678
679 #[test]
680 fn test_unicode() {
681 lex_ok!(r#""\u2764""#, vec![Token::StringLiteral("β€".to_string())]);
682 lex_ok!(r#""\uFA84""#, vec![Token::StringLiteral("οͺ".to_string())]);
683 lex_ok!(
684 r#""\uD83D\uDC69""#,
685 vec![Token::StringLiteral("π©".to_string())]
686 );
687 lex_ok!(r#""π©""#, vec![Token::StringLiteral("π©".to_string())]);
688 lex_error!(
689 r#""\uDCAC\u1234""#,
690 LexerError {
691 error_kind: LexerErrorKind::InvalidUnicode(1238580),
692 span: span!(start = (2, 0, 2), end = (13, 0, 13))
693 }
694 );
695 }
696}