radix_transactions/manifest/
lexer.rs

1use crate::manifest::compiler::CompileErrorDiagnosticsStyle;
2use crate::manifest::diagnostic_snippets::create_snippet;
3use crate::manifest::token::{Position, Span, Token, TokenWithSpan};
4use sbor::prelude::*;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum ExpectedChar {
8    Exact(char),
9    OneOf(Vec<char>),
10    HexDigit,
11    DigitLetterQuotePunctuation,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum LexerErrorKind {
16    UnexpectedEof,
17    UnexpectedChar(char, ExpectedChar),
18    InvalidIntegerLiteral(String),
19    InvalidIntegerType(String),
20    InvalidInteger(String),
21    InvalidUnicode(u32),
22    MissingUnicodeSurrogate(u32),
23}
24
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub struct LexerError {
27    pub error_kind: LexerErrorKind,
28    pub span: Span,
29}
30
31impl LexerError {
32    fn unexpected_char(position: Position, c: char, expected: ExpectedChar) -> Self {
33        Self {
34            error_kind: LexerErrorKind::UnexpectedChar(c, expected),
35            span: Span {
36                start: position,
37                end: position.advance(c),
38            },
39        }
40    }
41
42    fn invalid_integer_type(ty: String, start: Position, end: Position) -> Self {
43        Self {
44            error_kind: LexerErrorKind::InvalidIntegerType(ty),
45            span: Span { start, end },
46        }
47    }
48}
49
50#[derive(Debug, Clone)]
51pub struct Lexer {
52    /// The input text chars
53    text: Vec<char>,
54    /// The current position in the text (in case of end of file it equals to text length)
55    current: Position,
56}
57
58pub fn tokenize(s: &str) -> Result<Vec<TokenWithSpan>, LexerError> {
59    let mut lexer = Lexer::new(s);
60    let mut tokens = Vec::new();
61    while let Some(token) = lexer.next_token()? {
62        tokens.push(token);
63    }
64    Ok(tokens)
65}
66
67impl Lexer {
68    pub fn new(text: &str) -> Self {
69        Self {
70            text: text.chars().collect(),
71            current: Position {
72                full_index: 0,
73                line_idx: 0,
74                line_char_index: 0,
75            },
76        }
77    }
78
79    pub fn is_eof(&self) -> bool {
80        self.current.full_index == self.text.len()
81    }
82
83    fn peek(&self) -> Result<char, LexerError> {
84        if self.is_eof() {
85            Err(LexerError {
86                error_kind: LexerErrorKind::UnexpectedEof,
87                span: Span {
88                    start: self.current,
89                    end: self.current,
90                },
91            })
92        } else {
93            Ok(self.text[self.current.full_index])
94        }
95    }
96
97    fn advance(&mut self) -> Result<char, LexerError> {
98        let c = self.peek()?;
99        self.current = self.current.advance(c);
100        Ok(c)
101    }
102
103    fn advance_expected(&mut self, expected: char) -> Result<char, LexerError> {
104        self.advance_matching(|c| c == expected, ExpectedChar::Exact(expected))
105    }
106
107    fn advance_matching(
108        &mut self,
109        matcher: impl Fn(char) -> bool,
110        expected: ExpectedChar,
111    ) -> Result<char, LexerError> {
112        let previous = self.current;
113        let c = self.advance()?;
114        if !matcher(c) {
115            Err(LexerError::unexpected_char(previous, c, expected))
116        } else {
117            Ok(c)
118        }
119    }
120
121    fn advance_and_append(&mut self, s: &mut String) -> Result<char, LexerError> {
122        let c = self.advance()?;
123        s.push(c);
124        Ok(c)
125    }
126
127    fn is_whitespace(c: char) -> bool {
128        // slightly different from the original specs, we skip `\n`
129        // rather than consider it as a terminal
130        c == ' ' || c == '\t' || c == '\r' || c == '\n'
131    }
132
133    pub fn next_token(&mut self) -> Result<Option<TokenWithSpan>, LexerError> {
134        // skip comment and whitespace
135        let mut in_comment = false;
136        while !self.is_eof() {
137            if in_comment {
138                if self.advance()? == '\n' {
139                    in_comment = false;
140                }
141            } else if self.peek()? == '#' {
142                in_comment = true;
143            } else if Self::is_whitespace(self.peek()?) {
144                self.advance()?;
145            } else {
146                break;
147            }
148        }
149
150        // check if it's the end of file
151        if self.is_eof() {
152            return Ok(None);
153        }
154
155        // match next token
156        match self.peek()? {
157            '-' | '0'..='9' => self.tokenize_number(),
158            '"' => self.tokenize_string(),
159            'a'..='z' | 'A'..='Z' => self.tokenize_identifier(),
160            '{' | '}' | '(' | ')' | '<' | '>' | ',' | ';' | '&' | '=' => {
161                self.tokenize_punctuation()
162            }
163            c => Err(LexerError::unexpected_char(
164                self.current,
165                c,
166                ExpectedChar::DigitLetterQuotePunctuation,
167            )),
168        }
169        .map(Option::from)
170    }
171
172    // TODO: consider using DFA
173    fn tokenize_number(&mut self) -> Result<TokenWithSpan, LexerError> {
174        let literal_start = self.current;
175        let mut s = String::new();
176
177        // negative sign
178        if self.peek()? == '-' {
179            s.push(self.advance()?);
180        }
181
182        // integer
183        match self.advance_and_append(&mut s)? {
184            '0' => {}
185            '1'..='9' => {
186                while self.peek()?.is_ascii_digit() {
187                    s.push(self.advance()?);
188                }
189            }
190            _ => {
191                return Err(LexerError {
192                    error_kind: LexerErrorKind::InvalidIntegerLiteral(s),
193                    span: Span {
194                        start: literal_start,
195                        end: self.current,
196                    },
197                });
198            }
199        }
200
201        // type
202        let ty_start = self.current;
203        let mut t = String::new();
204        match self.advance_and_append(&mut t)? {
205            'i' => match self.advance_and_append(&mut t)? {
206                '1' => match self.advance_and_append(&mut t)? {
207                    '2' => match self.advance_and_append(&mut t)? {
208                        '8' => self.parse_int(&s, "i128", Token::I128Literal, literal_start),
209                        _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
210                    },
211                    '6' => self.parse_int(&s, "i16", Token::I16Literal, literal_start),
212                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
213                },
214                '3' => match self.advance_and_append(&mut t)? {
215                    '2' => self.parse_int(&s, "i32", Token::I32Literal, literal_start),
216                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
217                },
218                '6' => match self.advance_and_append(&mut t)? {
219                    '4' => self.parse_int(&s, "i64", Token::I64Literal, literal_start),
220                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
221                },
222                '8' => self.parse_int(&s, "i8", Token::I8Literal, literal_start),
223                _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
224            },
225            'u' => match self.advance_and_append(&mut t)? {
226                '1' => match self.advance_and_append(&mut t)? {
227                    '2' => match self.advance_and_append(&mut t)? {
228                        '8' => self.parse_int(&s, "u128", Token::U128Literal, literal_start),
229                        _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
230                    },
231                    '6' => self.parse_int(&s, "u16", Token::U16Literal, literal_start),
232                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
233                },
234                '3' => match self.advance_and_append(&mut t)? {
235                    '2' => self.parse_int(&s, "u32", Token::U32Literal, literal_start),
236                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
237                },
238                '6' => match self.advance_and_append(&mut t)? {
239                    '4' => self.parse_int(&s, "u64", Token::U64Literal, literal_start),
240                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
241                },
242                '8' => self.parse_int(&s, "u8", Token::U8Literal, literal_start),
243                _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
244            },
245            _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
246        }
247        .map(|token| self.new_token(token, literal_start, self.current))
248    }
249
250    fn parse_int<T>(
251        &self,
252        int: &str,
253        ty: &str,
254        map: fn(T) -> Token,
255        token_start: Position,
256    ) -> Result<Token, LexerError>
257    where
258        T: FromStr,
259        <T as FromStr>::Err: Display,
260    {
261        int.parse::<T>().map(map).map_err(|err| LexerError {
262            error_kind: LexerErrorKind::InvalidInteger(format!("'{}{}' - {}", int, ty, err)),
263            span: Span {
264                start: token_start,
265                end: self.current,
266            },
267        })
268    }
269
270    fn tokenize_string(&mut self) -> Result<TokenWithSpan, LexerError> {
271        let start = self.current;
272        assert_eq!(self.advance()?, '"');
273
274        let mut s = String::new();
275        while self.peek()? != '"' {
276            let c = self.advance()?;
277            if c == '\\' {
278                // Remember '\\' position
279                let token_start = self.current;
280
281                // See the JSON string specifications
282                match self.advance()? {
283                    '"' => s.push('\"'),
284                    '\\' => s.push('\\'),
285                    '/' => s.push('/'),
286                    'b' => s.push('\x08'),
287                    'f' => s.push('\x0c'),
288                    'n' => s.push('\n'),
289                    'r' => s.push('\r'),
290                    't' => s.push('\t'),
291                    'u' => {
292                        let mut unicode = self.read_utf16_unit()?;
293                        // Check unicode surrogate pair
294                        // (see https://unicodebook.readthedocs.io/unicode_encodings.html#surrogates)
295                        if (0xD800..=0xDFFF).contains(&unicode) {
296                            let position = self.current;
297                            if self.advance()? == '\\' && self.advance()? == 'u' {
298                                unicode = 0x10000
299                                    + ((unicode - 0xD800) << 10)
300                                    + self.read_utf16_unit()?
301                                    - 0xDC00;
302                            } else {
303                                return Err(LexerError {
304                                    error_kind: LexerErrorKind::MissingUnicodeSurrogate(unicode),
305                                    span: Span {
306                                        start: token_start,
307                                        end: position,
308                                    },
309                                });
310                            }
311                        }
312                        s.push(char::from_u32(unicode).ok_or(LexerError {
313                            error_kind: LexerErrorKind::InvalidUnicode(unicode),
314                            span: Span {
315                                start: token_start,
316                                end: self.current,
317                            },
318                        })?);
319                    }
320                    c => {
321                        return Err(LexerError::unexpected_char(
322                            token_start,
323                            c,
324                            ExpectedChar::OneOf(vec!['"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u']),
325                        ));
326                    }
327                }
328            } else {
329                s.push(c);
330            }
331        }
332        self.advance()?;
333
334        Ok(self.new_token(Token::StringLiteral(s), start, self.current))
335    }
336
337    fn read_utf16_unit(&mut self) -> Result<u32, LexerError> {
338        let mut code: u32 = 0;
339
340        for _ in 0..4 {
341            let c = self.advance_matching(|c| c.is_ascii_hexdigit(), ExpectedChar::HexDigit)?;
342            code = code * 16 + c.to_digit(16).unwrap();
343        }
344
345        Ok(code)
346    }
347
348    fn tokenize_identifier(&mut self) -> Result<TokenWithSpan, LexerError> {
349        let start = self.current;
350
351        let mut id = String::from(self.advance()?);
352        while !self.is_eof() {
353            let next_char = self.peek()?;
354            let next_char_can_be_part_of_ident =
355                next_char.is_ascii_alphanumeric() || next_char == '_' || next_char == ':';
356            if !next_char_can_be_part_of_ident {
357                break;
358            }
359            id.push(self.advance()?);
360        }
361
362        let token = match id.as_str() {
363            "true" => Token::BoolLiteral(true),
364            "false" => Token::BoolLiteral(false),
365            other => Token::Ident(other.to_string()),
366        };
367        Ok(self.new_token(token, start, self.current))
368    }
369
370    fn tokenize_punctuation(&mut self) -> Result<TokenWithSpan, LexerError> {
371        let token_start = self.current;
372
373        let token = match self.advance()? {
374            '(' => Token::OpenParenthesis,
375            ')' => Token::CloseParenthesis,
376            '<' => Token::LessThan,
377            '>' => Token::GreaterThan,
378            ',' => Token::Comma,
379            ';' => Token::Semicolon,
380            '=' => {
381                self.advance_expected('>')?;
382                Token::FatArrow
383            }
384            c => {
385                return Err(LexerError::unexpected_char(
386                    token_start,
387                    c,
388                    ExpectedChar::OneOf(vec!['(', ')', '<', '>', ',', ';', '=']),
389                ))
390            }
391        };
392
393        Ok(self.new_token(token, token_start, self.current))
394    }
395
396    fn new_token(&self, token: Token, start: Position, end: Position) -> TokenWithSpan {
397        TokenWithSpan {
398            token,
399            span: Span { start, end },
400        }
401    }
402}
403
404pub fn lexer_error_diagnostics(
405    s: &str,
406    err: LexerError,
407    style: CompileErrorDiagnosticsStyle,
408) -> String {
409    let (title, label) = match err.error_kind {
410        LexerErrorKind::UnexpectedEof => (
411            "unexpected end of file".to_string(),
412            "unexpected end of file".to_string(),
413        ),
414        LexerErrorKind::UnexpectedChar(c, expected) => {
415            let expected = match expected {
416                ExpectedChar::Exact(exact) => format!("'{}'", exact),
417                ExpectedChar::OneOf(one_of) => {
418                    let v: Vec<String> = one_of.iter().map(|c| format!("'{}'", c)).collect();
419                    if let Some((last, init)) =  v.split_last() {
420                        format!("{} or {}", init.join(", "), last)
421                    }
422                    else {
423                        "unknown".to_string()
424                    }
425                }
426                ExpectedChar::HexDigit => "hex digit".to_string(),
427                ExpectedChar::DigitLetterQuotePunctuation => "digit, letter, quotation mark or one of punctuation characters '(', ')', '<', '>', ',', ';', '='".to_string(),
428            };
429            (
430                format!("unexpected character {:?}, expected {}", c, expected),
431                "unexpected character".to_string(),
432            )
433        }
434        LexerErrorKind::InvalidIntegerLiteral(string) => (
435            format!("invalid integer literal '{}'", string),
436            "invalid integer literal".to_string(),
437        ),
438        LexerErrorKind::InvalidIntegerType(string) => (
439            format!("invalid integer type '{}'", string),
440            "invalid integer type".to_string(),
441        ),
442        LexerErrorKind::InvalidInteger(string) => (
443            format!("invalid integer value {}", string),
444            "invalid integer value".to_string(),
445        ),
446        LexerErrorKind::InvalidUnicode(value) => (
447            format!("invalid unicode code point {}", value),
448            "invalid unicode code point".to_string(),
449        ),
450        LexerErrorKind::MissingUnicodeSurrogate(value) => (
451            format!("missing unicode '{:X}' surrogate pair", value),
452            "missing unicode surrogate pair".to_string(),
453        ),
454    };
455    create_snippet(s, &err.span, &title, &label, style)
456}
457
458#[cfg(test)]
459mod tests {
460    use super::*;
461    use crate::{position, span};
462
463    #[macro_export]
464    macro_rules! lex_ok {
465        ( $s:expr, $expected:expr ) => {{
466            let mut lexer = Lexer::new($s);
467            for i in 0..$expected.len() {
468                assert_eq!(
469                    lexer.next_token().map(|opt| opt.map(|t| t.token)),
470                    Ok(Some($expected[i].clone()))
471                );
472            }
473            assert_eq!(lexer.next_token(), Ok(None));
474        }};
475    }
476
477    #[macro_export]
478    macro_rules! lex_error {
479        ( $s:expr, $expected:expr ) => {{
480            let mut lexer = Lexer::new($s);
481            loop {
482                match lexer.next_token() {
483                    Ok(Some(_)) => {}
484                    Ok(None) => {
485                        panic!("Expected {:?} but no error is thrown", $expected);
486                    }
487                    Err(e) => {
488                        assert_eq!(e, $expected);
489                        break;
490                    }
491                }
492            }
493        }};
494    }
495
496    #[test]
497    fn test_empty_strings() {
498        lex_ok!("", Vec::<Token>::new());
499        lex_ok!("  ", Vec::<Token>::new());
500        lex_ok!("\r\n\t", Vec::<Token>::new());
501    }
502
503    #[test]
504    fn test_bool() {
505        lex_ok!("true", vec![Token::BoolLiteral(true)]);
506        lex_ok!("false", vec![Token::BoolLiteral(false)]);
507        lex_ok!("false123u8", vec![Token::Ident("false123u8".into())]);
508    }
509
510    #[test]
511    fn test_int() {
512        lex_ok!(
513            "1u82u1283i84i128",
514            vec![
515                Token::U8Literal(1),
516                Token::U128Literal(2),
517                Token::I8Literal(3),
518                Token::I128Literal(4),
519            ]
520        );
521        lex_ok!("1u8 2u32", vec![Token::U8Literal(1), Token::U32Literal(2)]);
522        lex_error!(
523            "123",
524            LexerError {
525                error_kind: LexerErrorKind::UnexpectedEof,
526                span: span!(start = (3, 0, 3), end = (3, 0, 3))
527            }
528        );
529    }
530
531    #[test]
532    fn test_comment() {
533        lex_ok!("# 1u8", Vec::<Token>::new());
534        lex_ok!("1u8 # comment", vec![Token::U8Literal(1),]);
535        lex_ok!(
536            "# multiple\n# line\nCALL_FUNCTION",
537            vec![Token::Ident("CALL_FUNCTION".to_string()),]
538        );
539    }
540
541    #[test]
542    fn test_string() {
543        lex_ok!(
544            r#"  "" "abc" "abc\r\n\"def\uD83C\uDF0D"  "#,
545            vec![
546                Token::StringLiteral("".into()),
547                Token::StringLiteral("abc".into()),
548                Token::StringLiteral("abc\r\n\"def🌍".into()),
549            ]
550        );
551        lex_error!(
552            "\"",
553            LexerError {
554                error_kind: LexerErrorKind::UnexpectedEof,
555                span: span!(start = (1, 0, 1), end = (1, 0, 1))
556            }
557        );
558    }
559
560    #[test]
561    fn test_mixed() {
562        lex_ok!(
563            r#"CALL_FUNCTION Map<String, Array>("test", Array<String>("abc"));"#,
564            vec![
565                Token::Ident("CALL_FUNCTION".to_string()),
566                Token::Ident("Map".to_string()),
567                Token::LessThan,
568                Token::Ident("String".to_string()),
569                Token::Comma,
570                Token::Ident("Array".to_string()),
571                Token::GreaterThan,
572                Token::OpenParenthesis,
573                Token::StringLiteral("test".into()),
574                Token::Comma,
575                Token::Ident("Array".to_string()),
576                Token::LessThan,
577                Token::Ident("String".to_string()),
578                Token::GreaterThan,
579                Token::OpenParenthesis,
580                Token::StringLiteral("abc".into()),
581                Token::CloseParenthesis,
582                Token::CloseParenthesis,
583                Token::Semicolon,
584            ]
585        );
586    }
587
588    #[test]
589    fn test_precise_decimal() {
590        lex_ok!(
591            "PreciseDecimal(\"12\")",
592            vec![
593                Token::Ident("PreciseDecimal".to_string()),
594                Token::OpenParenthesis,
595                Token::StringLiteral("12".into()),
596                Token::CloseParenthesis,
597            ]
598        );
599    }
600
601    #[test]
602    fn test_precise_decimal_collection() {
603        lex_ok!(
604            "Array<PreciseDecimal>(PreciseDecimal(\"12\"), PreciseDecimal(\"212\"), PreciseDecimal(\"1984\"))",
605            vec![
606                Token::Ident("Array".to_string()),
607                Token::LessThan,
608                Token::Ident("PreciseDecimal".to_string()),
609                Token::GreaterThan,
610                Token::OpenParenthesis,
611                Token::Ident("PreciseDecimal".to_string()),
612                Token::OpenParenthesis,
613                Token::StringLiteral("12".into()),
614                Token::CloseParenthesis,
615                Token::Comma,
616                Token::Ident("PreciseDecimal".to_string()),
617                Token::OpenParenthesis,
618                Token::StringLiteral("212".into()),
619                Token::CloseParenthesis,
620                Token::Comma,
621                Token::Ident("PreciseDecimal".to_string()),
622                Token::OpenParenthesis,
623                Token::StringLiteral("1984".into()),
624                Token::CloseParenthesis,
625                Token::CloseParenthesis,
626            ]
627        );
628    }
629
630    #[test]
631    fn test_invalid_integer() {
632        lex_error!(
633            "-_28u32",
634            LexerError {
635                error_kind: LexerErrorKind::InvalidIntegerLiteral("-_".to_string()),
636                span: span!(start = (0, 0, 0), end = (2, 0, 2))
637            }
638        );
639
640        lex_error!(
641            "1i128\n 1u64 \n 1i37",
642            LexerError {
643                error_kind: LexerErrorKind::InvalidIntegerType("i37".to_string()),
644                span: span!(start = (15, 2, 2), end = (18, 2, 5))
645            }
646        );
647
648        lex_error!(
649            "3_0i8",
650            LexerError {
651                error_kind: LexerErrorKind::InvalidIntegerType("_".to_string()),
652                span: span!(start = (1, 0, 1), end = (2, 0, 2))
653            }
654        );
655    }
656
657    #[test]
658    fn test_unexpected_char() {
659        lex_error!(
660            "1u8 +2u32",
661            LexerError {
662                error_kind: LexerErrorKind::UnexpectedChar(
663                    '+',
664                    ExpectedChar::DigitLetterQuotePunctuation
665                ),
666                span: span!(start = (4, 0, 4), end = (5, 0, 5))
667            }
668        );
669
670        lex_error!(
671            "x=7",
672            LexerError {
673                error_kind: LexerErrorKind::UnexpectedChar('7', ExpectedChar::Exact('>')),
674                span: span!(start = (2, 0, 2), end = (3, 0, 3))
675            }
676        );
677    }
678
679    #[test]
680    fn test_unicode() {
681        lex_ok!(r#""\u2764""#, vec![Token::StringLiteral("❀".to_string())]);
682        lex_ok!(r#""\uFA84""#, vec![Token::StringLiteral("οͺ„".to_string())]);
683        lex_ok!(
684            r#""\uD83D\uDC69""#,
685            vec![Token::StringLiteral("πŸ‘©".to_string())]
686        );
687        lex_ok!(r#""πŸ‘©""#, vec![Token::StringLiteral("πŸ‘©".to_string())]);
688        lex_error!(
689            r#""\uDCAC\u1234""#,
690            LexerError {
691                error_kind: LexerErrorKind::InvalidUnicode(1238580),
692                span: span!(start = (2, 0, 2), end = (13, 0, 13))
693            }
694        );
695    }
696}