radix_transactions/manifest/
lexer.rs

1use crate::manifest::compiler::CompileErrorDiagnosticsStyle;
2use crate::manifest::diagnostic_snippets::create_snippet;
3use crate::manifest::token::{Position, Span, Token, TokenWithSpan};
4use sbor::prelude::*;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum ExpectedChar {
8    Exact(char),
9    OneOf(Vec<char>),
10    HexDigit,
11    DigitLetterQuotePunctuation,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum LexerErrorKind {
16    UnexpectedEof,
17    UnexpectedChar(char, ExpectedChar),
18    InvalidIntegerLiteral(String),
19    InvalidIntegerType(String),
20    InvalidInteger(String),
21    InvalidUnicode(u32),
22    MissingUnicodeSurrogate(u32),
23}
24
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub struct LexerError {
27    pub error_kind: LexerErrorKind,
28    pub span: Span,
29}
30
31impl LexerError {
32    fn unexpected_char(position: Position, c: char, expected: ExpectedChar) -> Self {
33        Self {
34            error_kind: LexerErrorKind::UnexpectedChar(c, expected),
35            span: Span {
36                start: position,
37                end: position.advance(c),
38            },
39        }
40    }
41
42    fn invalid_integer_type(ty: String, start: Position, end: Position) -> Self {
43        Self {
44            error_kind: LexerErrorKind::InvalidIntegerType(ty),
45            span: Span { start, end },
46        }
47    }
48}
49
50#[derive(Debug, Clone)]
51pub struct Lexer {
52    /// The input text chars
53    text: Vec<char>,
54    /// The current position in the text (in case of end of file it equals to text length)
55    current: Position,
56}
57
58pub fn tokenize(s: &str) -> Result<Vec<TokenWithSpan>, LexerError> {
59    let mut lexer = Lexer::new(s);
60    let mut tokens = Vec::new();
61    loop {
62        if let Some(token) = lexer.next_token()? {
63            tokens.push(token);
64        } else {
65            break;
66        }
67    }
68    Ok(tokens)
69}
70
71impl Lexer {
72    pub fn new(text: &str) -> Self {
73        Self {
74            text: text.chars().collect(),
75            current: Position {
76                full_index: 0,
77                line_idx: 0,
78                line_char_index: 0,
79            },
80        }
81    }
82
83    pub fn is_eof(&self) -> bool {
84        self.current.full_index == self.text.len()
85    }
86
87    fn peek(&self) -> Result<char, LexerError> {
88        if self.is_eof() {
89            Err(LexerError {
90                error_kind: LexerErrorKind::UnexpectedEof,
91                span: Span {
92                    start: self.current,
93                    end: self.current,
94                },
95            })
96        } else {
97            Ok(self.text[self.current.full_index])
98        }
99    }
100
101    fn advance(&mut self) -> Result<char, LexerError> {
102        let c = self.peek()?;
103        self.current = self.current.advance(c);
104        Ok(c)
105    }
106
107    fn advance_expected(&mut self, expected: char) -> Result<char, LexerError> {
108        self.advance_matching(|c| c == expected, ExpectedChar::Exact(expected))
109    }
110
111    fn advance_matching(
112        &mut self,
113        matcher: impl Fn(char) -> bool,
114        expected: ExpectedChar,
115    ) -> Result<char, LexerError> {
116        let previous = self.current;
117        let c = self.advance()?;
118        if !matcher(c) {
119            Err(LexerError::unexpected_char(previous, c, expected))
120        } else {
121            Ok(c)
122        }
123    }
124
125    fn advance_and_append(&mut self, s: &mut String) -> Result<char, LexerError> {
126        let c = self.advance()?;
127        s.push(c);
128        Ok(c)
129    }
130
131    fn is_whitespace(c: char) -> bool {
132        // slightly different from the original specs, we skip `\n`
133        // rather than consider it as a terminal
134        c == ' ' || c == '\t' || c == '\r' || c == '\n'
135    }
136
137    pub fn next_token(&mut self) -> Result<Option<TokenWithSpan>, LexerError> {
138        // skip comment and whitespace
139        let mut in_comment = false;
140        while !self.is_eof() {
141            if in_comment {
142                if self.advance()? == '\n' {
143                    in_comment = false;
144                }
145            } else if self.peek()? == '#' {
146                in_comment = true;
147            } else if Self::is_whitespace(self.peek()?) {
148                self.advance()?;
149            } else {
150                break;
151            }
152        }
153
154        // check if it's the end of file
155        if self.is_eof() {
156            return Ok(None);
157        }
158
159        // match next token
160        match self.peek()? {
161            '-' | '0'..='9' => self.tokenize_number(),
162            '"' => self.tokenize_string(),
163            'a'..='z' | 'A'..='Z' => self.tokenize_identifier(),
164            '{' | '}' | '(' | ')' | '<' | '>' | ',' | ';' | '&' | '=' => {
165                self.tokenize_punctuation()
166            }
167            c => Err(LexerError::unexpected_char(
168                self.current,
169                c,
170                ExpectedChar::DigitLetterQuotePunctuation,
171            )),
172        }
173        .map(Option::from)
174    }
175
176    // TODO: consider using DFA
177    fn tokenize_number(&mut self) -> Result<TokenWithSpan, LexerError> {
178        let literal_start = self.current;
179        let mut s = String::new();
180
181        // negative sign
182        if self.peek()? == '-' {
183            s.push(self.advance()?);
184        }
185
186        // integer
187        match self.advance_and_append(&mut s)? {
188            '0' => {}
189            '1'..='9' => {
190                while self.peek()?.is_ascii_digit() {
191                    s.push(self.advance()?);
192                }
193            }
194            _ => {
195                return Err(LexerError {
196                    error_kind: LexerErrorKind::InvalidIntegerLiteral(s),
197                    span: Span {
198                        start: literal_start,
199                        end: self.current,
200                    },
201                });
202            }
203        }
204
205        // type
206        let ty_start = self.current;
207        let mut t = String::new();
208        match self.advance_and_append(&mut t)? {
209            'i' => match self.advance_and_append(&mut t)? {
210                '1' => match self.advance_and_append(&mut t)? {
211                    '2' => match self.advance_and_append(&mut t)? {
212                        '8' => self.parse_int(&s, "i128", Token::I128Literal, literal_start),
213                        _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
214                    },
215                    '6' => self.parse_int(&s, "i16", Token::I16Literal, literal_start),
216                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
217                },
218                '3' => match self.advance_and_append(&mut t)? {
219                    '2' => self.parse_int(&s, "i32", Token::I32Literal, literal_start),
220                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
221                },
222                '6' => match self.advance_and_append(&mut t)? {
223                    '4' => self.parse_int(&s, "i64", Token::I64Literal, literal_start),
224                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
225                },
226                '8' => self.parse_int(&s, "i8", Token::I8Literal, literal_start),
227                _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
228            },
229            'u' => match self.advance_and_append(&mut t)? {
230                '1' => match self.advance_and_append(&mut t)? {
231                    '2' => match self.advance_and_append(&mut t)? {
232                        '8' => self.parse_int(&s, "u128", Token::U128Literal, literal_start),
233                        _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
234                    },
235                    '6' => self.parse_int(&s, "u16", Token::U16Literal, literal_start),
236                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
237                },
238                '3' => match self.advance_and_append(&mut t)? {
239                    '2' => self.parse_int(&s, "u32", Token::U32Literal, literal_start),
240                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
241                },
242                '6' => match self.advance_and_append(&mut t)? {
243                    '4' => self.parse_int(&s, "u64", Token::U64Literal, literal_start),
244                    _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
245                },
246                '8' => self.parse_int(&s, "u8", Token::U8Literal, literal_start),
247                _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
248            },
249            _ => Err(LexerError::invalid_integer_type(t, ty_start, self.current)),
250        }
251        .map(|token| self.new_token(token, literal_start, self.current))
252    }
253
254    fn parse_int<T>(
255        &self,
256        int: &str,
257        ty: &str,
258        map: fn(T) -> Token,
259        token_start: Position,
260    ) -> Result<Token, LexerError>
261    where
262        T: FromStr,
263        <T as FromStr>::Err: Display,
264    {
265        int.parse::<T>().map(map).map_err(|err| LexerError {
266            error_kind: LexerErrorKind::InvalidInteger(format!(
267                "'{}{}' - {}",
268                int,
269                ty,
270                err.to_string()
271            )),
272            span: Span {
273                start: token_start,
274                end: self.current,
275            },
276        })
277    }
278
279    fn tokenize_string(&mut self) -> Result<TokenWithSpan, LexerError> {
280        let start = self.current;
281        assert_eq!(self.advance()?, '"');
282
283        let mut s = String::new();
284        while self.peek()? != '"' {
285            let c = self.advance()?;
286            if c == '\\' {
287                // Remember '\\' position
288                let token_start = self.current;
289
290                // See the JSON string specifications
291                match self.advance()? {
292                    '"' => s.push('\"'),
293                    '\\' => s.push('\\'),
294                    '/' => s.push('/'),
295                    'b' => s.push('\x08'),
296                    'f' => s.push('\x0c'),
297                    'n' => s.push('\n'),
298                    'r' => s.push('\r'),
299                    't' => s.push('\t'),
300                    'u' => {
301                        let mut unicode = self.read_utf16_unit()?;
302                        // Check unicode surrogate pair
303                        // (see https://unicodebook.readthedocs.io/unicode_encodings.html#surrogates)
304                        if (0xD800..=0xDFFF).contains(&unicode) {
305                            let position = self.current;
306                            if self.advance()? == '\\' && self.advance()? == 'u' {
307                                unicode = 0x10000
308                                    + ((unicode - 0xD800) << 10)
309                                    + self.read_utf16_unit()?
310                                    - 0xDC00;
311                            } else {
312                                return Err(LexerError {
313                                    error_kind: LexerErrorKind::MissingUnicodeSurrogate(unicode),
314                                    span: Span {
315                                        start: token_start,
316                                        end: position,
317                                    },
318                                });
319                            }
320                        }
321                        s.push(char::from_u32(unicode).ok_or(LexerError {
322                            error_kind: LexerErrorKind::InvalidUnicode(unicode),
323                            span: Span {
324                                start: token_start,
325                                end: self.current,
326                            },
327                        })?);
328                    }
329                    c => {
330                        return Err(LexerError::unexpected_char(
331                            token_start,
332                            c,
333                            ExpectedChar::OneOf(vec!['"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u']),
334                        ));
335                    }
336                }
337            } else {
338                s.push(c);
339            }
340        }
341        self.advance()?;
342
343        Ok(self.new_token(Token::StringLiteral(s), start, self.current))
344    }
345
346    fn read_utf16_unit(&mut self) -> Result<u32, LexerError> {
347        let mut code: u32 = 0;
348
349        for _ in 0..4 {
350            let c = self.advance_matching(|c| c.is_ascii_hexdigit(), ExpectedChar::HexDigit)?;
351            code = code * 16 + c.to_digit(16).unwrap();
352        }
353
354        Ok(code)
355    }
356
357    fn tokenize_identifier(&mut self) -> Result<TokenWithSpan, LexerError> {
358        let start = self.current;
359
360        let mut id = String::from(self.advance()?);
361        while !self.is_eof() {
362            let next_char = self.peek()?;
363            let next_char_can_be_part_of_ident =
364                next_char.is_ascii_alphanumeric() || next_char == '_' || next_char == ':';
365            if !next_char_can_be_part_of_ident {
366                break;
367            }
368            id.push(self.advance()?);
369        }
370
371        let token = match id.as_str() {
372            "true" => Token::BoolLiteral(true),
373            "false" => Token::BoolLiteral(false),
374            other => Token::Ident(other.to_string()),
375        };
376        Ok(self.new_token(token, start, self.current))
377    }
378
379    fn tokenize_punctuation(&mut self) -> Result<TokenWithSpan, LexerError> {
380        let token_start = self.current;
381
382        let token = match self.advance()? {
383            '(' => Token::OpenParenthesis,
384            ')' => Token::CloseParenthesis,
385            '<' => Token::LessThan,
386            '>' => Token::GreaterThan,
387            ',' => Token::Comma,
388            ';' => Token::Semicolon,
389            '=' => {
390                self.advance_expected('>')?;
391                Token::FatArrow
392            }
393            c => {
394                return Err(LexerError::unexpected_char(
395                    token_start,
396                    c,
397                    ExpectedChar::OneOf(vec!['(', ')', '<', '>', ',', ';', '=']),
398                ))
399            }
400        };
401
402        Ok(self.new_token(token, token_start, self.current))
403    }
404
405    fn new_token(&self, token: Token, start: Position, end: Position) -> TokenWithSpan {
406        TokenWithSpan {
407            token,
408            span: Span { start, end },
409        }
410    }
411}
412
413pub fn lexer_error_diagnostics(
414    s: &str,
415    err: LexerError,
416    style: CompileErrorDiagnosticsStyle,
417) -> String {
418    let (title, label) = match err.error_kind {
419        LexerErrorKind::UnexpectedEof => (
420            "unexpected end of file".to_string(),
421            "unexpected end of file".to_string(),
422        ),
423        LexerErrorKind::UnexpectedChar(c, expected) => {
424            let expected = match expected {
425                ExpectedChar::Exact(exact) => format!("'{}'", exact),
426                ExpectedChar::OneOf(one_of) => {
427                    let v: Vec<String> = one_of.iter().map(|c| format!("'{}'", c)).collect();
428                    if let Some((last, init)) =  v.split_last() {
429                        format!("{} or {}", init.join(", "), last)
430                    }
431                    else {
432                        "unknown".to_string()
433                    }
434                }
435                ExpectedChar::HexDigit => "hex digit".to_string(),
436                ExpectedChar::DigitLetterQuotePunctuation => "digit, letter, quotation mark or one of punctuation characters '(', ')', '<', '>', ',', ';', '='".to_string(),
437            };
438            (
439                format!("unexpected character {:?}, expected {}", c, expected),
440                "unexpected character".to_string(),
441            )
442        }
443        LexerErrorKind::InvalidIntegerLiteral(string) => (
444            format!("invalid integer literal '{}'", string),
445            "invalid integer literal".to_string(),
446        ),
447        LexerErrorKind::InvalidIntegerType(string) => (
448            format!("invalid integer type '{}'", string),
449            "invalid integer type".to_string(),
450        ),
451        LexerErrorKind::InvalidInteger(string) => (
452            format!("invalid integer value {}", string),
453            "invalid integer value".to_string(),
454        ),
455        LexerErrorKind::InvalidUnicode(value) => (
456            format!("invalid unicode code point {}", value),
457            "invalid unicode code point".to_string(),
458        ),
459        LexerErrorKind::MissingUnicodeSurrogate(value) => (
460            format!("missing unicode '{:X}' surrogate pair", value),
461            "missing unicode surrogate pair".to_string(),
462        ),
463    };
464    create_snippet(s, &err.span, &title, &label, style)
465}
466
467#[cfg(test)]
468mod tests {
469    use super::*;
470    use crate::{position, span};
471
472    #[macro_export]
473    macro_rules! lex_ok {
474        ( $s:expr, $expected:expr ) => {{
475            let mut lexer = Lexer::new($s);
476            for i in 0..$expected.len() {
477                assert_eq!(
478                    lexer.next_token().map(|opt| opt.map(|t| t.token)),
479                    Ok(Some($expected[i].clone()))
480                );
481            }
482            assert_eq!(lexer.next_token(), Ok(None));
483        }};
484    }
485
486    #[macro_export]
487    macro_rules! lex_error {
488        ( $s:expr, $expected:expr ) => {{
489            let mut lexer = Lexer::new($s);
490            loop {
491                match lexer.next_token() {
492                    Ok(Some(_)) => {}
493                    Ok(None) => {
494                        panic!("Expected {:?} but no error is thrown", $expected);
495                    }
496                    Err(e) => {
497                        assert_eq!(e, $expected);
498                        break;
499                    }
500                }
501            }
502        }};
503    }
504
505    #[test]
506    fn test_empty_strings() {
507        lex_ok!("", Vec::<Token>::new());
508        lex_ok!("  ", Vec::<Token>::new());
509        lex_ok!("\r\n\t", Vec::<Token>::new());
510    }
511
512    #[test]
513    fn test_bool() {
514        lex_ok!("true", vec![Token::BoolLiteral(true)]);
515        lex_ok!("false", vec![Token::BoolLiteral(false)]);
516        lex_ok!("false123u8", vec![Token::Ident("false123u8".into())]);
517    }
518
519    #[test]
520    fn test_int() {
521        lex_ok!(
522            "1u82u1283i84i128",
523            vec![
524                Token::U8Literal(1),
525                Token::U128Literal(2),
526                Token::I8Literal(3),
527                Token::I128Literal(4),
528            ]
529        );
530        lex_ok!("1u8 2u32", vec![Token::U8Literal(1), Token::U32Literal(2)]);
531        lex_error!(
532            "123",
533            LexerError {
534                error_kind: LexerErrorKind::UnexpectedEof,
535                span: span!(start = (3, 0, 3), end = (3, 0, 3))
536            }
537        );
538    }
539
540    #[test]
541    fn test_comment() {
542        lex_ok!("# 1u8", Vec::<Token>::new());
543        lex_ok!("1u8 # comment", vec![Token::U8Literal(1),]);
544        lex_ok!(
545            "# multiple\n# line\nCALL_FUNCTION",
546            vec![Token::Ident("CALL_FUNCTION".to_string()),]
547        );
548    }
549
550    #[test]
551    fn test_string() {
552        lex_ok!(
553            r#"  "" "abc" "abc\r\n\"def\uD83C\uDF0D"  "#,
554            vec![
555                Token::StringLiteral("".into()),
556                Token::StringLiteral("abc".into()),
557                Token::StringLiteral("abc\r\n\"def🌍".into()),
558            ]
559        );
560        lex_error!(
561            "\"",
562            LexerError {
563                error_kind: LexerErrorKind::UnexpectedEof,
564                span: span!(start = (1, 0, 1), end = (1, 0, 1))
565            }
566        );
567    }
568
569    #[test]
570    fn test_mixed() {
571        lex_ok!(
572            r#"CALL_FUNCTION Map<String, Array>("test", Array<String>("abc"));"#,
573            vec![
574                Token::Ident("CALL_FUNCTION".to_string()),
575                Token::Ident("Map".to_string()),
576                Token::LessThan,
577                Token::Ident("String".to_string()),
578                Token::Comma,
579                Token::Ident("Array".to_string()),
580                Token::GreaterThan,
581                Token::OpenParenthesis,
582                Token::StringLiteral("test".into()),
583                Token::Comma,
584                Token::Ident("Array".to_string()),
585                Token::LessThan,
586                Token::Ident("String".to_string()),
587                Token::GreaterThan,
588                Token::OpenParenthesis,
589                Token::StringLiteral("abc".into()),
590                Token::CloseParenthesis,
591                Token::CloseParenthesis,
592                Token::Semicolon,
593            ]
594        );
595    }
596
597    #[test]
598    fn test_precise_decimal() {
599        lex_ok!(
600            "PreciseDecimal(\"12\")",
601            vec![
602                Token::Ident("PreciseDecimal".to_string()),
603                Token::OpenParenthesis,
604                Token::StringLiteral("12".into()),
605                Token::CloseParenthesis,
606            ]
607        );
608    }
609
610    #[test]
611    fn test_precise_decimal_collection() {
612        lex_ok!(
613            "Array<PreciseDecimal>(PreciseDecimal(\"12\"), PreciseDecimal(\"212\"), PreciseDecimal(\"1984\"))",
614            vec![
615                Token::Ident("Array".to_string()),
616                Token::LessThan,
617                Token::Ident("PreciseDecimal".to_string()),
618                Token::GreaterThan,
619                Token::OpenParenthesis,
620                Token::Ident("PreciseDecimal".to_string()),
621                Token::OpenParenthesis,
622                Token::StringLiteral("12".into()),
623                Token::CloseParenthesis,
624                Token::Comma,
625                Token::Ident("PreciseDecimal".to_string()),
626                Token::OpenParenthesis,
627                Token::StringLiteral("212".into()),
628                Token::CloseParenthesis,
629                Token::Comma,
630                Token::Ident("PreciseDecimal".to_string()),
631                Token::OpenParenthesis,
632                Token::StringLiteral("1984".into()),
633                Token::CloseParenthesis,
634                Token::CloseParenthesis,
635            ]
636        );
637    }
638
639    #[test]
640    fn test_invalid_integer() {
641        lex_error!(
642            "-_28u32",
643            LexerError {
644                error_kind: LexerErrorKind::InvalidIntegerLiteral("-_".to_string()),
645                span: span!(start = (0, 0, 0), end = (2, 0, 2))
646            }
647        );
648
649        lex_error!(
650            "1i128\n 1u64 \n 1i37",
651            LexerError {
652                error_kind: LexerErrorKind::InvalidIntegerType("i37".to_string()),
653                span: span!(start = (15, 2, 2), end = (18, 2, 5))
654            }
655        );
656
657        lex_error!(
658            "3_0i8",
659            LexerError {
660                error_kind: LexerErrorKind::InvalidIntegerType("_".to_string()),
661                span: span!(start = (1, 0, 1), end = (2, 0, 2))
662            }
663        );
664    }
665
666    #[test]
667    fn test_unexpected_char() {
668        lex_error!(
669            "1u8 +2u32",
670            LexerError {
671                error_kind: LexerErrorKind::UnexpectedChar(
672                    '+',
673                    ExpectedChar::DigitLetterQuotePunctuation
674                ),
675                span: span!(start = (4, 0, 4), end = (5, 0, 5))
676            }
677        );
678
679        lex_error!(
680            "x=7",
681            LexerError {
682                error_kind: LexerErrorKind::UnexpectedChar('7', ExpectedChar::Exact('>')),
683                span: span!(start = (2, 0, 2), end = (3, 0, 3))
684            }
685        );
686    }
687
688    #[test]
689    fn test_unicode() {
690        lex_ok!(r#""\u2764""#, vec![Token::StringLiteral("❀".to_string())]);
691        lex_ok!(r#""\uFA84""#, vec![Token::StringLiteral("οͺ„".to_string())]);
692        lex_ok!(
693            r#""\uD83D\uDC69""#,
694            vec![Token::StringLiteral("πŸ‘©".to_string())]
695        );
696        lex_ok!(r#""πŸ‘©""#, vec![Token::StringLiteral("πŸ‘©".to_string())]);
697        lex_error!(
698            r#""\uDCAC\u1234""#,
699            LexerError {
700                error_kind: LexerErrorKind::InvalidUnicode(1238580),
701                span: span!(start = (2, 0, 2), end = (13, 0, 13))
702            }
703        );
704    }
705}