miden_assembly_syntax/parser/
lexer.rs

1use alloc::string::String;
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
5
6use super::{
7    BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
8    ParsingError, Scanner, Token, WordValue,
9};
10use crate::Felt;
11
12/// The value produced by the [Lexer] when iterated
13///
14/// A successfully lexed token is wrapped in a tuple with the start and end byte offsets, where
15/// the end offset is exclusive. We explicitly use a tuple here, and not something like Span<T>,
16/// because this "triple" is the structure expected by the LALRPOP parser generator when used with
17/// a custom lexer like ours.
18pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
19
20/// Pops a single token from the [Lexer]
21macro_rules! pop {
22    ($lex:ident) => {{
23        $lex.skip();
24    }};
25    ($lex:ident, $token:expr) => {{
26        $lex.skip();
27        Ok($token)
28    }};
29}
30
31/// Pops two tokens from the [Lexer]
32macro_rules! pop2 {
33    ($lex:ident) => {{
34        $lex.skip();
35        $lex.skip();
36    }};
37    ($lex:ident, $token:expr) => {{
38        $lex.skip();
39        $lex.skip();
40        Ok($token)
41    }};
42}
43
44// LEXER
45// ================================================================================================
46
47/// The lexer that is used to perform lexical analysis Miden Assembly grammar. The lexer implements
48/// the `Iterator` trait, so in order to retrieve the tokens, you simply have to iterate over it.
49///
50/// # Errors
51///
52/// Because the lexer is implemented as an iterator over tokens, this means that you can continue
53/// to get tokens even if a lexical error occurs. The lexer will attempt to recover from an error
54/// by injecting tokens it expects.
55///
56/// If an error is unrecoverable, the lexer will continue to produce tokens, but there is no
57/// guarantee that parsing them will produce meaningful results, it is primarily to assist in
58/// gathering as many errors as possible.
59pub struct Lexer<'input> {
60    /// The [SourceId] of the file being lexed, for use in producing spans in lexer diagnostics
61    source_id: SourceId,
62
63    /// The scanner produces a sequence of chars + location, and can be controlled
64    /// The location type is usize
65    scanner: Scanner<'input>,
66
67    /// The most recent token to be lexed.
68    /// At the start and end, this should be Token::Eof
69    token: Token<'input>,
70
71    /// The position in the input where the current token starts
72    /// At the start this will be the byte index of the beginning of the input
73    token_start: usize,
74
75    /// The position in the input where the current token ends
76    /// At the start this will be the byte index of the beginning of the input
77    token_end: usize,
78
79    /// The current line number
80    line_num: usize,
81
82    /// When we have reached true Eof, this gets set to true, and the only token
83    /// produced after that point is Token::Eof, or None, depending on how you are
84    /// consuming the lexer
85    eof: bool,
86    empty: bool,
87
88    // A DFA for keyword matching
89    keywords: aho_corasick::AhoCorasick,
90
91    /// If an error occurs during tokenization, it is held here
92    error: Option<ParsingError>,
93}
94
95impl<'input> Lexer<'input> {
96    /// Produces an instance of the lexer with the lexical analysis to be performed on the `input`
97    /// string. Note that no lexical analysis occurs until the lexer has been iterated over.
98    pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
99        let start = scanner.start();
100        let keywords = Token::keyword_searcher();
101        let mut lexer = Self {
102            source_id,
103            scanner,
104            token: Token::Eof,
105            token_start: start,
106            token_end: start,
107            line_num: 0,
108            eof: false,
109            empty: false,
110            keywords,
111            error: None,
112        };
113        lexer.advance();
114        lexer
115    }
116
117    pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
118        if let Some(err) = self.error.take() {
119            return Some(Err(err));
120        }
121
122        if self.eof && matches!(self.token, Token::Eof) {
123            // Emit a single Eof token at the end, then None after
124            if self.empty {
125                return None;
126            } else {
127                self.empty = true;
128                let end = self.token_end as u32;
129                return Some(Ok((end, Token::Eof, end)));
130            }
131        }
132
133        let token = core::mem::replace(&mut self.token, Token::Eof);
134        let start = self.token_start;
135        let end = self.token_end;
136        self.advance();
137        Some(Ok((start as u32, token, end as u32)))
138    }
139
140    fn advance(&mut self) {
141        self.advance_start();
142        match self.tokenize() {
143            Ok(tok) => {
144                self.token = tok;
145            },
146            Err(err) => {
147                self.error = Some(err);
148            },
149        }
150    }
151
152    #[inline]
153    fn advance_start(&mut self) {
154        let mut position: usize;
155        loop {
156            let (pos, c) = self.scanner.read();
157
158            position = pos;
159
160            if c == '\0' {
161                self.eof = true;
162                return;
163            }
164
165            if c.is_whitespace() {
166                if c == '\n' {
167                    self.line_num += 1;
168                }
169                self.scanner.advance();
170                continue;
171            }
172
173            break;
174        }
175
176        self.token_start = position;
177    }
178
179    #[inline]
180    fn pop(&mut self) -> char {
181        let (pos, c) = self.scanner.pop();
182        self.token_end = pos + c.len_utf8();
183        c
184    }
185
186    #[inline]
187    fn peek(&mut self) -> char {
188        let (_, c) = self.scanner.peek();
189        c
190    }
191
192    #[inline]
193    #[allow(unused)]
194    fn peek_next(&mut self) -> char {
195        let (_, c) = self.scanner.peek_next();
196        c
197    }
198
199    #[inline]
200    fn read(&mut self) -> char {
201        let (_, c) = self.scanner.read();
202        c
203    }
204
205    #[inline]
206    fn skip(&mut self) {
207        self.pop();
208    }
209
210    /// Get the span for the current token in `Source`.
211    #[inline]
212    fn span(&self) -> SourceSpan {
213        assert!(self.token_start <= self.token_end, "invalid range");
214        assert!(self.token_end <= u32::MAX as usize, "file too large");
215        SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
216    }
217
218    #[inline]
219    fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
220        let range = span.into();
221        self.scanner.slice((range.start as usize)..(range.end as usize))
222    }
223
224    /// Get a string slice of the current token.
225    #[inline]
226    fn slice(&self) -> &'input str {
227        self.slice_span(self.span())
228    }
229
230    #[inline]
231    fn skip_whitespace(&mut self) {
232        let mut c: char;
233        loop {
234            c = self.read();
235
236            if !c.is_whitespace() {
237                break;
238            }
239
240            if c == '\n' {
241                self.line_num += 1;
242            }
243
244            self.skip();
245        }
246    }
247
248    fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
249        let c = self.read();
250
251        if c == '#' {
252            match self.peek() {
253                '!' => {
254                    self.skip();
255                    self.skip();
256                    return self.lex_docs();
257                },
258                _ => {
259                    self.skip();
260                    self.skip_comment();
261                    return Ok(Token::Comment);
262                },
263            }
264        }
265
266        if c == '\0' {
267            self.eof = true;
268            return Ok(Token::Eof);
269        }
270
271        if c.is_whitespace() {
272            self.skip_whitespace();
273        }
274
275        match self.read() {
276            '@' => pop!(self, Token::At),
277            '!' => pop!(self, Token::Bang),
278            ':' => match self.peek() {
279                ':' => pop2!(self, Token::ColonColon),
280                _ => pop!(self, Token::Colon),
281            },
282            ';' => pop!(self, Token::Semicolon),
283            '.' => match self.peek() {
284                '.' => pop2!(self, Token::Range),
285                _ => pop!(self, Token::Dot),
286            },
287            ',' => pop!(self, Token::Comma),
288            '=' => pop!(self, Token::Equal),
289            '<' => pop!(self, Token::Langle),
290            '{' => pop!(self, Token::Lbrace),
291            '[' => pop!(self, Token::Lbracket),
292            '(' => pop!(self, Token::Lparen),
293            '>' => pop!(self, Token::Rangle),
294            '}' => pop!(self, Token::Rbrace),
295            ']' => pop!(self, Token::Rbracket),
296            ')' => pop!(self, Token::Rparen),
297            '-' => match self.peek() {
298                '>' => pop2!(self, Token::Rstab),
299                _ => pop!(self, Token::Minus),
300            },
301            '+' => pop!(self, Token::Plus),
302            '/' => match self.peek() {
303                '/' => pop2!(self, Token::SlashSlash),
304                _ => pop!(self, Token::Slash),
305            },
306            '*' => pop!(self, Token::Star),
307            '$' => self.lex_special_identifier(),
308            '"' => self.lex_quoted_identifier_or_string(),
309            '0' => match self.peek() {
310                'x' => {
311                    self.skip();
312                    self.skip();
313                    self.lex_hex()
314                },
315                'b' => {
316                    self.skip();
317                    self.skip();
318                    self.lex_bin()
319                },
320                '0'..='9' => self.lex_number(),
321                _ => pop!(self, Token::Int(0)),
322            },
323            '1'..='9' => self.lex_number(),
324            'a'..='z' => self.lex_keyword_or_ident(),
325            'A'..='Z' => self.lex_identifier(),
326            '_' => match self.peek() {
327                c if c.is_ascii_alphanumeric() => self.lex_identifier(),
328                _ => Err(ParsingError::InvalidToken { span: self.span() }),
329            },
330            _ => Err(ParsingError::InvalidToken { span: self.span() }),
331        }
332    }
333
334    fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
335        let mut buf = String::new();
336
337        let mut c;
338        let mut line_start = self.token_start + 2;
339        let is_module_doc = self.line_num == 0;
340        loop {
341            c = self.read();
342
343            if c == '\0' {
344                self.eof = true;
345                buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
346
347                let is_first_line = self.line_num == 0;
348                break Ok(Token::DocComment(if is_first_line {
349                    DocumentationType::Module(buf)
350                } else {
351                    DocumentationType::Form(buf)
352                }));
353            }
354
355            if c == '\n' {
356                self.line_num += 1;
357
358                buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
359                buf.push('\n');
360
361                self.skip();
362                c = self.read();
363                match c {
364                    '#' if self.peek() == '!' => {
365                        self.skip();
366                        self.skip();
367                        line_start = self.token_end;
368                        continue;
369                    },
370                    _ if is_module_doc => {
371                        break Ok(Token::DocComment(DocumentationType::Module(buf)));
372                    },
373                    _ => {
374                        break Ok(Token::DocComment(DocumentationType::Form(buf)));
375                    },
376                }
377            }
378
379            self.skip();
380        }
381    }
382
383    fn skip_comment(&mut self) {
384        let mut c;
385        loop {
386            c = self.read();
387
388            if c == '\n' {
389                self.skip();
390                self.line_num += 1;
391                break;
392            }
393
394            if c == '\0' {
395                self.eof = true;
396                break;
397            }
398
399            self.skip();
400        }
401    }
402
403    fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
404        let c = self.pop();
405        debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
406
407        loop {
408            match self.read() {
409                '_' | '0'..='9' => self.skip(),
410                c if c.is_ascii_alphabetic() => self.skip(),
411                _ => break,
412            }
413        }
414
415        let name = self.slice();
416        match name {
417            "exp" => {
418                // Special handling for the `exp.uXX` tokenization
419                if self.read() == '.' && self.peek() == 'u' {
420                    pop2!(self, Token::ExpU)
421                } else {
422                    Ok(Token::Exp)
423                }
424            },
425            _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
426        }
427    }
428
429    fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
430        // Skip quotation mark
431        self.skip();
432
433        let mut is_identifier = true;
434        let quote_size = ByteOffset::from_char_len('"');
435        loop {
436            match self.read() {
437                '\0' | '\n' => {
438                    break Err(ParsingError::UnclosedQuote {
439                        start: SourceSpan::at(self.source_id, self.span().start()),
440                    });
441                },
442                '\\' => {
443                    is_identifier = false;
444                    self.skip();
445                    match self.read() {
446                        '"' | '\n' => {
447                            self.skip();
448                        },
449                        _ => (),
450                    }
451                },
452                '"' => {
453                    let span = self.span();
454                    let start = span.start() + quote_size;
455                    let span = SourceSpan::new(self.source_id, start..span.end());
456
457                    self.skip();
458                    break Ok(if is_identifier {
459                        Token::QuotedIdent(self.slice_span(span))
460                    } else {
461                        Token::QuotedString(self.slice_span(span))
462                    });
463                },
464                c if c.is_alphanumeric() || c.is_ascii_graphic() => {
465                    self.skip();
466                },
467                _ => {
468                    is_identifier = false;
469                    self.skip();
470                },
471            }
472        }
473    }
474
475    fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
476        let c = self.pop();
477        debug_assert!(c.is_ascii_alphabetic() || c == '_');
478
479        let mut is_constant_ident = c.is_ascii_uppercase() || c == '_';
480
481        loop {
482            match self.read() {
483                '_' | '0'..='9' => self.skip(),
484                c if c.is_ascii_alphabetic() => {
485                    is_constant_ident &= c.is_ascii_uppercase();
486                    self.skip();
487                },
488                _ => break,
489            }
490        }
491
492        if is_constant_ident {
493            Ok(Token::ConstantIdent(self.slice()))
494        } else {
495            Ok(Token::Ident(self.slice()))
496        }
497    }
498
499    fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
500        let c = self.pop();
501        debug_assert_eq!(c, '$');
502
503        loop {
504            match self.read() {
505                '_' | '0'..='9' => self.skip(),
506                c if c.is_ascii_lowercase() => self.skip(),
507                _ => break,
508            }
509        }
510
511        match self.slice() {
512            id @ ("$kernel" | "$exec" | "$anon") => Ok(Token::Ident(id)),
513            _ => {
514                let start = self.span().start();
515                let span = SourceSpan::at(self.span().source_id(), start);
516                Err(ParsingError::InvalidToken { span })
517            },
518        }
519    }
520
521    fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
522        // Expect the first character to be a digit
523        let c = self.read();
524        debug_assert!(c.is_ascii_digit());
525
526        while let '0'..='9' = self.read() {
527            self.skip();
528        }
529
530        self.slice()
531            .parse::<u64>()
532            .map_err(|error| ParsingError::InvalidLiteral {
533                span: self.span(),
534                kind: int_error_kind_to_literal_error_kind(
535                    error.kind(),
536                    LiteralErrorKind::FeltOverflow,
537                ),
538            })
539            .map(Token::Int)
540    }
541
542    fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
543        // Expect the first character to be a valid hexadecimal digit
544        debug_assert!(self.read().is_ascii_hexdigit());
545
546        loop {
547            // If we hit a non-hex digit, we're done
548            let c1 = self.read();
549            if !c1.is_ascii_hexdigit() {
550                break;
551            }
552            self.skip();
553
554            // All hex-encoded bytes are zero-padded, and thus occur
555            // in pairs, if we observe a non-hex digit at this point,
556            // it is invalid
557            let c2 = self.read();
558            if !c2.is_ascii_hexdigit() {
559                return Err(ParsingError::InvalidHexLiteral {
560                    span: self.span(),
561                    kind: HexErrorKind::Invalid,
562                });
563            }
564            self.skip();
565        }
566
567        let span = self.span();
568        let start = span.start();
569        let end = span.end();
570        let digit_start = start.to_u32() + 2;
571        let span = SourceSpan::new(span.source_id(), start..end);
572        parse_hex(span, self.slice_span(digit_start..end.to_u32()))
573    }
574
575    fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
576        // Expect the first character to be a valid binary digit
577        debug_assert!(is_ascii_binary(self.read()));
578
579        loop {
580            // If we hit a non-binary digit, we're done
581            let c1 = self.read();
582            if !is_ascii_binary(c1) {
583                break;
584            }
585            self.skip();
586        }
587
588        let span = self.span();
589        let start = span.start();
590        let digit_start = start.to_u32() + 2;
591        let end = span.end();
592        let span = SourceSpan::new(span.source_id(), start..end);
593        let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
594        Ok(Token::BinValue(value))
595    }
596}
597
598impl<'input> Iterator for Lexer<'input> {
599    type Item = Lexed<'input>;
600
601    fn next(&mut self) -> Option<Self::Item> {
602        let mut res = self.lex();
603        while let Some(Ok((_, Token::Comment, _))) = res {
604            res = self.lex();
605        }
606        res
607    }
608}
609
610// HELPER FUNCTIONS
611// ================================================================================================
612
613fn parse_hex<'input>(
614    span: SourceSpan,
615    hex_digits: &'input str,
616) -> Result<Token<'input>, ParsingError> {
617    use miden_core::{FieldElement, StarkField};
618    match hex_digits.len() {
619        // Felt
620        n if n <= 16 && n.is_multiple_of(2) => {
621            let value = u64::from_str_radix(hex_digits, 16).map_err(|error| {
622                ParsingError::InvalidLiteral {
623                    span,
624                    kind: int_error_kind_to_literal_error_kind(
625                        error.kind(),
626                        LiteralErrorKind::FeltOverflow,
627                    ),
628                }
629            })?;
630            if value >= Felt::MODULUS {
631                return Err(ParsingError::InvalidLiteral {
632                    span,
633                    kind: LiteralErrorKind::FeltOverflow,
634                });
635            }
636            Ok(Token::HexValue(shrink_u64_hex(value)))
637        },
638        // Word
639        64 => {
640            let mut word = [Felt::ZERO; 4];
641            for (index, element) in word.iter_mut().enumerate() {
642                let offset = index * 16;
643                let mut felt_bytes = [0u8; 8];
644                let digits = &hex_digits[offset..(offset + 16)];
645                for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
646                    let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
647                    *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
648                        ParsingError::InvalidLiteral {
649                            span,
650                            kind: int_error_kind_to_literal_error_kind(
651                                error.kind(),
652                                LiteralErrorKind::FeltOverflow,
653                            ),
654                        }
655                    })?;
656                }
657                let value = u64::from_le_bytes(felt_bytes);
658                if value >= Felt::MODULUS {
659                    return Err(ParsingError::InvalidLiteral {
660                        span,
661                        kind: LiteralErrorKind::FeltOverflow,
662                    });
663                }
664                *element = Felt::new(value);
665            }
666            Ok(Token::HexWord(WordValue(word)))
667        },
668        // Invalid
669        n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
670        n if !n.is_multiple_of(2) && n < 64 => {
671            Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::MissingDigits })
672        },
673        _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
674    }
675}
676
677fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
678    if bin_digits.len() <= 32 {
679        let value =
680            u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
681                span,
682                kind: int_error_kind_to_literal_error_kind(
683                    error.kind(),
684                    LiteralErrorKind::U32Overflow,
685                ),
686            })?;
687        Ok(shrink_u32_bin(value))
688    } else {
689        Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
690    }
691}
692
693#[inline(always)]
694fn is_ascii_binary(c: char) -> bool {
695    matches!(c, '0'..='1')
696}
697
698#[inline]
699pub fn shrink_u64_hex(n: u64) -> IntValue {
700    if n <= (u8::MAX as u64) {
701        IntValue::U8(n as u8)
702    } else if n <= (u16::MAX as u64) {
703        IntValue::U16(n as u16)
704    } else if n <= (u32::MAX as u64) {
705        IntValue::U32(n as u32)
706    } else {
707        IntValue::Felt(Felt::new(n))
708    }
709}
710
711#[inline]
712fn shrink_u32_bin(n: u32) -> BinEncodedValue {
713    if n <= (u8::MAX as u32) {
714        BinEncodedValue::U8(n as u8)
715    } else if n <= (u16::MAX as u32) {
716        BinEncodedValue::U16(n as u16)
717    } else {
718        BinEncodedValue::U32(n)
719    }
720}
721
722#[inline]
723fn int_error_kind_to_literal_error_kind(
724    kind: &IntErrorKind,
725    overflow: LiteralErrorKind,
726) -> LiteralErrorKind {
727    match kind {
728        IntErrorKind::Empty => LiteralErrorKind::Empty,
729        IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
730        IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
731        _ => unreachable!(),
732    }
733}
miden_assembly_syntax/parser/lexer.rs

miden_assembly_syntax/parser/
lexer.rs