miden_assembly_syntax/parser/
lexer.rs

1use alloc::{borrow::Cow, string::String};
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
5
6use super::{
7    BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
8    ParsingError, Scanner, Token, WordValue,
9};
10use crate::Felt;
11
12/// The value produced by the [Lexer] when iterated
13///
14/// A successfully lexed token is wrapped in a tuple with the start and end byte offsets, where
15/// the end offset is exclusive. We explicitly use a tuple here, and not something like Span<T>,
16/// because this "triple" is the structure expected by the LALRPOP parser generator when used with
17/// a custom lexer like ours.
18pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
19
20/// Pops a single token from the [Lexer]
21macro_rules! pop {
22    ($lex:ident) => {{
23        $lex.skip();
24    }};
25    ($lex:ident, $token:expr) => {{
26        $lex.skip();
27        Ok($token)
28    }};
29}
30
31/// Pops two tokens from the [Lexer]
32macro_rules! pop2 {
33    ($lex:ident) => {{
34        $lex.skip();
35        $lex.skip();
36    }};
37    ($lex:ident, $token:expr) => {{
38        $lex.skip();
39        $lex.skip();
40        Ok($token)
41    }};
42}
43
44// LEXER
45// ================================================================================================
46
47/// The lexer that is used to perform lexical analysis Miden Assembly grammar. The lexer implements
48/// the `Iterator` trait, so in order to retrieve the tokens, you simply have to iterate over it.
49///
50/// # Errors
51///
52/// Because the lexer is implemented as an iterator over tokens, this means that you can continue
53/// to get tokens even if a lexical error occurs. The lexer will attempt to recover from an error
54/// by injecting tokens it expects.
55///
56/// If an error is unrecoverable, the lexer will continue to produce tokens, but there is no
57/// guarantee that parsing them will produce meaningful results, it is primarily to assist in
58/// gathering as many errors as possible.
59pub struct Lexer<'input> {
60    /// The [SourceId] of the file being lexed, for use in producing spans in lexer diagnostics
61    source_id: SourceId,
62
63    /// The scanner produces a sequence of chars + location, and can be controlled
64    /// The location type is usize
65    scanner: Scanner<'input>,
66
67    /// The most recent token to be lexed.
68    /// At the start and end, this should be Token::Eof
69    token: Token<'input>,
70
71    /// The position in the input where the current token starts
72    /// At the start this will be the byte index of the beginning of the input
73    token_start: usize,
74
75    /// The position in the input where the current token ends
76    /// At the start this will be the byte index of the beginning of the input
77    token_end: usize,
78
79    /// The current line number
80    line_num: usize,
81
82    /// When we have reached true Eof, this gets set to true, and the only token
83    /// produced after that point is Token::Eof, or None, depending on how you are
84    /// consuming the lexer
85    eof: bool,
86    empty: bool,
87
88    // A DFA for keyword matching
89    keywords: aho_corasick::AhoCorasick,
90
91    /// If an error occurs during tokenization, it is held here
92    error: Option<ParsingError>,
93}
94
95impl<'input> Lexer<'input> {
96    /// Produces an instance of the lexer with the lexical analysis to be performed on the `input`
97    /// string. Note that no lexical analysis occurs until the lexer has been iterated over.
98    pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
99        let start = scanner.start();
100        let keywords = Token::keyword_searcher();
101        let mut lexer = Self {
102            source_id,
103            scanner,
104            token: Token::Eof,
105            token_start: start,
106            token_end: start,
107            line_num: 0,
108            eof: false,
109            empty: false,
110            keywords,
111            error: None,
112        };
113        lexer.advance();
114        lexer
115    }
116
117    pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
118        if let Some(err) = self.error.take() {
119            return Some(Err(err));
120        }
121
122        if self.eof && matches!(self.token, Token::Eof) {
123            // Emit a single Eof token at the end, then None after
124            if self.empty {
125                return None;
126            } else {
127                self.empty = true;
128                let end = self.token_end as u32;
129                return Some(Ok((end, Token::Eof, end)));
130            }
131        }
132
133        let token = core::mem::replace(&mut self.token, Token::Eof);
134        let start = self.token_start;
135        let end = self.token_end;
136        self.advance();
137        Some(Ok((start as u32, token, end as u32)))
138    }
139
140    fn advance(&mut self) {
141        self.advance_start();
142        match self.tokenize() {
143            Ok(tok) => {
144                self.token = tok;
145            },
146            Err(err) => {
147                self.error = Some(err);
148            },
149        }
150    }
151
152    #[inline]
153    fn advance_start(&mut self) {
154        let mut position: usize;
155        loop {
156            let (pos, c) = self.scanner.read();
157
158            position = pos;
159
160            if c == '\0' {
161                self.eof = true;
162                return;
163            }
164
165            if c.is_whitespace() {
166                if c == '\n' {
167                    self.line_num += 1;
168                }
169                self.scanner.advance();
170                continue;
171            }
172
173            break;
174        }
175
176        self.token_start = position;
177        self.token_end = position;
178    }
179
180    #[inline]
181    fn pop(&mut self) -> char {
182        let (pos, c) = self.scanner.pop();
183        self.token_end = pos + c.len_utf8();
184        c
185    }
186
187    #[inline]
188    fn peek(&mut self) -> char {
189        let (_, c) = self.scanner.peek();
190        c
191    }
192
193    #[inline]
194    #[expect(unused)]
195    fn peek_next(&mut self) -> char {
196        let (_, c) = self.scanner.peek_next();
197        c
198    }
199
200    #[inline]
201    fn read(&mut self) -> char {
202        let (_, c) = self.scanner.read();
203        c
204    }
205
206    #[inline]
207    fn skip(&mut self) {
208        self.pop();
209    }
210
211    /// Get the span for the current token in `Source`.
212    #[inline]
213    fn span(&self) -> SourceSpan {
214        assert!(self.token_start <= self.token_end, "invalid range");
215        assert!(self.token_end <= u32::MAX as usize, "file too large");
216        SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
217    }
218
219    #[inline]
220    fn char_span(&self, pos: usize, c: char) -> SourceSpan {
221        let end = pos + c.len_utf8();
222        SourceSpan::new(self.source_id, (pos as u32)..(end as u32))
223    }
224
225    #[inline]
226    fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
227        let range = span.into();
228        self.scanner.slice((range.start as usize)..(range.end as usize))
229    }
230
231    /// Get a string slice of the current token.
232    #[inline]
233    fn slice(&self) -> &'input str {
234        self.slice_span(self.span())
235    }
236
237    #[inline]
238    fn skip_whitespace(&mut self) {
239        let mut c: char;
240        loop {
241            c = self.read();
242
243            if !c.is_whitespace() {
244                break;
245            }
246
247            if c == '\n' {
248                self.line_num += 1;
249            }
250
251            self.skip();
252        }
253    }
254
255    fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
256        let c = self.read();
257
258        if c == '#' {
259            match self.peek() {
260                '!' => {
261                    self.skip();
262                    self.skip();
263                    return self.lex_docs();
264                },
265                _ => {
266                    self.skip();
267                    self.skip_comment();
268                    return Ok(Token::Comment);
269                },
270            }
271        }
272
273        if c == '\0' {
274            self.eof = true;
275            return Ok(Token::Eof);
276        }
277
278        if c.is_whitespace() {
279            self.skip_whitespace();
280        }
281
282        let pos = self.token_start;
283        let c = self.read();
284        match c {
285            '@' => pop!(self, Token::At),
286            '!' => pop!(self, Token::Bang),
287            ':' => match self.peek() {
288                ':' => pop2!(self, Token::ColonColon),
289                _ => pop!(self, Token::Colon),
290            },
291            ';' => pop!(self, Token::Semicolon),
292            '.' => match self.peek() {
293                '.' => pop2!(self, Token::Range),
294                _ => pop!(self, Token::Dot),
295            },
296            ',' => pop!(self, Token::Comma),
297            '=' => pop!(self, Token::Equal),
298            '<' => pop!(self, Token::Langle),
299            '{' => pop!(self, Token::Lbrace),
300            '[' => pop!(self, Token::Lbracket),
301            '(' => pop!(self, Token::Lparen),
302            '>' => pop!(self, Token::Rangle),
303            '}' => pop!(self, Token::Rbrace),
304            ']' => pop!(self, Token::Rbracket),
305            ')' => pop!(self, Token::Rparen),
306            '-' => match self.peek() {
307                '>' => pop2!(self, Token::Rstab),
308                _ => pop!(self, Token::Minus),
309            },
310            '+' => pop!(self, Token::Plus),
311            '/' => match self.peek() {
312                '/' => pop2!(self, Token::SlashSlash),
313                _ => pop!(self, Token::Slash),
314            },
315            '*' => pop!(self, Token::Star),
316            '$' => self.lex_special_identifier(),
317            '"' => self.lex_quoted_identifier_or_string(),
318            '0' => match self.peek() {
319                'x' => {
320                    self.skip();
321                    self.skip();
322                    self.lex_hex()
323                },
324                'b' => {
325                    self.skip();
326                    self.skip();
327                    self.lex_bin()
328                },
329                '0'..='9' => self.lex_number(),
330                _ => pop!(self, Token::Int(0)),
331            },
332            '1'..='9' => self.lex_number(),
333            'a'..='z' => self.lex_keyword_or_ident(),
334            'A'..='Z' => self.lex_identifier(),
335            '_' => match self.peek() {
336                c if c.is_ascii_alphanumeric() => self.lex_identifier(),
337                _ => Err(ParsingError::InvalidToken { span: self.char_span(pos, c) }),
338            },
339            _ => Err(ParsingError::InvalidToken { span: self.char_span(pos, c) }),
340        }
341    }
342
343    fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
344        let mut buf = String::new();
345
346        let mut c;
347        let mut line_start = self.token_start + 2;
348        let is_module_doc = self.line_num == 0;
349        loop {
350            c = self.read();
351
352            if c == '\0' {
353                self.eof = true;
354                buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
355
356                let is_first_line = self.line_num == 0;
357                break Ok(Token::DocComment(if is_first_line {
358                    DocumentationType::Module(buf)
359                } else {
360                    DocumentationType::Form(buf)
361                }));
362            }
363
364            if c == '\n' {
365                self.line_num += 1;
366
367                buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
368                buf.push('\n');
369
370                self.skip();
371                c = self.read();
372                match c {
373                    '#' if self.peek() == '!' => {
374                        self.skip();
375                        self.skip();
376                        line_start = self.token_end;
377                        continue;
378                    },
379                    _ if is_module_doc => {
380                        break Ok(Token::DocComment(DocumentationType::Module(buf)));
381                    },
382                    _ => {
383                        break Ok(Token::DocComment(DocumentationType::Form(buf)));
384                    },
385                }
386            }
387
388            self.skip();
389        }
390    }
391
392    fn skip_comment(&mut self) {
393        let mut c;
394        loop {
395            c = self.read();
396
397            if c == '\n' {
398                self.skip();
399                self.line_num += 1;
400                break;
401            }
402
403            if c == '\0' {
404                self.eof = true;
405                break;
406            }
407
408            self.skip();
409        }
410    }
411
412    fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
413        let c = self.pop();
414        debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
415
416        loop {
417            match self.read() {
418                '_' | '0'..='9' => self.skip(),
419                c if c.is_ascii_alphabetic() => self.skip(),
420                _ => break,
421            }
422        }
423
424        let name = self.slice();
425        match name {
426            "exp" => {
427                // Special handling for the `exp.uXX` tokenization
428                if self.read() == '.' && self.peek() == 'u' {
429                    pop2!(self, Token::ExpU)
430                } else {
431                    Ok(Token::Exp)
432                }
433            },
434            _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
435        }
436    }
437
438    fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
439        // Skip quotation mark
440        self.skip();
441
442        let mut is_identifier = true;
443        let quote_size = ByteOffset::from_char_len('"');
444        loop {
445            match self.read() {
446                '\0' | '\n' => {
447                    break Err(ParsingError::UnclosedQuote {
448                        start: SourceSpan::at(self.source_id, self.span().start()),
449                    });
450                },
451                '\\' => {
452                    is_identifier = false;
453                    self.skip();
454                    match self.read() {
455                        '"' | '\n' => {
456                            self.skip();
457                        },
458                        _ => (),
459                    }
460                },
461                '"' => {
462                    let span = self.span();
463                    let start = span.start() + quote_size;
464                    let span = SourceSpan::new(self.source_id, start..span.end());
465
466                    self.skip();
467                    break Ok(if is_identifier {
468                        Token::QuotedIdent(self.slice_span(span))
469                    } else {
470                        Token::QuotedString(self.slice_span(span))
471                    });
472                },
473                c if c.is_alphanumeric() || c.is_ascii_graphic() => {
474                    self.skip();
475                },
476                _ => {
477                    is_identifier = false;
478                    self.skip();
479                },
480            }
481        }
482    }
483
484    fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
485        let c = self.pop();
486        debug_assert!(c.is_ascii_alphabetic() || c == '_');
487
488        let mut is_constant_ident = c.is_ascii_uppercase() || c == '_';
489
490        loop {
491            match self.read() {
492                '_' | '0'..='9' => self.skip(),
493                c if c.is_ascii_alphabetic() => {
494                    is_constant_ident &= c.is_ascii_uppercase();
495                    self.skip();
496                },
497                _ => break,
498            }
499        }
500
501        if is_constant_ident {
502            Ok(Token::ConstantIdent(self.slice()))
503        } else {
504            Ok(Token::Ident(self.slice()))
505        }
506    }
507
508    fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
509        let c = self.pop();
510        debug_assert_eq!(c, '$');
511
512        loop {
513            match self.read() {
514                '_' | '0'..='9' => self.skip(),
515                c if c.is_ascii_lowercase() => self.skip(),
516                _ => break,
517            }
518        }
519
520        match self.slice() {
521            id @ ("$kernel" | "$exec") => Ok(Token::Ident(id)),
522            _ => {
523                let start = self.span().start();
524                let span = SourceSpan::at(self.span().source_id(), start);
525                Err(ParsingError::InvalidToken { span })
526            },
527        }
528    }
529
530    fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
531        // Expect the first character to be a digit
532        let c = self.read();
533        debug_assert!(c.is_ascii_digit());
534
535        while let '0'..='9' = self.read() {
536            self.skip();
537        }
538
539        self.slice()
540            .parse::<u64>()
541            .map_err(|error| ParsingError::InvalidLiteral {
542                span: self.span(),
543                kind: int_error_kind_to_literal_error_kind(
544                    error.kind(),
545                    LiteralErrorKind::FeltOverflow,
546                ),
547            })
548            .map(Token::Int)
549    }
550
551    fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
552        // Expect the first character to be a valid hexadecimal digit
553        debug_assert!(self.read().is_ascii_hexdigit());
554
555        loop {
556            // If we hit a non-hex digit, we're done
557            let c1 = self.read();
558            if !c1.is_ascii_hexdigit() {
559                break;
560            }
561            self.skip();
562
563            // All hex-encoded bytes are zero-padded, and thus occur
564            // in pairs, if we observe a non-hex digit at this point,
565            // it is invalid
566            let c2 = self.read();
567            if !c2.is_ascii_hexdigit() {
568                // For odd-length hex strings, we need to handle this by
569                // adjusting the span to not include the unpaired character
570                // and let parse_hex handle the padding
571                break;
572            }
573            self.skip();
574        }
575
576        let span = self.span();
577        let start = span.start();
578        let end = span.end();
579        let digit_start = start.to_u32() + 2;
580        let span = SourceSpan::new(span.source_id(), start..end);
581        parse_hex(span, self.slice_span(digit_start..end.to_u32()))
582    }
583
584    fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
585        // Expect the first character to be a valid binary digit
586        debug_assert!(is_ascii_binary(self.read()));
587
588        loop {
589            // If we hit a non-binary digit, we're done
590            let c1 = self.read();
591            if !is_ascii_binary(c1) {
592                break;
593            }
594            self.skip();
595        }
596
597        let span = self.span();
598        let start = span.start();
599        let digit_start = start.to_u32() + 2;
600        let end = span.end();
601        let span = SourceSpan::new(span.source_id(), start..end);
602        let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
603        Ok(Token::BinValue(value))
604    }
605}
606
607impl<'input> Iterator for Lexer<'input> {
608    type Item = Lexed<'input>;
609
610    fn next(&mut self) -> Option<Self::Item> {
611        let mut res = self.lex();
612        while let Some(Ok((_, Token::Comment, _))) = res {
613            res = self.lex();
614        }
615        res
616    }
617}
618
619// HELPER FUNCTIONS
620// ================================================================================================
621
622fn pad_hex_if_needed<'a>(hex: &'a str) -> Cow<'a, str> {
623    if hex.len().is_multiple_of(2) {
624        Cow::Borrowed(hex)
625    } else {
626        // allocate once, with exact capacity
627        let mut s = alloc::string::String::with_capacity(hex.len() + 1);
628        s.push('0');
629        s.push_str(hex);
630        Cow::Owned(s)
631    }
632}
633
634fn parse_hex<'input>(
635    span: SourceSpan,
636    hex_digits: &'input str,
637) -> Result<Token<'input>, ParsingError> {
638    use miden_core::{Felt, field::PrimeField64};
639
640    // Handle odd-length hex strings by padding with a leading zero
641    let hex_digits = pad_hex_if_needed(hex_digits);
642
643    match hex_digits.len() {
644        // Felt - allow all even lengths up to 16
645        n if n <= 16 && n.is_multiple_of(2) => {
646            let value = u64::from_str_radix(&hex_digits, 16).map_err(|error| {
647                ParsingError::InvalidLiteral {
648                    span,
649                    kind: int_error_kind_to_literal_error_kind(
650                        error.kind(),
651                        LiteralErrorKind::FeltOverflow,
652                    ),
653                }
654            })?;
655            if value >= Felt::ORDER_U64 {
656                return Err(ParsingError::InvalidLiteral {
657                    span,
658                    kind: LiteralErrorKind::FeltOverflow,
659                });
660            }
661            Ok(Token::HexValue(shrink_u64_hex(value)))
662        },
663        // Word
664        64 => {
665            let mut word = [Felt::ZERO; 4];
666            for (index, element) in word.iter_mut().enumerate() {
667                let offset = index * 16;
668                let mut felt_bytes = [0u8; 8];
669                let digits = &hex_digits[offset..(offset + 16)];
670                for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
671                    let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
672                    *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
673                        ParsingError::InvalidLiteral {
674                            span,
675                            kind: int_error_kind_to_literal_error_kind(
676                                error.kind(),
677                                LiteralErrorKind::FeltOverflow,
678                            ),
679                        }
680                    })?;
681                }
682                let value = u64::from_le_bytes(felt_bytes);
683                if value >= Felt::ORDER_U64 {
684                    return Err(ParsingError::InvalidLiteral {
685                        span,
686                        kind: LiteralErrorKind::FeltOverflow,
687                    });
688                }
689                *element = Felt::new(value);
690            }
691            Ok(Token::HexWord(WordValue(word)))
692        },
693        // Invalid
694        n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
695        _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
696    }
697}
698
699fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
700    if bin_digits.len() <= 32 {
701        let value =
702            u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
703                span,
704                kind: int_error_kind_to_literal_error_kind(
705                    error.kind(),
706                    LiteralErrorKind::U32Overflow,
707                ),
708            })?;
709        Ok(shrink_u32_bin(value))
710    } else {
711        Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
712    }
713}
714
715#[inline(always)]
716fn is_ascii_binary(c: char) -> bool {
717    matches!(c, '0'..='1')
718}
719
720#[inline]
721pub fn shrink_u64_hex(n: u64) -> IntValue {
722    if n <= (u8::MAX as u64) {
723        IntValue::U8(n as u8)
724    } else if n <= (u16::MAX as u64) {
725        IntValue::U16(n as u16)
726    } else if n <= (u32::MAX as u64) {
727        IntValue::U32(n as u32)
728    } else {
729        IntValue::Felt(Felt::new(n))
730    }
731}
732
733#[inline]
734fn shrink_u32_bin(n: u32) -> BinEncodedValue {
735    if n <= (u8::MAX as u32) {
736        BinEncodedValue::U8(n as u8)
737    } else if n <= (u16::MAX as u32) {
738        BinEncodedValue::U16(n as u16)
739    } else {
740        BinEncodedValue::U32(n)
741    }
742}
743
744#[inline]
745fn int_error_kind_to_literal_error_kind(
746    kind: &IntErrorKind,
747    overflow: LiteralErrorKind,
748) -> LiteralErrorKind {
749    match kind {
750        IntErrorKind::Empty => LiteralErrorKind::Empty,
751        IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
752        IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
753        _ => unreachable!(),
754    }
755}
miden_assembly_syntax/parser/lexer.rs

miden_assembly_syntax/parser/
lexer.rs