Skip to main content

miden_assembly_syntax/parser/
lexer.rs

1use alloc::{borrow::Cow, string::String};
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_core::field::PrimeCharacteristicRing;
5use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
6
7use super::{
8    BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
9    ParsingError, Scanner, Token, WordValue,
10};
11use crate::Felt;
12
13/// The value produced by the [Lexer] when iterated
14///
15/// A successfully lexed token is wrapped in a tuple with the start and end byte offsets, where
16/// the end offset is exclusive. We explicitly use a tuple here, and not something like Span<T>,
17/// because this "triple" is the structure expected by the LALRPOP parser generator when used with
18/// a custom lexer like ours.
19pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
20
21/// Pops a single token from the [Lexer]
22macro_rules! pop {
23    ($lex:ident) => {{
24        $lex.skip();
25    }};
26    ($lex:ident, $token:expr) => {{
27        $lex.skip();
28        Ok($token)
29    }};
30}
31
32/// Pops two tokens from the [Lexer]
33macro_rules! pop2 {
34    ($lex:ident) => {{
35        $lex.skip();
36        $lex.skip();
37    }};
38    ($lex:ident, $token:expr) => {{
39        $lex.skip();
40        $lex.skip();
41        Ok($token)
42    }};
43}
44
45// LEXER
46// ================================================================================================
47
48/// The lexer that is used to perform lexical analysis Miden Assembly grammar. The lexer implements
49/// the `Iterator` trait, so in order to retrieve the tokens, you simply have to iterate over it.
50///
51/// # Errors
52///
53/// Because the lexer is implemented as an iterator over tokens, this means that you can continue
54/// to get tokens even if a lexical error occurs. The lexer will attempt to recover from an error
55/// by injecting tokens it expects.
56///
57/// If an error is unrecoverable, the lexer will continue to produce tokens, but there is no
58/// guarantee that parsing them will produce meaningful results, it is primarily to assist in
59/// gathering as many errors as possible.
60pub struct Lexer<'input> {
61    /// The [SourceId] of the file being lexed, for use in producing spans in lexer diagnostics
62    source_id: SourceId,
63
64    /// The scanner produces a sequence of chars + location, and can be controlled
65    /// The location type is usize
66    scanner: Scanner<'input>,
67
68    /// The most recent token to be lexed.
69    /// At the start and end, this should be Token::Eof
70    token: Token<'input>,
71
72    /// The position in the input where the current token starts
73    /// At the start this will be the byte index of the beginning of the input
74    token_start: usize,
75
76    /// The position in the input where the current token ends
77    /// At the start this will be the byte index of the beginning of the input
78    token_end: usize,
79
80    /// The current line number
81    line_num: usize,
82
83    /// When we have reached true Eof, this gets set to true, and the only token
84    /// produced after that point is Token::Eof, or None, depending on how you are
85    /// consuming the lexer
86    eof: bool,
87    empty: bool,
88
89    // A DFA for keyword matching
90    keywords: aho_corasick::AhoCorasick,
91
92    /// If an error occurs during tokenization, it is held here
93    error: Option<ParsingError>,
94}
95
96impl<'input> Lexer<'input> {
97    /// Produces an instance of the lexer with the lexical analysis to be performed on the `input`
98    /// string. Note that no lexical analysis occurs until the lexer has been iterated over.
99    pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
100        let start = scanner.start();
101        let keywords = Token::keyword_searcher();
102        let mut lexer = Self {
103            source_id,
104            scanner,
105            token: Token::Eof,
106            token_start: start,
107            token_end: start,
108            line_num: 0,
109            eof: false,
110            empty: false,
111            keywords,
112            error: None,
113        };
114        lexer.advance();
115        lexer
116    }
117
118    pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
119        if let Some(err) = self.error.take() {
120            return Some(Err(err));
121        }
122
123        if self.eof && matches!(self.token, Token::Eof) {
124            // Emit a single Eof token at the end, then None after
125            if self.empty {
126                return None;
127            } else {
128                self.empty = true;
129                let end = self.token_end as u32;
130                return Some(Ok((end, Token::Eof, end)));
131            }
132        }
133
134        let token = core::mem::replace(&mut self.token, Token::Eof);
135        let start = self.token_start;
136        let end = self.token_end;
137        self.advance();
138        Some(Ok((start as u32, token, end as u32)))
139    }
140
141    fn advance(&mut self) {
142        self.advance_start();
143        match self.tokenize() {
144            Ok(tok) => {
145                self.token = tok;
146            },
147            Err(err) => {
148                self.error = Some(err);
149            },
150        }
151    }
152
153    #[inline]
154    fn advance_start(&mut self) {
155        let mut position: usize;
156        loop {
157            let (pos, c) = self.scanner.read();
158
159            position = pos;
160
161            if c == '\0' {
162                self.eof = true;
163                return;
164            }
165
166            if c.is_whitespace() {
167                if c == '\n' {
168                    self.line_num += 1;
169                }
170                self.scanner.advance();
171                continue;
172            }
173
174            break;
175        }
176
177        self.token_start = position;
178    }
179
180    #[inline]
181    fn pop(&mut self) -> char {
182        let (pos, c) = self.scanner.pop();
183        self.token_end = pos + c.len_utf8();
184        c
185    }
186
187    #[inline]
188    fn peek(&mut self) -> char {
189        let (_, c) = self.scanner.peek();
190        c
191    }
192
193    #[inline]
194    #[expect(unused)]
195    fn peek_next(&mut self) -> char {
196        let (_, c) = self.scanner.peek_next();
197        c
198    }
199
200    #[inline]
201    fn read(&mut self) -> char {
202        let (_, c) = self.scanner.read();
203        c
204    }
205
206    #[inline]
207    fn skip(&mut self) {
208        self.pop();
209    }
210
211    /// Get the span for the current token in `Source`.
212    #[inline]
213    fn span(&self) -> SourceSpan {
214        assert!(self.token_start <= self.token_end, "invalid range");
215        assert!(self.token_end <= u32::MAX as usize, "file too large");
216        SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
217    }
218
219    #[inline]
220    fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
221        let range = span.into();
222        self.scanner.slice((range.start as usize)..(range.end as usize))
223    }
224
225    /// Get a string slice of the current token.
226    #[inline]
227    fn slice(&self) -> &'input str {
228        self.slice_span(self.span())
229    }
230
231    #[inline]
232    fn skip_whitespace(&mut self) {
233        let mut c: char;
234        loop {
235            c = self.read();
236
237            if !c.is_whitespace() {
238                break;
239            }
240
241            if c == '\n' {
242                self.line_num += 1;
243            }
244
245            self.skip();
246        }
247    }
248
249    fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
250        let c = self.read();
251
252        if c == '#' {
253            match self.peek() {
254                '!' => {
255                    self.skip();
256                    self.skip();
257                    return self.lex_docs();
258                },
259                _ => {
260                    self.skip();
261                    self.skip_comment();
262                    return Ok(Token::Comment);
263                },
264            }
265        }
266
267        if c == '\0' {
268            self.eof = true;
269            return Ok(Token::Eof);
270        }
271
272        if c.is_whitespace() {
273            self.skip_whitespace();
274        }
275
276        match self.read() {
277            '@' => pop!(self, Token::At),
278            '!' => pop!(self, Token::Bang),
279            ':' => match self.peek() {
280                ':' => pop2!(self, Token::ColonColon),
281                _ => pop!(self, Token::Colon),
282            },
283            ';' => pop!(self, Token::Semicolon),
284            '.' => match self.peek() {
285                '.' => pop2!(self, Token::Range),
286                _ => pop!(self, Token::Dot),
287            },
288            ',' => pop!(self, Token::Comma),
289            '=' => pop!(self, Token::Equal),
290            '<' => pop!(self, Token::Langle),
291            '{' => pop!(self, Token::Lbrace),
292            '[' => pop!(self, Token::Lbracket),
293            '(' => pop!(self, Token::Lparen),
294            '>' => pop!(self, Token::Rangle),
295            '}' => pop!(self, Token::Rbrace),
296            ']' => pop!(self, Token::Rbracket),
297            ')' => pop!(self, Token::Rparen),
298            '-' => match self.peek() {
299                '>' => pop2!(self, Token::Rstab),
300                _ => pop!(self, Token::Minus),
301            },
302            '+' => pop!(self, Token::Plus),
303            '/' => match self.peek() {
304                '/' => pop2!(self, Token::SlashSlash),
305                _ => pop!(self, Token::Slash),
306            },
307            '*' => pop!(self, Token::Star),
308            '$' => self.lex_special_identifier(),
309            '"' => self.lex_quoted_identifier_or_string(),
310            '0' => match self.peek() {
311                'x' => {
312                    self.skip();
313                    self.skip();
314                    self.lex_hex()
315                },
316                'b' => {
317                    self.skip();
318                    self.skip();
319                    self.lex_bin()
320                },
321                '0'..='9' => self.lex_number(),
322                _ => pop!(self, Token::Int(0)),
323            },
324            '1'..='9' => self.lex_number(),
325            'a'..='z' => self.lex_keyword_or_ident(),
326            'A'..='Z' => self.lex_identifier(),
327            '_' => match self.peek() {
328                c if c.is_ascii_alphanumeric() => self.lex_identifier(),
329                _ => Err(ParsingError::InvalidToken { span: self.span() }),
330            },
331            _ => Err(ParsingError::InvalidToken { span: self.span() }),
332        }
333    }
334
335    fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
336        let mut buf = String::new();
337
338        let mut c;
339        let mut line_start = self.token_start + 2;
340        let is_module_doc = self.line_num == 0;
341        loop {
342            c = self.read();
343
344            if c == '\0' {
345                self.eof = true;
346                buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
347
348                let is_first_line = self.line_num == 0;
349                break Ok(Token::DocComment(if is_first_line {
350                    DocumentationType::Module(buf)
351                } else {
352                    DocumentationType::Form(buf)
353                }));
354            }
355
356            if c == '\n' {
357                self.line_num += 1;
358
359                buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
360                buf.push('\n');
361
362                self.skip();
363                c = self.read();
364                match c {
365                    '#' if self.peek() == '!' => {
366                        self.skip();
367                        self.skip();
368                        line_start = self.token_end;
369                        continue;
370                    },
371                    _ if is_module_doc => {
372                        break Ok(Token::DocComment(DocumentationType::Module(buf)));
373                    },
374                    _ => {
375                        break Ok(Token::DocComment(DocumentationType::Form(buf)));
376                    },
377                }
378            }
379
380            self.skip();
381        }
382    }
383
384    fn skip_comment(&mut self) {
385        let mut c;
386        loop {
387            c = self.read();
388
389            if c == '\n' {
390                self.skip();
391                self.line_num += 1;
392                break;
393            }
394
395            if c == '\0' {
396                self.eof = true;
397                break;
398            }
399
400            self.skip();
401        }
402    }
403
404    fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
405        let c = self.pop();
406        debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
407
408        loop {
409            match self.read() {
410                '_' | '0'..='9' => self.skip(),
411                c if c.is_ascii_alphabetic() => self.skip(),
412                _ => break,
413            }
414        }
415
416        let name = self.slice();
417        match name {
418            "exp" => {
419                // Special handling for the `exp.uXX` tokenization
420                if self.read() == '.' && self.peek() == 'u' {
421                    pop2!(self, Token::ExpU)
422                } else {
423                    Ok(Token::Exp)
424                }
425            },
426            _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
427        }
428    }
429
430    fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
431        // Skip quotation mark
432        self.skip();
433
434        let mut is_identifier = true;
435        let quote_size = ByteOffset::from_char_len('"');
436        loop {
437            match self.read() {
438                '\0' | '\n' => {
439                    break Err(ParsingError::UnclosedQuote {
440                        start: SourceSpan::at(self.source_id, self.span().start()),
441                    });
442                },
443                '\\' => {
444                    is_identifier = false;
445                    self.skip();
446                    match self.read() {
447                        '"' | '\n' => {
448                            self.skip();
449                        },
450                        _ => (),
451                    }
452                },
453                '"' => {
454                    let span = self.span();
455                    let start = span.start() + quote_size;
456                    let span = SourceSpan::new(self.source_id, start..span.end());
457
458                    self.skip();
459                    break Ok(if is_identifier {
460                        Token::QuotedIdent(self.slice_span(span))
461                    } else {
462                        Token::QuotedString(self.slice_span(span))
463                    });
464                },
465                c if c.is_alphanumeric() || c.is_ascii_graphic() => {
466                    self.skip();
467                },
468                _ => {
469                    is_identifier = false;
470                    self.skip();
471                },
472            }
473        }
474    }
475
476    fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
477        let c = self.pop();
478        debug_assert!(c.is_ascii_alphabetic() || c == '_');
479
480        let mut is_constant_ident = c.is_ascii_uppercase() || c == '_';
481
482        loop {
483            match self.read() {
484                '_' | '0'..='9' => self.skip(),
485                c if c.is_ascii_alphabetic() => {
486                    is_constant_ident &= c.is_ascii_uppercase();
487                    self.skip();
488                },
489                _ => break,
490            }
491        }
492
493        if is_constant_ident {
494            Ok(Token::ConstantIdent(self.slice()))
495        } else {
496            Ok(Token::Ident(self.slice()))
497        }
498    }
499
500    fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
501        let c = self.pop();
502        debug_assert_eq!(c, '$');
503
504        loop {
505            match self.read() {
506                '_' | '0'..='9' => self.skip(),
507                c if c.is_ascii_lowercase() => self.skip(),
508                _ => break,
509            }
510        }
511
512        match self.slice() {
513            id @ ("$kernel" | "$exec") => Ok(Token::Ident(id)),
514            _ => {
515                let start = self.span().start();
516                let span = SourceSpan::at(self.span().source_id(), start);
517                Err(ParsingError::InvalidToken { span })
518            },
519        }
520    }
521
522    fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
523        // Expect the first character to be a digit
524        let c = self.read();
525        debug_assert!(c.is_ascii_digit());
526
527        while let '0'..='9' = self.read() {
528            self.skip();
529        }
530
531        self.slice()
532            .parse::<u64>()
533            .map_err(|error| ParsingError::InvalidLiteral {
534                span: self.span(),
535                kind: int_error_kind_to_literal_error_kind(
536                    error.kind(),
537                    LiteralErrorKind::FeltOverflow,
538                ),
539            })
540            .map(Token::Int)
541    }
542
543    fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
544        // Expect the first character to be a valid hexadecimal digit
545        debug_assert!(self.read().is_ascii_hexdigit());
546
547        loop {
548            // If we hit a non-hex digit, we're done
549            let c1 = self.read();
550            if !c1.is_ascii_hexdigit() {
551                break;
552            }
553            self.skip();
554
555            // All hex-encoded bytes are zero-padded, and thus occur
556            // in pairs, if we observe a non-hex digit at this point,
557            // it is invalid
558            let c2 = self.read();
559            if !c2.is_ascii_hexdigit() {
560                // For odd-length hex strings, we need to handle this by
561                // adjusting the span to not include the unpaired character
562                // and let parse_hex handle the padding
563                break;
564            }
565            self.skip();
566        }
567
568        let span = self.span();
569        let start = span.start();
570        let end = span.end();
571        let digit_start = start.to_u32() + 2;
572        let span = SourceSpan::new(span.source_id(), start..end);
573        parse_hex(span, self.slice_span(digit_start..end.to_u32()))
574    }
575
576    fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
577        // Expect the first character to be a valid binary digit
578        debug_assert!(is_ascii_binary(self.read()));
579
580        loop {
581            // If we hit a non-binary digit, we're done
582            let c1 = self.read();
583            if !is_ascii_binary(c1) {
584                break;
585            }
586            self.skip();
587        }
588
589        let span = self.span();
590        let start = span.start();
591        let digit_start = start.to_u32() + 2;
592        let end = span.end();
593        let span = SourceSpan::new(span.source_id(), start..end);
594        let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
595        Ok(Token::BinValue(value))
596    }
597}
598
599impl<'input> Iterator for Lexer<'input> {
600    type Item = Lexed<'input>;
601
602    fn next(&mut self) -> Option<Self::Item> {
603        let mut res = self.lex();
604        while let Some(Ok((_, Token::Comment, _))) = res {
605            res = self.lex();
606        }
607        res
608    }
609}
610
611// HELPER FUNCTIONS
612// ================================================================================================
613
614fn pad_hex_if_needed<'a>(hex: &'a str) -> Cow<'a, str> {
615    if hex.len().is_multiple_of(2) {
616        Cow::Borrowed(hex)
617    } else {
618        // allocate once, with exact capacity
619        let mut s = alloc::string::String::with_capacity(hex.len() + 1);
620        s.push('0');
621        s.push_str(hex);
622        Cow::Owned(s)
623    }
624}
625
626fn parse_hex<'input>(
627    span: SourceSpan,
628    hex_digits: &'input str,
629) -> Result<Token<'input>, ParsingError> {
630    use miden_core::{Felt, field::PrimeField64};
631
632    // Handle odd-length hex strings by padding with a leading zero
633    let hex_digits = pad_hex_if_needed(hex_digits);
634
635    match hex_digits.len() {
636        // Felt - allow all even lengths up to 16
637        n if n <= 16 && n.is_multiple_of(2) => {
638            let value = u64::from_str_radix(&hex_digits, 16).map_err(|error| {
639                ParsingError::InvalidLiteral {
640                    span,
641                    kind: int_error_kind_to_literal_error_kind(
642                        error.kind(),
643                        LiteralErrorKind::FeltOverflow,
644                    ),
645                }
646            })?;
647            if value >= Felt::ORDER_U64 {
648                return Err(ParsingError::InvalidLiteral {
649                    span,
650                    kind: LiteralErrorKind::FeltOverflow,
651                });
652            }
653            Ok(Token::HexValue(shrink_u64_hex(value)))
654        },
655        // Word
656        64 => {
657            let mut word = [Felt::ZERO; 4];
658            for (index, element) in word.iter_mut().enumerate() {
659                let offset = index * 16;
660                let mut felt_bytes = [0u8; 8];
661                let digits = &hex_digits[offset..(offset + 16)];
662                for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
663                    let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
664                    *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
665                        ParsingError::InvalidLiteral {
666                            span,
667                            kind: int_error_kind_to_literal_error_kind(
668                                error.kind(),
669                                LiteralErrorKind::FeltOverflow,
670                            ),
671                        }
672                    })?;
673                }
674                let value = u64::from_le_bytes(felt_bytes);
675                if value >= Felt::ORDER_U64 {
676                    return Err(ParsingError::InvalidLiteral {
677                        span,
678                        kind: LiteralErrorKind::FeltOverflow,
679                    });
680                }
681                *element = Felt::new(value);
682            }
683            Ok(Token::HexWord(WordValue(word)))
684        },
685        // Invalid
686        n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
687        _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
688    }
689}
690
691fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
692    if bin_digits.len() <= 32 {
693        let value =
694            u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
695                span,
696                kind: int_error_kind_to_literal_error_kind(
697                    error.kind(),
698                    LiteralErrorKind::U32Overflow,
699                ),
700            })?;
701        Ok(shrink_u32_bin(value))
702    } else {
703        Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
704    }
705}
706
707#[inline(always)]
708fn is_ascii_binary(c: char) -> bool {
709    matches!(c, '0'..='1')
710}
711
712#[inline]
713pub fn shrink_u64_hex(n: u64) -> IntValue {
714    if n <= (u8::MAX as u64) {
715        IntValue::U8(n as u8)
716    } else if n <= (u16::MAX as u64) {
717        IntValue::U16(n as u16)
718    } else if n <= (u32::MAX as u64) {
719        IntValue::U32(n as u32)
720    } else {
721        IntValue::Felt(Felt::new(n))
722    }
723}
724
725#[inline]
726fn shrink_u32_bin(n: u32) -> BinEncodedValue {
727    if n <= (u8::MAX as u32) {
728        BinEncodedValue::U8(n as u8)
729    } else if n <= (u16::MAX as u32) {
730        BinEncodedValue::U16(n as u16)
731    } else {
732        BinEncodedValue::U32(n)
733    }
734}
735
736#[inline]
737fn int_error_kind_to_literal_error_kind(
738    kind: &IntErrorKind,
739    overflow: LiteralErrorKind,
740) -> LiteralErrorKind {
741    match kind {
742        IntErrorKind::Empty => LiteralErrorKind::Empty,
743        IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
744        IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
745        _ => unreachable!(),
746    }
747}