1use alloc::string::String;
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
5
6use super::{
7 BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
8 ParsingError, Scanner, Token, WordValue,
9};
10use crate::Felt;
11
12pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
19
20macro_rules! pop {
22 ($lex:ident) => {{
23 $lex.skip();
24 }};
25 ($lex:ident, $token:expr) => {{
26 $lex.skip();
27 Ok($token)
28 }};
29}
30
31macro_rules! pop2 {
33 ($lex:ident) => {{
34 $lex.skip();
35 $lex.skip();
36 }};
37 ($lex:ident, $token:expr) => {{
38 $lex.skip();
39 $lex.skip();
40 Ok($token)
41 }};
42}
43
44pub struct Lexer<'input> {
60 source_id: SourceId,
62
63 scanner: Scanner<'input>,
66
67 token: Token<'input>,
70
71 token_start: usize,
74
75 token_end: usize,
78
79 line_num: usize,
81
82 eof: bool,
86 empty: bool,
87
88 keywords: aho_corasick::AhoCorasick,
90
91 error: Option<ParsingError>,
93}
94
95impl<'input> Lexer<'input> {
96 pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
99 let start = scanner.start();
100 let keywords = Token::keyword_searcher();
101 let mut lexer = Self {
102 source_id,
103 scanner,
104 token: Token::Eof,
105 token_start: start,
106 token_end: start,
107 line_num: 0,
108 eof: false,
109 empty: false,
110 keywords,
111 error: None,
112 };
113 lexer.advance();
114 lexer
115 }
116
117 pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
118 if let Some(err) = self.error.take() {
119 return Some(Err(err));
120 }
121
122 if self.eof && matches!(self.token, Token::Eof) {
123 if self.empty {
125 return None;
126 } else {
127 self.empty = true;
128 let end = self.token_end as u32;
129 return Some(Ok((end, Token::Eof, end)));
130 }
131 }
132
133 let token = core::mem::replace(&mut self.token, Token::Eof);
134 let start = self.token_start;
135 let end = self.token_end;
136 self.advance();
137 Some(Ok((start as u32, token, end as u32)))
138 }
139
140 fn advance(&mut self) {
141 self.advance_start();
142 match self.tokenize() {
143 Ok(tok) => {
144 self.token = tok;
145 },
146 Err(err) => {
147 self.error = Some(err);
148 },
149 }
150 }
151
152 #[inline]
153 fn advance_start(&mut self) {
154 let mut position: usize;
155 loop {
156 let (pos, c) = self.scanner.read();
157
158 position = pos;
159
160 if c == '\0' {
161 self.eof = true;
162 return;
163 }
164
165 if c.is_whitespace() {
166 if c == '\n' {
167 self.line_num += 1;
168 }
169 self.scanner.advance();
170 continue;
171 }
172
173 break;
174 }
175
176 self.token_start = position;
177 }
178
179 #[inline]
180 fn pop(&mut self) -> char {
181 let (pos, c) = self.scanner.pop();
182 self.token_end = pos + c.len_utf8();
183 c
184 }
185
186 #[inline]
187 fn peek(&mut self) -> char {
188 let (_, c) = self.scanner.peek();
189 c
190 }
191
192 #[inline]
193 #[allow(unused)]
194 fn peek_next(&mut self) -> char {
195 let (_, c) = self.scanner.peek_next();
196 c
197 }
198
199 #[inline]
200 fn read(&mut self) -> char {
201 let (_, c) = self.scanner.read();
202 c
203 }
204
205 #[inline]
206 fn skip(&mut self) {
207 self.pop();
208 }
209
210 #[inline]
212 fn span(&self) -> SourceSpan {
213 assert!(self.token_start <= self.token_end, "invalid range");
214 assert!(self.token_end <= u32::MAX as usize, "file too large");
215 SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
216 }
217
218 #[inline]
219 fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
220 let range = span.into();
221 self.scanner.slice((range.start as usize)..(range.end as usize))
222 }
223
224 #[inline]
226 fn slice(&self) -> &'input str {
227 self.slice_span(self.span())
228 }
229
230 #[inline]
231 fn skip_whitespace(&mut self) {
232 let mut c: char;
233 loop {
234 c = self.read();
235
236 if !c.is_whitespace() {
237 break;
238 }
239
240 if c == '\n' {
241 self.line_num += 1;
242 }
243
244 self.skip();
245 }
246 }
247
248 fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
249 let c = self.read();
250
251 if c == '#' {
252 match self.peek() {
253 '!' => {
254 self.skip();
255 self.skip();
256 return self.lex_docs();
257 },
258 _ => {
259 self.skip();
260 self.skip_comment();
261 return Ok(Token::Comment);
262 },
263 }
264 }
265
266 if c == '\0' {
267 self.eof = true;
268 return Ok(Token::Eof);
269 }
270
271 if c.is_whitespace() {
272 self.skip_whitespace();
273 }
274
275 match self.read() {
276 '@' => pop!(self, Token::At),
277 '!' => pop!(self, Token::Bang),
278 ':' => match self.peek() {
279 ':' => pop2!(self, Token::ColonColon),
280 _ => pop!(self, Token::Colon),
281 },
282 ';' => pop!(self, Token::Semicolon),
283 '.' => match self.peek() {
284 '.' => pop2!(self, Token::Range),
285 _ => pop!(self, Token::Dot),
286 },
287 ',' => pop!(self, Token::Comma),
288 '=' => pop!(self, Token::Equal),
289 '<' => pop!(self, Token::Langle),
290 '{' => pop!(self, Token::Lbrace),
291 '[' => pop!(self, Token::Lbracket),
292 '(' => pop!(self, Token::Lparen),
293 '>' => pop!(self, Token::Rangle),
294 '}' => pop!(self, Token::Rbrace),
295 ']' => pop!(self, Token::Rbracket),
296 ')' => pop!(self, Token::Rparen),
297 '-' => match self.peek() {
298 '>' => pop2!(self, Token::Rstab),
299 _ => pop!(self, Token::Minus),
300 },
301 '+' => pop!(self, Token::Plus),
302 '/' => match self.peek() {
303 '/' => pop2!(self, Token::SlashSlash),
304 _ => pop!(self, Token::Slash),
305 },
306 '*' => pop!(self, Token::Star),
307 '$' => self.lex_special_identifier(),
308 '"' => self.lex_quoted_identifier_or_string(),
309 '0' => match self.peek() {
310 'x' => {
311 self.skip();
312 self.skip();
313 self.lex_hex()
314 },
315 'b' => {
316 self.skip();
317 self.skip();
318 self.lex_bin()
319 },
320 '0'..='9' => self.lex_number(),
321 _ => pop!(self, Token::Int(0)),
322 },
323 '1'..='9' => self.lex_number(),
324 'a'..='z' => self.lex_keyword_or_ident(),
325 'A'..='Z' => self.lex_identifier(),
326 '_' => match self.peek() {
327 c if c.is_ascii_alphanumeric() => self.lex_identifier(),
328 _ => Err(ParsingError::InvalidToken { span: self.span() }),
329 },
330 _ => Err(ParsingError::InvalidToken { span: self.span() }),
331 }
332 }
333
334 fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
335 let mut buf = String::new();
336
337 let mut c;
338 let mut line_start = self.token_start + 2;
339 let is_module_doc = self.line_num == 0;
340 loop {
341 c = self.read();
342
343 if c == '\0' {
344 self.eof = true;
345 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
346
347 let is_first_line = self.line_num == 0;
348 break Ok(Token::DocComment(if is_first_line {
349 DocumentationType::Module(buf)
350 } else {
351 DocumentationType::Form(buf)
352 }));
353 }
354
355 if c == '\n' {
356 self.line_num += 1;
357
358 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
359 buf.push('\n');
360
361 self.skip();
362 c = self.read();
363 match c {
364 '#' if self.peek() == '!' => {
365 self.skip();
366 self.skip();
367 line_start = self.token_end;
368 continue;
369 },
370 _ if is_module_doc => {
371 break Ok(Token::DocComment(DocumentationType::Module(buf)));
372 },
373 _ => {
374 break Ok(Token::DocComment(DocumentationType::Form(buf)));
375 },
376 }
377 }
378
379 self.skip();
380 }
381 }
382
383 fn skip_comment(&mut self) {
384 let mut c;
385 loop {
386 c = self.read();
387
388 if c == '\n' {
389 self.skip();
390 self.line_num += 1;
391 break;
392 }
393
394 if c == '\0' {
395 self.eof = true;
396 break;
397 }
398
399 self.skip();
400 }
401 }
402
403 fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
404 let c = self.pop();
405 debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
406
407 loop {
408 match self.read() {
409 '_' | '0'..='9' => self.skip(),
410 c if c.is_ascii_alphabetic() => self.skip(),
411 _ => break,
412 }
413 }
414
415 let name = self.slice();
416 match name {
417 "exp" => {
418 if self.read() == '.' && self.peek() == 'u' {
420 pop2!(self, Token::ExpU)
421 } else {
422 Ok(Token::Exp)
423 }
424 },
425 _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
426 }
427 }
428
429 fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
430 self.skip();
432
433 let mut is_identifier = true;
434 let quote_size = ByteOffset::from_char_len('"');
435 loop {
436 match self.read() {
437 '\0' | '\n' => {
438 break Err(ParsingError::UnclosedQuote {
439 start: SourceSpan::at(self.source_id, self.span().start()),
440 });
441 },
442 '\\' => {
443 is_identifier = false;
444 self.skip();
445 match self.read() {
446 '"' | '\n' => {
447 self.skip();
448 },
449 _ => (),
450 }
451 },
452 '"' => {
453 let span = self.span();
454 let start = span.start() + quote_size;
455 let span = SourceSpan::new(self.source_id, start..span.end());
456
457 self.skip();
458 break Ok(if is_identifier {
459 Token::QuotedIdent(self.slice_span(span))
460 } else {
461 Token::QuotedString(self.slice_span(span))
462 });
463 },
464 c if c.is_alphanumeric() || c.is_ascii_graphic() => {
465 self.skip();
466 },
467 _ => {
468 is_identifier = false;
469 self.skip();
470 },
471 }
472 }
473 }
474
475 fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
476 let c = self.pop();
477 debug_assert!(c.is_ascii_alphabetic() || c == '_');
478
479 let mut is_constant_ident = c.is_ascii_uppercase() || c == '_';
480
481 loop {
482 match self.read() {
483 '_' | '0'..='9' => self.skip(),
484 c if c.is_ascii_alphabetic() => {
485 is_constant_ident &= c.is_ascii_uppercase();
486 self.skip();
487 },
488 _ => break,
489 }
490 }
491
492 if is_constant_ident {
493 Ok(Token::ConstantIdent(self.slice()))
494 } else {
495 Ok(Token::Ident(self.slice()))
496 }
497 }
498
499 fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
500 let c = self.pop();
501 debug_assert_eq!(c, '$');
502
503 loop {
504 match self.read() {
505 '_' | '0'..='9' => self.skip(),
506 c if c.is_ascii_lowercase() => self.skip(),
507 _ => break,
508 }
509 }
510
511 match self.slice() {
512 id @ ("$kernel" | "$exec" | "$anon") => Ok(Token::Ident(id)),
513 _ => {
514 let start = self.span().start();
515 let span = SourceSpan::at(self.span().source_id(), start);
516 Err(ParsingError::InvalidToken { span })
517 },
518 }
519 }
520
521 fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
522 let c = self.read();
524 debug_assert!(c.is_ascii_digit());
525
526 while let '0'..='9' = self.read() {
527 self.skip();
528 }
529
530 self.slice()
531 .parse::<u64>()
532 .map_err(|error| ParsingError::InvalidLiteral {
533 span: self.span(),
534 kind: int_error_kind_to_literal_error_kind(
535 error.kind(),
536 LiteralErrorKind::FeltOverflow,
537 ),
538 })
539 .map(Token::Int)
540 }
541
542 fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
543 debug_assert!(self.read().is_ascii_hexdigit());
545
546 loop {
547 let c1 = self.read();
549 if !c1.is_ascii_hexdigit() {
550 break;
551 }
552 self.skip();
553
554 let c2 = self.read();
558 if !c2.is_ascii_hexdigit() {
559 return Err(ParsingError::InvalidHexLiteral {
560 span: self.span(),
561 kind: HexErrorKind::Invalid,
562 });
563 }
564 self.skip();
565 }
566
567 let span = self.span();
568 let start = span.start();
569 let end = span.end();
570 let digit_start = start.to_u32() + 2;
571 let span = SourceSpan::new(span.source_id(), start..end);
572 parse_hex(span, self.slice_span(digit_start..end.to_u32()))
573 }
574
575 fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
576 debug_assert!(is_ascii_binary(self.read()));
578
579 loop {
580 let c1 = self.read();
582 if !is_ascii_binary(c1) {
583 break;
584 }
585 self.skip();
586 }
587
588 let span = self.span();
589 let start = span.start();
590 let digit_start = start.to_u32() + 2;
591 let end = span.end();
592 let span = SourceSpan::new(span.source_id(), start..end);
593 let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
594 Ok(Token::BinValue(value))
595 }
596}
597
598impl<'input> Iterator for Lexer<'input> {
599 type Item = Lexed<'input>;
600
601 fn next(&mut self) -> Option<Self::Item> {
602 let mut res = self.lex();
603 while let Some(Ok((_, Token::Comment, _))) = res {
604 res = self.lex();
605 }
606 res
607 }
608}
609
610fn parse_hex<'input>(
614 span: SourceSpan,
615 hex_digits: &'input str,
616) -> Result<Token<'input>, ParsingError> {
617 use miden_core::{FieldElement, StarkField};
618 match hex_digits.len() {
619 n if n <= 16 && n.is_multiple_of(2) => {
621 let value = u64::from_str_radix(hex_digits, 16).map_err(|error| {
622 ParsingError::InvalidLiteral {
623 span,
624 kind: int_error_kind_to_literal_error_kind(
625 error.kind(),
626 LiteralErrorKind::FeltOverflow,
627 ),
628 }
629 })?;
630 if value >= Felt::MODULUS {
631 return Err(ParsingError::InvalidLiteral {
632 span,
633 kind: LiteralErrorKind::FeltOverflow,
634 });
635 }
636 Ok(Token::HexValue(shrink_u64_hex(value)))
637 },
638 64 => {
640 let mut word = [Felt::ZERO; 4];
641 for (index, element) in word.iter_mut().enumerate() {
642 let offset = index * 16;
643 let mut felt_bytes = [0u8; 8];
644 let digits = &hex_digits[offset..(offset + 16)];
645 for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
646 let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
647 *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
648 ParsingError::InvalidLiteral {
649 span,
650 kind: int_error_kind_to_literal_error_kind(
651 error.kind(),
652 LiteralErrorKind::FeltOverflow,
653 ),
654 }
655 })?;
656 }
657 let value = u64::from_le_bytes(felt_bytes);
658 if value >= Felt::MODULUS {
659 return Err(ParsingError::InvalidLiteral {
660 span,
661 kind: LiteralErrorKind::FeltOverflow,
662 });
663 }
664 *element = Felt::new(value);
665 }
666 Ok(Token::HexWord(WordValue(word)))
667 },
668 n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
670 n if !n.is_multiple_of(2) && n < 64 => {
671 Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::MissingDigits })
672 },
673 _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
674 }
675}
676
677fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
678 if bin_digits.len() <= 32 {
679 let value =
680 u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
681 span,
682 kind: int_error_kind_to_literal_error_kind(
683 error.kind(),
684 LiteralErrorKind::U32Overflow,
685 ),
686 })?;
687 Ok(shrink_u32_bin(value))
688 } else {
689 Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
690 }
691}
692
693#[inline(always)]
694fn is_ascii_binary(c: char) -> bool {
695 matches!(c, '0'..='1')
696}
697
698#[inline]
699pub fn shrink_u64_hex(n: u64) -> IntValue {
700 if n <= (u8::MAX as u64) {
701 IntValue::U8(n as u8)
702 } else if n <= (u16::MAX as u64) {
703 IntValue::U16(n as u16)
704 } else if n <= (u32::MAX as u64) {
705 IntValue::U32(n as u32)
706 } else {
707 IntValue::Felt(Felt::new(n))
708 }
709}
710
711#[inline]
712fn shrink_u32_bin(n: u32) -> BinEncodedValue {
713 if n <= (u8::MAX as u32) {
714 BinEncodedValue::U8(n as u8)
715 } else if n <= (u16::MAX as u32) {
716 BinEncodedValue::U16(n as u16)
717 } else {
718 BinEncodedValue::U32(n)
719 }
720}
721
722#[inline]
723fn int_error_kind_to_literal_error_kind(
724 kind: &IntErrorKind,
725 overflow: LiteralErrorKind,
726) -> LiteralErrorKind {
727 match kind {
728 IntErrorKind::Empty => LiteralErrorKind::Empty,
729 IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
730 IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
731 _ => unreachable!(),
732 }
733}