1use alloc::string::String;
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
5
6use super::{
7 BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
8 ParsingError, Scanner, Token, WordValue,
9};
10use crate::Felt;
11
12pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
19
20macro_rules! pop {
22 ($lex:ident) => {{
23 $lex.skip();
24 }};
25 ($lex:ident, $token:expr) => {{
26 $lex.skip();
27 Ok($token)
28 }};
29}
30
31macro_rules! pop2 {
33 ($lex:ident) => {{
34 $lex.skip();
35 $lex.skip();
36 }};
37 ($lex:ident, $token:expr) => {{
38 $lex.skip();
39 $lex.skip();
40 Ok($token)
41 }};
42}
43
44pub struct Lexer<'input> {
60 source_id: SourceId,
62
63 scanner: Scanner<'input>,
66
67 token: Token<'input>,
70
71 token_start: usize,
74
75 token_end: usize,
78
79 line_num: usize,
81
82 eof: bool,
86 empty: bool,
87
88 keywords: aho_corasick::AhoCorasick,
90
91 error: Option<ParsingError>,
93}
94
95impl<'input> Lexer<'input> {
96 pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
99 let start = scanner.start();
100 let keywords = Token::keyword_searcher();
101 let mut lexer = Self {
102 source_id,
103 scanner,
104 token: Token::Eof,
105 token_start: start,
106 token_end: start,
107 line_num: 0,
108 eof: false,
109 empty: false,
110 keywords,
111 error: None,
112 };
113 lexer.advance();
114 lexer
115 }
116
117 pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
118 if let Some(err) = self.error.take() {
119 return Some(Err(err));
120 }
121
122 if self.eof && matches!(self.token, Token::Eof) {
123 if self.empty {
125 return None;
126 } else {
127 self.empty = true;
128 let end = self.token_end as u32;
129 return Some(Ok((end, Token::Eof, end)));
130 }
131 }
132
133 let token = core::mem::replace(&mut self.token, Token::Eof);
134 let start = self.token_start;
135 let end = self.token_end;
136 self.advance();
137 Some(Ok((start as u32, token, end as u32)))
138 }
139
140 fn advance(&mut self) {
141 self.advance_start();
142 match self.tokenize() {
143 Ok(tok) => {
144 self.token = tok;
145 },
146 Err(err) => {
147 self.error = Some(err);
148 },
149 }
150 }
151
152 #[inline]
153 fn advance_start(&mut self) {
154 let mut position: usize;
155 loop {
156 let (pos, c) = self.scanner.read();
157
158 position = pos;
159
160 if c == '\0' {
161 self.eof = true;
162 return;
163 }
164
165 if c.is_whitespace() {
166 if c == '\n' {
167 self.line_num += 1;
168 }
169 self.scanner.advance();
170 continue;
171 }
172
173 break;
174 }
175
176 self.token_start = position;
177 }
178
179 #[inline]
180 fn pop(&mut self) -> char {
181 let (pos, c) = self.scanner.pop();
182 self.token_end = pos + c.len_utf8();
183 c
184 }
185
186 #[inline]
187 fn peek(&mut self) -> char {
188 let (_, c) = self.scanner.peek();
189 c
190 }
191
192 #[inline]
193 #[allow(unused)]
194 fn peek_next(&mut self) -> char {
195 let (_, c) = self.scanner.peek_next();
196 c
197 }
198
199 #[inline]
200 fn read(&mut self) -> char {
201 let (_, c) = self.scanner.read();
202 c
203 }
204
205 #[inline]
206 fn skip(&mut self) {
207 self.pop();
208 }
209
210 #[inline]
212 fn span(&self) -> SourceSpan {
213 assert!(self.token_start <= self.token_end, "invalid range");
214 assert!(self.token_end <= u32::MAX as usize, "file too large");
215 SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
216 }
217
218 #[inline]
219 fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
220 let range = span.into();
221 self.scanner.slice((range.start as usize)..(range.end as usize))
222 }
223
224 #[inline]
226 fn slice(&self) -> &'input str {
227 self.slice_span(self.span())
228 }
229
230 #[inline]
231 fn skip_whitespace(&mut self) {
232 let mut c: char;
233 loop {
234 c = self.read();
235
236 if !c.is_whitespace() {
237 break;
238 }
239
240 if c == '\n' {
241 self.line_num += 1;
242 }
243
244 self.skip();
245 }
246 }
247
248 fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
249 let c = self.read();
250
251 if c == '#' {
252 match self.peek() {
253 '!' => {
254 self.skip();
255 self.skip();
256 return self.lex_docs();
257 },
258 _ => {
259 self.skip();
260 self.skip_comment();
261 return Ok(Token::Comment);
262 },
263 }
264 }
265
266 if c == '\0' {
267 self.eof = true;
268 return Ok(Token::Eof);
269 }
270
271 if c.is_whitespace() {
272 self.skip_whitespace();
273 }
274
275 match self.read() {
276 '@' => pop!(self, Token::At),
277 '!' => pop!(self, Token::Bang),
278 ':' => match self.peek() {
279 ':' => pop2!(self, Token::ColonColon),
280 _ => Err(ParsingError::InvalidToken { span: self.span() }),
281 },
282 '.' => pop!(self, Token::Dot),
283 ',' => pop!(self, Token::Comma),
284 '=' => pop!(self, Token::Equal),
285 '(' => pop!(self, Token::Lparen),
286 '[' => pop!(self, Token::Lbracket),
287 ')' => pop!(self, Token::Rparen),
288 ']' => pop!(self, Token::Rbracket),
289 '-' => match self.peek() {
290 '>' => pop2!(self, Token::Rstab),
291 _ => pop!(self, Token::Minus),
292 },
293 '+' => pop!(self, Token::Plus),
294 '/' => match self.peek() {
295 '/' => pop2!(self, Token::SlashSlash),
296 _ => pop!(self, Token::Slash),
297 },
298 '*' => pop!(self, Token::Star),
299 '$' => self.lex_special_identifier(),
300 '"' => self.lex_quoted_identifier_or_string(),
301 '0' => match self.peek() {
302 'x' => {
303 self.skip();
304 self.skip();
305 self.lex_hex()
306 },
307 'b' => {
308 self.skip();
309 self.skip();
310 self.lex_bin()
311 },
312 '0'..='9' => self.lex_number(),
313 _ => pop!(self, Token::Int(0)),
314 },
315 '1'..='9' => self.lex_number(),
316 'a'..='z' => self.lex_keyword_or_ident(),
317 'A'..='Z' => self.lex_const_identifier(),
318 '_' => match self.peek() {
319 c if c.is_ascii_alphanumeric() => self.lex_identifier(),
320 _ => Err(ParsingError::InvalidToken { span: self.span() }),
321 },
322 _ => Err(ParsingError::InvalidToken { span: self.span() }),
323 }
324 }
325
326 fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
327 let mut buf = String::new();
328
329 let mut c;
330 let mut line_start = self.token_start + 2;
331 let is_module_doc = self.line_num == 0;
332 loop {
333 c = self.read();
334
335 if c == '\0' {
336 self.eof = true;
337 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
338
339 let is_first_line = self.line_num == 0;
340 break Ok(Token::DocComment(if is_first_line {
341 DocumentationType::Module(buf)
342 } else {
343 DocumentationType::Form(buf)
344 }));
345 }
346
347 if c == '\n' {
348 self.line_num += 1;
349
350 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
351 buf.push('\n');
352
353 self.skip();
354 c = self.read();
355 match c {
356 '#' if self.peek() == '!' => {
357 self.skip();
358 self.skip();
359 line_start = self.token_end;
360 continue;
361 },
362 _ if is_module_doc => {
363 break Ok(Token::DocComment(DocumentationType::Module(buf)));
364 },
365 _ => {
366 break Ok(Token::DocComment(DocumentationType::Form(buf)));
367 },
368 }
369 }
370
371 self.skip();
372 }
373 }
374
375 fn skip_comment(&mut self) {
376 let mut c;
377 loop {
378 c = self.read();
379
380 if c == '\n' {
381 self.skip();
382 self.line_num += 1;
383 break;
384 }
385
386 if c == '\0' {
387 self.eof = true;
388 break;
389 }
390
391 self.skip();
392 }
393 }
394
395 fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
396 let c = self.pop();
397 debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
398
399 loop {
400 match self.read() {
401 '_' | '0'..='9' => self.skip(),
402 c if c.is_ascii_alphabetic() => self.skip(),
403 _ => break,
404 }
405 }
406
407 let name = self.slice();
408 match name {
409 "exp" => {
410 if self.read() == '.' && self.peek() == 'u' {
412 pop2!(self, Token::ExpU)
413 } else {
414 Ok(Token::Exp)
415 }
416 },
417 _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
418 }
419 }
420
421 fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
422 self.skip();
424
425 let mut is_identifier = true;
426 let quote_size = ByteOffset::from_char_len('"');
427 loop {
428 match self.read() {
429 '\0' | '\n' => {
430 break Err(ParsingError::UnclosedQuote {
431 start: SourceSpan::at(self.source_id, self.span().start()),
432 });
433 },
434 '\\' => {
435 is_identifier = false;
436 self.skip();
437 match self.read() {
438 '"' | '\n' => {
439 self.skip();
440 },
441 _ => (),
442 }
443 },
444 '"' => {
445 let span = self.span();
446 let start = span.start() + quote_size;
447 let span = SourceSpan::new(self.source_id, start..span.end());
448
449 self.skip();
450 break Ok(if is_identifier {
451 Token::QuotedIdent(self.slice_span(span))
452 } else {
453 Token::QuotedString(self.slice_span(span))
454 });
455 },
456 c if c.is_alphanumeric() || c.is_ascii_graphic() => {
457 self.skip();
458 },
459 _ => {
460 is_identifier = false;
461 self.skip();
462 },
463 }
464 }
465 }
466
467 fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
468 let c = self.pop();
469 debug_assert!(c.is_ascii_lowercase() || c == '_');
470
471 loop {
472 match self.read() {
473 '_' | '0'..='9' => self.skip(),
474 c if c.is_ascii_lowercase() => self.skip(),
475 _ => break,
476 }
477 }
478
479 Ok(Token::Ident(self.slice()))
480 }
481
482 fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
483 let c = self.pop();
484 debug_assert_eq!(c, '$');
485
486 loop {
487 match self.read() {
488 '_' | '0'..='9' => self.skip(),
489 c if c.is_ascii_lowercase() => self.skip(),
490 _ => break,
491 }
492 }
493
494 match self.slice() {
495 id @ ("$kernel" | "$exec" | "$anon") => Ok(Token::Ident(id)),
496 _ => {
497 let start = self.span().start();
498 let span = SourceSpan::at(self.span().source_id(), start);
499 Err(ParsingError::InvalidToken { span })
500 },
501 }
502 }
503
504 fn lex_const_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
505 let c = self.pop();
506 debug_assert!(c.is_ascii_uppercase() || c == '_');
507
508 loop {
509 match self.read() {
510 '_' | '0'..='9' => self.skip(),
511 c if c.is_ascii_uppercase() => self.skip(),
512 _ => break,
513 }
514 }
515
516 Ok(Token::ConstantIdent(self.slice()))
517 }
518
519 fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
520 let c = self.read();
522 debug_assert!(c.is_ascii_digit());
523
524 while let '0'..='9' = self.read() {
525 self.skip();
526 }
527
528 self.slice()
529 .parse::<u64>()
530 .map_err(|error| ParsingError::InvalidLiteral {
531 span: self.span(),
532 kind: int_error_kind_to_literal_error_kind(
533 error.kind(),
534 LiteralErrorKind::FeltOverflow,
535 ),
536 })
537 .map(Token::Int)
538 }
539
540 fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
541 debug_assert!(self.read().is_ascii_hexdigit());
543
544 loop {
545 let c1 = self.read();
547 if !c1.is_ascii_hexdigit() {
548 break;
549 }
550 self.skip();
551
552 let c2 = self.read();
556 if !c2.is_ascii_hexdigit() {
557 return Err(ParsingError::InvalidHexLiteral {
558 span: self.span(),
559 kind: HexErrorKind::Invalid,
560 });
561 }
562 self.skip();
563 }
564
565 let span = self.span();
566 let start = span.start();
567 let end = span.end();
568 let digit_start = start.to_u32() + 2;
569 let span = SourceSpan::new(span.source_id(), start..end);
570 let value = parse_hex(span, self.slice_span(digit_start..end.to_u32()))?;
571 Ok(Token::HexValue(value))
572 }
573
574 fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
575 debug_assert!(is_ascii_binary(self.read()));
577
578 loop {
579 let c1 = self.read();
581 if !is_ascii_binary(c1) {
582 break;
583 }
584 self.skip();
585 }
586
587 let span = self.span();
588 let start = span.start();
589 let digit_start = start.to_u32() + 2;
590 let end = span.end();
591 let span = SourceSpan::new(span.source_id(), start..end);
592 let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
593 Ok(Token::BinValue(value))
594 }
595}
596
597impl<'input> Iterator for Lexer<'input> {
598 type Item = Lexed<'input>;
599
600 fn next(&mut self) -> Option<Self::Item> {
601 let mut res = self.lex();
602 while let Some(Ok((_, Token::Comment, _))) = res {
603 res = self.lex();
604 }
605 res
606 }
607}
608
609fn parse_hex(span: SourceSpan, hex_digits: &str) -> Result<IntValue, ParsingError> {
613 use miden_core::{FieldElement, StarkField};
614 match hex_digits.len() {
615 n if n <= 16 && n.is_multiple_of(2) => {
617 let value = u64::from_str_radix(hex_digits, 16).map_err(|error| {
618 ParsingError::InvalidLiteral {
619 span,
620 kind: int_error_kind_to_literal_error_kind(
621 error.kind(),
622 LiteralErrorKind::FeltOverflow,
623 ),
624 }
625 })?;
626 if value >= Felt::MODULUS {
627 return Err(ParsingError::InvalidLiteral {
628 span,
629 kind: LiteralErrorKind::FeltOverflow,
630 });
631 }
632 Ok(shrink_u64_hex(value))
633 },
634 64 => {
636 let mut word = [Felt::ZERO; 4];
637 for (index, element) in word.iter_mut().enumerate() {
638 let offset = index * 16;
639 let mut felt_bytes = [0u8; 8];
640 let digits = &hex_digits[offset..(offset + 16)];
641 for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
642 let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
643 *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
644 ParsingError::InvalidLiteral {
645 span,
646 kind: int_error_kind_to_literal_error_kind(
647 error.kind(),
648 LiteralErrorKind::FeltOverflow,
649 ),
650 }
651 })?;
652 }
653 let value = u64::from_le_bytes(felt_bytes);
654 if value >= Felt::MODULUS {
655 return Err(ParsingError::InvalidLiteral {
656 span,
657 kind: LiteralErrorKind::FeltOverflow,
658 });
659 }
660 *element = Felt::new(value);
661 }
662 Ok(IntValue::Word(WordValue(word)))
663 },
664 n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
666 n if !n.is_multiple_of(2) && n < 64 => {
667 Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::MissingDigits })
668 },
669 _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
670 }
671}
672
673fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
674 if bin_digits.len() <= 32 {
675 let value =
676 u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
677 span,
678 kind: int_error_kind_to_literal_error_kind(
679 error.kind(),
680 LiteralErrorKind::U32Overflow,
681 ),
682 })?;
683 Ok(shrink_u32_bin(value))
684 } else {
685 Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
686 }
687}
688
689#[inline(always)]
690fn is_ascii_binary(c: char) -> bool {
691 matches!(c, '0'..='1')
692}
693
694#[inline]
695fn shrink_u64_hex(n: u64) -> IntValue {
696 if n <= (u8::MAX as u64) {
697 IntValue::U8(n as u8)
698 } else if n <= (u16::MAX as u64) {
699 IntValue::U16(n as u16)
700 } else if n <= (u32::MAX as u64) {
701 IntValue::U32(n as u32)
702 } else {
703 IntValue::Felt(Felt::new(n))
704 }
705}
706
707#[inline]
708fn shrink_u32_bin(n: u32) -> BinEncodedValue {
709 if n <= (u8::MAX as u32) {
710 BinEncodedValue::U8(n as u8)
711 } else if n <= (u16::MAX as u32) {
712 BinEncodedValue::U16(n as u16)
713 } else {
714 BinEncodedValue::U32(n)
715 }
716}
717
718#[inline]
719fn int_error_kind_to_literal_error_kind(
720 kind: &IntErrorKind,
721 overflow: LiteralErrorKind,
722) -> LiteralErrorKind {
723 match kind {
724 IntErrorKind::Empty => LiteralErrorKind::Empty,
725 IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
726 IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
727 _ => unreachable!(),
728 }
729}