1use alloc::{borrow::Cow, string::String};
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_core::field::PrimeCharacteristicRing;
5use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
6
7use super::{
8 BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
9 ParsingError, Scanner, Token, WordValue,
10};
11use crate::Felt;
12
13pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
20
21macro_rules! pop {
23 ($lex:ident) => {{
24 $lex.skip();
25 }};
26 ($lex:ident, $token:expr) => {{
27 $lex.skip();
28 Ok($token)
29 }};
30}
31
32macro_rules! pop2 {
34 ($lex:ident) => {{
35 $lex.skip();
36 $lex.skip();
37 }};
38 ($lex:ident, $token:expr) => {{
39 $lex.skip();
40 $lex.skip();
41 Ok($token)
42 }};
43}
44
45pub struct Lexer<'input> {
61 source_id: SourceId,
63
64 scanner: Scanner<'input>,
67
68 token: Token<'input>,
71
72 token_start: usize,
75
76 token_end: usize,
79
80 line_num: usize,
82
83 eof: bool,
87 empty: bool,
88
89 keywords: aho_corasick::AhoCorasick,
91
92 error: Option<ParsingError>,
94}
95
96impl<'input> Lexer<'input> {
97 pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
100 let start = scanner.start();
101 let keywords = Token::keyword_searcher();
102 let mut lexer = Self {
103 source_id,
104 scanner,
105 token: Token::Eof,
106 token_start: start,
107 token_end: start,
108 line_num: 0,
109 eof: false,
110 empty: false,
111 keywords,
112 error: None,
113 };
114 lexer.advance();
115 lexer
116 }
117
118 pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
119 if let Some(err) = self.error.take() {
120 return Some(Err(err));
121 }
122
123 if self.eof && matches!(self.token, Token::Eof) {
124 if self.empty {
126 return None;
127 } else {
128 self.empty = true;
129 let end = self.token_end as u32;
130 return Some(Ok((end, Token::Eof, end)));
131 }
132 }
133
134 let token = core::mem::replace(&mut self.token, Token::Eof);
135 let start = self.token_start;
136 let end = self.token_end;
137 self.advance();
138 Some(Ok((start as u32, token, end as u32)))
139 }
140
141 fn advance(&mut self) {
142 self.advance_start();
143 match self.tokenize() {
144 Ok(tok) => {
145 self.token = tok;
146 },
147 Err(err) => {
148 self.error = Some(err);
149 },
150 }
151 }
152
153 #[inline]
154 fn advance_start(&mut self) {
155 let mut position: usize;
156 loop {
157 let (pos, c) = self.scanner.read();
158
159 position = pos;
160
161 if c == '\0' {
162 self.eof = true;
163 return;
164 }
165
166 if c.is_whitespace() {
167 if c == '\n' {
168 self.line_num += 1;
169 }
170 self.scanner.advance();
171 continue;
172 }
173
174 break;
175 }
176
177 self.token_start = position;
178 }
179
180 #[inline]
181 fn pop(&mut self) -> char {
182 let (pos, c) = self.scanner.pop();
183 self.token_end = pos + c.len_utf8();
184 c
185 }
186
187 #[inline]
188 fn peek(&mut self) -> char {
189 let (_, c) = self.scanner.peek();
190 c
191 }
192
193 #[inline]
194 #[expect(unused)]
195 fn peek_next(&mut self) -> char {
196 let (_, c) = self.scanner.peek_next();
197 c
198 }
199
200 #[inline]
201 fn read(&mut self) -> char {
202 let (_, c) = self.scanner.read();
203 c
204 }
205
206 #[inline]
207 fn skip(&mut self) {
208 self.pop();
209 }
210
211 #[inline]
213 fn span(&self) -> SourceSpan {
214 assert!(self.token_start <= self.token_end, "invalid range");
215 assert!(self.token_end <= u32::MAX as usize, "file too large");
216 SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
217 }
218
219 #[inline]
220 fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
221 let range = span.into();
222 self.scanner.slice((range.start as usize)..(range.end as usize))
223 }
224
225 #[inline]
227 fn slice(&self) -> &'input str {
228 self.slice_span(self.span())
229 }
230
231 #[inline]
232 fn skip_whitespace(&mut self) {
233 let mut c: char;
234 loop {
235 c = self.read();
236
237 if !c.is_whitespace() {
238 break;
239 }
240
241 if c == '\n' {
242 self.line_num += 1;
243 }
244
245 self.skip();
246 }
247 }
248
249 fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
250 let c = self.read();
251
252 if c == '#' {
253 match self.peek() {
254 '!' => {
255 self.skip();
256 self.skip();
257 return self.lex_docs();
258 },
259 _ => {
260 self.skip();
261 self.skip_comment();
262 return Ok(Token::Comment);
263 },
264 }
265 }
266
267 if c == '\0' {
268 self.eof = true;
269 return Ok(Token::Eof);
270 }
271
272 if c.is_whitespace() {
273 self.skip_whitespace();
274 }
275
276 match self.read() {
277 '@' => pop!(self, Token::At),
278 '!' => pop!(self, Token::Bang),
279 ':' => match self.peek() {
280 ':' => pop2!(self, Token::ColonColon),
281 _ => pop!(self, Token::Colon),
282 },
283 ';' => pop!(self, Token::Semicolon),
284 '.' => match self.peek() {
285 '.' => pop2!(self, Token::Range),
286 _ => pop!(self, Token::Dot),
287 },
288 ',' => pop!(self, Token::Comma),
289 '=' => pop!(self, Token::Equal),
290 '<' => pop!(self, Token::Langle),
291 '{' => pop!(self, Token::Lbrace),
292 '[' => pop!(self, Token::Lbracket),
293 '(' => pop!(self, Token::Lparen),
294 '>' => pop!(self, Token::Rangle),
295 '}' => pop!(self, Token::Rbrace),
296 ']' => pop!(self, Token::Rbracket),
297 ')' => pop!(self, Token::Rparen),
298 '-' => match self.peek() {
299 '>' => pop2!(self, Token::Rstab),
300 _ => pop!(self, Token::Minus),
301 },
302 '+' => pop!(self, Token::Plus),
303 '/' => match self.peek() {
304 '/' => pop2!(self, Token::SlashSlash),
305 _ => pop!(self, Token::Slash),
306 },
307 '*' => pop!(self, Token::Star),
308 '$' => self.lex_special_identifier(),
309 '"' => self.lex_quoted_identifier_or_string(),
310 '0' => match self.peek() {
311 'x' => {
312 self.skip();
313 self.skip();
314 self.lex_hex()
315 },
316 'b' => {
317 self.skip();
318 self.skip();
319 self.lex_bin()
320 },
321 '0'..='9' => self.lex_number(),
322 _ => pop!(self, Token::Int(0)),
323 },
324 '1'..='9' => self.lex_number(),
325 'a'..='z' => self.lex_keyword_or_ident(),
326 'A'..='Z' => self.lex_identifier(),
327 '_' => match self.peek() {
328 c if c.is_ascii_alphanumeric() => self.lex_identifier(),
329 _ => Err(ParsingError::InvalidToken { span: self.span() }),
330 },
331 _ => Err(ParsingError::InvalidToken { span: self.span() }),
332 }
333 }
334
335 fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
336 let mut buf = String::new();
337
338 let mut c;
339 let mut line_start = self.token_start + 2;
340 let is_module_doc = self.line_num == 0;
341 loop {
342 c = self.read();
343
344 if c == '\0' {
345 self.eof = true;
346 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
347
348 let is_first_line = self.line_num == 0;
349 break Ok(Token::DocComment(if is_first_line {
350 DocumentationType::Module(buf)
351 } else {
352 DocumentationType::Form(buf)
353 }));
354 }
355
356 if c == '\n' {
357 self.line_num += 1;
358
359 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
360 buf.push('\n');
361
362 self.skip();
363 c = self.read();
364 match c {
365 '#' if self.peek() == '!' => {
366 self.skip();
367 self.skip();
368 line_start = self.token_end;
369 continue;
370 },
371 _ if is_module_doc => {
372 break Ok(Token::DocComment(DocumentationType::Module(buf)));
373 },
374 _ => {
375 break Ok(Token::DocComment(DocumentationType::Form(buf)));
376 },
377 }
378 }
379
380 self.skip();
381 }
382 }
383
384 fn skip_comment(&mut self) {
385 let mut c;
386 loop {
387 c = self.read();
388
389 if c == '\n' {
390 self.skip();
391 self.line_num += 1;
392 break;
393 }
394
395 if c == '\0' {
396 self.eof = true;
397 break;
398 }
399
400 self.skip();
401 }
402 }
403
404 fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
405 let c = self.pop();
406 debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
407
408 loop {
409 match self.read() {
410 '_' | '0'..='9' => self.skip(),
411 c if c.is_ascii_alphabetic() => self.skip(),
412 _ => break,
413 }
414 }
415
416 let name = self.slice();
417 match name {
418 "exp" => {
419 if self.read() == '.' && self.peek() == 'u' {
421 pop2!(self, Token::ExpU)
422 } else {
423 Ok(Token::Exp)
424 }
425 },
426 _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
427 }
428 }
429
430 fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
431 self.skip();
433
434 let mut is_identifier = true;
435 let quote_size = ByteOffset::from_char_len('"');
436 loop {
437 match self.read() {
438 '\0' | '\n' => {
439 break Err(ParsingError::UnclosedQuote {
440 start: SourceSpan::at(self.source_id, self.span().start()),
441 });
442 },
443 '\\' => {
444 is_identifier = false;
445 self.skip();
446 match self.read() {
447 '"' | '\n' => {
448 self.skip();
449 },
450 _ => (),
451 }
452 },
453 '"' => {
454 let span = self.span();
455 let start = span.start() + quote_size;
456 let span = SourceSpan::new(self.source_id, start..span.end());
457
458 self.skip();
459 break Ok(if is_identifier {
460 Token::QuotedIdent(self.slice_span(span))
461 } else {
462 Token::QuotedString(self.slice_span(span))
463 });
464 },
465 c if c.is_alphanumeric() || c.is_ascii_graphic() => {
466 self.skip();
467 },
468 _ => {
469 is_identifier = false;
470 self.skip();
471 },
472 }
473 }
474 }
475
476 fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
477 let c = self.pop();
478 debug_assert!(c.is_ascii_alphabetic() || c == '_');
479
480 let mut is_constant_ident = c.is_ascii_uppercase() || c == '_';
481
482 loop {
483 match self.read() {
484 '_' | '0'..='9' => self.skip(),
485 c if c.is_ascii_alphabetic() => {
486 is_constant_ident &= c.is_ascii_uppercase();
487 self.skip();
488 },
489 _ => break,
490 }
491 }
492
493 if is_constant_ident {
494 Ok(Token::ConstantIdent(self.slice()))
495 } else {
496 Ok(Token::Ident(self.slice()))
497 }
498 }
499
500 fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
501 let c = self.pop();
502 debug_assert_eq!(c, '$');
503
504 loop {
505 match self.read() {
506 '_' | '0'..='9' => self.skip(),
507 c if c.is_ascii_lowercase() => self.skip(),
508 _ => break,
509 }
510 }
511
512 match self.slice() {
513 id @ ("$kernel" | "$exec") => Ok(Token::Ident(id)),
514 _ => {
515 let start = self.span().start();
516 let span = SourceSpan::at(self.span().source_id(), start);
517 Err(ParsingError::InvalidToken { span })
518 },
519 }
520 }
521
522 fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
523 let c = self.read();
525 debug_assert!(c.is_ascii_digit());
526
527 while let '0'..='9' = self.read() {
528 self.skip();
529 }
530
531 self.slice()
532 .parse::<u64>()
533 .map_err(|error| ParsingError::InvalidLiteral {
534 span: self.span(),
535 kind: int_error_kind_to_literal_error_kind(
536 error.kind(),
537 LiteralErrorKind::FeltOverflow,
538 ),
539 })
540 .map(Token::Int)
541 }
542
543 fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
544 debug_assert!(self.read().is_ascii_hexdigit());
546
547 loop {
548 let c1 = self.read();
550 if !c1.is_ascii_hexdigit() {
551 break;
552 }
553 self.skip();
554
555 let c2 = self.read();
559 if !c2.is_ascii_hexdigit() {
560 break;
564 }
565 self.skip();
566 }
567
568 let span = self.span();
569 let start = span.start();
570 let end = span.end();
571 let digit_start = start.to_u32() + 2;
572 let span = SourceSpan::new(span.source_id(), start..end);
573 parse_hex(span, self.slice_span(digit_start..end.to_u32()))
574 }
575
576 fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
577 debug_assert!(is_ascii_binary(self.read()));
579
580 loop {
581 let c1 = self.read();
583 if !is_ascii_binary(c1) {
584 break;
585 }
586 self.skip();
587 }
588
589 let span = self.span();
590 let start = span.start();
591 let digit_start = start.to_u32() + 2;
592 let end = span.end();
593 let span = SourceSpan::new(span.source_id(), start..end);
594 let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
595 Ok(Token::BinValue(value))
596 }
597}
598
599impl<'input> Iterator for Lexer<'input> {
600 type Item = Lexed<'input>;
601
602 fn next(&mut self) -> Option<Self::Item> {
603 let mut res = self.lex();
604 while let Some(Ok((_, Token::Comment, _))) = res {
605 res = self.lex();
606 }
607 res
608 }
609}
610
611fn pad_hex_if_needed<'a>(hex: &'a str) -> Cow<'a, str> {
615 if hex.len().is_multiple_of(2) {
616 Cow::Borrowed(hex)
617 } else {
618 let mut s = alloc::string::String::with_capacity(hex.len() + 1);
620 s.push('0');
621 s.push_str(hex);
622 Cow::Owned(s)
623 }
624}
625
626fn parse_hex<'input>(
627 span: SourceSpan,
628 hex_digits: &'input str,
629) -> Result<Token<'input>, ParsingError> {
630 use miden_core::{Felt, field::PrimeField64};
631
632 let hex_digits = pad_hex_if_needed(hex_digits);
634
635 match hex_digits.len() {
636 n if n <= 16 && n.is_multiple_of(2) => {
638 let value = u64::from_str_radix(&hex_digits, 16).map_err(|error| {
639 ParsingError::InvalidLiteral {
640 span,
641 kind: int_error_kind_to_literal_error_kind(
642 error.kind(),
643 LiteralErrorKind::FeltOverflow,
644 ),
645 }
646 })?;
647 if value >= Felt::ORDER_U64 {
648 return Err(ParsingError::InvalidLiteral {
649 span,
650 kind: LiteralErrorKind::FeltOverflow,
651 });
652 }
653 Ok(Token::HexValue(shrink_u64_hex(value)))
654 },
655 64 => {
657 let mut word = [Felt::ZERO; 4];
658 for (index, element) in word.iter_mut().enumerate() {
659 let offset = index * 16;
660 let mut felt_bytes = [0u8; 8];
661 let digits = &hex_digits[offset..(offset + 16)];
662 for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
663 let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
664 *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
665 ParsingError::InvalidLiteral {
666 span,
667 kind: int_error_kind_to_literal_error_kind(
668 error.kind(),
669 LiteralErrorKind::FeltOverflow,
670 ),
671 }
672 })?;
673 }
674 let value = u64::from_le_bytes(felt_bytes);
675 if value >= Felt::ORDER_U64 {
676 return Err(ParsingError::InvalidLiteral {
677 span,
678 kind: LiteralErrorKind::FeltOverflow,
679 });
680 }
681 *element = Felt::new(value);
682 }
683 Ok(Token::HexWord(WordValue(word)))
684 },
685 n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
687 _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
688 }
689}
690
691fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
692 if bin_digits.len() <= 32 {
693 let value =
694 u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
695 span,
696 kind: int_error_kind_to_literal_error_kind(
697 error.kind(),
698 LiteralErrorKind::U32Overflow,
699 ),
700 })?;
701 Ok(shrink_u32_bin(value))
702 } else {
703 Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
704 }
705}
706
707#[inline(always)]
708fn is_ascii_binary(c: char) -> bool {
709 matches!(c, '0'..='1')
710}
711
712#[inline]
713pub fn shrink_u64_hex(n: u64) -> IntValue {
714 if n <= (u8::MAX as u64) {
715 IntValue::U8(n as u8)
716 } else if n <= (u16::MAX as u64) {
717 IntValue::U16(n as u16)
718 } else if n <= (u32::MAX as u64) {
719 IntValue::U32(n as u32)
720 } else {
721 IntValue::Felt(Felt::new(n))
722 }
723}
724
725#[inline]
726fn shrink_u32_bin(n: u32) -> BinEncodedValue {
727 if n <= (u8::MAX as u32) {
728 BinEncodedValue::U8(n as u8)
729 } else if n <= (u16::MAX as u32) {
730 BinEncodedValue::U16(n as u16)
731 } else {
732 BinEncodedValue::U32(n)
733 }
734}
735
736#[inline]
737fn int_error_kind_to_literal_error_kind(
738 kind: &IntErrorKind,
739 overflow: LiteralErrorKind,
740) -> LiteralErrorKind {
741 match kind {
742 IntErrorKind::Empty => LiteralErrorKind::Empty,
743 IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
744 IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
745 _ => unreachable!(),
746 }
747}