1use alloc::{borrow::Cow, string::String};
2use core::{num::IntErrorKind, ops::Range};
3
4use miden_debug_types::{ByteOffset, SourceId, SourceSpan};
5
6use super::{
7 BinEncodedValue, BinErrorKind, DocumentationType, HexErrorKind, IntValue, LiteralErrorKind,
8 ParsingError, Scanner, Token, WordValue,
9};
10use crate::Felt;
11
12pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParsingError>;
19
20macro_rules! pop {
22 ($lex:ident) => {{
23 $lex.skip();
24 }};
25 ($lex:ident, $token:expr) => {{
26 $lex.skip();
27 Ok($token)
28 }};
29}
30
31macro_rules! pop2 {
33 ($lex:ident) => {{
34 $lex.skip();
35 $lex.skip();
36 }};
37 ($lex:ident, $token:expr) => {{
38 $lex.skip();
39 $lex.skip();
40 Ok($token)
41 }};
42}
43
44pub struct Lexer<'input> {
60 source_id: SourceId,
62
63 scanner: Scanner<'input>,
66
67 token: Token<'input>,
70
71 token_start: usize,
74
75 token_end: usize,
78
79 line_num: usize,
81
82 eof: bool,
86 empty: bool,
87
88 keywords: aho_corasick::AhoCorasick,
90
91 error: Option<ParsingError>,
93}
94
95impl<'input> Lexer<'input> {
96 pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
99 let start = scanner.start();
100 let keywords = Token::keyword_searcher();
101 let mut lexer = Self {
102 source_id,
103 scanner,
104 token: Token::Eof,
105 token_start: start,
106 token_end: start,
107 line_num: 0,
108 eof: false,
109 empty: false,
110 keywords,
111 error: None,
112 };
113 lexer.advance();
114 lexer
115 }
116
117 pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
118 if let Some(err) = self.error.take() {
119 return Some(Err(err));
120 }
121
122 if self.eof && matches!(self.token, Token::Eof) {
123 if self.empty {
125 return None;
126 } else {
127 self.empty = true;
128 let end = self.token_end as u32;
129 return Some(Ok((end, Token::Eof, end)));
130 }
131 }
132
133 let token = core::mem::replace(&mut self.token, Token::Eof);
134 let start = self.token_start;
135 let end = self.token_end;
136 self.advance();
137 Some(Ok((start as u32, token, end as u32)))
138 }
139
140 fn advance(&mut self) {
141 self.advance_start();
142 match self.tokenize() {
143 Ok(tok) => {
144 self.token = tok;
145 },
146 Err(err) => {
147 self.error = Some(err);
148 },
149 }
150 }
151
152 #[inline]
153 fn advance_start(&mut self) {
154 let mut position: usize;
155 loop {
156 let (pos, c) = self.scanner.read();
157
158 position = pos;
159
160 if c == '\0' {
161 self.eof = true;
162 return;
163 }
164
165 if c.is_whitespace() {
166 if c == '\n' {
167 self.line_num += 1;
168 }
169 self.scanner.advance();
170 continue;
171 }
172
173 break;
174 }
175
176 self.token_start = position;
177 self.token_end = position;
178 }
179
180 #[inline]
181 fn pop(&mut self) -> char {
182 let (pos, c) = self.scanner.pop();
183 self.token_end = pos + c.len_utf8();
184 c
185 }
186
187 #[inline]
188 fn peek(&mut self) -> char {
189 let (_, c) = self.scanner.peek();
190 c
191 }
192
193 #[inline]
194 #[expect(unused)]
195 fn peek_next(&mut self) -> char {
196 let (_, c) = self.scanner.peek_next();
197 c
198 }
199
200 #[inline]
201 fn read(&mut self) -> char {
202 let (_, c) = self.scanner.read();
203 c
204 }
205
206 #[inline]
207 fn skip(&mut self) {
208 self.pop();
209 }
210
211 #[inline]
213 fn span(&self) -> SourceSpan {
214 assert!(self.token_start <= self.token_end, "invalid range");
215 assert!(self.token_end <= u32::MAX as usize, "file too large");
216 SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
217 }
218
219 #[inline]
220 fn char_span(&self, pos: usize, c: char) -> SourceSpan {
221 let end = pos + c.len_utf8();
222 SourceSpan::new(self.source_id, (pos as u32)..(end as u32))
223 }
224
225 #[inline]
226 fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
227 let range = span.into();
228 self.scanner.slice((range.start as usize)..(range.end as usize))
229 }
230
231 #[inline]
233 fn slice(&self) -> &'input str {
234 self.slice_span(self.span())
235 }
236
237 #[inline]
238 fn skip_whitespace(&mut self) {
239 let mut c: char;
240 loop {
241 c = self.read();
242
243 if !c.is_whitespace() {
244 break;
245 }
246
247 if c == '\n' {
248 self.line_num += 1;
249 }
250
251 self.skip();
252 }
253 }
254
255 fn tokenize(&mut self) -> Result<Token<'input>, ParsingError> {
256 let c = self.read();
257
258 if c == '#' {
259 match self.peek() {
260 '!' => {
261 self.skip();
262 self.skip();
263 return self.lex_docs();
264 },
265 _ => {
266 self.skip();
267 self.skip_comment();
268 return Ok(Token::Comment);
269 },
270 }
271 }
272
273 if c == '\0' {
274 self.eof = true;
275 return Ok(Token::Eof);
276 }
277
278 if c.is_whitespace() {
279 self.skip_whitespace();
280 }
281
282 let pos = self.token_start;
283 let c = self.read();
284 match c {
285 '@' => pop!(self, Token::At),
286 '!' => pop!(self, Token::Bang),
287 ':' => match self.peek() {
288 ':' => pop2!(self, Token::ColonColon),
289 _ => pop!(self, Token::Colon),
290 },
291 ';' => pop!(self, Token::Semicolon),
292 '.' => match self.peek() {
293 '.' => pop2!(self, Token::Range),
294 _ => pop!(self, Token::Dot),
295 },
296 ',' => pop!(self, Token::Comma),
297 '=' => pop!(self, Token::Equal),
298 '<' => pop!(self, Token::Langle),
299 '{' => pop!(self, Token::Lbrace),
300 '[' => pop!(self, Token::Lbracket),
301 '(' => pop!(self, Token::Lparen),
302 '>' => pop!(self, Token::Rangle),
303 '}' => pop!(self, Token::Rbrace),
304 ']' => pop!(self, Token::Rbracket),
305 ')' => pop!(self, Token::Rparen),
306 '-' => match self.peek() {
307 '>' => pop2!(self, Token::Rstab),
308 _ => pop!(self, Token::Minus),
309 },
310 '+' => pop!(self, Token::Plus),
311 '/' => match self.peek() {
312 '/' => pop2!(self, Token::SlashSlash),
313 _ => pop!(self, Token::Slash),
314 },
315 '*' => pop!(self, Token::Star),
316 '$' => self.lex_special_identifier(),
317 '"' => self.lex_quoted_identifier_or_string(),
318 '0' => match self.peek() {
319 'x' => {
320 self.skip();
321 self.skip();
322 self.lex_hex()
323 },
324 'b' => {
325 self.skip();
326 self.skip();
327 self.lex_bin()
328 },
329 '0'..='9' => self.lex_number(),
330 _ => pop!(self, Token::Int(0)),
331 },
332 '1'..='9' => self.lex_number(),
333 'a'..='z' => self.lex_keyword_or_ident(),
334 'A'..='Z' => self.lex_identifier(),
335 '_' => match self.peek() {
336 c if c.is_ascii_alphanumeric() => self.lex_identifier(),
337 _ => Err(ParsingError::InvalidToken { span: self.char_span(pos, c) }),
338 },
339 _ => Err(ParsingError::InvalidToken { span: self.char_span(pos, c) }),
340 }
341 }
342
343 fn lex_docs(&mut self) -> Result<Token<'input>, ParsingError> {
344 let mut buf = String::new();
345
346 let mut c;
347 let mut line_start = self.token_start + 2;
348 let is_module_doc = self.line_num == 0;
349 loop {
350 c = self.read();
351
352 if c == '\0' {
353 self.eof = true;
354 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
355
356 let is_first_line = self.line_num == 0;
357 break Ok(Token::DocComment(if is_first_line {
358 DocumentationType::Module(buf)
359 } else {
360 DocumentationType::Form(buf)
361 }));
362 }
363
364 if c == '\n' {
365 self.line_num += 1;
366
367 buf.push_str(self.slice_span((line_start as u32)..(self.token_end as u32)).trim());
368 buf.push('\n');
369
370 self.skip();
371 c = self.read();
372 match c {
373 '#' if self.peek() == '!' => {
374 self.skip();
375 self.skip();
376 line_start = self.token_end;
377 continue;
378 },
379 _ if is_module_doc => {
380 break Ok(Token::DocComment(DocumentationType::Module(buf)));
381 },
382 _ => {
383 break Ok(Token::DocComment(DocumentationType::Form(buf)));
384 },
385 }
386 }
387
388 self.skip();
389 }
390 }
391
392 fn skip_comment(&mut self) {
393 let mut c;
394 loop {
395 c = self.read();
396
397 if c == '\n' {
398 self.skip();
399 self.line_num += 1;
400 break;
401 }
402
403 if c == '\0' {
404 self.eof = true;
405 break;
406 }
407
408 self.skip();
409 }
410 }
411
412 fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParsingError> {
413 let c = self.pop();
414 debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
415
416 loop {
417 match self.read() {
418 '_' | '0'..='9' => self.skip(),
419 c if c.is_ascii_alphabetic() => self.skip(),
420 _ => break,
421 }
422 }
423
424 let name = self.slice();
425 match name {
426 "exp" => {
427 if self.read() == '.' && self.peek() == 'u' {
429 pop2!(self, Token::ExpU)
430 } else {
431 Ok(Token::Exp)
432 }
433 },
434 _ => Ok(Token::from_keyword_or_ident_with_searcher(name, &self.keywords)),
435 }
436 }
437
438 fn lex_quoted_identifier_or_string(&mut self) -> Result<Token<'input>, ParsingError> {
439 self.skip();
441
442 let mut is_identifier = true;
443 let quote_size = ByteOffset::from_char_len('"');
444 loop {
445 match self.read() {
446 '\0' | '\n' => {
447 break Err(ParsingError::UnclosedQuote {
448 start: SourceSpan::at(self.source_id, self.span().start()),
449 });
450 },
451 '\\' => {
452 is_identifier = false;
453 self.skip();
454 match self.read() {
455 '"' | '\n' => {
456 self.skip();
457 },
458 _ => (),
459 }
460 },
461 '"' => {
462 let span = self.span();
463 let start = span.start() + quote_size;
464 let span = SourceSpan::new(self.source_id, start..span.end());
465
466 self.skip();
467 break Ok(if is_identifier {
468 Token::QuotedIdent(self.slice_span(span))
469 } else {
470 Token::QuotedString(self.slice_span(span))
471 });
472 },
473 c if c.is_alphanumeric() || c.is_ascii_graphic() => {
474 self.skip();
475 },
476 _ => {
477 is_identifier = false;
478 self.skip();
479 },
480 }
481 }
482 }
483
484 fn lex_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
485 let c = self.pop();
486 debug_assert!(c.is_ascii_alphabetic() || c == '_');
487
488 let mut is_constant_ident = c.is_ascii_uppercase() || c == '_';
489
490 loop {
491 match self.read() {
492 '_' | '0'..='9' => self.skip(),
493 c if c.is_ascii_alphabetic() => {
494 is_constant_ident &= c.is_ascii_uppercase();
495 self.skip();
496 },
497 _ => break,
498 }
499 }
500
501 if is_constant_ident {
502 Ok(Token::ConstantIdent(self.slice()))
503 } else {
504 Ok(Token::Ident(self.slice()))
505 }
506 }
507
508 fn lex_special_identifier(&mut self) -> Result<Token<'input>, ParsingError> {
509 let c = self.pop();
510 debug_assert_eq!(c, '$');
511
512 loop {
513 match self.read() {
514 '_' | '0'..='9' => self.skip(),
515 c if c.is_ascii_lowercase() => self.skip(),
516 _ => break,
517 }
518 }
519
520 match self.slice() {
521 id @ ("$kernel" | "$exec") => Ok(Token::Ident(id)),
522 _ => {
523 let start = self.span().start();
524 let span = SourceSpan::at(self.span().source_id(), start);
525 Err(ParsingError::InvalidToken { span })
526 },
527 }
528 }
529
530 fn lex_number(&mut self) -> Result<Token<'input>, ParsingError> {
531 let c = self.read();
533 debug_assert!(c.is_ascii_digit());
534
535 while let '0'..='9' = self.read() {
536 self.skip();
537 }
538
539 self.slice()
540 .parse::<u64>()
541 .map_err(|error| ParsingError::InvalidLiteral {
542 span: self.span(),
543 kind: int_error_kind_to_literal_error_kind(
544 error.kind(),
545 LiteralErrorKind::FeltOverflow,
546 ),
547 })
548 .map(Token::Int)
549 }
550
551 fn lex_hex(&mut self) -> Result<Token<'input>, ParsingError> {
552 debug_assert!(self.read().is_ascii_hexdigit());
554
555 loop {
556 let c1 = self.read();
558 if !c1.is_ascii_hexdigit() {
559 break;
560 }
561 self.skip();
562
563 let c2 = self.read();
567 if !c2.is_ascii_hexdigit() {
568 break;
572 }
573 self.skip();
574 }
575
576 let span = self.span();
577 let start = span.start();
578 let end = span.end();
579 let digit_start = start.to_u32() + 2;
580 let span = SourceSpan::new(span.source_id(), start..end);
581 parse_hex(span, self.slice_span(digit_start..end.to_u32()))
582 }
583
584 fn lex_bin(&mut self) -> Result<Token<'input>, ParsingError> {
585 debug_assert!(is_ascii_binary(self.read()));
587
588 loop {
589 let c1 = self.read();
591 if !is_ascii_binary(c1) {
592 break;
593 }
594 self.skip();
595 }
596
597 let span = self.span();
598 let start = span.start();
599 let digit_start = start.to_u32() + 2;
600 let end = span.end();
601 let span = SourceSpan::new(span.source_id(), start..end);
602 let value = parse_bin(span, self.slice_span(digit_start..end.to_u32()))?;
603 Ok(Token::BinValue(value))
604 }
605}
606
607impl<'input> Iterator for Lexer<'input> {
608 type Item = Lexed<'input>;
609
610 fn next(&mut self) -> Option<Self::Item> {
611 let mut res = self.lex();
612 while let Some(Ok((_, Token::Comment, _))) = res {
613 res = self.lex();
614 }
615 res
616 }
617}
618
619fn pad_hex_if_needed<'a>(hex: &'a str) -> Cow<'a, str> {
623 if hex.len().is_multiple_of(2) {
624 Cow::Borrowed(hex)
625 } else {
626 let mut s = alloc::string::String::with_capacity(hex.len() + 1);
628 s.push('0');
629 s.push_str(hex);
630 Cow::Owned(s)
631 }
632}
633
634fn parse_hex<'input>(
635 span: SourceSpan,
636 hex_digits: &'input str,
637) -> Result<Token<'input>, ParsingError> {
638 use miden_core::{Felt, field::PrimeField64};
639
640 let hex_digits = pad_hex_if_needed(hex_digits);
642
643 match hex_digits.len() {
644 n if n <= 16 && n.is_multiple_of(2) => {
646 let value = u64::from_str_radix(&hex_digits, 16).map_err(|error| {
647 ParsingError::InvalidLiteral {
648 span,
649 kind: int_error_kind_to_literal_error_kind(
650 error.kind(),
651 LiteralErrorKind::FeltOverflow,
652 ),
653 }
654 })?;
655 if value >= Felt::ORDER_U64 {
656 return Err(ParsingError::InvalidLiteral {
657 span,
658 kind: LiteralErrorKind::FeltOverflow,
659 });
660 }
661 Ok(Token::HexValue(shrink_u64_hex(value)))
662 },
663 64 => {
665 let mut word = [Felt::ZERO; 4];
666 for (index, element) in word.iter_mut().enumerate() {
667 let offset = index * 16;
668 let mut felt_bytes = [0u8; 8];
669 let digits = &hex_digits[offset..(offset + 16)];
670 for (byte_idx, byte) in felt_bytes.iter_mut().enumerate() {
671 let byte_str = &digits[(byte_idx * 2)..((byte_idx * 2) + 2)];
672 *byte = u8::from_str_radix(byte_str, 16).map_err(|error| {
673 ParsingError::InvalidLiteral {
674 span,
675 kind: int_error_kind_to_literal_error_kind(
676 error.kind(),
677 LiteralErrorKind::FeltOverflow,
678 ),
679 }
680 })?;
681 }
682 let value = u64::from_le_bytes(felt_bytes);
683 if value >= Felt::ORDER_U64 {
684 return Err(ParsingError::InvalidLiteral {
685 span,
686 kind: LiteralErrorKind::FeltOverflow,
687 });
688 }
689 *element = Felt::new(value);
690 }
691 Ok(Token::HexWord(WordValue(word)))
692 },
693 n if n > 64 => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::TooLong }),
695 _ => Err(ParsingError::InvalidHexLiteral { span, kind: HexErrorKind::Invalid }),
696 }
697}
698
699fn parse_bin(span: SourceSpan, bin_digits: &str) -> Result<BinEncodedValue, ParsingError> {
700 if bin_digits.len() <= 32 {
701 let value =
702 u32::from_str_radix(bin_digits, 2).map_err(|error| ParsingError::InvalidLiteral {
703 span,
704 kind: int_error_kind_to_literal_error_kind(
705 error.kind(),
706 LiteralErrorKind::U32Overflow,
707 ),
708 })?;
709 Ok(shrink_u32_bin(value))
710 } else {
711 Err(ParsingError::InvalidBinaryLiteral { span, kind: BinErrorKind::TooLong })
712 }
713}
714
715#[inline(always)]
716fn is_ascii_binary(c: char) -> bool {
717 matches!(c, '0'..='1')
718}
719
720#[inline]
721pub fn shrink_u64_hex(n: u64) -> IntValue {
722 if n <= (u8::MAX as u64) {
723 IntValue::U8(n as u8)
724 } else if n <= (u16::MAX as u64) {
725 IntValue::U16(n as u16)
726 } else if n <= (u32::MAX as u64) {
727 IntValue::U32(n as u32)
728 } else {
729 IntValue::Felt(Felt::new(n))
730 }
731}
732
733#[inline]
734fn shrink_u32_bin(n: u32) -> BinEncodedValue {
735 if n <= (u8::MAX as u32) {
736 BinEncodedValue::U8(n as u8)
737 } else if n <= (u16::MAX as u32) {
738 BinEncodedValue::U16(n as u16)
739 } else {
740 BinEncodedValue::U32(n)
741 }
742}
743
744#[inline]
745fn int_error_kind_to_literal_error_kind(
746 kind: &IntErrorKind,
747 overflow: LiteralErrorKind,
748) -> LiteralErrorKind {
749 match kind {
750 IntErrorKind::Empty => LiteralErrorKind::Empty,
751 IntErrorKind::InvalidDigit => LiteralErrorKind::InvalidDigit,
752 IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => overflow,
753 _ => unreachable!(),
754 }
755}