#[cfg(test)]
mod tests;
use core::{fmt, mem, num::IntErrorKind};
use miden_diagnostics::{Diagnostic, SourceIndex, SourceSpan, ToDiagnostic};
use miden_parsing::{Scanner, Source};
use crate::{Symbol, parser::ParseError};
pub type Lexed = Result<(SourceIndex, Token, SourceIndex), ParseError>;
#[derive(Clone, Debug, thiserror::Error)]
pub enum LexicalError {
#[error("invalid integer value: {}", DisplayIntErrorKind(reason))]
InvalidInt {
span: SourceSpan,
reason: IntErrorKind,
},
#[error("encountered unexpected character '{found}'")]
UnexpectedCharacter { start: SourceIndex, found: char },
}
impl PartialEq for LexicalError {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Self::InvalidInt { reason: lhs, .. }, Self::InvalidInt { reason: rhs, .. }) => {
lhs == rhs
}
(
Self::UnexpectedCharacter { found: lhs, .. },
Self::UnexpectedCharacter { found: rhs, .. },
) => lhs == rhs,
_ => false,
}
}
}
impl ToDiagnostic for LexicalError {
fn to_diagnostic(self) -> Diagnostic {
use miden_diagnostics::Label;
match self {
Self::InvalidInt { span, ref reason } => Diagnostic::error()
.with_message("invalid integer literal")
.with_labels(vec![
Label::primary(span.source_id(), span)
.with_message(format!("{}", DisplayIntErrorKind(reason))),
]),
Self::UnexpectedCharacter { start, .. } => Diagnostic::error()
.with_message("unexpected character")
.with_labels(vec![Label::primary(
start.source_id(),
SourceSpan::new(start, start),
)]),
}
}
}
struct DisplayIntErrorKind<'a>(&'a IntErrorKind);
impl fmt::Display for DisplayIntErrorKind<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.0 {
IntErrorKind::Empty => write!(f, "unable to parse empty string as integer"),
IntErrorKind::InvalidDigit => write!(f, "invalid digit"),
IntErrorKind::PosOverflow => write!(f, "value is too big"),
IntErrorKind::NegOverflow => write!(f, "value is too big"),
IntErrorKind::Zero => write!(f, "zero is not a valid value here"),
other => write!(f, "unable to parse integer value: {other:?}"),
}
}
}
#[derive(Debug, Clone)]
pub enum Token {
Eof,
Error(LexicalError),
Comment,
Ident(Symbol),
DeclIdentRef(Symbol),
FunctionIdent(Symbol),
Num(u64),
Def,
Mod,
Use,
Let,
Const,
TraceColumns,
Main,
PublicInputs,
PeriodicColumns,
Ev,
Fn,
Buses,
Multiset,
Logup,
Null,
Unconstrained,
Insert,
Remove,
BoundaryConstraints,
First,
Last,
IntegrityConstraints,
For,
In,
Enf,
Return,
Match,
Case,
When,
Felt,
With,
Quote,
Colon,
ColonColon,
Comma,
Dot,
DotDot,
LParen,
RParen,
LBracket,
RBracket,
LBrace,
RBrace,
Equal,
Plus,
Minus,
Star,
Caret,
Ampersand,
Bar,
Bang,
Arrow,
SemiColon,
}
impl Token {
pub fn from_keyword_or_ident(s: &str) -> Self {
match s {
"def" => Self::Def,
"mod" => Self::Mod,
"use" => Self::Use,
"let" => Self::Let,
"const" => Self::Const,
"trace_columns" => Self::TraceColumns,
"main" => Self::Main,
"public_inputs" => Self::PublicInputs,
"periodic_columns" => Self::PeriodicColumns,
"ev" => Self::Ev,
"fn" => Self::Fn,
"felt" => Self::Felt,
"buses" => Self::Buses,
"multiset" => Self::Multiset,
"logup" => Self::Logup,
"null" => Self::Null,
"unconstrained" => Self::Unconstrained,
"insert" => Self::Insert,
"remove" => Self::Remove,
"boundary_constraints" => Self::BoundaryConstraints,
"integrity_constraints" => Self::IntegrityConstraints,
"first" => Self::First,
"last" => Self::Last,
"for" => Self::For,
"in" => Self::In,
"enf" => Self::Enf,
"return" => Self::Return,
"match" => Self::Match,
"case" => Self::Case,
"when" => Self::When,
"with" => Self::With,
other => Self::Ident(Symbol::intern(other)),
}
}
}
impl Eq for Token {}
impl PartialEq for Token {
fn eq(&self, other: &Token) -> bool {
match self {
Self::Num(i) => {
if let Self::Num(i2) = other {
return *i == *i2;
}
}
Self::Error(_) => {
if let Self::Error(_) = other {
return true;
}
}
Self::Ident(i) => {
if let Self::Ident(i2) = other {
return i == i2;
}
}
Self::DeclIdentRef(i) => {
if let Self::DeclIdentRef(i2) = other {
return i == i2;
}
}
Self::FunctionIdent(i) => {
if let Self::FunctionIdent(i2) = other {
return i == i2;
}
}
_ => return mem::discriminant(self) == mem::discriminant(other),
}
false
}
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::Eof => write!(f, "EOF"),
Self::Error(_) => write!(f, "ERROR"),
Self::Comment => write!(f, "COMMENT"),
Self::Ident(id) => write!(f, "{id}"),
Self::DeclIdentRef(id) => write!(f, "{id}"),
Self::FunctionIdent(id) => write!(f, "{id}"),
Self::Num(i) => write!(f, "{i}"),
Self::Def => write!(f, "def"),
Self::Mod => write!(f, "mod"),
Self::Use => write!(f, "use"),
Self::Let => write!(f, "let"),
Self::Const => write!(f, "const"),
Self::TraceColumns => write!(f, "trace_columns"),
Self::Main => write!(f, "main"),
Self::PublicInputs => write!(f, "public_inputs"),
Self::PeriodicColumns => write!(f, "periodic_columns"),
Self::Ev => write!(f, "ev"),
Self::Fn => write!(f, "fn"),
Self::Felt => write!(f, "felt"),
Self::Buses => write!(f, "buses"),
Self::Multiset => write!(f, "multiset"),
Self::Logup => write!(f, "logup"),
Self::Null => write!(f, "null"),
Self::Unconstrained => write!(f, "unconstrained"),
Self::Insert => write!(f, "insert"),
Self::Remove => write!(f, "remove"),
Self::BoundaryConstraints => write!(f, "boundary_constraints"),
Self::First => write!(f, "first"),
Self::Last => write!(f, "last"),
Self::IntegrityConstraints => write!(f, "integrity_constraints"),
Self::For => write!(f, "for"),
Self::In => write!(f, "in"),
Self::Enf => write!(f, "enf"),
Self::Return => write!(f, "return"),
Self::Match => write!(f, "match"),
Self::Case => write!(f, "case"),
Self::When => write!(f, "when"),
Self::With => write!(f, "with"),
Self::Quote => write!(f, "'"),
Self::Colon => write!(f, ":"),
Self::ColonColon => write!(f, "::"),
Self::Comma => write!(f, ","),
Self::Dot => write!(f, "."),
Self::DotDot => write!(f, ".."),
Self::LParen => write!(f, "("),
Self::RParen => write!(f, ")"),
Self::LBracket => write!(f, "["),
Self::RBracket => write!(f, "]"),
Self::LBrace => write!(f, "{{"),
Self::RBrace => write!(f, "}}"),
Self::Equal => write!(f, "="),
Self::Plus => write!(f, "+"),
Self::Minus => write!(f, "-"),
Self::Star => write!(f, "*"),
Self::Caret => write!(f, "^"),
Self::Ampersand => write!(f, "&"),
Self::Bar => write!(f, "|"),
Self::Bang => write!(f, "!"),
Self::Arrow => write!(f, "->"),
Self::SemiColon => write!(f, ";"),
}
}
}
macro_rules! pop {
($lex:ident) => {{
$lex.skip();
}};
($lex:ident, $code:expr) => {{
$lex.skip();
$code
}};
}
macro_rules! pop2 {
($lex:ident) => {{
$lex.skip();
$lex.skip();
}};
($lex:ident, $code:expr) => {{
$lex.skip();
$lex.skip();
$code
}};
}
pub struct Lexer<S> {
scanner: Scanner<S>,
token: Token,
token_start: SourceIndex,
token_end: SourceIndex,
eof: bool,
}
impl<S> Lexer<S>
where
S: Source,
{
pub fn new(scanner: Scanner<S>) -> Self {
use miden_diagnostics::ByteOffset;
let start = scanner.start();
let mut lexer = Lexer {
scanner,
token: Token::Eof,
token_start: start + ByteOffset(0),
token_end: start + ByteOffset(0),
eof: false,
};
lexer.advance();
lexer
}
pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
if self.eof && self.token == Token::Eof {
return None;
}
let token = std::mem::replace(&mut self.token, Token::Eof);
let start = self.token_start;
let end = self.token_end;
self.advance();
match token {
Token::Error(err) => Some(Err(err.into())),
token => Some(Ok((start, token, end))),
}
}
fn advance(&mut self) {
self.advance_start();
self.token = self.tokenize();
}
#[inline]
fn advance_start(&mut self) {
let mut position: SourceIndex;
loop {
let (pos, c) = self.scanner.read();
position = pos;
if c == '\0' {
self.eof = true;
return;
}
if c.is_whitespace() {
self.scanner.advance();
continue;
}
break;
}
self.token_start = position;
}
#[inline]
fn pop(&mut self) -> char {
use miden_diagnostics::ByteOffset;
let (pos, c) = self.scanner.pop();
self.token_end = pos + ByteOffset::from_char_len(c);
c
}
#[inline]
fn peek(&mut self) -> char {
let (_, c) = self.scanner.peek();
c
}
#[inline]
fn read(&mut self) -> char {
let (_, c) = self.scanner.read();
c
}
#[inline]
fn skip(&mut self) {
self.pop();
}
#[inline]
fn span(&self) -> SourceSpan {
SourceSpan::new(self.token_start, self.token_end)
}
#[inline]
fn slice(&self) -> &str {
self.scanner.slice(self.span())
}
#[inline]
fn skip_whitespace(&mut self) {
let mut c: char;
loop {
c = self.read();
if !c.is_whitespace() {
break;
}
self.skip();
}
}
fn tokenize(&mut self) -> Token {
let c = self.read();
if c == '#' {
self.skip();
return self.lex_comment();
}
if c == '\0' {
self.eof = true;
return Token::Eof;
}
if c.is_whitespace() {
self.skip_whitespace();
}
match self.read() {
',' => pop!(self, Token::Comma),
'.' => match self.peek() {
'.' => pop2!(self, Token::DotDot),
_ => pop!(self, Token::Dot),
},
':' => match self.peek() {
':' => pop2!(self, Token::ColonColon),
_ => pop!(self, Token::Colon),
},
'\'' => pop!(self, Token::Quote),
'(' => pop!(self, Token::LParen),
')' => pop!(self, Token::RParen),
'[' => pop!(self, Token::LBracket),
']' => pop!(self, Token::RBracket),
'{' => pop!(self, Token::LBrace),
'}' => pop!(self, Token::RBrace),
'=' => pop!(self, Token::Equal),
'+' => pop!(self, Token::Plus),
'-' => match self.peek() {
'>' => pop2!(self, Token::Arrow),
_ => pop!(self, Token::Minus),
},
'*' => pop!(self, Token::Star),
'^' => pop!(self, Token::Caret),
'&' => pop!(self, Token::Ampersand),
'|' => pop!(self, Token::Bar),
'!' => pop!(self, Token::Bang),
';' => pop!(self, Token::SemiColon),
'$' => self.lex_special_identifier(),
'0'..='9' => self.lex_number(),
'a'..='z' => self.lex_keyword_or_ident(),
'A'..='Z' => self.lex_identifier(),
c => Token::Error(LexicalError::UnexpectedCharacter {
start: self.span().start(),
found: c,
}),
}
}
fn lex_comment(&mut self) -> Token {
let mut c;
loop {
c = self.read();
if c == '\n' {
break;
}
if c == '\0' {
self.eof = true;
break;
}
self.skip();
}
Token::Comment
}
#[inline]
fn lex_special_identifier(&mut self) -> Token {
let c = self.pop();
debug_assert!(c == '$');
match self.read() {
c if c.is_ascii_alphabetic() => (),
c => {
return Token::Error(LexicalError::UnexpectedCharacter {
start: self.span().start(),
found: c,
});
}
}
self.skip_ident();
Token::DeclIdentRef(Symbol::intern(self.slice()))
}
#[inline]
fn lex_keyword_or_ident(&mut self) -> Token {
let c = self.pop();
debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
self.skip_ident();
let next = self.read();
match Token::from_keyword_or_ident(self.slice()) {
Token::Ident(id) if next == '(' => Token::FunctionIdent(id),
token => token,
}
}
#[inline]
fn lex_identifier(&mut self) -> Token {
let c = self.pop();
debug_assert!(c.is_ascii_alphabetic());
self.skip_ident();
if self.read() == '(' {
Token::FunctionIdent(Symbol::intern(self.slice()))
} else {
Token::Ident(Symbol::intern(self.slice()))
}
}
fn skip_ident(&mut self) {
loop {
match self.read() {
'_' => self.skip(),
'0'..='9' => self.skip(),
c if c.is_ascii_alphabetic() => self.skip(),
_ => break,
}
}
}
#[inline]
fn lex_number(&mut self) -> Token {
let mut num = String::new();
debug_assert!(self.read().is_ascii_digit());
while let '0'..='9' = self.read() {
num.push(self.pop());
}
match num.parse::<u64>() {
Ok(i) => Token::Num(i),
Err(err) => Token::Error(LexicalError::InvalidInt {
span: self.span(),
reason: err.kind().clone(),
}),
}
}
}
impl<S> Iterator for Lexer<S>
where
S: Source,
{
type Item = Lexed;
fn next(&mut self) -> Option<Self::Item> {
let mut res = self.lex();
while let Some(Ok((_, Token::Comment, _))) = res {
res = self.lex();
}
res
}
}