use alloc::{
borrow::Cow,
format,
string::{String, ToString},
vec,
};
use core::{num::IntErrorKind, ops::Range};
use midenc_session::diagnostics::Span;
use super::{ParseResult, ParserError, Scanner, Token};
use crate::{
Felt,
diagnostics::{
ByteIndex, ByteOffset, LabeledSpan, Report, Severity, SourceId, SourceSpan,
miette::diagnostic,
},
};
pub type Lexed<'input> = Result<(u32, Token<'input>, u32), ParserError>;
macro_rules! pop {
($lex:ident) => {{
$lex.skip();
}};
($lex:ident, $token:expr) => {{
$lex.skip();
Ok($token)
}};
}
macro_rules! pop2 {
($lex:ident) => {{
$lex.skip();
$lex.skip();
}};
($lex:ident, $token:expr) => {{
$lex.skip();
$lex.skip();
Ok($token)
}};
}
macro_rules! pop3 {
($lex:ident) => {{
$lex.skip();
$lex.skip();
$lex.skip();
}};
($lex:ident, $token:expr) => {{
$lex.skip();
$lex.skip();
$lex.skip();
Ok($token)
}};
}
pub struct Lexer<'input> {
source_id: SourceId,
scanner: Scanner<'input>,
token: Token<'input>,
token_start: usize,
token_end: usize,
line_num: usize,
eof: bool,
empty: bool,
error: Option<ParserError>,
}
impl<'input> Lexer<'input> {
pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
let start = scanner.start();
let mut lexer = Self {
source_id,
scanner,
token: Token::Eof,
token_start: start,
token_end: start,
line_num: 0,
eof: false,
empty: false,
error: None,
};
lexer.advance();
lexer
}
pub fn lex(&mut self) -> Option<<Self as Iterator>::Item> {
if let Some(err) = self.error.take() {
return Some(Err(err));
}
if self.eof && matches!(self.token, Token::Eof) {
if self.empty {
return None;
} else {
self.empty = true;
let end = self.token_end as u32;
return Some(Ok((end, Token::Eof, end)));
}
}
let token = core::mem::replace(&mut self.token, Token::Eof);
let start = self.token_start;
let end = self.token_end;
self.advance();
Some(Ok((start as u32, token, end as u32)))
}
fn advance(&mut self) {
self.advance_start();
match self.tokenize() {
Ok(tok) => {
self.token = tok;
}
Err(err) => {
self.error = Some(err);
}
}
}
fn advance_to(&mut self, pos: usize) {
self.scanner.advance_to(pos);
self.token_start = pos;
match self.tokenize() {
Ok(tok) => {
self.token = tok;
}
Err(err) => {
self.error = Some(err);
}
}
}
#[inline]
fn advance_start(&mut self) {
let mut position: usize;
loop {
let (pos, c) = self.scanner.read();
position = pos;
if c == '\0' {
self.eof = true;
return;
}
if c.is_whitespace() {
if c == '\n' {
self.line_num += 1;
}
self.scanner.advance();
continue;
}
break;
}
self.token_start = position;
}
#[inline]
fn pop(&mut self) -> char {
let (pos, c) = self.scanner.pop();
self.token_end = pos + c.len_utf8();
c
}
#[inline]
fn peek(&mut self) -> char {
let (_, c) = self.scanner.peek();
c
}
#[inline]
fn peek_next(&mut self) -> char {
let (_, c) = self.scanner.peek_next();
c
}
#[inline]
fn read(&mut self) -> char {
let (_, c) = self.scanner.read();
c
}
#[inline]
fn skip(&mut self) {
self.pop();
}
#[inline]
pub fn current_position(&self) -> ByteIndex {
ByteIndex::new(self.token_end as u32)
}
#[inline]
fn span(&self) -> SourceSpan {
assert!(self.token_start <= self.token_end, "invalid range");
assert!(self.token_end <= u32::MAX as usize, "file too large");
SourceSpan::new(self.source_id, (self.token_start as u32)..(self.token_end as u32))
}
#[inline]
fn slice_span(&self, span: impl Into<Range<u32>>) -> &'input str {
let range = span.into();
self.scanner.slice((range.start as usize)..(range.end as usize))
}
#[inline]
fn slice(&self) -> &'input str {
self.slice_span(self.span())
}
#[inline]
fn skip_whitespace(&mut self) {
let mut c: char;
loop {
c = self.read();
if !c.is_whitespace() {
break;
}
if c == '\n' {
self.line_num += 1;
}
self.skip();
}
}
fn tokenize(&mut self) -> Result<Token<'input>, ParserError> {
let c = self.read();
if c == '/' {
match self.peek() {
'/' => {
self.skip();
self.skip();
self.skip_comment();
return Ok(Token::Comment);
}
_ => return pop!(self, Token::Slash),
}
}
if c == '\0' {
self.eof = true;
return Ok(Token::Eof);
}
if c.is_whitespace() {
self.skip_whitespace();
}
match self.read() {
'@' => self.lex_at_identifier(),
'!' | '^' | '%' => self.lex_prefixed_identifier(),
':' => match self.peek() {
':' => pop2!(self, Token::ColonColon),
_ => pop!(self, Token::Colon),
},
'$' => pop!(self, Token::Dollar),
'.' => match self.peek() {
'.' => match self.peek_next() {
'.' => pop3!(self, Token::Ellipsis),
_ => pop!(self, Token::Dot),
},
_ => pop!(self, Token::Dot),
},
',' => pop!(self, Token::Comma),
';' => pop!(self, Token::Semicolon),
'=' => pop!(self, Token::Equal),
'#' => match self.peek() {
'-' => match self.peek_next() {
'}' => pop3!(self, Token::FileMetadataEnd),
_ => self.lex_prefixed_identifier(),
},
_ => self.lex_prefixed_identifier(),
},
'|' => pop!(self, Token::Pipe),
'<' => pop!(self, Token::Langle),
'{' => match self.peek() {
'-' => match self.peek_next() {
'#' => pop3!(self, Token::FileMetadataStart),
_ => pop!(self, Token::Lbrace),
},
_ => pop!(self, Token::Lbrace),
},
'[' => pop!(self, Token::Lbracket),
'(' => pop!(self, Token::Lparen),
'>' => pop!(self, Token::Rangle),
'}' => pop!(self, Token::Rbrace),
']' => pop!(self, Token::Rbracket),
')' => pop!(self, Token::Rparen),
'-' => match self.peek() {
'>' => pop2!(self, Token::Rstab),
_ => pop!(self, Token::Minus),
},
'+' => pop!(self, Token::Plus),
'*' => pop!(self, Token::Star),
'/' => pop!(self, Token::Slash),
'?' => pop!(self, Token::Question),
'"' => self.lex_string(),
'0' => match self.peek() {
'x' => {
self.skip();
self.skip();
self.lex_hex()
}
'b' => {
self.skip();
self.skip();
self.lex_bin()
}
'0'..='9' => self.lex_decimal(),
_ => pop!(self, Token::Int("0")),
},
'1'..='9' => self.lex_decimal(),
'a'..='z' => self.lex_keyword_or_ident(),
'A'..='Z' => self.lex_identifier(),
'_' => self.lex_identifier(),
invalid => Err(ParserError::InvalidCharacter {
span: SourceSpan::at(self.source_id, self.token_start as u32),
character: invalid,
}),
}
}
fn skip_comment(&mut self) {
let mut c;
loop {
c = self.read();
if c == '\n' {
self.skip();
self.line_num += 1;
break;
}
if c == '\0' {
self.eof = true;
break;
}
self.skip();
}
}
fn lex_at_identifier(&mut self) -> Result<Token<'input>, ParserError> {
let c = self.pop();
debug_assert_eq!(c, '@');
self.token_start += c.len_utf8();
if self.read() == '"' {
let Token::String(s) = self.lex_string()? else {
unreachable!()
};
Ok(Token::AtIdent(s))
} else {
let Token::BareIdent(s) = self.lex_identifier()? else {
unreachable!()
};
Ok(Token::AtIdent(s))
}
}
fn lex_prefixed_identifier(&mut self) -> Result<Token<'input>, ParserError> {
let prefix = self.pop();
debug_assert!(matches!(prefix, '#' | '%' | '^' | '!'), "got '{prefix}'");
let c = self.read();
if c.is_ascii_digit() {
self.token_start += prefix.len_utf8();
while self.read().is_ascii_digit() {
self.skip();
}
} else if c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '$' | '.') {
self.token_start += prefix.len_utf8();
loop {
match self.read() {
'_' | '-' | '$' | '.' => self.skip(),
c if c.is_ascii_alphanumeric() => self.skip(),
_ => break,
}
}
} else {
return Err(ParserError::UnexpectedToken {
span: self.span(),
token: format!("'{c}'"),
expected: Some("expected identifier".to_string()),
});
}
let id = self.slice();
Ok(match prefix {
'#' => Token::HashIdent(id),
'%' => Token::PercentIdent(id),
'^' => Token::CaretIdent(id),
'!' => Token::BangIdent(id),
_ => unreachable!(),
})
}
fn lex_keyword_or_ident(&mut self) -> Result<Token<'input>, ParserError> {
let c = self.pop();
debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase());
loop {
match self.read() {
'_' | '-' | '$' | '.' => self.skip(),
c if c.is_ascii_alphanumeric() => self.skip(),
_ => break,
}
}
let name = self.slice();
Ok(Token::from_keyword_or_ident(name))
}
fn lex_string(&mut self) -> Result<Token<'input>, ParserError> {
self.skip();
self.token_start += '"'.len_utf8();
let mut is_identifier = true;
loop {
match self.read() {
'\0' | '\n' => {
break Err(ParserError::UnclosedQuote {
span: SourceSpan::at(self.source_id, self.span().start()),
});
}
'\\' => {
is_identifier = false;
self.skip();
match self.read() {
'"' | '\n' => {
self.skip();
}
_ => (),
}
}
'"' => {
let span = self.span();
self.skip();
break Ok(Token::String(self.slice_span(span)));
}
c if c.is_alphanumeric() || c.is_ascii_graphic() => {
self.skip();
}
_ => {
is_identifier = false;
self.skip();
}
}
}
}
fn lex_identifier(&mut self) -> Result<Token<'input>, ParserError> {
let c = self.pop();
debug_assert!(c.is_ascii_alphabetic() || c == '_');
loop {
match self.read() {
'_' | '$' | '.' => self.skip(),
c if c.is_ascii_alphanumeric() => self.skip(),
_ => break,
}
}
Ok(Token::BareIdent(self.slice()))
}
fn lex_decimal(&mut self) -> Result<Token<'input>, ParserError> {
let c = self.read();
debug_assert!(c.is_ascii_digit());
while let '0'..='9' = self.read() {
self.skip();
}
let digits = self.slice();
Ok(Token::Int(digits))
}
fn lex_hex(&mut self) -> Result<Token<'input>, ParserError> {
debug_assert!(self.read().is_ascii_hexdigit());
loop {
let c1 = self.read();
if !c1.is_ascii_hexdigit() {
break;
}
self.skip();
let c2 = self.read();
if !c2.is_ascii_hexdigit() {
break;
}
self.skip();
}
let span = self.span();
let start = span.start();
let end = span.end();
let digit_start = start.to_u32() + 2;
let span = SourceSpan::new(span.source_id(), start..end);
Ok(Token::Hex(self.slice_span(digit_start..end.to_u32())))
}
fn lex_bin(&mut self) -> Result<Token<'input>, ParserError> {
debug_assert!(is_ascii_binary(self.read()));
loop {
let c1 = self.read();
if !is_ascii_binary(c1) {
break;
}
self.skip();
}
let span = self.span();
let start = span.start();
let digit_start = start.to_u32() + 2;
let end = span.end();
let span = SourceSpan::new(span.source_id(), start..end);
Ok(Token::Binary(self.slice_span(digit_start..end.to_u32())))
}
}
pub struct TokenStream<'input> {
lexer: Lexer<'input>,
upcoming: Option<Lexed<'input>>,
current_position: ByteIndex,
}
impl<'input> TokenStream<'input> {
pub fn new(source_id: SourceId, scanner: Scanner<'input>) -> Self {
let mut lexer = Lexer::new(source_id, scanner);
let current_position = lexer.current_position();
let upcoming = lexer.next();
Self {
lexer,
upcoming,
current_position,
}
}
pub fn reset_to(&mut self, pos: usize) {
self.lexer.advance_to(pos);
let current_position = self.lexer.current_position();
let upcoming = self.lexer.next();
self.current_position = current_position;
self.upcoming = upcoming;
}
#[inline]
pub const fn source_id(&self) -> SourceId {
self.lexer.source_id
}
#[inline]
pub fn remaining_source(&self) -> &str {
self.lexer.scanner.slice_from(self.current_position.to_usize())
}
#[inline]
pub const fn current_position(&self) -> ByteIndex {
self.current_position
}
#[inline]
pub fn current_span(&self) -> SourceSpan {
SourceSpan::at(self.lexer.source_id, self.current_position)
}
pub fn peek(&mut self) -> ParseResult<Option<(u32, Token<'input>, u32)>> {
match self.upcoming.take() {
Some(Ok(ok)) => {
self.upcoming = Some(Ok(ok));
Ok(Some(ok))
}
Some(Err(err)) => Err(err),
None => Ok(None),
}
}
pub fn is_next<F>(&mut self, predicate: F) -> bool
where
F: Fn(Token<'input>) -> bool,
{
self.upcoming.as_ref().is_some_and(|lexed| match lexed {
Ok((_, tok, _)) => predicate(*tok),
Err(_) => false,
})
}
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> ParseResult<Option<Span<Token<'input>>>> {
if let Some(lexed) = self.upcoming.take() {
self.advance();
let source_id = self.source_id();
lexed.map(|(start, tok, end)| Some(spanned!(source_id, start, end, tok)))
} else {
Ok(None)
}
}
pub fn next_if<F>(&mut self, predicate: F) -> ParseResult<Option<Span<Token<'input>>>>
where
F: Fn(Token<'input>) -> bool,
{
if self.is_next(predicate) {
let source_id = self.source_id();
let result = self
.upcoming
.take()
.map(|lexed| lexed.map(|(start, tok, end)| spanned!(source_id, start, end, tok)));
self.advance();
result.transpose()
} else {
Ok(None)
}
}
pub fn next_if_map<F, T>(&mut self, mapper: F) -> ParseResult<Option<Span<T>>>
where
F: Fn(Token<'input>) -> Option<T>,
{
match self.upcoming.take() {
Some(Ok((start, tok, end))) => {
let source_id = self.source_id();
let result = mapper(tok).map(|out| spanned!(source_id, start, end, out));
Ok(if result.is_some() {
self.advance();
result
} else {
self.upcoming = Some(Ok((start, tok, end)));
None
})
}
Some(Err(err)) => Err(err),
None => Ok(None),
}
}
pub fn next_if_eq(&mut self, expected: Token<'_>) -> ParseResult<bool> {
self.next_if(|tok| tok == expected).map(|maybe_tok| maybe_tok.is_some())
}
pub fn expect(&mut self, expected: Token<'_>) -> ParseResult {
match self.next()? {
Some(spanned) if spanned == expected => Ok(()),
Some(spanned) => Err(ParserError::UnexpectedToken {
token: spanned.to_string(),
span: span!(spanned.span().source_id(), spanned.span().start()),
expected: Some(expected.to_string()),
}),
None => Err(ParserError::UnexpectedEof {
expected: vec![expected.to_string()],
}),
}
}
pub fn expect_if<F>(&mut self, expected: &str, predicate: F) -> ParseResult<Span<Token<'input>>>
where
F: Fn(Token<'input>) -> bool,
{
match self.next()? {
Some(spanned) => {
if predicate(spanned.into_inner()) {
Ok(spanned)
} else {
Err(ParserError::UnexpectedToken {
span: spanned.span(),
token: spanned.to_string(),
expected: Some(expected.to_string()),
})
}
}
None => Err(ParserError::UnexpectedEof {
expected: vec![expected.to_string()],
}),
}
}
pub fn expect_map<F, T>(&mut self, expected: &str, mapper: F) -> ParseResult<Span<T>>
where
F: Fn(Token<'input>) -> Option<T>,
{
match self.next()? {
Some(spanned) => {
let (span, tok) = spanned.into_parts();
match mapper(tok) {
Some(out) => Ok(Span::new(span, out)),
None => Err(ParserError::UnexpectedToken {
span,
token: tok.to_string(),
expected: Some(expected.to_string()),
}),
}
}
None => Err(ParserError::UnexpectedEof {
expected: vec![expected.to_string()],
}),
}
}
#[inline]
fn advance(&mut self) {
self.current_position = self.lexer.current_position();
self.upcoming = self.lexer.next();
}
}
impl<'input> Iterator for TokenStream<'input> {
type Item = Lexed<'input>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(lexed) = self.upcoming.take() {
self.advance();
Some(lexed)
} else {
None
}
}
}
impl<'input> Iterator for Lexer<'input> {
type Item = Lexed<'input>;
fn next(&mut self) -> Option<Self::Item> {
let mut res = self.lex();
while let Some(Ok((_, Token::Comment, _))) = res {
res = self.lex();
}
res
}
}
#[inline(always)]
fn is_ascii_binary(c: char) -> bool {
matches!(c, '0'..='1')
}