use crate::front::span::{Error, Pos, Span};
use crate::front::token::{Keyword, Token, TokenKind};
use crate::ir::BinaryOp;
use crate::{log_raw_fatal_error, return_error};
use std::io::Read;
pub struct Lexer<T: Read> {
reader: T,
pos: Pos,
last_char: Option<char>,
}
pub type Result = std::result::Result<Token, Error>;
impl<T: Read> Lexer<T> {
pub fn new(reader: T) -> Self {
Self {
reader,
pos: Pos::new(),
last_char: Some(' '),
}
}
pub fn next_token(&mut self) -> Result {
while self.last_char.is_some_and(|c| c.is_whitespace()) {
self.next_char()?;
}
if let Some(c) = self.last_char {
if c == '/' {
self.handle_comment()
} else if c == '@' || c == '%' {
self.handle_symbol()
} else if c.is_alphabetic() {
self.handle_keyword()
} else if c.is_numeric() || c == '-' {
self.handle_integer()
} else {
let pos = self.pos;
self.next_char()?;
Ok(Token::new(Span::new(pos), TokenKind::Other(c)))
}
} else {
Ok(Token::new(Span::new(self.pos), TokenKind::End))
}
}
fn next_char(&mut self) -> std::result::Result<(), Error> {
let mut single_char = [0];
self.last_char = (self
.reader
.read(&mut single_char)
.map_err(|err| log_raw_fatal_error!("{}", err))?
!= 0)
.then(|| {
let c = single_char[0] as char;
self.pos.update(c);
c
});
Ok(())
}
fn handle_integer(&mut self) -> Result {
let mut span = Span::new(self.pos);
let mut num = String::from(self.last_char.unwrap());
self.next_char()?;
while self.last_char.is_some_and(|c| c.is_numeric()) {
num.push(self.last_char.unwrap());
span.update(self.pos);
self.next_char()?;
}
if let Ok(i) = num.parse() {
Ok(Token::new(span, TokenKind::Int(i)))
} else {
self.log_err_and_skip(span, &format!("invalid integer literal '{}'", num))
}
}
fn handle_symbol(&mut self) -> Result {
let mut span = Span::new(self.pos);
let tag = self.last_char.unwrap();
let mut symbol = String::from(tag);
self.next_char()?;
if self.last_char.is_some_and(|c| c.is_numeric()) {
if tag == '@' {
return self.log_err_and_skip(span, "invalid named symbol");
}
let digit = self.last_char.unwrap();
symbol.push(digit);
span.update(self.pos);
self.next_char()?;
if digit != '0' {
while self.last_char.is_some_and(|c| c.is_numeric()) {
symbol.push(self.last_char.unwrap());
span.update(self.pos);
self.next_char()?;
}
}
} else {
while self
.last_char
.is_some_and(|c| c.is_alphanumeric() || c == '_')
{
symbol.push(self.last_char.unwrap());
span.update(self.pos);
self.next_char()?;
}
}
if symbol.len() == 1 {
self.log_err_and_skip(span, &format!("invalid symbol '{}'", symbol))
} else {
Ok(Token::new(span, TokenKind::Symbol(symbol)))
}
}
fn handle_keyword(&mut self) -> Result {
let mut span = Span::new(self.pos);
let mut keyword = String::new();
while self.last_char.is_some_and(|c| c.is_alphanumeric()) {
keyword.push(self.last_char.unwrap());
span.update(self.pos);
self.next_char()?;
}
if let Some(keyword) = KEYWORDS.get(keyword.as_str()).copied() {
Ok(Token::new(span, TokenKind::Keyword(keyword)))
} else if let Some(op) = BINARY_OPS.get(keyword.as_str()).copied() {
Ok(Token::new(span, TokenKind::BinaryOp(op)))
} else {
self.log_err_and_skip(span, &format!("invalid keyword/operator '{}'", keyword))
}
}
fn handle_comment(&mut self) -> Result {
let span = Span::new(self.pos);
self.next_char()?;
if self.last_char == Some('*') {
self.handle_block_comment(span)
} else if self.last_char == Some('/') {
while self.last_char.is_some_and(|c| c != '\r' && c != '\n') {
self.next_char()?;
}
self.next_token()
} else {
self.log_err_and_skip(span.into_updated(self.pos), "invalid comment")
}
}
fn handle_block_comment(&mut self, span: Span) -> Result {
self.next_char()?;
let mut star = false;
while self.last_char.is_some() && !(star && self.last_char == Some('/')) {
star = self.last_char == Some('*');
self.next_char()?;
}
if self.last_char.is_none() {
self.log_err_and_skip(span.into_updated(self.pos), "comment unclosed at EOF")
} else {
self.next_char()?;
self.next_token()
}
}
fn log_err_and_skip(&mut self, span: Span, message: &str) -> Result {
while self.last_char.is_some_and(|c| !c.is_whitespace()) {
self.next_char()?;
}
return_error!(span, "{}", message)
}
}
static KEYWORDS: phf::Map<&'static str, Keyword> = phf::phf_map! {
"i32" => Keyword::I32,
"undef" => Keyword::Undef,
"zeroinit" => Keyword::ZeroInit,
"global" => Keyword::Global,
"alloc" => Keyword::Alloc,
"load" => Keyword::Load,
"store" => Keyword::Store,
"getptr" => Keyword::GetPtr,
"getelemptr" => Keyword::GetElemPtr,
"br" => Keyword::Br,
"jump" => Keyword::Jump,
"call" => Keyword::Call,
"ret" => Keyword::Ret,
"fun" => Keyword::Fun,
"decl" => Keyword::Decl,
};
static BINARY_OPS: phf::Map<&'static str, BinaryOp> = phf::phf_map! {
"ne" => BinaryOp::NotEq,
"eq" => BinaryOp::Eq,
"gt" => BinaryOp::Gt,
"lt" => BinaryOp::Lt,
"ge" => BinaryOp::Ge,
"le" => BinaryOp::Le,
"add" => BinaryOp::Add,
"sub" => BinaryOp::Sub,
"mul" => BinaryOp::Mul,
"div" => BinaryOp::Div,
"mod" => BinaryOp::Mod,
"and" => BinaryOp::And,
"or" => BinaryOp::Or,
"xor" => BinaryOp::Xor,
"shl" => BinaryOp::Shl,
"shr" => BinaryOp::Shr,
"sar" => BinaryOp::Sar,
};
#[cfg(test)]
mod test {
use super::*;
use std::io::Cursor;
#[test]
fn read_tokens() {
let buf = Cursor::new(
r#"
// comment
fun @main(): i32 {
%entry:
%ret /**/ = alloc i32
store 0, %ret
%0 = load %ret
%1 = add %0, 1
ret %1
/*
block
comment
*/
}
// comment2
"#,
);
let mut lexer = Lexer::new(buf);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::Fun)
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("@main".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other('('));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other(')'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other(':'));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::I32)
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other('{'));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%entry".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other(':'));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%ret".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other('='));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::Alloc)
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::I32)
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::Store)
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Int(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other(','));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%ret".into())
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%0".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other('='));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::Load)
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%ret".into())
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%1".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other('='));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::BinaryOp(BinaryOp::Add)
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%0".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other(','));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Int(1));
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Keyword(Keyword::Ret)
);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Symbol("%1".into())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Other('}'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::End);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::End);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::End);
}
}