use crate::token::{Token, keyword};
use crate::{Error, Span};
pub struct SpannedToken {
pub token: Token,
pub span: Span,
}
pub struct Lexer {
source: Vec<char>,
position: usize,
line: u32,
column: u32,
}
impl Lexer {
pub fn new(source: &str) -> Self {
Self {
source: source.chars().collect(),
position: 0,
line: 1,
column: 1,
}
}
pub fn tokenize(&mut self) -> crate::Result<Vec<SpannedToken>> {
let mut tokens = Vec::new();
loop {
self.skip_whitespace_and_comments();
let span = self.span();
if self.at_end() {
tokens.push(SpannedToken {
token: Token::End,
span,
});
break;
}
let token = self.next_token()?;
tokens.push(SpannedToken { token, span });
}
Ok(tokens)
}
fn span(&self) -> Span {
Span {
line: self.line,
column: self.column,
}
}
fn at_end(&self) -> bool {
self.position >= self.source.len()
}
fn peek(&self) -> char {
self.source.get(self.position).copied().unwrap_or('\0')
}
fn peek_next(&self) -> char {
self.source.get(self.position + 1).copied().unwrap_or('\0')
}
fn advance(&mut self) -> char {
let ch = self.peek();
self.position += 1;
if ch == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
ch
}
fn skip_whitespace_and_comments(&mut self) {
loop {
while !self.at_end() && self.peek().is_whitespace() {
self.advance();
}
if self.peek() == '/' && self.peek_next() == '/' {
while !self.at_end() && self.peek() != '\n' {
self.advance();
}
} else {
break;
}
}
}
fn next_token(&mut self) -> crate::Result<Token> {
let ch = self.peek();
if ch.is_ascii_digit() {
return self.lex_number();
}
if ch.is_ascii_alphabetic() || ch == '_' {
return Ok(self.lex_identifier());
}
if ch == '"' {
return self.lex_string();
}
let span = self.span();
self.advance();
match ch {
'(' => Ok(Token::LeftParen),
')' => Ok(Token::RightParen),
'{' => Ok(Token::LeftBrace),
'}' => Ok(Token::RightBrace),
'[' => Ok(Token::LeftBracket),
']' => Ok(Token::RightBracket),
';' => Ok(Token::Semicolon),
',' => Ok(Token::Comma),
':' => {
if self.peek() == '=' {
self.advance();
Ok(Token::ColonEqual)
} else {
Ok(Token::Colon)
}
}
'@' => Ok(Token::At),
'~' => Ok(Token::Tilde),
'^' => Ok(Token::Caret),
'%' => Ok(Token::Percent),
'+' => {
if self.peek() == '=' {
self.advance();
Ok(Token::PlusEqual)
} else {
Ok(Token::Plus)
}
}
'/' => {
if self.peek() == '=' {
self.advance();
Ok(Token::SlashEqual)
} else {
Ok(Token::Slash)
}
}
'.' => {
if self.peek() == '.' && self.peek_next() == '.' {
self.advance();
self.advance();
Ok(Token::Ellipsis)
} else {
Ok(Token::Dot)
}
}
'*' => {
if self.peek() == '=' {
self.advance();
Ok(Token::StarEqual)
} else {
Ok(Token::Star)
}
}
'-' => {
if self.peek() == '=' {
self.advance();
Ok(Token::MinusEqual)
} else if self.peek() == '>' {
self.advance();
Ok(Token::Arrow)
} else {
Ok(Token::Minus)
}
}
'=' => {
if self.peek() == '=' {
self.advance();
Ok(Token::EqualEqual)
} else if self.peek() == '>' {
self.advance();
Ok(Token::FatArrow)
} else {
Ok(Token::Equal)
}
}
'!' => {
if self.peek() == '=' {
self.advance();
Ok(Token::BangEqual)
} else {
Ok(Token::Bang)
}
}
'<' => {
if self.peek() == '=' {
self.advance();
Ok(Token::LessEqual)
} else if self.peek() == '<' {
self.advance();
Ok(Token::LessLess)
} else {
Ok(Token::Less)
}
}
'>' => {
if self.peek() == '=' {
self.advance();
Ok(Token::GreaterEqual)
} else if self.peek() == '>' {
self.advance();
Ok(Token::GreaterGreater)
} else {
Ok(Token::Greater)
}
}
'&' => {
if self.peek() == '&' {
self.advance();
Ok(Token::AmpAmp)
} else {
Ok(Token::Amp)
}
}
'|' => {
if self.peek() == '|' {
self.advance();
Ok(Token::PipePipe)
} else {
Ok(Token::Pipe)
}
}
_ => Err(Error {
message: format!("unexpected character '{ch}'"),
span: Some(span),
}),
}
}
fn lex_number(&mut self) -> crate::Result<Token> {
let span = self.span();
let first = self.advance();
if first == '0' && matches!(self.peek(), 'x' | 'b' | 'o') {
let prefix = self.advance();
let (radix, valid_digit): (u32, fn(char) -> bool) = match prefix {
'x' => (16, |c: char| c.is_ascii_hexdigit()),
'b' => (2, |c: char| c == '0' || c == '1'),
'o' => (8, |c: char| ('0'..='7').contains(&c)),
_ => unreachable!(),
};
let mut digits = std::string::String::new();
while !self.at_end() && (valid_digit(self.peek()) || self.peek() == '_') {
let ch = self.advance();
if ch != '_' {
digits.push(ch);
}
}
let value = u128::from_str_radix(&digits, radix).map_err(|_| Error {
message: format!("invalid integer literal '0{prefix}{digits}'"),
span: Some(span),
})?;
return Ok(Token::Integer(value));
}
let mut number = std::string::String::from(first);
while !self.at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
let ch = self.advance();
if ch != '_' {
number.push(ch);
}
}
if self.peek() == '.' && self.peek_next().is_ascii_digit() {
number.push(self.advance());
while !self.at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
let ch = self.advance();
if ch != '_' {
number.push(ch);
}
}
let value: f64 = number.parse().map_err(|_| Error {
message: format!("invalid float literal '{number}'"),
span: Some(span),
})?;
Ok(Token::Float(value))
} else {
let value: u128 = number.parse().map_err(|_| Error {
message: format!("invalid integer literal '{number}'"),
span: Some(span),
})?;
Ok(Token::Integer(value))
}
}
fn lex_identifier(&mut self) -> Token {
let mut name = std::string::String::new();
while !self.at_end() && (self.peek().is_ascii_alphanumeric() || self.peek() == '_') {
name.push(self.advance());
}
keyword(&name).unwrap_or(Token::Identifier(name))
}
fn lex_string(&mut self) -> crate::Result<Token> {
let span = self.span();
self.advance();
let mut bytes = Vec::new();
loop {
if self.at_end() {
return Err(Error {
message: "unterminated string literal".into(),
span: Some(span),
});
}
let ch = self.advance();
if ch == '"' {
break;
}
if ch == '\\' {
if self.at_end() {
return Err(Error {
message: "unterminated escape sequence".into(),
span: Some(self.span()),
});
}
let escaped = self.advance();
match escaped {
'n' => bytes.push(b'\n'),
't' => bytes.push(b'\t'),
'r' => bytes.push(b'\r'),
'\\' => bytes.push(b'\\'),
'"' => bytes.push(b'"'),
'0' => bytes.push(0),
_ => {
return Err(Error {
message: format!("unknown escape sequence '\\{escaped}'"),
span: Some(self.span()),
});
}
}
} else {
let mut buffer = [0u8; 4];
let encoded = ch.encode_utf8(&mut buffer);
bytes.extend_from_slice(encoded.as_bytes());
}
}
Ok(Token::String(bytes))
}
}