use crate::error::{ParseError, ParseResult, Position, Span};
use super::token_types::{SpannedToken, Token};
pub(super) struct Tokenizer<'a> {
input: &'a str,
offset: usize,
line: usize,
column: usize,
}
impl<'a> Tokenizer<'a> {
pub(super) fn new(input: &'a str) -> Self {
Self {
input,
offset: 0,
line: 1,
column: 1,
}
}
fn position(&self) -> Position {
Position::new(self.line, self.column, self.offset)
}
fn peek(&self) -> Option<char> {
self.input[self.offset..].chars().next()
}
fn peek_ahead(&self, n: usize) -> Option<char> {
self.input[self.offset..].chars().nth(n)
}
fn consume(&mut self) -> Option<char> {
let ch = self.peek()?;
self.offset += ch.len_utf8();
if ch == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
Some(ch)
}
fn skip_whitespace(&mut self) {
while let Some(ch) = self.peek() {
if ch.is_whitespace() {
self.consume();
} else {
break;
}
}
}
fn scan_decimal_part(&mut self, buf: &mut String) -> bool {
if self.peek() == Some('.') && matches!(self.peek_ahead(1), Some(d) if d.is_ascii_digit()) {
buf.push('.');
self.consume();
while let Some(ch) = self.peek() {
if ch.is_ascii_digit() {
buf.push(ch);
self.consume();
} else {
break;
}
}
return true;
}
false
}
fn scan_exponent_part(&mut self, buf: &mut String) -> bool {
if let Some(c @ ('e' | 'E')) = self.peek() {
buf.push(c);
self.consume();
if let Some(c @ ('+' | '-')) = self.peek() {
buf.push(c);
self.consume();
}
while let Some(ch) = self.peek() {
if ch.is_ascii_digit() {
buf.push(ch);
self.consume();
} else {
break;
}
}
return true;
}
false
}
fn scan_number(&mut self) -> ParseResult<(Token, Span)> {
let start = self.position();
let mut number_str = String::new();
while let Some(ch) = self.peek() {
if ch.is_ascii_digit() {
number_str.push(ch);
self.consume();
} else {
break;
}
}
let has_dot = self.scan_decimal_part(&mut number_str);
let has_exp = self.scan_exponent_part(&mut number_str);
let end = self.position();
let span = Span::new(start, end);
if has_dot || has_exp {
match number_str.parse::<f64>() {
Ok(n) => Ok((Token::Float(n), span)),
Err(_) => Err(ParseError::invalid_number(
&number_str,
"invalid float",
Some(span),
)),
}
} else {
match number_str.parse::<i64>() {
Ok(n) => Ok((Token::Integer(n), span)),
Err(_) => Err(ParseError::invalid_number(
&number_str,
"invalid integer",
Some(span),
)),
}
}
}
fn scan_identifier(&mut self) -> (Token, Span) {
let start = self.position();
let mut ident = String::new();
if let Some(ch) = self.peek() {
if ch.is_ascii_alphabetic() {
ident.push(ch);
self.consume();
}
}
while let Some(ch) = self.peek() {
if ch.is_ascii_alphanumeric() {
ident.push(ch);
self.consume();
} else {
break;
}
}
let end = self.position();
let span = Span::new(start, end);
let token = match ident.as_str() {
"dot" => Token::Dot,
"cross" => Token::Cross,
"grad" | "nabla" => Token::Grad,
"div" => Token::Div,
"curl" => Token::Curl,
"laplacian" => Token::Laplacian,
"forall" => Token::ForAll,
"exists" => Token::Exists,
"union" => Token::Union,
"intersect" => Token::Intersect,
"in" => Token::In,
"notin" => Token::NotIn,
"and" => Token::And,
"or" => Token::Or,
"not" => Token::Not,
"implies" => Token::Implies,
"iff" => Token::Iff,
_ => Token::Identifier(ident),
};
(token, span)
}
fn scan_two_char_op(
&mut self,
start: Position,
follow: char,
short_tok: Token,
long_tok: Token,
) -> SpannedToken {
self.consume();
if self.peek() == Some(follow) {
self.consume();
SpannedToken::new(long_tok, Span::new(start, self.position()))
} else {
SpannedToken::new(short_tok, Span::new(start, self.position()))
}
}
fn scan_unicode_token(&mut self, start: Position, tok: Token) -> SpannedToken {
self.consume();
SpannedToken::new(tok, Span::new(start, self.position()))
}
fn scan_multi_char_op(
&mut self,
ch: char,
start: Position,
) -> ParseResult<Option<SpannedToken>> {
let token = match ch {
'!' => self.scan_two_char_op(start, '=', Token::Bang, Token::NotEquals),
'<' => self.scan_two_char_op(start, '=', Token::Less, Token::LessEq),
'>' => self.scan_two_char_op(start, '=', Token::Greater, Token::GreaterEq),
'*' => self.scan_two_char_op(start, '*', Token::Star, Token::DoubleStar),
'≤' => self.scan_unicode_token(start, Token::LessEq),
'≥' => self.scan_unicode_token(start, Token::GreaterEq),
'≠' => self.scan_unicode_token(start, Token::NotEquals),
'π' => self.scan_unicode_token(start, Token::Pi),
'∞' => self.scan_unicode_token(start, Token::Infinity),
'√' => self.scan_unicode_token(start, Token::Sqrt),
'∇' => self.scan_unicode_token(start, Token::Grad),
'∧' => self.scan_unicode_token(start, Token::And),
'∨' => self.scan_unicode_token(start, Token::Or),
'¬' => self.scan_unicode_token(start, Token::Not),
'∪' => self.scan_unicode_token(start, Token::Union),
'∩' => self.scan_unicode_token(start, Token::Intersect),
'∈' => self.scan_unicode_token(start, Token::In),
'∉' => self.scan_unicode_token(start, Token::NotIn),
'→' => self.scan_unicode_token(start, Token::Implies),
'↔' => self.scan_unicode_token(start, Token::Iff),
_ => return Ok(None),
};
Ok(Some(token))
}
fn scan_token(&mut self) -> ParseResult<Option<SpannedToken>> {
self.skip_whitespace();
let Some(ch) = self.peek() else {
return Ok(None);
};
let start = self.position();
if ch.is_ascii_digit() {
let (token, span) = self.scan_number()?;
return Ok(Some(SpannedToken::new(token, span)));
}
if ch.is_ascii_alphabetic() {
let (token, span) = self.scan_identifier();
return Ok(Some(SpannedToken::new(token, span)));
}
if let Some(tok) = self.scan_multi_char_op(ch, start)? {
return Ok(Some(tok));
}
self.consume();
let end = self.position();
let span = Span::new(start, end);
let token = match ch {
'+' => Token::Plus,
'-' => Token::Minus,
'/' => Token::Slash,
'^' => Token::Caret,
'%' => Token::Percent,
'(' => Token::LParen,
')' => Token::RParen,
'[' => Token::LBracket,
']' => Token::RBracket,
'{' => Token::LBrace,
'}' => Token::RBrace,
',' => Token::Comma,
';' => Token::Semicolon,
'=' => Token::Equals,
'_' => Token::Underscore,
'\'' => Token::Apostrophe,
_ => {
return Err(ParseError::unexpected_token(
vec!["valid token".to_string()],
ch.to_string(),
Some(span),
));
}
};
Ok(Some(SpannedToken::new(token, span)))
}
pub(super) fn tokenize_all(&mut self) -> ParseResult<Vec<SpannedToken>> {
let mut tokens = Vec::new();
while let Some(token) = self.scan_token()? {
tokens.push(token);
}
Ok(tokens)
}
}
pub fn tokenize(input: &str) -> ParseResult<Vec<SpannedToken>> {
let mut tokenizer = Tokenizer::new(input);
tokenizer.tokenize_all()
}