use crate::error::{ParseError, ParseResult, Position, Span};
use super::token_types::LatexToken;
pub(super) struct Tokenizer<'a> {
pub(super) input: &'a str,
pub(super) offset: usize,
pub(super) line: usize,
pub(super) column: usize,
}
impl<'a> Tokenizer<'a> {
pub(super) fn new(input: &'a str) -> Self {
Self {
input,
offset: 0,
line: 1,
column: 1,
}
}
pub(super) fn position(&self) -> Position {
Position::new(self.line, self.column, self.offset)
}
pub(super) fn peek(&self) -> Option<char> {
self.input[self.offset..].chars().next()
}
pub(super) fn peek_ahead(&self, n: usize) -> Option<char> {
self.input[self.offset..].chars().nth(n)
}
pub(super) fn consume(&mut self) -> Option<char> {
let ch = self.peek()?;
self.offset += ch.len_utf8();
if ch == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
Some(ch)
}
pub(super) fn skip_whitespace(&mut self) {
while let Some(ch) = self.peek() {
if ch.is_whitespace() {
self.consume();
} else {
break;
}
}
}
pub(super) fn scan_command(&mut self) -> ParseResult<(String, Span)> {
let start = self.position();
self.consume();
if self.peek() == Some('\\') {
self.consume();
let end = self.position();
return Ok(("\\\\".to_string(), Span::new(start, end)));
}
let mut name = String::new();
while let Some(ch) = self.peek() {
if ch.is_ascii_alphabetic() {
name.push(ch);
self.consume();
} else {
break;
}
}
let end = self.position();
if name.is_empty() {
return Err(ParseError::invalid_latex_command(
"\\",
Some(Span::new(start, end)),
));
}
Ok((name, Span::new(start, end)))
}
pub(super) fn scan_environment(&mut self, is_begin: bool) -> ParseResult<(String, Span)> {
let start = self.position();
self.skip_whitespace();
if self.peek() != Some('{') {
let cmd = if is_begin { "\\begin" } else { "\\end" };
return Err(ParseError::custom(
format!("{} must be followed by {{name}}", cmd),
Some(Span::new(start, self.position())),
));
}
self.consume();
let mut name = String::new();
while let Some(ch) = self.peek() {
if ch == '}' {
self.consume();
let end = self.position();
return Ok((name, Span::new(start, end)));
} else if ch.is_ascii_alphanumeric() || ch == '*' {
name.push(ch);
self.consume();
} else {
return Err(ParseError::custom(
format!("invalid character '{}' in environment name", ch),
Some(Span::new(start, self.position())),
));
}
}
Err(ParseError::unexpected_eof(
vec!["closing brace"],
Some(Span::new(start, self.position())),
))
}
pub(super) fn scan_number(&mut self) -> (String, Span) {
let start = self.position();
let mut num = String::new();
let mut has_decimal = false;
while let Some(ch) = self.peek() {
if ch.is_ascii_digit() {
num.push(ch);
self.consume();
} else if ch == '.' && !has_decimal {
if let Some(next) = self.peek_ahead(1) {
if next.is_ascii_digit() {
has_decimal = true;
num.push(ch);
self.consume();
} else {
break;
}
} else {
break;
}
} else {
break;
}
}
let end = self.position();
(num, Span::new(start, end))
}
pub(super) fn next_token(&mut self) -> ParseResult<(LatexToken, Span)> {
self.skip_whitespace();
let start = self.position();
let ch = match self.peek() {
Some(ch) => ch,
None => return Ok((LatexToken::Eof, Span::at(start))),
};
if ch == '\\' {
let (cmd, span) = self.scan_command()?;
if cmd == "\\\\" {
return Ok((LatexToken::DoubleBackslash, span));
}
return self.resolve_command(cmd, span, start);
}
self.consume();
let end = self.position();
let span = Span::new(start, end);
match ch {
'+' => Ok((LatexToken::Plus, span)),
'-' => Ok((LatexToken::Minus, span)),
'*' => Ok((LatexToken::Star, span)),
'/' => Ok((LatexToken::Slash, span)),
'^' => Ok((LatexToken::Caret, span)),
'_' => Ok((LatexToken::Underscore, span)),
'=' => Ok((LatexToken::Equals, span)),
'<' => Ok((LatexToken::Less, span)),
'>' => Ok((LatexToken::Greater, span)),
'{' => Ok((LatexToken::LBrace, span)),
'}' => Ok((LatexToken::RBrace, span)),
'(' => Ok((LatexToken::LParen, span)),
')' => Ok((LatexToken::RParen, span)),
'[' => Ok((LatexToken::LBracket, span)),
']' => Ok((LatexToken::RBracket, span)),
'|' => Ok((LatexToken::Pipe, span)),
'&' => Ok((LatexToken::Ampersand, span)),
',' => Ok((LatexToken::Comma, span)),
':' => Ok((LatexToken::Colon, span)),
_ if ch.is_ascii_digit() => {
self.offset -= 1;
self.column -= 1;
let (num, num_span) = self.scan_number();
Ok((LatexToken::Number(num), num_span))
}
_ if ch.is_ascii_alphabetic() => Ok((LatexToken::Letter(ch), span)),
_ => Err(ParseError::custom(
format!("unexpected character '{}'", ch),
Some(span),
)),
}
}
}
pub fn tokenize_latex(input: &str) -> ParseResult<Vec<(LatexToken, Span)>> {
let mut tokenizer = Tokenizer::new(input);
let mut tokens = Vec::new();
loop {
let (token, span) = tokenizer.next_token()?;
if matches!(token, LatexToken::Eof) {
tokens.push((token, span));
break;
}
tokens.push((token, span));
}
Ok(tokens)
}