use std::mem;
use std::ops::Range;
use std::str::CharIndices;
use mathml_renderer::{attribute::OpAttrs, symbol};
use crate::CommandConfig;
use crate::commands::get_command;
use crate::environments::Env;
use crate::error::{GetUnwrap, LatexErrKind, LatexError};
use crate::token::{EndToken, Mode, Span, TokSpan, Token};
pub(crate) struct Lexer<'config, 'source>
where
'config: 'source,
{
input: CharIndices<'source>,
peek: (usize, Option<char>),
input_string: &'source str,
input_length: usize,
parse_cmd_args: Option<u8>,
cmd_cfg: Option<&'config CommandConfig>,
}
impl<'config, 'source> Lexer<'config, 'source> {
pub(crate) fn new(
input: &'source str,
parsing_custom_cmds: bool,
cmd_cfg: Option<&'config CommandConfig>,
) -> Self {
let mut lexer = Lexer {
input: input.char_indices(),
peek: (0, None),
input_string: input,
input_length: input.len(),
parse_cmd_args: if parsing_custom_cmds {
Some(0) } else {
None
},
cmd_cfg,
};
lexer.read_char(); lexer
}
#[inline]
pub(super) fn input_length(&self) -> usize {
self.input_length
}
#[inline]
pub(crate) fn parse_cmd_args(&self) -> Option<u8> {
self.parse_cmd_args
}
fn read_char(&mut self) -> (usize, Option<char>) {
mem::replace(
&mut self.peek,
self.input
.next()
.map_or((self.input_length, None), |(idx, ch)| (idx, Some(ch))),
)
}
fn skip_whitespace(&mut self) -> Option<Span> {
let mut span: Option<Span> = None;
while let (loc, Some(ch)) = self.peek
&& ch.is_ascii_whitespace()
{
self.read_char(); if span.is_none() {
span = Some(Span::new(loc, loc + ch.len_utf8()));
}
}
span
}
#[inline]
fn read_command(&mut self) -> (&'source str, usize) {
let start = self.peek.0;
while self.peek.1.is_some_and(|ch| ch.is_ascii_alphabetic()) {
self.read_char();
}
if self.peek.1 == Some('*') {
self.read_char();
}
if start == self.peek.0 {
self.read_char();
}
let end = self.peek.0;
(self.input_string.get_unwrap(start..end), end)
}
#[inline]
fn read_env_name(&mut self) -> Result<(&'source str, usize), CharSpan> {
let (loc, first) = self.read_char();
if first != Some('{') {
return if first.is_some_and(|ch| ch.is_ascii_alphabetic() || matches!(ch, '*')) {
Ok((self.input_string.get_unwrap(loc..self.peek.0), self.peek.0))
} else {
Err((first, loc..(loc + first.map_or(0, char::len_utf8))))
};
}
let start = self.peek.0;
while self.peek.1.is_some_and(|ch| {
ch.is_ascii_alphabetic() || ch.is_ascii_whitespace() || matches!(ch, '*')
}) {
self.read_char();
}
let (loc, closing) = self.read_char();
if closing == Some('}') {
let end = loc;
Ok((self.input_string.get_unwrap(start..end), end + 1))
} else {
Err((closing, loc..(loc + closing.map_or(0, char::len_utf8))))
}
}
pub(crate) fn next_token(&mut self) -> Result<TokSpan<'source>, Box<LatexError>> {
match self.next_token_internal() {
LexerResult::Tok(tok) => Ok(tok),
LexerResult::UnknownCommand(cmd, span) => {
if self.cmd_cfg.is_some_and(|cfg| cfg.ignore_unknown_commands) {
Ok(TokSpan::new(Token::UnknownCommand(cmd), span))
} else {
Err(Box::new(LatexError(
span.into(),
LatexErrKind::UnknownCommand(cmd.into()),
)))
}
}
LexerResult::Err(err) => Err(err),
}
}
pub(crate) fn next_token_no_unknown_command(
&mut self,
) -> Result<TokSpan<'config>, Box<LatexError>> {
match self.next_token_internal() {
LexerResult::Tok(tok) => Ok(tok),
LexerResult::UnknownCommand(cmd, span) => Err(Box::new(LatexError(
span.into(),
LatexErrKind::UnknownCommand(cmd.into()),
))),
LexerResult::Err(err) => Err(err),
}
}
fn next_token_internal(&mut self) -> LexerResult<'config, 'source> {
if let Some(span) = self.skip_whitespace() {
return LexerResult::Tok(TokSpan::new(Token::Whitespace, span));
}
let (loc, ch) = self.read_char();
let Some(ch) = ch else {
return LexerResult::Tok(TokSpan::new(Token::Eoi, Span::zero_width(loc)));
};
if ch == '%' {
while self.peek.1 != Some('\n') && self.peek.1.is_some() {
self.read_char();
}
self.read_char(); return self.next_token_internal();
}
let mut span = Span::new(loc, loc + ch.len_utf8());
let tok = match ch {
'\u{0}' => {
return LexerResult::Err(Box::new(LatexError(
loc..(loc + 1),
LatexErrKind::DisallowedChar(ch),
)));
}
' ' => Token::Letter(symbol::NO_BREAK_SPACE, Mode::MathOrText),
'"' => Token::Letter(symbol::RIGHT_DOUBLE_QUOTATION_MARK, Mode::MathOrText),
'#' => {
if let Some(num) = &mut self.parse_cmd_args {
if let Some(next) = self.peek.1
&& next.is_ascii_digit()
{
let param_num = (next as u32).wrapping_sub('1' as u32);
let param_num = if let Ok(param_num) = u8::try_from(param_num)
&& (0..=8).contains(¶m_num)
{
param_num
} else {
return LexerResult::Err(Box::new(LatexError(
(loc + 1)..(loc + 2),
LatexErrKind::InvalidParameterNumber,
)));
};
if (param_num + 1) > *num {
*num = param_num + 1;
}
self.read_char();
span = span.with_length(2);
Token::CustomCmdArg(param_num)
} else {
let (loc, ch) = self.read_char();
if let Some(ch) = ch {
return LexerResult::Err(Box::new(LatexError(
loc..(loc + ch.len_utf8()),
LatexErrKind::InvalidParameterNumber,
)));
}
return LexerResult::Err(Box::new(LatexError(
loc..loc,
LatexErrKind::ExpectedParamNumberGotEOI,
)));
}
} else {
return LexerResult::Err(Box::new(LatexError(
loc..(loc + 1),
LatexErrKind::MacroParameterOutsideCustomCommand,
)));
}
}
'&' => Token::NewColumn,
'\'' => Token::Prime,
'<' => Token::OpLessThan,
'>' => Token::OpGreaterThan,
'[' => Token::SquareBracketOpen,
']' => Token::SquareBracketClose,
'^' => Token::Circumflex,
'_' => Token::Underscore,
'`' => Token::Letter(symbol::LEFT_SINGLE_QUOTATION_MARK, Mode::MathOrText),
'{' => Token::GroupBegin,
'}' => Token::GroupEnd,
'~' => Token::NonBreakingSpace,
'\\' => {
let (cmd_string, end) = self.read_command();
let span = Span::new(loc, end);
self.skip_whitespace();
return self.parse_command(span, cmd_string);
}
c => {
if let Some(tok) = nonalpha_nonspecial_ascii_to_token(c) {
tok
} else if c.is_ascii_digit() {
Token::Digit(c)
} else {
Token::Letter(c, Mode::MathOrText)
}
}
};
LexerResult::Tok(TokSpan::new(tok, span))
}
fn parse_command(
&mut self,
span: Span,
cmd_string: &'source str,
) -> LexerResult<'config, 'source> {
'unreliable_rendering: {
if self
.cmd_cfg
.is_some_and(|cfg| cfg.allow_unreliable_rendering)
{
let tok = match cmd_string {
"widecheck" => Token::Accent(symbol::CARON, true, OpAttrs::STRETCHY_TRUE),
"widetilde" => {
Token::Accent(symbol::TILDE.as_op(), true, OpAttrs::STRETCHY_TRUE)
}
_ => break 'unreliable_rendering,
};
return LexerResult::Tok(TokSpan::new(tok, span));
}
}
let tok: Result<(Token<'config>, Span), LatexError> = if let Some(tok) = self
.cmd_cfg
.and_then(|custom_cmds| custom_cmds.get_command(cmd_string))
.or_else(|| get_command(cmd_string))
{
Ok((tok, span))
} else {
let env_marker = match cmd_string {
"begin" => Some(EnvMarker::Begin),
"end" => Some(EnvMarker::End),
_ => None,
};
if let Some(env_marker) = env_marker {
'env_name: {
self.skip_whitespace();
let group_loc = self.peek.0;
let (name, end) = match self.read_env_name() {
Ok(lit) => lit,
Err((ch, span)) => match ch {
None => {
break 'env_name Err(LatexError(
span,
LatexErrKind::UnclosedGroup(EndToken::GroupClose),
));
}
Some(ch) => {
break 'env_name Err(LatexError(
span,
LatexErrKind::DisallowedChar(ch),
));
}
},
};
let Some(env) = Env::from_str(name) else {
break 'env_name Err(LatexError(
group_loc..end,
LatexErrKind::UnknownEnvironment(name.into()),
));
};
let span = Span::new(span.start(), end);
Ok((
match env_marker {
EnvMarker::Begin => Token::Begin(env),
EnvMarker::End => Token::End(env),
},
span,
))
}
} else {
return LexerResult::UnknownCommand(cmd_string, span);
}
};
match tok {
Ok((tok, span)) => LexerResult::Tok(TokSpan::new(tok, span)),
Err(err) => LexerResult::Err(Box::new(err)),
}
}
}
type CharSpan = (Option<char>, Range<usize>);
fn nonalpha_nonspecial_ascii_to_token(ch: char) -> Option<Token<'static>> {
let tok_ref = match ch {
'!' => &Token::ForceClose(symbol::EXCLAMATION_MARK),
'(' => &Token::Open(symbol::LEFT_PARENTHESIS),
')' => &Token::Close(symbol::RIGHT_PARENTHESIS),
'*' => &const { Token::ForceBinaryOp(symbol::ASTERISK_OPERATOR.as_op()) },
'+' => &Token::BinaryOp(symbol::PLUS_SIGN),
',' => &Token::Punctuation(symbol::COMMA),
'-' => &Token::BinaryOp(symbol::MINUS_SIGN),
'/' => &Token::Ord(symbol::SOLIDUS),
':' => &const { Token::ForceRelation(symbol::COLON.as_op()) },
';' => &Token::Punctuation(symbol::SEMICOLON),
'=' => &Token::Relation(symbol::EQUALS_SIGN),
'|' => &Token::Ord(symbol::VERTICAL_LINE),
_ => return None,
};
Some(Token::MathOrTextMode(tok_ref, ch))
}
enum EnvMarker {
Begin = 1,
End = 2,
}
pub(crate) fn recover_limited_ascii(tok: Token) -> Option<char> {
match tok {
Token::Letter(ch, _) if ch.is_ascii_alphabetic() || ch == '.' => Some(ch),
Token::Digit(ch) | Token::MathOrTextMode(_, ch) => Some(ch),
Token::Whitespace => Some(' '),
_ => None,
}
}
enum LexerResult<'config, 'source> {
Tok(TokSpan<'config>),
UnknownCommand(&'source str, Span),
Err(Box<LatexError>),
}
#[cfg(test)]
mod tests {
use std::fmt::Write;
use insta::assert_snapshot;
use super::super::token::Token;
use super::*;
#[test]
fn lexer_test() {
let problems = [
("simple_number", r"3"),
("number_with_dot", r"3.14"),
("number_with_dot_at_end", r"3.14."),
("number_with_two_inner_dots", r"3..14"),
("lower_case_latin", r"x"),
("lower_case_greek", r"\pi"),
("assigment_with_space", r"x = 3.14"),
("two_lower_case_greek", r"\alpha\beta"),
("simple_expression", r"x+y"),
("space_and_number", r"\ 1"),
("space_in_text", r"\text{ x y z}"),
("comment", "ab%hello\ncd"),
("switch_to_text_mode", r"\prod\text\o\sum"),
("switch_to_text_mode_braces", r"\prod\text{\o}\sum"),
("custom_space", r"{x\hspace{2em}}"),
("hspace_whitespace_in_between", r"\hspace { 4 em } x"),
("color", r"{x\color{red} y}"),
("color_whitespace", r"{x\color {red} y}"),
("color_newline", "{x\\color\n{red} y}"),
("color_one_letter", "{x\\color r y}"),
("genfrac_with_parens", r"\genfrac(]{0pt}{2}{a+b}{c+d}"),
(
"genfrac_with_one_sided_parens",
r"\genfrac{}]{0pt}{2}{a+b}{c+d}",
),
("genfrac_without_parens", r"\genfrac{}{}{0pt}{2}{a+b}{c+d}"),
("begin_array", r"\begin{array}{c|c}"),
("end_array", r"\end{array}{c|c}"),
];
for (name, problem) in problems.into_iter() {
let mut lexer = Lexer::new(problem, false, None);
let mut tokens = String::new();
loop {
let tokloc = lexer.next_token().unwrap();
if matches!(tokloc.token(), Token::Eoi) {
break;
}
let (tok, span) = tokloc.into_parts();
write!(tokens, "{}:{}: {:?}\n", span.start(), span.end(), tok).unwrap();
}
assert_snapshot!(name, &tokens, problem);
}
}
#[test]
fn test_lexer_errors() {
let problems = [
("unknown_command", r"\unknowncmd + x"),
("missing_brace", r"\begin x + y"),
("disallowed_chars", r"\begin{matrix x + y}"),
(
"unknown_environment",
r"\begin{unknownenv} x + y \end{unknownenv}",
),
("null_character_in_input", "x + \u{0} + y"),
("null_character_in_string_literal", "\\text{\u{0}}"),
];
for (name, problem) in problems.into_iter() {
let mut lexer = Lexer::new(problem, false, None);
let err = loop {
match lexer.next_token() {
Ok(tokloc) => {
if matches!(tokloc.token(), Token::Eoi) {
break None;
}
}
Err(err) => {
break Some(err);
}
}
};
let Some(error) = err else {
panic!("Expected an error in problem: {}", problem);
};
let report = error.to_report("<input>", false);
let mut buf = Vec::new();
report
.write(("<input>", ariadne::Source::from(problem)), &mut buf)
.expect("failed to write report");
let output = String::from_utf8(buf).expect("report should be valid UTF-8");
assert_snapshot!(name, &output, problem);
}
}
#[test]
fn test_parsing_custom_commands() {
let parsing_custom_cmds = true;
let problem = r"\frac{#1}{#2} + \sqrt{#3}";
let mut lexer = Lexer::new(problem, parsing_custom_cmds, None);
let mut tokens = String::new();
loop {
let tokloc = lexer.next_token().unwrap();
if matches!(tokloc.token(), Token::Eoi) {
break;
}
let (tok, span) = tokloc.into_parts();
write!(tokens, "{}..{}: {:?}\n", span.start(), span.end(), tok).unwrap();
}
assert!(matches!(lexer.parse_cmd_args(), Some(3)));
assert_snapshot!("parsing_custom_commands", tokens, problem);
}
#[test]
fn test_recover_limited_ascii() {
let input = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,-*:|";
let mut lexer = Lexer::new(input, false, None);
let mut output = String::new();
while let Ok(tokloc) = lexer.next_token() {
let tok = tokloc.into_token();
if let Some(ch) = recover_limited_ascii(tok) {
output.push(ch);
}
if matches!(tok, Token::Eoi) {
break;
}
}
assert_eq!(input, output);
}
}