use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug, Clone)]
pub struct TokenizerConfig {
pub preserve_whitespace: bool,
pub preserve_comments: bool,
pub expand_macros: bool,
pub normalize_unicode: bool,
pub max_token_length: usize,
}
impl Default for TokenizerConfig {
fn default() -> Self {
Self {
preserve_whitespace: false,
preserve_comments: false,
expand_macros: false,
normalize_unicode: true,
max_token_length: 256,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MathMode {
InlineDollar,
InlineParen,
DisplayDoubleDollar,
DisplayBracket,
Environment,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BraceKind {
Curly,
Square,
Paren,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LaTeXTokenKind {
Command(String),
Environment(String),
Text(String),
Number(String),
Identifier(String),
Operator(String),
OpenBrace(BraceKind),
CloseBrace(BraceKind),
MathOpen(MathMode),
MathClose(MathMode),
Ampersand,
Newline,
Comment(String),
Whitespace(String),
Parameter(u8),
Subscript,
Superscript,
Tilde,
Special(char),
Unknown(String),
}
#[derive(Debug, Clone, PartialEq)]
pub struct LaTeXToken {
pub kind: LaTeXTokenKind,
pub start: usize,
pub end: usize,
pub in_math: bool,
}
impl LaTeXToken {
pub fn new(kind: LaTeXTokenKind, start: usize, end: usize, in_math: bool) -> Self {
Self {
kind,
start,
end,
in_math,
}
}
pub fn text(&self) -> String {
match &self.kind {
LaTeXTokenKind::Command(s) => format!("\\{}", s),
LaTeXTokenKind::Environment(s) => s.clone(),
LaTeXTokenKind::Text(s) => s.clone(),
LaTeXTokenKind::Number(s) => s.clone(),
LaTeXTokenKind::Identifier(s) => s.clone(),
LaTeXTokenKind::Operator(s) => s.clone(),
LaTeXTokenKind::OpenBrace(BraceKind::Curly) => "{".to_string(),
LaTeXTokenKind::OpenBrace(BraceKind::Square) => "[".to_string(),
LaTeXTokenKind::OpenBrace(BraceKind::Paren) => "(".to_string(),
LaTeXTokenKind::CloseBrace(BraceKind::Curly) => "}".to_string(),
LaTeXTokenKind::CloseBrace(BraceKind::Square) => "]".to_string(),
LaTeXTokenKind::CloseBrace(BraceKind::Paren) => ")".to_string(),
LaTeXTokenKind::MathOpen(MathMode::InlineDollar) => "$".to_string(),
LaTeXTokenKind::MathOpen(MathMode::InlineParen) => "\\(".to_string(),
LaTeXTokenKind::MathOpen(MathMode::DisplayDoubleDollar) => "$$".to_string(),
LaTeXTokenKind::MathOpen(MathMode::DisplayBracket) => "\\[".to_string(),
LaTeXTokenKind::MathOpen(MathMode::Environment) => String::new(),
LaTeXTokenKind::MathClose(MathMode::InlineDollar) => "$".to_string(),
LaTeXTokenKind::MathClose(MathMode::InlineParen) => "\\)".to_string(),
LaTeXTokenKind::MathClose(MathMode::DisplayDoubleDollar) => "$$".to_string(),
LaTeXTokenKind::MathClose(MathMode::DisplayBracket) => "\\]".to_string(),
LaTeXTokenKind::MathClose(MathMode::Environment) => String::new(),
LaTeXTokenKind::Ampersand => "&".to_string(),
LaTeXTokenKind::Newline => "\\\\".to_string(),
LaTeXTokenKind::Comment(s) => format!("%{}", s),
LaTeXTokenKind::Whitespace(s) => s.clone(),
LaTeXTokenKind::Parameter(n) => format!("#{}", n),
LaTeXTokenKind::Subscript => "_".to_string(),
LaTeXTokenKind::Superscript => "^".to_string(),
LaTeXTokenKind::Tilde => "~".to_string(),
LaTeXTokenKind::Special(c) => c.to_string(),
LaTeXTokenKind::Unknown(s) => s.clone(),
}
}
pub fn is_structural(&self) -> bool {
matches!(
self.kind,
LaTeXTokenKind::OpenBrace(_)
| LaTeXTokenKind::CloseBrace(_)
| LaTeXTokenKind::MathOpen(_)
| LaTeXTokenKind::MathClose(_)
| LaTeXTokenKind::Ampersand
| LaTeXTokenKind::Newline
)
}
pub fn is_command(&self) -> bool {
matches!(self.kind, LaTeXTokenKind::Command(_))
}
pub fn is_math(&self) -> bool {
self.in_math
|| matches!(
self.kind,
LaTeXTokenKind::MathOpen(_)
| LaTeXTokenKind::MathClose(_)
| LaTeXTokenKind::Subscript
| LaTeXTokenKind::Superscript
)
}
}
pub struct LaTeXTokenizer {
config: TokenizerConfig,
}
impl LaTeXTokenizer {
pub fn new() -> Self {
Self {
config: TokenizerConfig::default(),
}
}
pub fn with_config(config: TokenizerConfig) -> Self {
Self { config }
}
pub fn tokenize(&self, input: &str) -> Vec<LaTeXToken> {
let mut tokens = Vec::new();
let mut lexer = Lexer::new(input, &self.config);
while let Some(token) = lexer.next_token() {
let should_keep = match &token.kind {
LaTeXTokenKind::Whitespace(_) => self.config.preserve_whitespace,
LaTeXTokenKind::Comment(_) => self.config.preserve_comments,
_ => true,
};
if should_keep {
tokens.push(token);
}
}
tokens
}
pub fn tokenize_iter<'a>(&'a self, input: &'a str) -> impl Iterator<Item = LaTeXToken> + 'a {
let preserve_whitespace = self.config.preserve_whitespace;
let preserve_comments = self.config.preserve_comments;
std::iter::from_fn(move || {
None
})
.take(0)
.chain(
self.tokenize(input)
.into_iter()
.filter(move |token| match &token.kind {
LaTeXTokenKind::Whitespace(_) => preserve_whitespace,
LaTeXTokenKind::Comment(_) => preserve_comments,
_ => true,
}),
)
}
pub fn config(&self) -> &TokenizerConfig {
&self.config
}
}
impl Default for LaTeXTokenizer {
fn default() -> Self {
Self::new()
}
}
struct Lexer<'a> {
chars: Peekable<Chars<'a>>,
pos: usize,
config: &'a TokenizerConfig,
math_stack: Vec<MathMode>,
after_begin: bool,
after_end: bool,
}
impl<'a> Lexer<'a> {
fn new(input: &'a str, config: &'a TokenizerConfig) -> Self {
Self {
chars: input.chars().peekable(),
pos: 0,
config,
math_stack: Vec::new(),
after_begin: false,
after_end: false,
}
}
fn in_math(&self) -> bool {
!self.math_stack.is_empty()
}
fn advance(&mut self) -> Option<char> {
let c = self.chars.next()?;
self.pos += c.len_utf8();
Some(c)
}
fn peek(&mut self) -> Option<&char> {
self.chars.peek()
}
fn next_token(&mut self) -> Option<LaTeXToken> {
let start = self.pos;
let c = self.advance()?;
let in_math = self.in_math();
match c {
'\\' => self.lex_backslash(start, in_math),
'$' => self.lex_dollar(start),
'{' => Some(LaTeXToken::new(
LaTeXTokenKind::OpenBrace(BraceKind::Curly),
start,
self.pos,
in_math,
)),
'}' => Some(LaTeXToken::new(
LaTeXTokenKind::CloseBrace(BraceKind::Curly),
start,
self.pos,
in_math,
)),
'[' => Some(LaTeXToken::new(
LaTeXTokenKind::OpenBrace(BraceKind::Square),
start,
self.pos,
in_math,
)),
']' => Some(LaTeXToken::new(
LaTeXTokenKind::CloseBrace(BraceKind::Square),
start,
self.pos,
in_math,
)),
'(' => Some(LaTeXToken::new(
LaTeXTokenKind::OpenBrace(BraceKind::Paren),
start,
self.pos,
in_math,
)),
')' => Some(LaTeXToken::new(
LaTeXTokenKind::CloseBrace(BraceKind::Paren),
start,
self.pos,
in_math,
)),
'%' => self.lex_comment(start),
'&' => Some(LaTeXToken::new(
LaTeXTokenKind::Ampersand,
start,
self.pos,
in_math,
)),
'_' => Some(LaTeXToken::new(
LaTeXTokenKind::Subscript,
start,
self.pos,
in_math,
)),
'^' => Some(LaTeXToken::new(
LaTeXTokenKind::Superscript,
start,
self.pos,
in_math,
)),
'#' => self.lex_parameter(start, in_math),
'~' => Some(LaTeXToken::new(
LaTeXTokenKind::Tilde,
start,
self.pos,
in_math,
)),
c if c.is_whitespace() => self.lex_whitespace(start, c),
c if c.is_ascii_digit() => self.lex_number(start, c, in_math),
c if in_math && is_math_operator(c) => Some(LaTeXToken::new(
LaTeXTokenKind::Operator(c.to_string()),
start,
self.pos,
true,
)),
c if c.is_alphabetic() => {
if in_math {
Some(LaTeXToken::new(
LaTeXTokenKind::Identifier(c.to_string()),
start,
self.pos,
true,
))
} else {
self.lex_text(start, c)
}
}
_ => {
if in_math {
Some(LaTeXToken::new(
LaTeXTokenKind::Special(c),
start,
self.pos,
true,
))
} else {
self.lex_text(start, c)
}
}
}
}
fn lex_backslash(&mut self, start: usize, in_math: bool) -> Option<LaTeXToken> {
match self.peek() {
Some('\\') => {
self.advance();
Some(LaTeXToken::new(
LaTeXTokenKind::Newline,
start,
self.pos,
in_math,
))
}
Some('[') => {
self.advance();
self.math_stack.push(MathMode::DisplayBracket);
Some(LaTeXToken::new(
LaTeXTokenKind::MathOpen(MathMode::DisplayBracket),
start,
self.pos,
false,
))
}
Some(']') => {
self.advance();
if self.math_stack.last() == Some(&MathMode::DisplayBracket) {
self.math_stack.pop();
}
Some(LaTeXToken::new(
LaTeXTokenKind::MathClose(MathMode::DisplayBracket),
start,
self.pos,
false,
))
}
Some('(') => {
self.advance();
self.math_stack.push(MathMode::InlineParen);
Some(LaTeXToken::new(
LaTeXTokenKind::MathOpen(MathMode::InlineParen),
start,
self.pos,
false,
))
}
Some(')') => {
self.advance();
if self.math_stack.last() == Some(&MathMode::InlineParen) {
self.math_stack.pop();
}
Some(LaTeXToken::new(
LaTeXTokenKind::MathClose(MathMode::InlineParen),
start,
self.pos,
false,
))
}
Some(&c) if c.is_alphabetic() => {
let mut name = String::new();
while let Some(&c) = self.peek() {
if c.is_alphabetic() {
name.push(c);
self.advance();
} else {
break;
}
}
let kind = match name.as_str() {
"begin" => {
self.after_begin = true;
LaTeXTokenKind::Command(name)
}
"end" => {
self.after_end = true;
LaTeXTokenKind::Command(name)
}
_ => LaTeXTokenKind::Command(name),
};
Some(LaTeXToken::new(kind, start, self.pos, in_math))
}
Some(&c) if !c.is_alphanumeric() && !c.is_whitespace() => {
self.advance();
Some(LaTeXToken::new(
LaTeXTokenKind::Command(c.to_string()),
start,
self.pos,
in_math,
))
}
_ => Some(LaTeXToken::new(
LaTeXTokenKind::Unknown("\\".to_string()),
start,
self.pos,
in_math,
)),
}
}
fn lex_dollar(&mut self, start: usize) -> Option<LaTeXToken> {
if self.peek() == Some(&'$') {
self.advance();
if self.math_stack.last() == Some(&MathMode::DisplayDoubleDollar) {
self.math_stack.pop();
Some(LaTeXToken::new(
LaTeXTokenKind::MathClose(MathMode::DisplayDoubleDollar),
start,
self.pos,
false,
))
} else {
self.math_stack.push(MathMode::DisplayDoubleDollar);
Some(LaTeXToken::new(
LaTeXTokenKind::MathOpen(MathMode::DisplayDoubleDollar),
start,
self.pos,
false,
))
}
} else {
if self.math_stack.last() == Some(&MathMode::InlineDollar) {
self.math_stack.pop();
Some(LaTeXToken::new(
LaTeXTokenKind::MathClose(MathMode::InlineDollar),
start,
self.pos,
false,
))
} else {
self.math_stack.push(MathMode::InlineDollar);
Some(LaTeXToken::new(
LaTeXTokenKind::MathOpen(MathMode::InlineDollar),
start,
self.pos,
false,
))
}
}
}
fn lex_comment(&mut self, start: usize) -> Option<LaTeXToken> {
let mut content = String::new();
while let Some(&c) = self.peek() {
if c == '\n' {
break;
}
content.push(c);
self.advance();
}
Some(LaTeXToken::new(
LaTeXTokenKind::Comment(content),
start,
self.pos,
self.in_math(),
))
}
fn lex_parameter(&mut self, start: usize, in_math: bool) -> Option<LaTeXToken> {
if let Some(&c) = self.peek() {
if c.is_ascii_digit() {
self.advance();
let n = c.to_digit(10).unwrap_or(1) as u8;
return Some(LaTeXToken::new(
LaTeXTokenKind::Parameter(n),
start,
self.pos,
in_math,
));
}
}
Some(LaTeXToken::new(
LaTeXTokenKind::Special('#'),
start,
self.pos,
in_math,
))
}
fn lex_whitespace(&mut self, start: usize, first: char) -> Option<LaTeXToken> {
let mut content = String::new();
content.push(first);
while let Some(&c) = self.peek() {
if c.is_whitespace() {
content.push(c);
self.advance();
} else {
break;
}
}
Some(LaTeXToken::new(
LaTeXTokenKind::Whitespace(content),
start,
self.pos,
self.in_math(),
))
}
fn lex_number(&mut self, start: usize, first: char, in_math: bool) -> Option<LaTeXToken> {
let mut content = String::new();
content.push(first);
while let Some(&c) = self.peek() {
if c.is_ascii_digit() {
content.push(c);
self.advance();
} else {
break;
}
}
if self.peek() == Some(&'.') {
let has_digit_after_dot = {
let mut lookahead = self.chars.clone();
lookahead.next(); lookahead.next().map_or(false, |c| c.is_ascii_digit())
};
if has_digit_after_dot {
content.push('.');
self.advance();
while let Some(&c) = self.peek() {
if c.is_ascii_digit() {
content.push(c);
self.advance();
} else {
break;
}
}
}
}
Some(LaTeXToken::new(
LaTeXTokenKind::Number(content),
start,
self.pos,
in_math,
))
}
fn lex_text(&mut self, start: usize, first: char) -> Option<LaTeXToken> {
let mut content = String::new();
content.push(first);
while let Some(&c) = self.peek() {
if is_special_char(c) || c.is_whitespace() {
break;
}
content.push(c);
self.advance();
if content.len() >= self.config.max_token_length {
break;
}
}
if self.after_begin || self.after_end {
self.after_begin = false;
self.after_end = false;
if is_math_environment(&content) {
if self.after_begin {
self.math_stack.push(MathMode::Environment);
} else if self.math_stack.last() == Some(&MathMode::Environment) {
self.math_stack.pop();
}
}
Some(LaTeXToken::new(
LaTeXTokenKind::Environment(content),
start,
self.pos,
self.in_math(),
))
} else {
Some(LaTeXToken::new(
LaTeXTokenKind::Text(content),
start,
self.pos,
self.in_math(),
))
}
}
}
fn is_special_char(c: char) -> bool {
matches!(
c,
'\\' | '{' | '}' | '[' | ']' | '(' | ')' | '$' | '%' | '&' | '_' | '^' | '#' | '~'
)
}
fn is_math_operator(c: char) -> bool {
matches!(
c,
'+' | '-' | '*' | '/' | '=' | '<' | '>' | '!' | '|' | ':' | ';' | ',' | '.'
)
}
fn is_math_environment(name: &str) -> bool {
matches!(
name,
"equation"
| "equation*"
| "align"
| "align*"
| "alignat"
| "alignat*"
| "gather"
| "gather*"
| "multline"
| "multline*"
| "eqnarray"
| "eqnarray*"
| "displaymath"
| "math"
| "array"
| "matrix"
| "bmatrix"
| "pmatrix"
| "vmatrix"
| "Vmatrix"
| "cases"
| "split"
| "subequations"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_command() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"\alpha");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, LaTeXTokenKind::Command("alpha".to_string()));
}
#[test]
fn test_math_mode() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"$x^2$");
assert_eq!(tokens.len(), 5);
assert!(matches!(
tokens[0].kind,
LaTeXTokenKind::MathOpen(MathMode::InlineDollar)
));
assert!(matches!(tokens[1].kind, LaTeXTokenKind::Identifier(_)));
assert!(tokens[1].in_math);
assert_eq!(tokens[2].kind, LaTeXTokenKind::Superscript);
assert!(matches!(tokens[3].kind, LaTeXTokenKind::Number(_)));
assert!(matches!(
tokens[4].kind,
LaTeXTokenKind::MathClose(MathMode::InlineDollar)
));
}
#[test]
fn test_environment() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"\begin{equation}");
assert!(tokens
.iter()
.any(|t| matches!(&t.kind, LaTeXTokenKind::Command(s) if s == "begin")));
assert!(tokens
.iter()
.any(|t| matches!(&t.kind, LaTeXTokenKind::Environment(s) if s == "equation")));
}
#[test]
fn test_numbers() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"$3.14$");
assert!(tokens
.iter()
.any(|t| matches!(&t.kind, LaTeXTokenKind::Number(s) if s == "3.14")));
}
#[test]
fn test_operators() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"$a + b = c$");
let operators: Vec<_> = tokens
.iter()
.filter(|t| matches!(&t.kind, LaTeXTokenKind::Operator(_)))
.collect();
assert_eq!(operators.len(), 2); }
#[test]
fn test_comment() {
let config = TokenizerConfig {
preserve_comments: true,
..Default::default()
};
let tokenizer = LaTeXTokenizer::with_config(config);
let tokens = tokenizer.tokenize(r"text % comment");
assert!(tokens
.iter()
.any(|t| matches!(&t.kind, LaTeXTokenKind::Comment(_))));
}
#[test]
fn test_display_math() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"\[x\]");
assert!(matches!(
tokens[0].kind,
LaTeXTokenKind::MathOpen(MathMode::DisplayBracket)
));
assert!(matches!(
tokens.last().unwrap().kind,
LaTeXTokenKind::MathClose(MathMode::DisplayBracket)
));
}
#[test]
fn test_subscript_superscript() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"$x_1^2$");
assert!(tokens.iter().any(|t| t.kind == LaTeXTokenKind::Subscript));
assert!(tokens.iter().any(|t| t.kind == LaTeXTokenKind::Superscript));
}
#[test]
fn test_parameter() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"#1");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, LaTeXTokenKind::Parameter(1));
}
#[test]
fn test_newline() {
let tokenizer = LaTeXTokenizer::new();
let tokens = tokenizer.tokenize(r"\\");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, LaTeXTokenKind::Newline);
}
}