use crate::namespace::KeyMap;
use crate::types::{LexerInterface, ParseError, Settings, SourceLocation, Token};
use alloc::sync::Arc;
#[must_use]
pub fn last_non_combining_mark_index(s: &str) -> Option<usize> {
let mut cut_idx = s.len();
for (idx, ch) in s.char_indices().rev() {
if (0x0300..=0x036F).contains(&(ch as u32)) {
cut_idx = idx;
} else {
break;
}
}
if cut_idx == s.len() {
None
} else {
Some(cut_idx)
}
}
const fn is_combining_mark(ch: char) -> bool {
(ch as u32) >= 0x0300 && (ch as u32) <= 0x036F
}
fn match_space(s: &str) -> Option<usize> {
let mut len = 0;
for c in s.chars() {
if matches!(c, ' ' | '\r' | '\n' | '\t') {
len += c.len_utf8();
} else {
break;
}
}
(len > 0).then_some(len)
}
fn match_control_space(s: &str) -> Option<usize> {
let mut chars = s.chars();
let mut len = 0;
if chars.next()? != '\\' {
return None;
}
len += 1;
let next = chars.next()?;
len += next.len_utf8();
if next == '\n' {
} else if matches!(next, ' ' | '\r' | '\t') {
while let Some(c) = chars.clone().next() {
if matches!(c, ' ' | '\r' | '\t') {
chars.next();
len += c.len_utf8();
} else {
break;
}
}
if let Some(c) = chars.clone().next()
&& c == '\n'
{
chars.next();
len += c.len_utf8();
}
} else {
return None;
}
while let Some(c) = chars.clone().next() {
if matches!(c, ' ' | '\r' | '\t') {
chars.next();
len += c.len_utf8();
} else {
break;
}
}
Some(len)
}
fn match_normal_char_with_accents(s: &str) -> Option<usize> {
let mut chars = s.chars();
let first = chars.next()?;
let mut len_b = first.len_utf8();
let u = first as u32;
let in_range = |x: u32, a: u32, b: u32| x >= a && x <= b;
if in_range(u, 0x0021, 0x005B)
|| in_range(u, 0x005D, 0x2027)
|| in_range(u, 0x202A, 0xD7FF)
|| in_range(u, 0xF900, 0xFFFF)
{
while let Some(c) = chars.clone().next() {
if is_combining_mark(c) {
chars.next();
len_b += c.len_utf8();
} else {
break;
}
}
return Some(len_b);
}
if u > 0xFFFF {
while let Some(c) = chars.clone().next() {
if is_combining_mark(c) {
chars.next();
len_b += c.len_utf8();
} else {
break;
}
}
return Some(len_b);
}
None
}
fn match_verb(s: &str, star: bool) -> Option<usize> {
let prefix = if star { "\\verb*" } else { "\\verb" };
let rest = s.strip_prefix(prefix)?;
let mut chars = rest.char_indices();
let (_, delim_char) = chars.next()?;
if !star && delim_char.is_ascii_alphabetic() {
return None;
}
for (i, c) in chars {
if matches!(c, '\n' | '\r') {
return None;
}
if c == delim_char {
return Some(prefix.len() + i + c.len_utf8());
}
}
None
}
fn match_control_word(s: &str) -> Option<usize> {
let mut chars = s.chars();
let mut len = 0;
if chars.next()? != '\\' {
return None;
}
len += 1;
let mut matched = false;
for c in chars {
if c.is_ascii_alphabetic() || c == '@' {
len += c.len_utf8();
matched = true;
} else {
break;
}
}
matched.then_some(len)
}
fn match_control_word_with_space(s: &str) -> Option<(usize, usize)> {
if let Some(len) = match_control_word(s) {
let rest = &s[len..];
if let Some(space_len) = match_space(rest) {
return Some((len, space_len));
}
return Some((len, 0));
}
None
}
fn match_control_symbol(s: &str) -> Option<usize> {
let mut chars = s.chars();
if chars.next()? != '\\' {
return None;
}
let c = chars.next()?;
let cu = c as u32;
if (0xD800..=0xDFFF).contains(&cu) {
return None;
}
Some(1 + c.len_utf8())
}
fn exec(last_index: &mut usize, slice: &str) -> TokenMatch {
let (branch, mlen, skip) = if let Some(l) = match_space(slice) {
(BranchKind::Space, l, 0)
} else if let Some(l) = match_control_space(slice) {
(BranchKind::ControlSpace, l, 0)
} else if let Some(l) = match_normal_char_with_accents(slice) {
(BranchKind::NormalWithAccents, l, 0)
} else if let Some(l) = match_verb(slice, true) {
(BranchKind::VerbStar, l, 0)
} else if let Some(l) = match_verb(slice, false) {
(BranchKind::Verb, l, 0)
} else if let Some((l, s)) = match_control_word_with_space(slice) {
(BranchKind::ControlWordWhitespace, l + s, s)
} else if let Some(l) = match_control_symbol(slice) {
(BranchKind::ControlSymbol, l, 0)
} else {
let Some(ch) = slice.chars().next() else {
return TokenMatch {
branch: BranchKind::Unknown,
mlen: 0,
skip: 0,
};
};
(BranchKind::Unknown, ch.len_utf8(), 0)
};
*last_index += mlen;
TokenMatch { branch, mlen, skip }
}
#[derive(PartialEq, Eq)]
enum BranchKind {
Unknown,
Space,
ControlSpace,
NormalWithAccents,
VerbStar,
Verb,
ControlWordWhitespace,
ControlSymbol,
}
struct TokenMatch {
branch: BranchKind,
mlen: usize,
skip: usize,
}
pub struct Lexer<'a> {
input: Arc<str>,
last_index: usize,
settings: &'a Settings,
catcodes: KeyMap<char, u8>,
}
impl<'a> Lexer<'a> {
#[must_use]
pub fn new(input: Arc<str>, settings: &'a Settings) -> Self {
let mut catcodes = KeyMap::default();
catcodes.insert('%', 14); catcodes.insert('~', 13);
Self {
input,
last_index: 0,
settings,
catcodes,
}
}
pub fn set_catcode(&mut self, char: char, code: u8) {
self.catcodes.insert(char, code);
}
#[must_use]
pub fn get_catcode(&self, ch: char) -> Option<u8> {
self.catcodes.get(&ch).copied()
}
pub fn lex(&mut self) -> Result<Token, ParseError> {
if self.last_index >= self.input.len() {
return Ok(Token {
text: "EOF".to_owned(),
loc: Some(SourceLocation {
input: Arc::clone(&self.input),
start: self.last_index,
end: self.last_index,
}),
noexpand: None,
treat_as_relax: None,
});
}
let slice = &self.input[self.last_index..];
let matched = exec(&mut self.last_index, slice);
let text = match matched.branch {
BranchKind::Unknown => {
let ch = &slice[..matched.mlen];
let loc = Some(SourceLocation {
input: Arc::clone(&self.input),
start: self.last_index - matched.mlen,
end: self.last_index,
});
let token = Token::new(ch.to_owned(), loc);
return Err(ParseError::with_token(
format!("Unexpected character: {ch}"),
&token,
));
}
BranchKind::ControlWordWhitespace => &slice[..matched.mlen - matched.skip],
BranchKind::ControlSymbol | BranchKind::NormalWithAccents => &slice[..matched.mlen],
BranchKind::ControlSpace => r"\ ",
_ => " ",
};
if text.len() == 1
&& let Some(first_char) = text.chars().next()
&& self.catcodes.get(&first_char) == Some(&14)
{
if let Some(rel_pos) = slice.find('\n') {
let nl_index_global = self.last_index + rel_pos;
self.last_index = nl_index_global;
} else {
self.last_index = self.input.len();
self.settings.report_nonstrict("commentAtEnd", "% comment has no terminating newline; LaTeX would fail because of commenting the end of math mode (e.g. $)", None)?;
}
return self.lex();
}
Ok(Token::new(
text.to_owned(),
Some(SourceLocation {
input: Arc::clone(&self.input),
start: self.last_index - matched.mlen,
end: self.last_index,
}),
))
}
#[must_use]
pub const fn position(&self) -> usize {
self.last_index
}
pub const fn set_position(&mut self, last_index: usize) {
self.last_index = last_index;
}
}
impl LexerInterface for Lexer<'_> {
fn input(&self) -> &str {
&self.input
}
}