1#[cfg(not(feature = "qp-trie"))]
2use crate::prefix_map::{HashPrefixMap, PrefixMap};
3#[cfg(feature = "qp-trie")]
4use crate::prefix_map::{PrefixMap, QpTriePrefixMap};
5use std::iter::FusedIterator;
6use std::sync::LazyLock;
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10pub enum Token {
11 Frac,
13 Super,
15 Sub,
17 Sep,
19 Number,
21 Text,
23 Ident,
25 Symbol,
27 Function,
29 Unary,
31 Binary,
33 OpenBracket,
35 CloseBracket,
37 OpenCloseBracket,
39}
40
41macro_rules! tokens {
42 ($($type:ident => $($str:expr),+;)+) => {
43 [
44 $(
45 $(
46 ($str, Token::$type),
47 )+
48 )+
49 ]
50 };
51}
52
53pub const ASCIIMATH_TOKENS: [(&str, Token); 351] = tokens!(
58 Frac => "/";
59 Super => "^";
60 Sub => "_";
61 Sep => ",";
62 Function => "sin", "cos", "tan", "sinh", "cosh", "tanh", "cot", "sec", "csc", "arcsin",
63 "arccos", "arctan", "coth", "sech", "csch", "exp", "log", "ln", "det", "gcd", "lcm", "Sin",
64 "Cos", "Tan", "Arcsin", "Arccos", "Arctan", "Sinh", "Cosh", "Tanh", "Cot", "Sec", "Csc",
65 "Log", "Ln", "f", "g";
66 Unary => "sqrt", "abs", "norm", "floor", "ceil", "Abs", "hat", "bar", "overline", "vec", "dot",
67 "ddot", "overarc", "overparen", "ul", "underline", "ubrace", "underbrace", "obrace",
68 "overbrace", "text", "mbox", "cancel", "tilde";
69 Unary => "bb", "mathbf", "sf", "mathsf", "bbb", "mathbb", "cc", "mathcal", "tt", "mathtt",
71 "fr", "mathfrak";
72 Binary => "frac", "root", "stackrel", "overset", "underset", "color", "id", "class";
73 Symbol => "alpha", "beta", "chi", "delta", "Delta", "epsi", "epsilon", "varepsilon", "eta",
75 "gamma", "Gamma", "iota", "kappa", "lambda", "Lambda", "lamda", "Lamda", "mu", "nu",
76 "omega", "Omega", "phi", "varphi", "Phi", "pi", "Pi", "psi", "Psi", "rho", "sigma",
77 "Sigma", "tau", "theta", "vartheta", "Theta", "upsilon", "xi", "Xi", "zeta";
78 Symbol => "*", "cdot", "**", "ast", "***", "star", "//", "\\\\", "backslash", "setminus", "xx",
80 "times", "|><", "ltimes", "><|", "rtimes", "|><|", "bowtie", "-:", "div", "divide", "@",
81 "circ", "o+", "oplus", "ox", "otimes", "o.", "odot", "sum", "prod", "^^", "wedge", "^^^",
82 "bigwedge", "vv", "vee", "vvv", "bigvee", "nn", "cap", "nnn", "bigcap", "uu", "cup", "uuu",
83 "bigcup";
84 Symbol => "=", "!=", "ne", ":=", "<", "lt", "<=", "le", "lt=", "leq", ">", "gt", "mlt", "ll",
86 ">=", "ge", "gt=", "geq", "mgt", "gg", "-<", "prec", "-lt", ">-", "succ", "-<=", "preceq",
87 ">-=", "succeq", "in", "!in", "notin", "sub", "subset", "sup", "supset", "sube",
88 "subseteq", "supe", "supseteq", "-=", "equiv", "~=", "cong", "~~", "aprox", "~", "sim",
89 "prop", "propto";
90 Symbol => "and", "or", "not", "neg", "=>", "implies", "if", "<=>", "iff", "AA", "forall", "EE",
92 "exists", "_|_", "bot", "TT", "top", "|--", "vdash", "|==", "models";
93 Symbol => ":|:", "int", "oint", "del", "partial", "grad", "nabla", "+-", "pm", "-+", "mp",
95 "O/", "emptyset", "oo", "infty", "aleph", "...", "ldots", ":.", "therefore", ":'",
96 "because", "/_", "angle", "/_\\", "triangle", "'", "prime", "\\ ", "frown", "quad",
97 "qquad", "cdots", "vdots", "ddots", "diamond", "square", "|__", "lfloor", "__|", "rfloor",
98 "|~", "lceiling", "~|", "rceiling", "CC", "NN", "QQ", "RR", "ZZ";
99 Symbol => "lim", "Lim", "dim", "mod", "lub", "glb", "min", "max";
101 Symbol => "uarr", "uparrow", "darr", "downarrow", "rarr", "rightarrow", "->", "to", ">->",
103 "rightarrowtail", "->>", "twoheadrightarrow", ">->>", "twoheadrightarrowtail", "|->",
104 "mapsto", "larr", "leftarrow", "harr", "leftrightarrow", "rArr", "Rightarrow", "lArr",
105 "Leftarrow", "hArr", "Leftrightarrow";
106 OpenBracket => "(", "[", "{", "|:", "(:", "<<", "langle", "left(", "left[", "{:";
108 CloseBracket => ")", "]", "}", ":|", ":)", ">>", "rangle", "right)", "right]", ":}";
109 OpenCloseBracket => "|";
110 Ident => "dx", "dy", "dz", "dt";
112);
113
114#[cfg(feature = "qp-trie")]
115pub type DefaultTokens = QpTriePrefixMap<&'static str, Token>;
116#[cfg(not(feature = "qp-trie"))]
117pub type DefaultTokens = HashPrefixMap<&'static str, Token>;
118
119#[cfg(feature = "qp-trie")]
120static DEFAULT_TOKENS: LazyLock<DefaultTokens> =
121 LazyLock::new(|| QpTriePrefixMap::from_iter(ASCIIMATH_TOKENS));
122#[cfg(not(feature = "qp-trie"))]
123static DEFAULT_TOKENS: LazyLock<DefaultTokens> =
124 LazyLock::new(|| HashPrefixMap::from_iter(ASCIIMATH_TOKENS));
125
126fn strip_number(inp: &str) -> Option<(&str, &str)> {
128 let mut seen_decimal = false;
129 let len = inp
130 .char_indices()
131 .find(|(_, c)| match c {
132 '.' if !seen_decimal => {
133 seen_decimal = true;
134 false
135 }
136 '0'..='9' => false,
137 _ => true,
138 })
139 .map_or(inp.len(), |(i, _)| i);
140 if len > 1 || (!seen_decimal && len > 0) {
141 Some((&inp[..len], &inp[len..]))
142 } else {
143 None
144 }
145}
146
147fn strip_text(inp: &str) -> Option<(&str, &str)> {
149 if inp.chars().next()? != '"' {
150 return None;
151 }
152 let (len, _) = inp[1..].char_indices().find(|(_, c)| c == &'"')?;
153 Some((&inp[1..=len], &inp[len + 2..]))
155}
156
157#[derive(Debug, Clone, PartialEq, Eq)]
169pub struct Tokenizer<'a, 'b, T> {
170 remaining: &'a str,
171 token_map: &'b T,
172 char_ident: bool,
173}
174
175impl<'a> Tokenizer<'a, 'static, DefaultTokens> {
176 #[must_use]
186 pub fn new(inp: &'a str) -> Self {
187 Self::with_tokens(inp, &DEFAULT_TOKENS, true)
188 }
189}
190
191impl<'a, 'b, T> Tokenizer<'a, 'b, T> {
192 pub fn with_tokens(inp: &'a str, token_map: &'b T, char_ident: bool) -> Self {
200 Tokenizer {
201 remaining: inp,
202 token_map,
203 char_ident,
204 }
205 }
206}
207
208impl<'a, T> Iterator for Tokenizer<'a, '_, T>
209where
210 T: PrefixMap<Token>,
211{
212 type Item = (&'a str, Token);
213
214 fn next(&mut self) -> Option<Self::Item> {
215 self.remaining = self.remaining.trim_start();
217 if let Some((len, &token)) = self.token_map.get_longest_prefix(self.remaining)
218 && len > 0
219 {
220 let (pref, rem) = self.remaining.split_at(len);
221 self.remaining = rem;
222 Some((pref, token))
223 } else if let Some((num, res)) = strip_number(self.remaining) {
224 self.remaining = res;
226 Some((num, Token::Number))
227 } else if let Some((text, res)) = strip_text(self.remaining) {
228 self.remaining = res;
230 Some((text, Token::Text))
231 } else if self.char_ident {
232 self.remaining.chars().next().map(|chr| {
234 let len = chr.len_utf8();
235 let raw = &self.remaining[..len];
236 self.remaining = &self.remaining[len..];
237 (raw, Token::Ident)
238 })
239 } else {
240 let len = self
241 .remaining
242 .char_indices()
243 .find(|&(i, c)| {
244 matches!(c, '.' | '"' | '0'..='9')
246 || c.is_whitespace()
247 || self
248 .token_map
249 .get_longest_prefix(&self.remaining[i..])
250 .is_some_and(|(i, _)| i > 0)
251 })
252 .map_or(self.remaining.len(), |(i, _)| i);
253 if len == 0 {
254 None
255 } else {
256 let raw = &self.remaining[..len];
257 self.remaining = &self.remaining[len..];
258 Some((raw, Token::Ident))
259 }
260 }
261 }
262
263 fn size_hint(&self) -> (usize, Option<usize>) {
264 (0, Some(self.remaining.len()))
265 }
266}
267
268impl<T> FusedIterator for Tokenizer<'_, '_, T> where T: PrefixMap<Token> {}
269
270#[cfg(test)]
271mod tests {
272 use crate::prefix_map::HashPrefixMap;
273 use crate::{ASCIIMATH_TOKENS, Token, Tokenizer};
274
275 #[test]
276 fn char_tokenizer() {
277 let tokens: Vec<_> =
278 Tokenizer::new(r#"frac (abs x) xy / 7^2 "text with spaces""#).collect();
279 assert_eq!(
280 *tokens,
281 [
282 ("frac", Token::Binary),
283 ("(", Token::OpenBracket),
284 ("abs", Token::Unary),
285 ("x", Token::Ident),
286 (")", Token::CloseBracket),
287 ("x", Token::Ident),
288 ("y", Token::Ident),
289 ("/", Token::Frac),
290 ("7", Token::Number),
291 ("^", Token::Super),
292 ("2", Token::Number),
293 ("text with spaces", Token::Text),
294 ]
295 );
296 }
297
298 #[test]
299 fn str_tokenizer() {
300 let token_map = HashPrefixMap::from_iter(ASCIIMATH_TOKENS);
301 let tokens: Vec<_> = Tokenizer::with_tokens(
302 r#"frac (abs x) xy / 7^2 "text with spaces""#,
303 &token_map,
304 false,
305 )
306 .collect();
307 assert_eq!(
308 *tokens,
309 [
310 ("frac", Token::Binary),
311 ("(", Token::OpenBracket),
312 ("abs", Token::Unary),
313 ("x", Token::Ident),
314 (")", Token::CloseBracket),
315 ("xy", Token::Ident),
316 ("/", Token::Frac),
317 ("7", Token::Number),
318 ("^", Token::Super),
319 ("2", Token::Number),
320 ("text with spaces", Token::Text),
321 ]
322 );
323 }
324
325 #[test]
326 fn perverse_tokens() {
327 let token_map = HashPrefixMap::from_iter([("", Token::Symbol), (" 4", Token::Symbol)]);
328 let tokens: Vec<_> = Tokenizer::with_tokens(" 4 x 4 6", &token_map, false).collect();
329 assert_eq!(
330 *tokens,
331 [
332 ("4", Token::Number),
333 ("x", Token::Ident),
334 ("4", Token::Number),
335 ("6", Token::Number),
336 ]
337 );
338 }
339}