Skip to main content

asciimath_parser/
tokenizer.rs

1#[cfg(not(feature = "qp-trie"))]
2use crate::prefix_map::{HashPrefixMap, PrefixMap};
3#[cfg(feature = "qp-trie")]
4use crate::prefix_map::{PrefixMap, QpTriePrefixMap};
5use std::iter::FusedIterator;
6use std::sync::LazyLock;
7
8/// A parsed token label
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10pub enum Token {
11    /// A token indicating a fraction that has the lowest precedence `/`
12    Frac,
13    /// A token indicating a superscript `^`
14    Super,
15    /// A token indicating a subscript `_`
16    Sub,
17    /// A token indicating the separation of rows and cols in matrices `,`
18    Sep,
19    /// A number
20    Number,
21    /// Quoted text
22    Text,
23    /// A raw identifier
24    Ident,
25    /// A defined symbol token
26    Symbol,
27    /// A function
28    Function,
29    /// A unary operation
30    Unary,
31    /// A binary operation
32    Binary,
33    /// An opening bracket
34    OpenBracket,
35    /// A closing bracket
36    CloseBracket,
37    /// A bracket that can either open or close
38    OpenCloseBracket,
39}
40
41macro_rules! tokens {
42    ($($type:ident => $($str:expr),+;)+) => {
43        [
44            $(
45                $(
46                    ($str, Token::$type),
47                )+
48            )+
49        ]
50    };
51}
52
53/// The tokens for standard asciimath
54///
55/// This a a constant exported to enable easily alternate parsing, or verification of string
56/// slices.
57pub const ASCIIMATH_TOKENS: [(&str, Token); 351] = tokens!(
58    Frac => "/";
59    Super => "^";
60    Sub => "_";
61    Sep => ",";
62    Function => "sin", "cos", "tan", "sinh", "cosh", "tanh", "cot", "sec", "csc", "arcsin",
63        "arccos", "arctan", "coth", "sech", "csch", "exp", "log", "ln", "det", "gcd", "lcm", "Sin",
64        "Cos", "Tan", "Arcsin", "Arccos", "Arctan", "Sinh", "Cosh", "Tanh", "Cot", "Sec", "Csc",
65        "Log", "Ln", "f", "g";
66    Unary => "sqrt", "abs", "norm", "floor", "ceil", "Abs", "hat", "bar", "overline", "vec", "dot",
67        "ddot", "overarc", "overparen", "ul", "underline", "ubrace", "underbrace", "obrace",
68        "overbrace", "text", "mbox", "cancel", "tilde";
69    // font commands
70    Unary => "bb", "mathbf", "sf", "mathsf", "bbb", "mathbb", "cc", "mathcal", "tt", "mathtt",
71        "fr", "mathfrak";
72    Binary => "frac", "root", "stackrel", "overset", "underset", "color", "id", "class";
73    // greek symbols
74    Symbol => "alpha", "beta", "chi", "delta", "Delta", "epsi", "epsilon", "varepsilon", "eta",
75        "gamma", "Gamma", "iota", "kappa", "lambda", "Lambda", "lamda", "Lamda", "mu", "nu",
76        "omega", "Omega", "phi", "varphi", "Phi", "pi", "Pi", "psi", "Psi", "rho", "sigma",
77        "Sigma", "tau", "theta", "vartheta", "Theta", "upsilon", "xi", "Xi", "zeta";
78    // operations
79    Symbol => "*", "cdot", "**", "ast", "***", "star", "//", "\\\\", "backslash", "setminus", "xx",
80        "times", "|><", "ltimes", "><|", "rtimes", "|><|", "bowtie", "-:", "div", "divide", "@",
81        "circ", "o+", "oplus", "ox", "otimes", "o.", "odot", "sum", "prod", "^^", "wedge", "^^^",
82        "bigwedge", "vv", "vee", "vvv", "bigvee", "nn", "cap", "nnn", "bigcap", "uu", "cup", "uuu",
83        "bigcup";
84    // relations
85    Symbol => "=", "!=", "ne", ":=", "<", "lt", "<=", "le", "lt=", "leq", ">", "gt", "mlt", "ll",
86        ">=", "ge", "gt=", "geq", "mgt", "gg", "-<", "prec", "-lt", ">-", "succ", "-<=", "preceq",
87        ">-=", "succeq", "in", "!in", "notin", "sub", "subset", "sup", "supset", "sube",
88        "subseteq", "supe", "supseteq", "-=", "equiv", "~=", "cong", "~~", "aprox", "~", "sim",
89        "prop", "propto";
90    // logical
91    Symbol => "and", "or", "not", "neg", "=>", "implies", "if", "<=>", "iff", "AA", "forall", "EE",
92        "exists", "_|_", "bot", "TT", "top", "|--", "vdash", "|==", "models";
93    // misc
94    Symbol => ":|:", "int", "oint", "del", "partial", "grad", "nabla", "+-", "pm", "-+", "mp",
95        "O/", "emptyset", "oo", "infty", "aleph", "...", "ldots", ":.", "therefore", ":'",
96        "because", "/_", "angle", "/_\\", "triangle", "'", "prime", "\\ ", "frown", "quad",
97        "qquad", "cdots", "vdots", "ddots", "diamond", "square", "|__", "lfloor", "__|", "rfloor",
98        "|~", "lceiling", "~|", "rceiling", "CC", "NN", "QQ", "RR", "ZZ";
99    // underover
100    Symbol => "lim", "Lim", "dim", "mod", "lub", "glb", "min", "max";
101    // arrows
102    Symbol => "uarr", "uparrow", "darr", "downarrow", "rarr", "rightarrow", "->", "to", ">->",
103        "rightarrowtail", "->>", "twoheadrightarrow", ">->>", "twoheadrightarrowtail", "|->",
104        "mapsto", "larr", "leftarrow", "harr", "leftrightarrow", "rArr", "Rightarrow", "lArr",
105        "Leftarrow", "hArr", "Leftrightarrow";
106    // brackets
107    OpenBracket => "(", "[", "{", "|:", "(:", "<<", "langle", "left(", "left[", "{:";
108    CloseBracket => ")", "]", "}", ":|", ":)", ">>", "rangle", "right)", "right]", ":}";
109    OpenCloseBracket => "|";
110    // defined identifiers
111    Ident => "dx", "dy", "dz", "dt";
112);
113
114#[cfg(feature = "qp-trie")]
115pub type DefaultTokens = QpTriePrefixMap<&'static str, Token>;
116#[cfg(not(feature = "qp-trie"))]
117pub type DefaultTokens = HashPrefixMap<&'static str, Token>;
118
119#[cfg(feature = "qp-trie")]
120static DEFAULT_TOKENS: LazyLock<DefaultTokens> =
121    LazyLock::new(|| QpTriePrefixMap::from_iter(ASCIIMATH_TOKENS));
122#[cfg(not(feature = "qp-trie"))]
123static DEFAULT_TOKENS: LazyLock<DefaultTokens> =
124    LazyLock::new(|| HashPrefixMap::from_iter(ASCIIMATH_TOKENS));
125
126// TODO allow for negative sign preceeding numbers?
127fn strip_number(inp: &str) -> Option<(&str, &str)> {
128    let mut seen_decimal = false;
129    let len = inp
130        .char_indices()
131        .find(|(_, c)| match c {
132            '.' if !seen_decimal => {
133                seen_decimal = true;
134                false
135            }
136            '0'..='9' => false,
137            _ => true,
138        })
139        .map_or(inp.len(), |(i, _)| i);
140    if len > 1 || (!seen_decimal && len > 0) {
141        Some((&inp[..len], &inp[len..]))
142    } else {
143        None
144    }
145}
146
147// TODO Add escape behind a tokenizer option
148fn strip_text(inp: &str) -> Option<(&str, &str)> {
149    if inp.chars().next()? != '"' {
150        return None;
151    }
152    let (len, _) = inp[1..].char_indices().find(|(_, c)| c == &'"')?;
153    // NOTE off by 1 because we skipped the first byte
154    Some((&inp[1..=len], &inp[len + 2..]))
155}
156
157/// A tokenizer where unknown characters are parsed as individual identifiers
158///
159/// This is the compliant mode of tokenization for for asciimath and means that unknown characters
160/// are identified individually
161///
162/// # Example
163/// ```
164/// use asciimath_parser::{Tokenizer, Token};
165/// let res: Vec<_> = Tokenizer::new("ab").collect();
166/// assert_eq!(res, [("a", Token::Ident), ("b", Token::Ident)]);
167/// ```
168#[derive(Debug, Clone, PartialEq, Eq)]
169pub struct Tokenizer<'a, 'b, T> {
170    remaining: &'a str,
171    token_map: &'b T,
172    char_ident: bool,
173}
174
175impl<'a> Tokenizer<'a, 'static, DefaultTokens> {
176    /// Create a new tokenizer with the default tokens.
177    ///
178    /// Ignoring performance differences, this achieves the same result as:
179    /// ```
180    /// use asciimath_parser::prefix_map::HashPrefixMap;
181    /// use asciimath_parser::{ASCIIMATH_TOKENS, Tokenizer, parse_tokens};
182    ///
183    /// Tokenizer::with_tokens("...", &HashPrefixMap::from_iter(ASCIIMATH_TOKENS), true);
184    /// ```
185    #[must_use]
186    pub fn new(inp: &'a str) -> Self {
187        Self::with_tokens(inp, &DEFAULT_TOKENS, true)
188    }
189}
190
191impl<'a, 'b, T> Tokenizer<'a, 'b, T> {
192    /// Create a new tokenizer with custom tokens
193    ///
194    /// # Parameters
195    /// - `inp`: the string to tokenize
196    /// - `token_map`: a prefix map of available tokens
197    /// - `char_ident`: whether to parse individual characters as identifiers (standard) or to
198    ///   treat entire sequences of unmatched characters as a single identifier.
199    pub fn with_tokens(inp: &'a str, token_map: &'b T, char_ident: bool) -> Self {
200        Tokenizer {
201            remaining: inp,
202            token_map,
203            char_ident,
204        }
205    }
206}
207
208impl<'a, T> Iterator for Tokenizer<'a, '_, T>
209where
210    T: PrefixMap<Token>,
211{
212    type Item = (&'a str, Token);
213
214    fn next(&mut self) -> Option<Self::Item> {
215        // remove whitespace
216        self.remaining = self.remaining.trim_start();
217        if let Some((len, &token)) = self.token_map.get_longest_prefix(self.remaining)
218            && len > 0
219        {
220            let (pref, rem) = self.remaining.split_at(len);
221            self.remaining = rem;
222            Some((pref, token))
223        } else if let Some((num, res)) = strip_number(self.remaining) {
224            // number
225            self.remaining = res;
226            Some((num, Token::Number))
227        } else if let Some((text, res)) = strip_text(self.remaining) {
228            // text
229            self.remaining = res;
230            Some((text, Token::Text))
231        } else if self.char_ident {
232            // next char
233            self.remaining.chars().next().map(|chr| {
234                let len = chr.len_utf8();
235                let raw = &self.remaining[..len];
236                self.remaining = &self.remaining[len..];
237                (raw, Token::Ident)
238            })
239        } else {
240            let len = self
241                .remaining
242                .char_indices()
243                .find(|&(i, c)| {
244                    // NOTE using the strip would be guaranteed to be correct, but less efficient
245                    matches!(c, '.' | '"' | '0'..='9')
246                        || c.is_whitespace()
247                        || self
248                            .token_map
249                            .get_longest_prefix(&self.remaining[i..])
250                            .is_some_and(|(i, _)| i > 0)
251                })
252                .map_or(self.remaining.len(), |(i, _)| i);
253            if len == 0 {
254                None
255            } else {
256                let raw = &self.remaining[..len];
257                self.remaining = &self.remaining[len..];
258                Some((raw, Token::Ident))
259            }
260        }
261    }
262
263    fn size_hint(&self) -> (usize, Option<usize>) {
264        (0, Some(self.remaining.len()))
265    }
266}
267
268impl<T> FusedIterator for Tokenizer<'_, '_, T> where T: PrefixMap<Token> {}
269
270#[cfg(test)]
271mod tests {
272    use crate::prefix_map::HashPrefixMap;
273    use crate::{ASCIIMATH_TOKENS, Token, Tokenizer};
274
275    #[test]
276    fn char_tokenizer() {
277        let tokens: Vec<_> =
278            Tokenizer::new(r#"frac (abs x) xy / 7^2 "text with spaces""#).collect();
279        assert_eq!(
280            *tokens,
281            [
282                ("frac", Token::Binary),
283                ("(", Token::OpenBracket),
284                ("abs", Token::Unary),
285                ("x", Token::Ident),
286                (")", Token::CloseBracket),
287                ("x", Token::Ident),
288                ("y", Token::Ident),
289                ("/", Token::Frac),
290                ("7", Token::Number),
291                ("^", Token::Super),
292                ("2", Token::Number),
293                ("text with spaces", Token::Text),
294            ]
295        );
296    }
297
298    #[test]
299    fn str_tokenizer() {
300        let token_map = HashPrefixMap::from_iter(ASCIIMATH_TOKENS);
301        let tokens: Vec<_> = Tokenizer::with_tokens(
302            r#"frac (abs x) xy / 7^2 "text with spaces""#,
303            &token_map,
304            false,
305        )
306        .collect();
307        assert_eq!(
308            *tokens,
309            [
310                ("frac", Token::Binary),
311                ("(", Token::OpenBracket),
312                ("abs", Token::Unary),
313                ("x", Token::Ident),
314                (")", Token::CloseBracket),
315                ("xy", Token::Ident),
316                ("/", Token::Frac),
317                ("7", Token::Number),
318                ("^", Token::Super),
319                ("2", Token::Number),
320                ("text with spaces", Token::Text),
321            ]
322        );
323    }
324
325    #[test]
326    fn perverse_tokens() {
327        let token_map = HashPrefixMap::from_iter([("", Token::Symbol), (" 4", Token::Symbol)]);
328        let tokens: Vec<_> = Tokenizer::with_tokens(" 4 x 4 6", &token_map, false).collect();
329        assert_eq!(
330            *tokens,
331            [
332                ("4", Token::Number),
333                ("x", Token::Ident),
334                ("4", Token::Number),
335                ("6", Token::Number),
336            ]
337        );
338    }
339}