Skip to main content

rdx_math/
tokenizer.rs

1/// LaTeX math tokenizer.
2///
3/// Converts a raw LaTeX math string (without `$` delimiters) into a flat sequence of
4/// [`Token`] values that the parser can consume.
5/// A single lexical unit from a LaTeX math string.
6#[derive(Debug, Clone, PartialEq)]
7pub(crate) enum Token {
8    // Grouping
9    LBrace,
10    RBrace,
11
12    // Scripts and alignment
13    Caret,
14    Underscore,
15    Ampersand,
16    Tilde, // ~ non-breaking space
17
18    // Delimiter characters (when appearing bare, not after \left/\right)
19    LParen,
20    RParen,
21    LBracket,
22    RBracket,
23    Pipe,
24
25    // Single-character operators
26    Plus,
27    Minus,
28    Equals,
29    LessThan,
30    GreaterThan,
31    Comma,
32    Semicolon,
33    Colon,
34    Bang,
35    Prime, // '
36
37    // Commands: \word  (alphabetic sequences)
38    Command(String),
39
40    // \\ row separator
41    DoubleBackslash,
42
43    // Spacing shorthand: \, \; \: \!  (these are also commands but special-cased)
44    ThinSpace,    // \,
45    MedSpace,     // \; or \:
46    NegThinSpace, // \!
47
48    // Literals
49    Letter(char), // a-z A-Z
50    Digit(char),  // 0-9
51    Dot,          // .
52
53    // Compound tokens produced by the tokenizer when it sees \begin{name} or \end{name}
54    Begin(String),
55    End(String),
56
57    // Collapsed whitespace
58    Whitespace,
59
60    // End of input
61    Eof,
62}
63
64/// Positional information attached to a token.
65#[derive(Debug, Clone, PartialEq)]
66pub(crate) struct Span {
67    /// Byte offset of the first character.
68    pub start: usize,
69    /// Byte offset one past the last character.
70    pub end: usize,
71}
72
73/// A token together with its source span.
74#[derive(Debug, Clone, PartialEq)]
75pub(crate) struct Spanned {
76    pub token: Token,
77    pub span: Span,
78}
79
80/// Tokenize a LaTeX math string into a `Vec<Spanned>`.
81///
82/// The returned vector always ends with a [`Token::Eof`] entry.
83pub(crate) fn tokenize(input: &str) -> Vec<Spanned> {
84    let bytes = input.as_bytes();
85    let len = input.len();
86    let mut pos = 0usize;
87    let mut out: Vec<Spanned> = Vec::new();
88
89    macro_rules! push {
90        ($start:expr, $tok:expr) => {
91            out.push(Spanned {
92                token: $tok,
93                span: Span {
94                    start: $start,
95                    end: pos,
96                },
97            });
98        };
99    }
100
101    while pos < len {
102        let start = pos;
103        // SAFETY: `pos < len` is guaranteed by the while condition, and `input` is
104        // valid UTF-8, so there is always at least one char remaining.
105        let Some(ch) = input[pos..].chars().next() else {
106            break;
107        };
108        let ch_len = ch.len_utf8();
109
110        match ch {
111            // Whitespace: collapse runs
112            c if c.is_ascii_whitespace() => {
113                while pos < len && input[pos..].starts_with(|c: char| c.is_ascii_whitespace()) {
114                    pos += 1;
115                }
116                push!(start, Token::Whitespace);
117            }
118
119            '{' => {
120                pos += 1;
121                push!(start, Token::LBrace);
122            }
123            '}' => {
124                pos += 1;
125                push!(start, Token::RBrace);
126            }
127            '^' => {
128                pos += 1;
129                push!(start, Token::Caret);
130            }
131            '_' => {
132                pos += 1;
133                push!(start, Token::Underscore);
134            }
135            '&' => {
136                pos += 1;
137                push!(start, Token::Ampersand);
138            }
139            '~' => {
140                pos += 1;
141                push!(start, Token::Tilde);
142            }
143            '(' => {
144                pos += 1;
145                push!(start, Token::LParen);
146            }
147            ')' => {
148                pos += 1;
149                push!(start, Token::RParen);
150            }
151            '[' => {
152                pos += 1;
153                push!(start, Token::LBracket);
154            }
155            ']' => {
156                pos += 1;
157                push!(start, Token::RBracket);
158            }
159            '|' => {
160                pos += 1;
161                push!(start, Token::Pipe);
162            }
163            '+' => {
164                pos += 1;
165                push!(start, Token::Plus);
166            }
167            '-' => {
168                pos += 1;
169                push!(start, Token::Minus);
170            }
171            '=' => {
172                pos += 1;
173                push!(start, Token::Equals);
174            }
175            '<' => {
176                pos += 1;
177                push!(start, Token::LessThan);
178            }
179            '>' => {
180                pos += 1;
181                push!(start, Token::GreaterThan);
182            }
183            ',' => {
184                pos += 1;
185                push!(start, Token::Comma);
186            }
187            ';' => {
188                pos += 1;
189                push!(start, Token::Semicolon);
190            }
191            ':' => {
192                pos += 1;
193                push!(start, Token::Colon);
194            }
195            '!' => {
196                pos += 1;
197                push!(start, Token::Bang);
198            }
199            '\'' => {
200                pos += 1;
201                push!(start, Token::Prime);
202            }
203            '.' => {
204                pos += 1;
205                push!(start, Token::Dot);
206            }
207
208            '\\' => {
209                pos += 1; // consume backslash
210                if pos >= len {
211                    // Trailing backslash — emit as error via an unknown command
212                    push!(start, Token::Command("".to_string()));
213                    continue;
214                }
215
216                let Some(next) = input[pos..].chars().next() else {
217                    push!(start, Token::Command("".to_string()));
218                    continue;
219                };
220
221                if next == '\\' {
222                    // \\ — double backslash (row separator)
223                    pos += 1;
224                    push!(start, Token::DoubleBackslash);
225                } else if next == ',' {
226                    pos += 1;
227                    push!(start, Token::ThinSpace);
228                } else if next == ';' || next == ':' {
229                    pos += 1;
230                    push!(start, Token::MedSpace);
231                } else if next == '!' {
232                    pos += 1;
233                    push!(start, Token::NegThinSpace);
234                } else if next == ' ' {
235                    // \ followed by a space — normal space
236                    pos += 1;
237                    push!(start, Token::Whitespace);
238                } else if next.is_ascii_alphabetic() {
239                    // Collect alphabetic command name
240                    let name_start = pos;
241                    while pos < len {
242                        let Some(c) = input[pos..].chars().next() else {
243                            break;
244                        };
245                        if c.is_ascii_alphabetic() {
246                            pos += c.len_utf8();
247                        } else {
248                            break;
249                        }
250                    }
251                    let name = &input[name_start..pos];
252
253                    if name == "begin" || name == "end" {
254                        // Consume optional whitespace then {env_name}
255                        skip_whitespace(input, &mut pos);
256                        if pos < len && bytes[pos] == b'{' {
257                            pos += 1; // {
258                            let env_start = pos;
259                            while pos < len && bytes[pos] != b'}' {
260                                pos += 1;
261                            }
262                            let env_name = input[env_start..pos].trim().to_string();
263                            if pos < len {
264                                pos += 1; // }
265                            }
266                            if name == "begin" {
267                                push!(start, Token::Begin(env_name));
268                            } else {
269                                push!(start, Token::End(env_name));
270                            }
271                        } else {
272                            // Malformed: \begin without {
273                            let tok = if name == "begin" {
274                                Token::Begin(String::new())
275                            } else {
276                                Token::End(String::new())
277                            };
278                            push!(start, tok);
279                        }
280                    } else {
281                        push!(start, Token::Command(name.to_string()));
282                    }
283                } else {
284                    // Non-alpha single character after backslash: \{ \} \| \[ \] etc.
285                    let sym = next.to_string();
286                    pos += next.len_utf8();
287                    push!(start, Token::Command(sym));
288                }
289            }
290
291            c if c.is_ascii_alphabetic() => {
292                pos += ch_len;
293                push!(start, Token::Letter(c));
294            }
295
296            c if c.is_ascii_digit() => {
297                pos += ch_len;
298                push!(start, Token::Digit(c));
299            }
300
301            // Skip non-ASCII, non-special characters by emitting as a Letter if they are Unicode
302            // math letters; otherwise advance past them to avoid infinite loops.
303            c => {
304                pos += ch_len;
305                // Emit as Letter so the parser can at least produce an Ident node.
306                push!(start, Token::Letter(c));
307            }
308        }
309    }
310
311    // Always terminate with Eof
312    out.push(Spanned {
313        token: Token::Eof,
314        span: Span {
315            start: len,
316            end: len,
317        },
318    });
319
320    out
321}
322
323/// Convert a token back to its raw LaTeX representation (used for raw-string extraction).
324pub(crate) fn token_to_raw_str(tok: &Token) -> String {
325    match tok {
326        Token::LBrace => "{".to_string(),
327        Token::RBrace => "}".to_string(),
328        Token::Caret => "^".to_string(),
329        Token::Underscore => "_".to_string(),
330        Token::Ampersand => "&".to_string(),
331        Token::Tilde => "~".to_string(),
332        Token::LParen => "(".to_string(),
333        Token::RParen => ")".to_string(),
334        Token::LBracket => "[".to_string(),
335        Token::RBracket => "]".to_string(),
336        Token::Pipe => "|".to_string(),
337        Token::Plus => "+".to_string(),
338        Token::Minus => "-".to_string(),
339        Token::Equals => "=".to_string(),
340        Token::LessThan => "<".to_string(),
341        Token::GreaterThan => ">".to_string(),
342        Token::Comma => ",".to_string(),
343        Token::Semicolon => ";".to_string(),
344        Token::Colon => ":".to_string(),
345        Token::Bang => "!".to_string(),
346        Token::Prime => "'".to_string(),
347        Token::Dot => ".".to_string(),
348        Token::Command(c) => format!("\\{c}"),
349        Token::DoubleBackslash => "\\\\".to_string(),
350        Token::ThinSpace => "\\,".to_string(),
351        Token::MedSpace => "\\;".to_string(),
352        Token::NegThinSpace => "\\!".to_string(),
353        Token::Letter(c) => c.to_string(),
354        Token::Digit(c) => c.to_string(),
355        Token::Begin(e) => format!("\\begin{{{e}}}"),
356        Token::End(e) => format!("\\end{{{e}}}"),
357        Token::Whitespace => " ".to_string(),
358        Token::Eof => String::new(),
359    }
360}
361
362fn skip_whitespace(input: &str, pos: &mut usize) {
363    while *pos < input.len() {
364        let Some(c) = input[*pos..].chars().next() else {
365            break;
366        };
367        if c.is_ascii_whitespace() {
368            *pos += c.len_utf8();
369        } else {
370            break;
371        }
372    }
373}
374
375// ─── TokenStream ──────────────────────────────────────────────────────────────
376
377/// A cursor over a `Vec<Spanned>` that the parser uses.
378pub(crate) struct TokenStream {
379    tokens: Vec<Spanned>,
380    pos: usize,
381}
382
383impl TokenStream {
384    pub fn new(tokens: Vec<Spanned>) -> Self {
385        Self { tokens, pos: 0 }
386    }
387
388    /// Peek at the current token without consuming it.
389    pub fn peek(&self) -> &Token {
390        &self.tokens[self.pos].token
391    }
392
393    /// Peek at the token `offset` positions ahead (0 = current).
394    pub fn peek_ahead(&self, offset: usize) -> &Token {
395        let idx = (self.pos + offset).min(self.tokens.len() - 1);
396        &self.tokens[idx].token
397    }
398
399    /// Consume and return the current token.
400    pub fn next(&mut self) -> Token {
401        let tok = self.tokens[self.pos].token.clone();
402        if self.pos + 1 < self.tokens.len() {
403            self.pos += 1;
404        }
405        tok
406    }
407
408    /// Byte offset of the current token.
409    #[allow(dead_code)]
410    pub fn current_offset(&self) -> usize {
411        self.tokens[self.pos].span.start
412    }
413
414    /// Skip whitespace tokens.
415    pub fn skip_whitespace(&mut self) {
416        while matches!(self.peek(), Token::Whitespace) {
417            self.next();
418        }
419    }
420
421    /// Returns `true` if the stream is at end-of-input (Eof token).
422    #[allow(dead_code)]
423    pub fn is_eof(&self) -> bool {
424        matches!(self.peek(), Token::Eof)
425    }
426
427    /// Consume a `{...}` group and return its contents as a raw string,
428    /// preserving whitespace exactly.  Returns `None` if the current token
429    /// is not `{`.
430    pub fn read_raw_brace_string(&mut self) -> Option<String> {
431        if !matches!(self.peek(), Token::LBrace) {
432            return None;
433        }
434        self.next(); // consume {
435
436        let mut result = String::new();
437        let mut depth = 1usize;
438
439        loop {
440            match self.peek().clone() {
441                Token::Eof => break,
442                Token::LBrace => {
443                    depth += 1;
444                    result.push('{');
445                    self.next();
446                }
447                Token::RBrace => {
448                    depth -= 1;
449                    if depth == 0 {
450                        self.next(); // consume closing }
451                        break;
452                    }
453                    result.push('}');
454                    self.next();
455                }
456                Token::Whitespace => {
457                    result.push(' ');
458                    self.next();
459                }
460                tok => {
461                    result.push_str(&token_to_raw_str(&tok));
462                    self.next();
463                }
464            }
465        }
466
467        Some(result)
468    }
469}
470
471// ─── Tests ────────────────────────────────────────────────────────────────────
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476
477    fn tokens(input: &str) -> Vec<Token> {
478        tokenize(input).into_iter().map(|s| s.token).collect()
479    }
480
481    #[test]
482    fn tokenize_simple_letters() {
483        let toks = tokens("abc");
484        assert_eq!(
485            toks,
486            vec![
487                Token::Letter('a'),
488                Token::Letter('b'),
489                Token::Letter('c'),
490                Token::Eof,
491            ]
492        );
493    }
494
495    #[test]
496    fn tokenize_digits() {
497        let toks = tokens("123");
498        assert_eq!(
499            toks,
500            vec![
501                Token::Digit('1'),
502                Token::Digit('2'),
503                Token::Digit('3'),
504                Token::Eof,
505            ]
506        );
507    }
508
509    #[test]
510    fn tokenize_command() {
511        let toks = tokens(r"\frac");
512        assert_eq!(toks, vec![Token::Command("frac".to_string()), Token::Eof]);
513    }
514
515    #[test]
516    fn tokenize_double_backslash() {
517        let toks = tokens(r"\\");
518        assert_eq!(toks, vec![Token::DoubleBackslash, Token::Eof]);
519    }
520
521    #[test]
522    fn tokenize_spacing() {
523        let toks = tokens(r"\,\;\!");
524        assert_eq!(
525            toks,
526            vec![
527                Token::ThinSpace,
528                Token::MedSpace,
529                Token::NegThinSpace,
530                Token::Eof,
531            ]
532        );
533    }
534
535    #[test]
536    fn tokenize_begin_end() {
537        let toks = tokens(r"\begin{pmatrix}");
538        assert_eq!(toks, vec![Token::Begin("pmatrix".to_string()), Token::Eof]);
539    }
540
541    #[test]
542    fn tokenize_end_env() {
543        let toks = tokens(r"\end{pmatrix}");
544        assert_eq!(toks, vec![Token::End("pmatrix".to_string()), Token::Eof]);
545    }
546
547    #[test]
548    fn tokenize_scripts() {
549        let toks = tokens("x^2_i");
550        assert_eq!(
551            toks,
552            vec![
553                Token::Letter('x'),
554                Token::Caret,
555                Token::Digit('2'),
556                Token::Underscore,
557                Token::Letter('i'),
558                Token::Eof,
559            ]
560        );
561    }
562
563    #[test]
564    fn tokenize_braces() {
565        let toks = tokens("{a}");
566        assert_eq!(
567            toks,
568            vec![Token::LBrace, Token::Letter('a'), Token::RBrace, Token::Eof,]
569        );
570    }
571
572    #[test]
573    fn tokenize_whitespace_collapsed() {
574        let toks = tokens("a   b");
575        assert_eq!(
576            toks,
577            vec![
578                Token::Letter('a'),
579                Token::Whitespace,
580                Token::Letter('b'),
581                Token::Eof,
582            ]
583        );
584    }
585
586    #[test]
587    fn tokenize_backslash_brace() {
588        // \{ and \} are used for literal brace delimiters in \left\{ ... \right\}
589        let toks = tokens(r"\{");
590        assert_eq!(toks, vec![Token::Command("{".to_string()), Token::Eof]);
591    }
592
593    #[test]
594    fn tokenize_pipe() {
595        let toks = tokens("|");
596        assert_eq!(toks, vec![Token::Pipe, Token::Eof]);
597    }
598
599    #[test]
600    fn tokenize_operators() {
601        let toks = tokens("+-=<>");
602        assert_eq!(
603            toks,
604            vec![
605                Token::Plus,
606                Token::Minus,
607                Token::Equals,
608                Token::LessThan,
609                Token::GreaterThan,
610                Token::Eof,
611            ]
612        );
613    }
614
615    #[test]
616    fn token_stream_peek_and_next() {
617        let ts_tokens = tokenize("ab");
618        let mut ts = TokenStream::new(ts_tokens);
619        assert_eq!(ts.peek(), &Token::Letter('a'));
620        ts.next();
621        assert_eq!(ts.peek(), &Token::Letter('b'));
622        ts.next();
623        assert_eq!(ts.peek(), &Token::Eof);
624    }
625
626    #[test]
627    fn token_stream_skip_whitespace() {
628        let ts_tokens = tokenize("a   b");
629        let mut ts = TokenStream::new(ts_tokens);
630        ts.next(); // consume 'a'
631        ts.skip_whitespace();
632        assert_eq!(ts.peek(), &Token::Letter('b'));
633    }
634}