mitex_lexer/
token.rs

1use logos::{Logos, Source};
2use mitex_spec::CommandSpec;
3
4/// Brace kinds in TeX, used by defining [`Token`]
5#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
6pub enum BraceKind {
7    /// Curly braces: `{` or `}`
8    Curly,
9    /// brackets (Square braces): `[` or `]`
10    Bracket,
11    /// Parenthesis: `(` or `)`
12    Paren,
13}
14
15/// The token types defined in logos
16///
17/// For naming of marks, see <https://en.wikipedia.org/wiki/List_of_typographical_symbols_and_punctuation_marks>
18///
19/// It also specifies how logos would lex the token
20#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Logos)]
21#[logos(extras = (CommandSpec, logos::Span))]
22pub enum Token {
23    /// A line break
24    /// Typically a `\r\n` or `\n`
25    #[regex(r"[\r\n]+", priority = 2)]
26    LineBreak,
27
28    /// A whitespace sequence that doesn't contain line breaks
29    /// Typically a space or a tab
30    #[regex(r"[^\S\r\n]+", priority = 1)]
31    Whitespace,
32
33    /// A comment that follows a line break
34    /// E.g.
35    ///
36    /// ```tex
37    /// % This is a comment
38    /// ```
39    #[regex(r"%[^\r\n]*")]
40    LineComment,
41
42    /// Left braces
43    /// E.g. `{`, `[`, `(`, etc.
44    /// See [`BraceKind`] for braces.
45    #[token("{", bc)]
46    #[token("[", bb)]
47    #[token("(", bp)]
48    Left(BraceKind),
49
50    /// Right braces
51    /// E.g. `}`, `]`, `)`, etc.
52    /// See [`BraceKind`] for braces.
53    #[token("}", bc)]
54    #[token("]", bb)]
55    #[token(")", bp)]
56    Right(BraceKind),
57
58    /// An ascii comma
59    #[token(",")]
60    Comma,
61
62    /// An ascii tilde
63    #[token("~")]
64    Tilde,
65
66    /// An ascii slash
67    #[token("/")]
68    Slash,
69
70    /// An ascii ampersand
71    #[token("&")]
72    Ampersand,
73
74    /// An ascii caret
75    #[token("^")]
76    Caret,
77
78    /// An ascii apostrophe
79    #[token("'")]
80    Apostrophe,
81
82    /// An ascii ditto
83    #[token("\"")]
84    Ditto,
85
86    /// An ascii semicolon
87    #[token(";")]
88    Semicolon,
89
90    /// An ascii hash
91    #[token("#")]
92    Hash,
93
94    /// An ascii asterisk
95    #[token("*")]
96    Asterisk,
97
98    /// An ascii atsign
99    #[token("@")]
100    AtSign,
101
102    /// An ascii underscore
103    #[token("_", priority = 2)]
104    Underscore,
105
106    /// A character sequence that doesn't contain any above tokens
107    #[regex(r#"[^\s\\%\{\},\$\[\]\(\)\~/_\*@'";&^#]+"#, priority = 1)]
108    Word,
109
110    /// Special dollar signs
111    #[regex(r"\$\$?")]
112    Dollar,
113
114    /// Though newline is also a valid command, whose name is `\`, we lex it
115    /// independently so to help later AST consumers. This also means that user
116    /// cannot redefine `\` as a command.
117    #[regex(r"\\\\", priority = 4)]
118    NewLine,
119
120    /// A command start with a backslash
121    /// Note: backslash (`\`) is a command without name
122    /// Note: An escape sequence is a command with any single unicode char
123    #[regex(r"\\", lex_command_name, priority = 3)]
124    CommandName(CommandName),
125
126    /// Macro error
127    Error,
128
129    /// A macro argument
130    MacroArg(u8),
131}
132
133impl Token {
134    /// Determine whether the token is trivia
135    pub fn is_trivia(&self) -> bool {
136        use Token::*;
137        matches!(self, LineBreak | Whitespace | LineComment)
138    }
139}
140
141/// The command name used by parser
142#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
143pub enum IfCommandName {
144    /// \if
145    If,
146    /// \iftypst
147    IfTypst,
148    /// \iffalse
149    IfFalse,
150    /// \iftrue
151    IfTrue,
152    /// \ifcase
153    IfCase,
154    /// \ifnum
155    IfNum,
156    /// \ifcat
157    IfCat,
158    /// \ifx
159    IfX,
160    /// \ifvoid
161    IfVoid,
162    /// \ifhbox
163    IfHBox,
164    /// \ifvbox
165    IfVBox,
166    /// \ifhmode
167    IfHMode,
168    /// \ifmmode
169    IfMMode,
170    /// \ifvmode
171    IfVMode,
172    /// \ifinner
173    IfInner,
174    /// \ifdim
175    IfDim,
176    /// \ifeof
177    IfEof,
178    /// \@ifstar
179    IfStar,
180}
181
182/// The command name used by parser
183#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
184pub enum CommandName {
185    /// Rest of the command names
186    Generic,
187    /// clause of Environment: \begin
188    BeginEnvironment,
189    /// clause of Environment: \end
190    EndEnvironment,
191    /// clause of Math: \( or \[
192    BeginMath,
193    /// clause of Math: \) or \]
194    EndMath,
195    /// clause of Environment: \begin, but error
196    ErrorBeginEnvironment,
197    /// clause of Environment: \end, but error
198    ErrorEndEnvironment,
199    /// clause of IfStatements: \if...
200    If(IfCommandName),
201    /// clause of IfStatements: \else
202    Else,
203    /// clause of IfStatements: \fi
204    EndIf,
205    /// clause of LRItem: \left
206    Left,
207    /// clause of LRItem: \right
208    Right,
209}
210
211/// Mark the brace kind of a token as curly
212#[inline(always)]
213fn bc(_: &mut logos::Lexer<Token>) -> BraceKind {
214    BraceKind::Curly
215}
216
217/// Mark the brace kind of a token as bracket
218#[inline(always)]
219fn bb(_: &mut logos::Lexer<Token>) -> BraceKind {
220    BraceKind::Bracket
221}
222
223/// Mark the brace kind of a token as parenthesis
224#[inline(always)]
225fn bp(_: &mut logos::Lexer<Token>) -> BraceKind {
226    BraceKind::Paren
227}
228
229/// The utf8 length of ascii chars
230const LEN_ASCII: usize = 1;
231
232/// Lex a valid command name
233// todo: handle commands with underscores, whcih would require command names
234// todo: from specification
235fn lex_command_name(lexer: &mut logos::Lexer<Token>) -> CommandName {
236    use IfCommandName::*;
237    let command_start = &lexer.source()[lexer.span().end..];
238
239    // Get the first char in utf8 case
240    let c = match command_start.chars().next() {
241        Some(c) => c,
242        None => return CommandName::Generic,
243    };
244
245    // Case1: `\ ` is not a command name hence the command is empty
246    // Note: a space is not a command name
247    if c.is_whitespace() {
248        return CommandName::Generic;
249    }
250
251    // Case2: `\.*` is a command name, e.g. `\;` is a space command in TeX
252    // Note: the first char is always legal, since a backslash with any single char
253    // is a valid escape sequence
254    lexer.bump(c.len_utf8());
255
256    // Lex the command name if it is not an escape sequence
257    match c {
258        '(' | '[' => return CommandName::BeginMath,
259        ')' | ']' => return CommandName::EndMath,
260        '@' => {}
261        _ if !c.is_ascii_alphabetic() => return CommandName::Generic,
262        _ => {}
263    }
264
265    // Case3 (Rest): lex a general ascii command name
266    // We treat the command name as ascii to improve performance slightly
267    let ascii_str = &command_start.as_bytes()[LEN_ASCII..];
268    let bump_size = advance_ascii_name(lexer, ascii_str, true);
269    lexer.bump(bump_size);
270
271    let name = &command_start[..LEN_ASCII + bump_size];
272    match name {
273        "if" => CommandName::If(If),
274        "iftypst" => CommandName::If(IfTypst),
275        "iffalse" => CommandName::If(IfFalse),
276        "iftrue" => CommandName::If(IfTrue),
277        "ifcase" => CommandName::If(IfCase),
278        "ifnum" => CommandName::If(IfNum),
279        "ifcat" => CommandName::If(IfCat),
280        "ifx" => CommandName::If(IfX),
281        "ifvoid" => CommandName::If(IfVoid),
282        "ifhbox" => CommandName::If(IfHBox),
283        "ifvbox" => CommandName::If(IfVBox),
284        "ifhmode" => CommandName::If(IfHMode),
285        "ifmmode" => CommandName::If(IfMMode),
286        "ifvmode" => CommandName::If(IfVMode),
287        "ifinner" => CommandName::If(IfInner),
288        "ifdim" => CommandName::If(IfDim),
289        "ifeof" => CommandName::If(IfEof),
290        "@ifstar" => CommandName::If(IfStar),
291        "else" => CommandName::Else,
292        "fi" => CommandName::EndIf,
293        "left" => CommandName::Left,
294        "right" => CommandName::Right,
295        "begin" => lex_begin_end(lexer, true),
296        "end" => lex_begin_end(lexer, false),
297        _ => CommandName::Generic,
298    }
299}
300
301fn advance_ascii_name(
302    lexer: &mut logos::Lexer<Token>,
303    ascii_str: &[u8],
304    lex_slash_command: bool,
305) -> usize {
306    let mut bump_size = 0;
307    for c in ascii_str {
308        match c {
309            // Find the command name in the spec
310            // If a starred command is not found, recover to a normal command
311            // This is the same behavior as TeX
312            //
313            // We can build a regex set to improve performance
314            // but overall this is not a bottleneck so we don't do it now
315            // And RegexSet heavily increases the binary size
316            b'*' => {
317                let verified = if lex_slash_command {
318                    let spec = &lexer.extras.0;
319                    // for char `\`, etc.
320                    let s = lexer.span().start + 1;
321                    // for char  `*`
322                    let s = s..s + bump_size + 2;
323                    let t = lexer.source().slice(s);
324                    t.and_then(|s| spec.get(s)).is_some()
325                } else {
326                    true
327                };
328
329                if verified {
330                    bump_size += LEN_ASCII;
331                }
332
333                break;
334            }
335            c if c.is_ascii_alphabetic() => bump_size += LEN_ASCII,
336            // todo: math mode don't want :
337            // b'@' | b':' => bump_size += LEN_ASCII,
338            b'@' => bump_size += LEN_ASCII,
339            _ => break,
340        };
341    }
342
343    bump_size
344}
345
346fn lex_begin_end(lexer: &mut logos::Lexer<Token>, is_begin: bool) -> CommandName {
347    struct LexTask<'a, 'b> {
348        lexer: &'a mut logos::Lexer<'b, Token>,
349        chars: std::str::Chars<'b>,
350        collected: usize,
351    }
352
353    impl<'a, 'b> LexTask<'a, 'b> {
354        fn new(lexer: &'a mut logos::Lexer<'b, Token>) -> Self {
355            Self {
356                chars: lexer.source()[lexer.span().end..].chars(),
357                lexer,
358                collected: 0,
359            }
360        }
361
362        fn next_non_trivia(&mut self) -> Option<char> {
363            loop {
364                let c = match self.chars.next() {
365                    Some(c) => c,
366                    None => break None,
367                };
368
369                if c.is_whitespace() {
370                    self.collected += c.len_utf8();
371                    continue;
372                }
373
374                if c == '%' {
375                    self.collected += c.len_utf8();
376                    for c in self.chars.by_ref() {
377                        if c == '\n' || c == '\r' {
378                            break;
379                        }
380                        self.collected += c.len_utf8();
381                    }
382                    continue;
383                }
384
385                self.collected += c.len_utf8();
386                return Some(c);
387            }
388        }
389
390        #[inline(always)]
391        fn work(&mut self) -> Option<()> {
392            let c = self.next_non_trivia()?;
393
394            if c != '{' {
395                return None;
396            }
397
398            let ns = self.lexer.span().end + self.collected;
399            let ascii_str = self.lexer.source()[ns..].as_bytes();
400
401            let bump_size = advance_ascii_name(self.lexer, ascii_str, false);
402            self.lexer.extras.1 = ns..ns + bump_size;
403            self.collected += bump_size;
404            self.chars = self.lexer.source()[ns + bump_size..].chars();
405
406            let c = self.next_non_trivia()?;
407            if c != '}' {
408                return None;
409            }
410
411            self.lexer.bump(self.collected);
412            Some(())
413        }
414    }
415
416    let mut task = LexTask::new(lexer);
417    match (task.work(), is_begin) {
418        (Some(..), true) => CommandName::BeginEnvironment,
419        (Some(..), false) => CommandName::EndEnvironment,
420        (None, true) => CommandName::ErrorBeginEnvironment,
421        (None, false) => CommandName::ErrorEndEnvironment,
422    }
423}