texform_core/
lexer.rs

1//! LaTeX lexical analysis powered by [Logos](https://docs.rs/logos).
2//!
3//! The lexer maps LaTeX source bytes into a flat stream of [`Token`]s.
4//! It follows the TeX catcode model in a simplified form:
5//!
6//! - Catcode 0 (Escape) triggers control-sequence scanning.
7//! - Catcodes 1–8 map to dedicated structural tokens.
8//! - Catcode 10 (Spacer) and catcode 14 (Comment) are handled as
9//!   whitespace / skip rules.
10//! - Catcodes 11/12 (Letter/Other) fall through to [`Token::Char`].
11//! - Catcodes 9 (Ignore) and 15 (Invalid) are not matched by any rule
12//!   and produce lexer errors automatically.
13//!
14//! The lexer is intentionally lossy: comments are discarded and runs of
15//! whitespace are collapsed, matching TeXForm's normalization goals.
16
17use logos::Logos;
18
19/// Token types for LaTeX lexical analysis.
20///
21/// This lexer recognizes LaTeX tokens based on character categories (catcode).
22/// It provides a simplified view where special characters and control sequences
23/// are identified, while preserving enough information for parsing.
24#[derive(Logos, Debug, PartialEq, Clone)]
25pub enum Token {
26    // --- Control Sequences ---
27    /// Control sequence: \command
28    /// - catcode 0 (Escape): backslash triggers control sequence scanning
29    /// - Matches: \<letters> (control word) or \<single-char> (control symbol)
30    /// - Returns the command name without the backslash
31    #[regex(r"\\(?:[a-zA-Z]+|.)", |lex| {
32        let slice = lex.slice();
33        slice[1..].to_string()
34    })]
35    ControlSeq(String),
36
37    /// Active character: ~
38    /// - catcode 13: Active Character
39    /// - Treated as a command but without escape character
40    /// - In LaTeX, ~ produces a non-breaking space
41    #[token("~")]
42    ActiveChar,
43
44    // --- Structural Tokens ---
45    /// Left brace: {
46    /// - catcode 1: Begin Group
47    /// - Used for grouping and delimiting arguments
48    #[token("{")]
49    LBrace,
50
51    /// Right brace: }
52    /// - catcode 2: End Group
53    /// - Closes groups started by LBrace
54    #[token("}")]
55    RBrace,
56
57    /// Dollar sign: $
58    /// - catcode 3: Math Shift
59    /// - Toggles inline math mode; $$ indicates display math
60    #[token("$")]
61    MathShift,
62
63    /// Ampersand: &
64    /// - catcode 4: Alignment Tab
65    /// - Used in tables and alignment environments
66    #[token("&")]
67    Alignment,
68
69    /// Hash/pound sign: #
70    /// - catcode 6: Parameter
71    /// - Used in macro definitions and arguments
72    #[token("#")]
73    Parameter,
74
75    /// Caret: ^
76    /// - catcode 7: Superscript
77    /// - Indicates superscript in math mode
78    #[token("^")]
79    Superscript,
80
81    /// Underscore: _
82    /// - catcode 8: Subscript
83    /// - Indicates subscript in math mode
84    #[token("_")]
85    Subscript,
86
87    /// Star/Asterisk: *
88    /// - catcode 12: Other
89    /// - Used for starred command variants (e.g., \section*)
90    /// - Must be checked immediately after command names
91    #[token("*")]
92    Star,
93
94    /// Left bracket: [
95    /// - catcode 12: Other
96    /// - Often used for optional arguments
97    #[token("[")]
98    LBracket,
99
100    /// Right bracket: ]
101    /// - catcode 12: Other
102    /// - Closes optional arguments
103    #[token("]")]
104    RBracket,
105
106    /// Prime mark(s): one or more ' or U+2019
107    /// - In math mode, represents derivative notation (f' = f^\prime)
108    /// - Multiple primes are common: f'', f'''
109    /// - We store the count to simplify parser handling
110    #[regex(r"['\u2019]+", callback = |lex| lex.slice().chars().count())]
111    Prime(usize),
112
113    // --- Whitespace and Comments ---
114    /// Whitespace: spaces, tabs, newlines, form feeds, non-breaking space
115    /// - catcode 10: Spacer
116    /// - Multiple consecutive whitespace characters are merged
117    /// - Includes U+00A0 (non-breaking space) for copy-paste behavior
118    #[regex(r"[ \t\n\f\u{00A0}]+")]
119    Whitespaces,
120
121    /// Comment: % to end of line
122    /// - catcode 14: Comment
123    /// - Lexer consumes everything from % to line end (inclusive)
124    /// - Comments are discarded and do not produce tokens
125    #[regex(r"%[^\n]*\n?", logos::skip)]
126    Comment,
127
128    // --- Character Tokens ---
129    /// Regular character: letters, digits, punctuation, Unicode (excluding invalid chars)
130    /// - catcode 11: Letter (a-z, A-Z)
131    /// - catcode 12: Other (digits, punctuation, etc.)
132    /// - Matches any single printable character not covered by above patterns
133    /// - Has lowest priority (1) to act as fallback
134    ///
135    /// Note: Control characters (catcode 9, 15) are NOT matched by any pattern
136    /// and will cause lexing errors automatically:
137    /// - catcode 9 (Ignore): \x00-\x08, \x0B-\x1F (control chars except \t, \n, \f)
138    /// - catcode 15 (Invalid): \x7F (DEL character)
139    #[regex(r"[\x20-\x7E\u{80}-\u{10FFFF}]", priority = 1, callback = |lex| {
140        let slice = lex.slice();
141        slice.chars().next().unwrap()
142    })]
143    Char(char),
144}
145
146impl std::fmt::Display for Token {
147    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148        match self {
149            Token::ControlSeq(name) => write!(f, "\\{name}"),
150            Token::ActiveChar => write!(f, "~"),
151            Token::LBrace => write!(f, "{{"),
152            Token::RBrace => write!(f, "}}"),
153            Token::MathShift => write!(f, "$"),
154            Token::Alignment => write!(f, "&"),
155            Token::Parameter => write!(f, "#"),
156            Token::Superscript => write!(f, "^"),
157            Token::Subscript => write!(f, "_"),
158            Token::Star => write!(f, "*"),
159            Token::LBracket => write!(f, "["),
160            Token::RBracket => write!(f, "]"),
161            Token::Prime(n) => {
162                for _ in 0..*n {
163                    write!(f, "'")?;
164                }
165                Ok(())
166            }
167            Token::Whitespaces => write!(f, " "),
168            Token::Comment => write!(f, "%"),
169            Token::Char(c) => write!(f, "{c}"),
170        }
171    }
172}
texform_core/lexer.rs

texform_core/
lexer.rs