texform_core/lexer.rs
1//! LaTeX lexical analysis powered by [Logos](https://docs.rs/logos).
2//!
3//! The lexer maps LaTeX source bytes into a flat stream of [`Token`]s.
4//! It follows the TeX catcode model in a simplified form:
5//!
6//! - Catcode 0 (Escape) triggers control-sequence scanning.
7//! - Catcodes 1–8 map to dedicated structural tokens.
8//! - Catcode 10 (Spacer) and catcode 14 (Comment) are handled as
9//! whitespace / skip rules.
10//! - Catcodes 11/12 (Letter/Other) fall through to [`Token::Char`].
11//! - Catcodes 9 (Ignore) and 15 (Invalid) are not matched by any rule
12//! and produce lexer errors automatically.
13//!
14//! The lexer is intentionally lossy: comments are discarded and runs of
15//! whitespace are collapsed, matching TeXForm's normalization goals.
16
17use logos::Logos;
18
19/// Token types for LaTeX lexical analysis.
20///
21/// This lexer recognizes LaTeX tokens based on character categories (catcode).
22/// It provides a simplified view where special characters and control sequences
23/// are identified, while preserving enough information for parsing.
24#[derive(Logos, Debug, PartialEq, Clone)]
25pub enum Token {
26 // --- Control Sequences ---
27 /// Control sequence: \command
28 /// - catcode 0 (Escape): backslash triggers control sequence scanning
29 /// - Matches: \<letters> (control word) or \<single-char> (control symbol)
30 /// - Returns the command name without the backslash
31 #[regex(r"\\(?:[a-zA-Z]+|.)", |lex| {
32 let slice = lex.slice();
33 slice[1..].to_string()
34 })]
35 ControlSeq(String),
36
37 /// Active character: ~
38 /// - catcode 13: Active Character
39 /// - Treated as a command but without escape character
40 /// - In LaTeX, ~ produces a non-breaking space
41 #[token("~")]
42 ActiveChar,
43
44 // --- Structural Tokens ---
45 /// Left brace: {
46 /// - catcode 1: Begin Group
47 /// - Used for grouping and delimiting arguments
48 #[token("{")]
49 LBrace,
50
51 /// Right brace: }
52 /// - catcode 2: End Group
53 /// - Closes groups started by LBrace
54 #[token("}")]
55 RBrace,
56
57 /// Dollar sign: $
58 /// - catcode 3: Math Shift
59 /// - Toggles inline math mode; $$ indicates display math
60 #[token("$")]
61 MathShift,
62
63 /// Ampersand: &
64 /// - catcode 4: Alignment Tab
65 /// - Used in tables and alignment environments
66 #[token("&")]
67 Alignment,
68
69 /// Hash/pound sign: #
70 /// - catcode 6: Parameter
71 /// - Used in macro definitions and arguments
72 #[token("#")]
73 Parameter,
74
75 /// Caret: ^
76 /// - catcode 7: Superscript
77 /// - Indicates superscript in math mode
78 #[token("^")]
79 Superscript,
80
81 /// Underscore: _
82 /// - catcode 8: Subscript
83 /// - Indicates subscript in math mode
84 #[token("_")]
85 Subscript,
86
87 /// Star/Asterisk: *
88 /// - catcode 12: Other
89 /// - Used for starred command variants (e.g., \section*)
90 /// - Must be checked immediately after command names
91 #[token("*")]
92 Star,
93
94 /// Left bracket: [
95 /// - catcode 12: Other
96 /// - Often used for optional arguments
97 #[token("[")]
98 LBracket,
99
100 /// Right bracket: ]
101 /// - catcode 12: Other
102 /// - Closes optional arguments
103 #[token("]")]
104 RBracket,
105
106 /// Prime mark(s): one or more ' or U+2019
107 /// - In math mode, represents derivative notation (f' = f^\prime)
108 /// - Multiple primes are common: f'', f'''
109 /// - We store the count to simplify parser handling
110 #[regex(r"['\u2019]+", callback = |lex| lex.slice().chars().count())]
111 Prime(usize),
112
113 // --- Whitespace and Comments ---
114 /// Whitespace: spaces, tabs, newlines, form feeds, non-breaking space
115 /// - catcode 10: Spacer
116 /// - Multiple consecutive whitespace characters are merged
117 /// - Includes U+00A0 (non-breaking space) for copy-paste behavior
118 #[regex(r"[ \t\n\f\u{00A0}]+")]
119 Whitespaces,
120
121 /// Comment: % to end of line
122 /// - catcode 14: Comment
123 /// - Lexer consumes everything from % to line end (inclusive)
124 /// - Comments are discarded and do not produce tokens
125 #[regex(r"%[^\n]*\n?", logos::skip)]
126 Comment,
127
128 // --- Character Tokens ---
129 /// Regular character: letters, digits, punctuation, Unicode (excluding invalid chars)
130 /// - catcode 11: Letter (a-z, A-Z)
131 /// - catcode 12: Other (digits, punctuation, etc.)
132 /// - Matches any single printable character not covered by above patterns
133 /// - Has lowest priority (1) to act as fallback
134 ///
135 /// Note: Control characters (catcode 9, 15) are NOT matched by any pattern
136 /// and will cause lexing errors automatically:
137 /// - catcode 9 (Ignore): \x00-\x08, \x0B-\x1F (control chars except \t, \n, \f)
138 /// - catcode 15 (Invalid): \x7F (DEL character)
139 #[regex(r"[\x20-\x7E\u{80}-\u{10FFFF}]", priority = 1, callback = |lex| {
140 let slice = lex.slice();
141 slice.chars().next().unwrap()
142 })]
143 Char(char),
144}
145
146impl std::fmt::Display for Token {
147 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148 match self {
149 Token::ControlSeq(name) => write!(f, "\\{name}"),
150 Token::ActiveChar => write!(f, "~"),
151 Token::LBrace => write!(f, "{{"),
152 Token::RBrace => write!(f, "}}"),
153 Token::MathShift => write!(f, "$"),
154 Token::Alignment => write!(f, "&"),
155 Token::Parameter => write!(f, "#"),
156 Token::Superscript => write!(f, "^"),
157 Token::Subscript => write!(f, "_"),
158 Token::Star => write!(f, "*"),
159 Token::LBracket => write!(f, "["),
160 Token::RBracket => write!(f, "]"),
161 Token::Prime(n) => {
162 for _ in 0..*n {
163 write!(f, "'")?;
164 }
165 Ok(())
166 }
167 Token::Whitespaces => write!(f, " "),
168 Token::Comment => write!(f, "%"),
169 Token::Char(c) => write!(f, "{c}"),
170 }
171 }
172}