1use logos::{Logos, Source};
2use mitex_spec::CommandSpec;
3
4#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
6pub enum BraceKind {
7 Curly,
9 Bracket,
11 Paren,
13}
14
15#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Logos)]
21#[logos(extras = (CommandSpec, logos::Span))]
22pub enum Token {
23 #[regex(r"[\r\n]+", priority = 2)]
26 LineBreak,
27
28 #[regex(r"[^\S\r\n]+", priority = 1)]
31 Whitespace,
32
33 #[regex(r"%[^\r\n]*")]
40 LineComment,
41
42 #[token("{", bc)]
46 #[token("[", bb)]
47 #[token("(", bp)]
48 Left(BraceKind),
49
50 #[token("}", bc)]
54 #[token("]", bb)]
55 #[token(")", bp)]
56 Right(BraceKind),
57
58 #[token(",")]
60 Comma,
61
62 #[token("~")]
64 Tilde,
65
66 #[token("/")]
68 Slash,
69
70 #[token("&")]
72 Ampersand,
73
74 #[token("^")]
76 Caret,
77
78 #[token("'")]
80 Apostrophe,
81
82 #[token("\"")]
84 Ditto,
85
86 #[token(";")]
88 Semicolon,
89
90 #[token("#")]
92 Hash,
93
94 #[token("*")]
96 Asterisk,
97
98 #[token("@")]
100 AtSign,
101
102 #[token("_", priority = 2)]
104 Underscore,
105
106 #[regex(r#"[^\s\\%\{\},\$\[\]\(\)\~/_\*@'";&^#]+"#, priority = 1)]
108 Word,
109
110 #[regex(r"\$\$?")]
112 Dollar,
113
114 #[regex(r"\\\\", priority = 4)]
118 NewLine,
119
120 #[regex(r"\\", lex_command_name, priority = 3)]
124 CommandName(CommandName),
125
126 Error,
128
129 MacroArg(u8),
131}
132
133impl Token {
134 pub fn is_trivia(&self) -> bool {
136 use Token::*;
137 matches!(self, LineBreak | Whitespace | LineComment)
138 }
139}
140
141#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
143pub enum IfCommandName {
144 If,
146 IfTypst,
148 IfFalse,
150 IfTrue,
152 IfCase,
154 IfNum,
156 IfCat,
158 IfX,
160 IfVoid,
162 IfHBox,
164 IfVBox,
166 IfHMode,
168 IfMMode,
170 IfVMode,
172 IfInner,
174 IfDim,
176 IfEof,
178 IfStar,
180}
181
182#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
184pub enum CommandName {
185 Generic,
187 BeginEnvironment,
189 EndEnvironment,
191 BeginMath,
193 EndMath,
195 ErrorBeginEnvironment,
197 ErrorEndEnvironment,
199 If(IfCommandName),
201 Else,
203 EndIf,
205 Left,
207 Right,
209}
210
211#[inline(always)]
213fn bc(_: &mut logos::Lexer<Token>) -> BraceKind {
214 BraceKind::Curly
215}
216
217#[inline(always)]
219fn bb(_: &mut logos::Lexer<Token>) -> BraceKind {
220 BraceKind::Bracket
221}
222
223#[inline(always)]
225fn bp(_: &mut logos::Lexer<Token>) -> BraceKind {
226 BraceKind::Paren
227}
228
229const LEN_ASCII: usize = 1;
231
232fn lex_command_name(lexer: &mut logos::Lexer<Token>) -> CommandName {
236 use IfCommandName::*;
237 let command_start = &lexer.source()[lexer.span().end..];
238
239 let c = match command_start.chars().next() {
241 Some(c) => c,
242 None => return CommandName::Generic,
243 };
244
245 if c.is_whitespace() {
248 return CommandName::Generic;
249 }
250
251 lexer.bump(c.len_utf8());
255
256 match c {
258 '(' | '[' => return CommandName::BeginMath,
259 ')' | ']' => return CommandName::EndMath,
260 '@' => {}
261 _ if !c.is_ascii_alphabetic() => return CommandName::Generic,
262 _ => {}
263 }
264
265 let ascii_str = &command_start.as_bytes()[LEN_ASCII..];
268 let bump_size = advance_ascii_name(lexer, ascii_str, true);
269 lexer.bump(bump_size);
270
271 let name = &command_start[..LEN_ASCII + bump_size];
272 match name {
273 "if" => CommandName::If(If),
274 "iftypst" => CommandName::If(IfTypst),
275 "iffalse" => CommandName::If(IfFalse),
276 "iftrue" => CommandName::If(IfTrue),
277 "ifcase" => CommandName::If(IfCase),
278 "ifnum" => CommandName::If(IfNum),
279 "ifcat" => CommandName::If(IfCat),
280 "ifx" => CommandName::If(IfX),
281 "ifvoid" => CommandName::If(IfVoid),
282 "ifhbox" => CommandName::If(IfHBox),
283 "ifvbox" => CommandName::If(IfVBox),
284 "ifhmode" => CommandName::If(IfHMode),
285 "ifmmode" => CommandName::If(IfMMode),
286 "ifvmode" => CommandName::If(IfVMode),
287 "ifinner" => CommandName::If(IfInner),
288 "ifdim" => CommandName::If(IfDim),
289 "ifeof" => CommandName::If(IfEof),
290 "@ifstar" => CommandName::If(IfStar),
291 "else" => CommandName::Else,
292 "fi" => CommandName::EndIf,
293 "left" => CommandName::Left,
294 "right" => CommandName::Right,
295 "begin" => lex_begin_end(lexer, true),
296 "end" => lex_begin_end(lexer, false),
297 _ => CommandName::Generic,
298 }
299}
300
301fn advance_ascii_name(
302 lexer: &mut logos::Lexer<Token>,
303 ascii_str: &[u8],
304 lex_slash_command: bool,
305) -> usize {
306 let mut bump_size = 0;
307 for c in ascii_str {
308 match c {
309 b'*' => {
317 let verified = if lex_slash_command {
318 let spec = &lexer.extras.0;
319 let s = lexer.span().start + 1;
321 let s = s..s + bump_size + 2;
323 let t = lexer.source().slice(s);
324 t.and_then(|s| spec.get(s)).is_some()
325 } else {
326 true
327 };
328
329 if verified {
330 bump_size += LEN_ASCII;
331 }
332
333 break;
334 }
335 c if c.is_ascii_alphabetic() => bump_size += LEN_ASCII,
336 b'@' => bump_size += LEN_ASCII,
339 _ => break,
340 };
341 }
342
343 bump_size
344}
345
346fn lex_begin_end(lexer: &mut logos::Lexer<Token>, is_begin: bool) -> CommandName {
347 struct LexTask<'a, 'b> {
348 lexer: &'a mut logos::Lexer<'b, Token>,
349 chars: std::str::Chars<'b>,
350 collected: usize,
351 }
352
353 impl<'a, 'b> LexTask<'a, 'b> {
354 fn new(lexer: &'a mut logos::Lexer<'b, Token>) -> Self {
355 Self {
356 chars: lexer.source()[lexer.span().end..].chars(),
357 lexer,
358 collected: 0,
359 }
360 }
361
362 fn next_non_trivia(&mut self) -> Option<char> {
363 loop {
364 let c = match self.chars.next() {
365 Some(c) => c,
366 None => break None,
367 };
368
369 if c.is_whitespace() {
370 self.collected += c.len_utf8();
371 continue;
372 }
373
374 if c == '%' {
375 self.collected += c.len_utf8();
376 for c in self.chars.by_ref() {
377 if c == '\n' || c == '\r' {
378 break;
379 }
380 self.collected += c.len_utf8();
381 }
382 continue;
383 }
384
385 self.collected += c.len_utf8();
386 return Some(c);
387 }
388 }
389
390 #[inline(always)]
391 fn work(&mut self) -> Option<()> {
392 let c = self.next_non_trivia()?;
393
394 if c != '{' {
395 return None;
396 }
397
398 let ns = self.lexer.span().end + self.collected;
399 let ascii_str = self.lexer.source()[ns..].as_bytes();
400
401 let bump_size = advance_ascii_name(self.lexer, ascii_str, false);
402 self.lexer.extras.1 = ns..ns + bump_size;
403 self.collected += bump_size;
404 self.chars = self.lexer.source()[ns + bump_size..].chars();
405
406 let c = self.next_non_trivia()?;
407 if c != '}' {
408 return None;
409 }
410
411 self.lexer.bump(self.collected);
412 Some(())
413 }
414 }
415
416 let mut task = LexTask::new(lexer);
417 match (task.work(), is_begin) {
418 (Some(..), true) => CommandName::BeginEnvironment,
419 (Some(..), false) => CommandName::EndEnvironment,
420 (None, true) => CommandName::ErrorBeginEnvironment,
421 (None, false) => CommandName::ErrorEndEnvironment,
422 }
423}