1#[derive(Debug, Clone, PartialEq)]
7pub(crate) enum Token {
8 LBrace,
10 RBrace,
11
12 Caret,
14 Underscore,
15 Ampersand,
16 Tilde, LParen,
20 RParen,
21 LBracket,
22 RBracket,
23 Pipe,
24
25 Plus,
27 Minus,
28 Equals,
29 LessThan,
30 GreaterThan,
31 Comma,
32 Semicolon,
33 Colon,
34 Bang,
35 Prime, Command(String),
39
40 DoubleBackslash,
42
43 ThinSpace, MedSpace, NegThinSpace, Letter(char), Digit(char), Dot, Begin(String),
55 End(String),
56
57 Whitespace,
59
60 Eof,
62}
63
64#[derive(Debug, Clone, PartialEq)]
66pub(crate) struct Span {
67 pub start: usize,
69 pub end: usize,
71}
72
73#[derive(Debug, Clone, PartialEq)]
75pub(crate) struct Spanned {
76 pub token: Token,
77 pub span: Span,
78}
79
80pub(crate) fn tokenize(input: &str) -> Vec<Spanned> {
84 let bytes = input.as_bytes();
85 let len = input.len();
86 let mut pos = 0usize;
87 let mut out: Vec<Spanned> = Vec::new();
88
89 macro_rules! push {
90 ($start:expr, $tok:expr) => {
91 out.push(Spanned {
92 token: $tok,
93 span: Span {
94 start: $start,
95 end: pos,
96 },
97 });
98 };
99 }
100
101 while pos < len {
102 let start = pos;
103 let Some(ch) = input[pos..].chars().next() else {
106 break;
107 };
108 let ch_len = ch.len_utf8();
109
110 match ch {
111 c if c.is_ascii_whitespace() => {
113 while pos < len && input[pos..].starts_with(|c: char| c.is_ascii_whitespace()) {
114 pos += 1;
115 }
116 push!(start, Token::Whitespace);
117 }
118
119 '{' => {
120 pos += 1;
121 push!(start, Token::LBrace);
122 }
123 '}' => {
124 pos += 1;
125 push!(start, Token::RBrace);
126 }
127 '^' => {
128 pos += 1;
129 push!(start, Token::Caret);
130 }
131 '_' => {
132 pos += 1;
133 push!(start, Token::Underscore);
134 }
135 '&' => {
136 pos += 1;
137 push!(start, Token::Ampersand);
138 }
139 '~' => {
140 pos += 1;
141 push!(start, Token::Tilde);
142 }
143 '(' => {
144 pos += 1;
145 push!(start, Token::LParen);
146 }
147 ')' => {
148 pos += 1;
149 push!(start, Token::RParen);
150 }
151 '[' => {
152 pos += 1;
153 push!(start, Token::LBracket);
154 }
155 ']' => {
156 pos += 1;
157 push!(start, Token::RBracket);
158 }
159 '|' => {
160 pos += 1;
161 push!(start, Token::Pipe);
162 }
163 '+' => {
164 pos += 1;
165 push!(start, Token::Plus);
166 }
167 '-' => {
168 pos += 1;
169 push!(start, Token::Minus);
170 }
171 '=' => {
172 pos += 1;
173 push!(start, Token::Equals);
174 }
175 '<' => {
176 pos += 1;
177 push!(start, Token::LessThan);
178 }
179 '>' => {
180 pos += 1;
181 push!(start, Token::GreaterThan);
182 }
183 ',' => {
184 pos += 1;
185 push!(start, Token::Comma);
186 }
187 ';' => {
188 pos += 1;
189 push!(start, Token::Semicolon);
190 }
191 ':' => {
192 pos += 1;
193 push!(start, Token::Colon);
194 }
195 '!' => {
196 pos += 1;
197 push!(start, Token::Bang);
198 }
199 '\'' => {
200 pos += 1;
201 push!(start, Token::Prime);
202 }
203 '.' => {
204 pos += 1;
205 push!(start, Token::Dot);
206 }
207
208 '\\' => {
209 pos += 1; if pos >= len {
211 push!(start, Token::Command("".to_string()));
213 continue;
214 }
215
216 let Some(next) = input[pos..].chars().next() else {
217 push!(start, Token::Command("".to_string()));
218 continue;
219 };
220
221 if next == '\\' {
222 pos += 1;
224 push!(start, Token::DoubleBackslash);
225 } else if next == ',' {
226 pos += 1;
227 push!(start, Token::ThinSpace);
228 } else if next == ';' || next == ':' {
229 pos += 1;
230 push!(start, Token::MedSpace);
231 } else if next == '!' {
232 pos += 1;
233 push!(start, Token::NegThinSpace);
234 } else if next == ' ' {
235 pos += 1;
237 push!(start, Token::Whitespace);
238 } else if next.is_ascii_alphabetic() {
239 let name_start = pos;
241 while pos < len {
242 let Some(c) = input[pos..].chars().next() else {
243 break;
244 };
245 if c.is_ascii_alphabetic() {
246 pos += c.len_utf8();
247 } else {
248 break;
249 }
250 }
251 let name = &input[name_start..pos];
252
253 if name == "begin" || name == "end" {
254 skip_whitespace(input, &mut pos);
256 if pos < len && bytes[pos] == b'{' {
257 pos += 1; let env_start = pos;
259 while pos < len && bytes[pos] != b'}' {
260 pos += 1;
261 }
262 let env_name = input[env_start..pos].trim().to_string();
263 if pos < len {
264 pos += 1; }
266 if name == "begin" {
267 push!(start, Token::Begin(env_name));
268 } else {
269 push!(start, Token::End(env_name));
270 }
271 } else {
272 let tok = if name == "begin" {
274 Token::Begin(String::new())
275 } else {
276 Token::End(String::new())
277 };
278 push!(start, tok);
279 }
280 } else {
281 push!(start, Token::Command(name.to_string()));
282 }
283 } else {
284 let sym = next.to_string();
286 pos += next.len_utf8();
287 push!(start, Token::Command(sym));
288 }
289 }
290
291 c if c.is_ascii_alphabetic() => {
292 pos += ch_len;
293 push!(start, Token::Letter(c));
294 }
295
296 c if c.is_ascii_digit() => {
297 pos += ch_len;
298 push!(start, Token::Digit(c));
299 }
300
301 c => {
304 pos += ch_len;
305 push!(start, Token::Letter(c));
307 }
308 }
309 }
310
311 out.push(Spanned {
313 token: Token::Eof,
314 span: Span {
315 start: len,
316 end: len,
317 },
318 });
319
320 out
321}
322
323pub(crate) fn token_to_raw_str(tok: &Token) -> String {
325 match tok {
326 Token::LBrace => "{".to_string(),
327 Token::RBrace => "}".to_string(),
328 Token::Caret => "^".to_string(),
329 Token::Underscore => "_".to_string(),
330 Token::Ampersand => "&".to_string(),
331 Token::Tilde => "~".to_string(),
332 Token::LParen => "(".to_string(),
333 Token::RParen => ")".to_string(),
334 Token::LBracket => "[".to_string(),
335 Token::RBracket => "]".to_string(),
336 Token::Pipe => "|".to_string(),
337 Token::Plus => "+".to_string(),
338 Token::Minus => "-".to_string(),
339 Token::Equals => "=".to_string(),
340 Token::LessThan => "<".to_string(),
341 Token::GreaterThan => ">".to_string(),
342 Token::Comma => ",".to_string(),
343 Token::Semicolon => ";".to_string(),
344 Token::Colon => ":".to_string(),
345 Token::Bang => "!".to_string(),
346 Token::Prime => "'".to_string(),
347 Token::Dot => ".".to_string(),
348 Token::Command(c) => format!("\\{c}"),
349 Token::DoubleBackslash => "\\\\".to_string(),
350 Token::ThinSpace => "\\,".to_string(),
351 Token::MedSpace => "\\;".to_string(),
352 Token::NegThinSpace => "\\!".to_string(),
353 Token::Letter(c) => c.to_string(),
354 Token::Digit(c) => c.to_string(),
355 Token::Begin(e) => format!("\\begin{{{e}}}"),
356 Token::End(e) => format!("\\end{{{e}}}"),
357 Token::Whitespace => " ".to_string(),
358 Token::Eof => String::new(),
359 }
360}
361
362fn skip_whitespace(input: &str, pos: &mut usize) {
363 while *pos < input.len() {
364 let Some(c) = input[*pos..].chars().next() else {
365 break;
366 };
367 if c.is_ascii_whitespace() {
368 *pos += c.len_utf8();
369 } else {
370 break;
371 }
372 }
373}
374
375pub(crate) struct TokenStream {
379 tokens: Vec<Spanned>,
380 pos: usize,
381}
382
383impl TokenStream {
384 pub fn new(tokens: Vec<Spanned>) -> Self {
385 Self { tokens, pos: 0 }
386 }
387
388 pub fn peek(&self) -> &Token {
390 &self.tokens[self.pos].token
391 }
392
393 pub fn peek_ahead(&self, offset: usize) -> &Token {
395 let idx = (self.pos + offset).min(self.tokens.len() - 1);
396 &self.tokens[idx].token
397 }
398
399 pub fn next(&mut self) -> Token {
401 let tok = self.tokens[self.pos].token.clone();
402 if self.pos + 1 < self.tokens.len() {
403 self.pos += 1;
404 }
405 tok
406 }
407
408 #[allow(dead_code)]
410 pub fn current_offset(&self) -> usize {
411 self.tokens[self.pos].span.start
412 }
413
414 pub fn skip_whitespace(&mut self) {
416 while matches!(self.peek(), Token::Whitespace) {
417 self.next();
418 }
419 }
420
421 #[allow(dead_code)]
423 pub fn is_eof(&self) -> bool {
424 matches!(self.peek(), Token::Eof)
425 }
426
427 pub fn read_raw_brace_string(&mut self) -> Option<String> {
431 if !matches!(self.peek(), Token::LBrace) {
432 return None;
433 }
434 self.next(); let mut result = String::new();
437 let mut depth = 1usize;
438
439 loop {
440 match self.peek().clone() {
441 Token::Eof => break,
442 Token::LBrace => {
443 depth += 1;
444 result.push('{');
445 self.next();
446 }
447 Token::RBrace => {
448 depth -= 1;
449 if depth == 0 {
450 self.next(); break;
452 }
453 result.push('}');
454 self.next();
455 }
456 Token::Whitespace => {
457 result.push(' ');
458 self.next();
459 }
460 tok => {
461 result.push_str(&token_to_raw_str(&tok));
462 self.next();
463 }
464 }
465 }
466
467 Some(result)
468 }
469}
470
471#[cfg(test)]
474mod tests {
475 use super::*;
476
477 fn tokens(input: &str) -> Vec<Token> {
478 tokenize(input).into_iter().map(|s| s.token).collect()
479 }
480
481 #[test]
482 fn tokenize_simple_letters() {
483 let toks = tokens("abc");
484 assert_eq!(
485 toks,
486 vec![
487 Token::Letter('a'),
488 Token::Letter('b'),
489 Token::Letter('c'),
490 Token::Eof,
491 ]
492 );
493 }
494
495 #[test]
496 fn tokenize_digits() {
497 let toks = tokens("123");
498 assert_eq!(
499 toks,
500 vec![
501 Token::Digit('1'),
502 Token::Digit('2'),
503 Token::Digit('3'),
504 Token::Eof,
505 ]
506 );
507 }
508
509 #[test]
510 fn tokenize_command() {
511 let toks = tokens(r"\frac");
512 assert_eq!(toks, vec![Token::Command("frac".to_string()), Token::Eof]);
513 }
514
515 #[test]
516 fn tokenize_double_backslash() {
517 let toks = tokens(r"\\");
518 assert_eq!(toks, vec![Token::DoubleBackslash, Token::Eof]);
519 }
520
521 #[test]
522 fn tokenize_spacing() {
523 let toks = tokens(r"\,\;\!");
524 assert_eq!(
525 toks,
526 vec![
527 Token::ThinSpace,
528 Token::MedSpace,
529 Token::NegThinSpace,
530 Token::Eof,
531 ]
532 );
533 }
534
535 #[test]
536 fn tokenize_begin_end() {
537 let toks = tokens(r"\begin{pmatrix}");
538 assert_eq!(toks, vec![Token::Begin("pmatrix".to_string()), Token::Eof]);
539 }
540
541 #[test]
542 fn tokenize_end_env() {
543 let toks = tokens(r"\end{pmatrix}");
544 assert_eq!(toks, vec![Token::End("pmatrix".to_string()), Token::Eof]);
545 }
546
547 #[test]
548 fn tokenize_scripts() {
549 let toks = tokens("x^2_i");
550 assert_eq!(
551 toks,
552 vec![
553 Token::Letter('x'),
554 Token::Caret,
555 Token::Digit('2'),
556 Token::Underscore,
557 Token::Letter('i'),
558 Token::Eof,
559 ]
560 );
561 }
562
563 #[test]
564 fn tokenize_braces() {
565 let toks = tokens("{a}");
566 assert_eq!(
567 toks,
568 vec![Token::LBrace, Token::Letter('a'), Token::RBrace, Token::Eof,]
569 );
570 }
571
572 #[test]
573 fn tokenize_whitespace_collapsed() {
574 let toks = tokens("a b");
575 assert_eq!(
576 toks,
577 vec![
578 Token::Letter('a'),
579 Token::Whitespace,
580 Token::Letter('b'),
581 Token::Eof,
582 ]
583 );
584 }
585
586 #[test]
587 fn tokenize_backslash_brace() {
588 let toks = tokens(r"\{");
590 assert_eq!(toks, vec![Token::Command("{".to_string()), Token::Eof]);
591 }
592
593 #[test]
594 fn tokenize_pipe() {
595 let toks = tokens("|");
596 assert_eq!(toks, vec![Token::Pipe, Token::Eof]);
597 }
598
599 #[test]
600 fn tokenize_operators() {
601 let toks = tokens("+-=<>");
602 assert_eq!(
603 toks,
604 vec![
605 Token::Plus,
606 Token::Minus,
607 Token::Equals,
608 Token::LessThan,
609 Token::GreaterThan,
610 Token::Eof,
611 ]
612 );
613 }
614
615 #[test]
616 fn token_stream_peek_and_next() {
617 let ts_tokens = tokenize("ab");
618 let mut ts = TokenStream::new(ts_tokens);
619 assert_eq!(ts.peek(), &Token::Letter('a'));
620 ts.next();
621 assert_eq!(ts.peek(), &Token::Letter('b'));
622 ts.next();
623 assert_eq!(ts.peek(), &Token::Eof);
624 }
625
626 #[test]
627 fn token_stream_skip_whitespace() {
628 let ts_tokens = tokenize("a b");
629 let mut ts = TokenStream::new(ts_tokens);
630 ts.next(); ts.skip_whitespace();
632 assert_eq!(ts.peek(), &Token::Letter('b'));
633 }
634}