tex2typst_rs/
tex_tokenizer.rs1use crate::definitions::{TexToken, TexTokenType};
2use crate::tex_parser_utils::{SUB_SYMBOL, SUP_SYMBOL};
3
4fn eat_command_name(latex: &Vec<char>, start: usize) -> String {
5 let mut pos = start;
6 while pos < latex.len() && latex[pos].is_alphabetic() {
7 pos += 1;
8 }
9 latex[start..pos].iter().collect::<String>()
10}
11
12fn find_closing_curly_bracket_char(latex: &Vec<char>, start: usize) -> Result<usize, &'static str> {
13 assert_eq!(latex[start], '{');
14 let mut count = 1;
15 let mut pos = start + 1;
16
17 while count > 0 {
18 if pos >= latex.len() {
19 return Err("Unmatched curly brackets");
20 }
21 if pos + 1 < latex.len() && ["\\{", "\\}"].contains(&latex[pos..pos + 2].iter().collect::<String>().as_str()) {
22 pos += 2;
23 continue;
24 }
25 match latex[pos] {
26 '{' => count += 1,
27 '}' => count -= 1,
28 _ => {}
29 }
30 pos += 1;
31 }
32
33 Ok(pos - 1)
34}
35
36pub fn tokenize(latex: &str) -> Result<Vec<TexToken>, String> {
37 let latex: Vec<char> = latex.chars().collect();
38 let mut tokens: Vec<TexToken> = Vec::new();
39 let mut pos = 0;
40
41 while pos < latex.len() {
42 let first_char = latex[pos];
43 let token: TexToken;
44 match first_char {
45 '%' => {
46 let mut new_pos = pos + 1;
47 while new_pos < latex.len() && latex[new_pos] != '\n' {
48 new_pos += 1;
49 }
50 token = TexToken::new(TexTokenType::Comment, latex[pos + 1..new_pos].iter().collect());
51 pos = new_pos;
52 }
53 '{' | '}' | '_' | '^' | '&' => {
54 token = TexToken::new(TexTokenType::Control, first_char.to_string());
55 pos += 1;
56 }
57 '\n' => {
58 token = TexToken::new(TexTokenType::Newline, first_char.to_string());
59 pos += 1;
60 }
61 '\r' => {
62 if pos + 1 < latex.len() && latex[pos + 1] == '\n' {
63 token = TexToken::new(TexTokenType::Newline, "\n".to_string());
64 pos += 2;
65 } else {
66 token = TexToken::new(TexTokenType::Newline, "\n".to_string());
67 pos += 1;
68 }
69 }
70 ' ' => {
71 let mut new_pos = pos;
72 while new_pos < latex.len() && latex[new_pos] == ' ' {
73 new_pos += 1;
74 }
75 token = TexToken::new(TexTokenType::Space, latex[pos..new_pos].iter().collect());
76 pos = new_pos;
77 }
78 '\\' => {
79 if pos + 1 >= latex.len() {
80 return Err("Expecting command name after '\\'".to_string());
81 }
82 let first_two_chars = latex[pos..pos + 2].iter().collect::<String>();
83 if ["\\\\", "\\,"].contains(&&*first_two_chars) {
84 token = TexToken::new(TexTokenType::Control, first_two_chars.to_string());
85 } else if ["\\{", "\\}", "\\%", "\\$", "\\&", "\\#", "\\_", "\\|"].contains(&&*first_two_chars) {
86 token = TexToken::new(TexTokenType::Element, first_two_chars.to_string());
87 } else {
88 let command = eat_command_name(&latex, pos + 1);
89 token = TexToken::new(TexTokenType::Command, format!("\\{}", command));
90 }
91 pos += token.value.len();
92 }
93 _ => {
94 if first_char.is_digit(10) {
95 let mut new_pos = pos;
96 while new_pos < latex.len() && latex[new_pos].is_digit(10) {
97 new_pos += 1;
98 }
99 token = TexToken::new(TexTokenType::Element, latex[pos..new_pos].iter().collect());
100 } else if first_char.is_alphabetic() {
101 token = TexToken::new(TexTokenType::Element, first_char.to_string());
102 } else if "+-*/='<>!.,;:?()[]|".contains(first_char) {
103 token = TexToken::new(TexTokenType::Element, first_char.to_string());
104 } else if "~".contains(first_char) {
105 token = TexToken::new(TexTokenType::NoBreakSpace, "space.nobreak".to_string());
106 } else {
107 token = TexToken::new(TexTokenType::Unknown, first_char.to_string());
108 }
109 pos += token.value.len();
110 }
111 }
112
113 tokens.push(token.clone());
114
115 if token.token_type == TexTokenType::Command
116 && matches!(token.value.as_str(), r"\text" | r"\operatorname" | r"\begin" | r"\end")
117 {
118 if pos >= latex.len() || latex[pos] != '{' {
119 if let Some(nn) = latex[pos..].iter().position(|&c| c == '{') {
120 pos += nn;
121 } else {
122 return Err(format!("No content for {} command", token.value));
123 }
124 }
125 tokens.push(TexToken::new(TexTokenType::Control, "{".to_string()));
126 let pos_closing_bracket = find_closing_curly_bracket_char(&latex, pos)?;
127 pos += 1;
128 let mut text_inside: String = latex[pos..pos_closing_bracket].iter().collect();
129 let chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
130 for &char in &chars {
131 text_inside = text_inside.replace(&format!("\\{}", char), &char.to_string());
132 }
133 tokens.push(TexToken::new(TexTokenType::Text, text_inside));
134 tokens.push(TexToken::new(TexTokenType::Control, "}".to_string()));
135 pos = pos_closing_bracket + 1;
136 }
137 }
138
139 Ok(pass_ignore_whitespace_before_script_mark(tokens))
140}
141
142fn pass_ignore_whitespace_before_script_mark(tokens: Vec<TexToken>) -> Vec<TexToken> {
144 let is_script_mark = |token: &TexToken| token.eq(&SUB_SYMBOL) || token.eq(&SUP_SYMBOL);
145 let mut out_tokens: Vec<TexToken> = Vec::new();
146
147 for i in 0..tokens.len() {
148 if tokens[i].token_type == TexTokenType::Space && i + 1 < tokens.len() && is_script_mark(&tokens[i + 1]) {
149 continue;
150 }
151 if tokens[i].token_type == TexTokenType::Space && i > 0 && is_script_mark(&tokens[i - 1]) {
152 continue;
153 }
154 out_tokens.push(tokens[i].clone());
155 }
156
157 out_tokens
158}