1use crate::command_registry::{CommandRegistry, CommandType};
2use crate::definitions::TexNodeData::Array;
3use crate::definitions::{TexNode, TexNodeData, TexNodeType, TexSupsubData, TexToken, TexTokenType};
4use crate::map::SYMBOL_MAP;
5use crate::tex_parser_utils::*;
6use crate::tex_tokenizer;
7use std::cmp::PartialEq;
8
9type ParseResult = Result<(TexNode, usize), String>;
10
11pub struct LatexParser {
12 space_sensitive: bool,
13 newline_sensitive: bool,
14 command_registry: CommandRegistry,
15}
16
17impl LatexParser {
18 pub fn new(space_sensitive: bool, newline_sensitive: bool) -> Self {
19 LatexParser {
20 space_sensitive,
21 newline_sensitive,
22 command_registry: CommandRegistry::new(),
23 }
24 }
25
26 pub fn parse(&self, tokens: Vec<TexToken>) -> Result<TexNode, String> {
27 let mut results: Vec<TexNode> = Vec::new();
28 let mut pos = 0;
29
30 while pos < tokens.len() {
31 let (res, new_pos) = self.parse_next_expr(&tokens, pos)?;
32 pos = new_pos;
33 if res.node_type == TexNodeType::Whitespace
34 && (!self.space_sensitive && res.content.replace(" ", "").is_empty()
35 || !self.newline_sensitive && res.content == "\n")
36 {
37 continue;
38 }
39 if res.node_type == TexNodeType::Control && res.content == "&" {
40 return Err("Unexpected & outside of an alignment".to_string());
41 } else {
42 results.push(res);
43 }
44 }
45
46 if results.is_empty() {
47 Ok(EMPTY_NODE.clone())
48 } else if results.len() == 1 {
49 Ok(results.remove(0))
50 } else {
51 Ok(TexNode::new(TexNodeType::Ordgroup, String::new(), Some(results), None))
52 }
53 }
54
55 fn parse_next_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
56 let (base, mut pos) = self.parse_next_expr_without_supsub(tokens, start)?;
57 let mut sub: Option<TexNode> = None;
58 let mut sup: Option<TexNode> = None;
59 let mut num_prime = 0;
60
61 num_prime += eat_primes(tokens, pos);
62 pos += num_prime;
63 if pos < tokens.len() && tokens[pos] == *SUB_SYMBOL {
64 let (sub_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
65 sub = Some(sub_node);
66 pos = new_pos;
67 num_prime += eat_primes(tokens, pos);
68 pos += num_prime;
69 if pos < tokens.len() && tokens[pos] == *SUP_SYMBOL {
70 let (sup_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
71 sup = Some(sup_node);
72 pos = new_pos;
73 if eat_primes(tokens, pos) > 0 {
74 panic!("Double superscript");
75 }
76 }
77 } else if pos < tokens.len() && tokens[pos] == *SUP_SYMBOL {
78 let (sup_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
79 sup = Some(sup_node);
80 pos = new_pos;
81 if eat_primes(tokens, pos) > 0 {
82 panic!("Double superscript");
83 }
84 if pos < tokens.len() && tokens[pos] == *SUB_SYMBOL {
85 let (sub_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
86 sub = Some(sub_node);
87 pos = new_pos;
88 if eat_primes(tokens, pos) > 0 {
89 panic!("Double superscript");
90 }
91 }
92 }
93
94 if sub.is_some() || sup.is_some() || num_prime > 0 {
95 let mut res = TexSupsubData {
96 base,
97 sub: None,
98 sup: None,
99 };
100 if let Some(sub_node) = sub {
101 res.sub = Some(sub_node);
102 }
103 if num_prime > 0 {
104 let mut sup_node = TexNode::new(TexNodeType::Ordgroup, String::new(), Some(Vec::new()), None);
105 for _ in 0..num_prime {
106 sup_node.args.as_mut().unwrap().push(TexNode::new(
107 TexNodeType::Element,
108 "'".to_string(),
109 None,
110 None,
111 ));
112 }
113 if let Some(sup_node_inner) = sup {
114 sup_node.args.as_mut().unwrap().push(sup_node_inner);
115 }
116 if sup_node.args.as_ref().unwrap().len() == 1 {
117 res.sup = Some(sup_node.args.unwrap().remove(0));
118 } else {
119 res.sup = Some(sup_node);
120 }
121 } else if let Some(sup_node) = sup {
122 res.sup = Some(sup_node);
123 }
124 Ok((
125 TexNode::new(
126 TexNodeType::SupSub,
127 String::new(),
128 None,
129 Some(Box::from(TexNodeData::Supsub(res))),
130 ),
131 pos,
132 ))
133 } else {
134 Ok((base, pos))
135 }
136 }
137
138 fn parse_next_expr_without_supsub(&self, tokens: &[TexToken], start: usize) -> ParseResult {
139 match tokens.get(start) {
140 None => Err("Unexpected end of input".to_string()),
141 Some(_first_token) => {
142 let first_token = _first_token;
143 let token_type = &first_token.token_type;
144 match token_type {
145 TexTokenType::Element => Ok((
146 TexNode::new(TexNodeType::Element, first_token.value.clone(), None, None),
147 start + 1,
148 )),
149 TexTokenType::Text => Ok((
150 TexNode::new(TexNodeType::Text, first_token.value.clone(), None, None),
151 start + 1,
152 )),
153 TexTokenType::Comment => Ok((
154 TexNode::new(TexNodeType::Comment, first_token.value.clone(), None, None),
155 start + 1,
156 )),
157 TexTokenType::Space | TexTokenType::Newline => Ok((
158 TexNode::new(TexNodeType::Whitespace, first_token.value.clone(), None, None),
159 start + 1,
160 )),
161 TexTokenType::NoBreakSpace => Ok((
162 TexNode::new(TexNodeType::NoBreakSpace, first_token.value.clone(), None, None),
163 start + 1,
164 )),
165 TexTokenType::Command => {
166 if first_token.eq(&BEGIN_COMMAND) {
167 self.parse_begin_end_expr(tokens, start)
168 } else if first_token.eq(&LEFT_COMMAND) {
169 self.parse_left_right_expr(tokens, start)
170 } else {
171 self.parse_command_expr(tokens, start)
172 }
173 }
174 TexTokenType::Control => {
175 let control_char = &first_token.value;
176 match control_char.as_str() {
177 "{" => {
178 let pos_closing_bracket =
179 find_closing_match(tokens, start, &LEFT_CURLY_BRACKET, &RIGHT_CURLY_BRACKET);
180 if pos_closing_bracket == -1 {
181 Err("Unmatched '{'".to_string())
182 } else {
183 let expr_inside = &tokens[start + 1..pos_closing_bracket as usize];
184 Ok((self.parse(expr_inside.to_vec())?, pos_closing_bracket as usize + 1))
185 }
186 }
187 "}" => Err("Unexpected '}'".to_string()),
188 "\\\\" => Ok((
189 TexNode::new(TexNodeType::Control, "\\\\".to_string(), None, None),
190 start + 1,
191 )),
192 "\\," => Ok((
193 TexNode::new(TexNodeType::Control, "\\,".to_string(), None, None),
194 start + 1,
195 )),
196 "_" | "^" => Ok((EMPTY_NODE.clone(), start)),
197 "&" => Ok((
198 TexNode::new(TexNodeType::Control, "&".to_string(), None, None),
199 start + 1,
200 )),
201 _ => Err("Unknown control sequence".to_string()),
202 }
203 }
204 TexTokenType::Unknown => Ok((
205 TexNode::new(TexNodeType::Unknown, first_token.value.clone(), None, None),
206 start + 1,
207 )),
208 }
209 }
210 }
211 }
212
213 fn parse_command_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
214 let command = &tokens[start].value; let pos = start + 1;
216
217 if matches!(command[1..].as_ref(), "left" | "right" | "begin" | "end") {
218 return Err(format!("Unexpected command: {}", command));
219 }
220
221 match self.command_registry.get_command_type(&command[1..]) {
222 Some(CommandType::Symbol) => {
223 if !SYMBOL_MAP.contains_key(&command[1..]) {
224 return Ok((
225 TexNode::new(TexNodeType::UnknownMacro, command.clone(), None, None),
226 pos,
227 ));
228 }
229 Ok((TexNode::new(TexNodeType::Symbol, command.clone(), None, None), pos))
230 }
231 Some(CommandType::Unary) => {
232 if pos >= tokens.len() {
233 return Err(format!("Expecting argument for {}", command));
234 }
235 if command == "\\text" {
236 if pos + 2 >= tokens.len() {
237 return Err("Expecting content for \\text command".to_string());
238 }
239 assert_eq!(tokens[pos], *LEFT_CURLY_BRACKET);
240 assert_eq!(tokens[pos + 1].token_type, TexTokenType::Text);
241 assert_eq!(tokens[pos + 2], *RIGHT_CURLY_BRACKET);
242 let text = tokens[pos + 1].value.clone();
243 return Ok((TexNode::new(TexNodeType::Text, text, None, None), pos + 3));
244 }
245 let (arg1, new_pos) = self.parse_next_expr_without_supsub(tokens, pos)?;
246 Ok((
247 TexNode::new(TexNodeType::UnaryFunc, command.clone(), Some(vec![arg1]), None),
248 new_pos,
249 ))
250 }
251 Some(CommandType::Binary) => {
252 let (arg1, pos1) = self.parse_next_expr_without_supsub(tokens, pos)?;
253 let (arg2, pos2) = self.parse_next_expr_without_supsub(tokens, pos1)?;
254 Ok((
255 TexNode::new(TexNodeType::BinaryFunc, command.clone(), Some(vec![arg1, arg2]), None),
256 pos2,
257 ))
258 }
259 Some(CommandType::OptionalBinary) => {
260 let mut args = vec![];
261 let mut new_pos = pos;
262 if tokens[pos].token_type == TexTokenType::Element && tokens[pos].value == "[" {
263 let pos_left_square_bracket = pos;
264 let pos_right_square_bracket =
265 find_closing_match(tokens, pos, &LEFT_SQUARE_BRACKET, &RIGHT_SQUARE_BRACKET);
266 if pos_right_square_bracket == -1 {
267 return Err("No matching right square bracket for [".to_string());
268 }
269 let optional_arg_inside = &tokens[pos_left_square_bracket + 1..pos_right_square_bracket as usize];
270 let optional_arg_node = self.parse(optional_arg_inside.to_vec())?;
271 let (mandatory_arg_node, _new_pos) =
272 self.parse_next_expr_without_supsub(tokens, pos_right_square_bracket as usize + 1)?;
273 args.push(optional_arg_node);
274 args.push(mandatory_arg_node);
275 new_pos = _new_pos;
276 } else {
277 let (arg1, _new_pos) = self.parse_next_expr_without_supsub(tokens, pos)?;
278 args.push(arg1);
279 new_pos = _new_pos;
280 }
281 Ok((
282 TexNode::new(TexNodeType::OptionBinaryFunc, command.clone(), Some(args), None),
283 new_pos,
284 ))
285 }
286 _ => Err("Invalid number of parameters".to_string()),
287 }
288 }
289
290 fn parse_left_right_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
291 assert!(tokens[start].eq(&LEFT_COMMAND));
292
293 let mut pos = start + 1;
294 pos += eat_whitespaces(tokens, pos);
295
296 if pos >= tokens.len() {
297 return Err("Expecting delimiter after \\left".to_string());
298 }
299
300 let left_delimiter = eat_parenthesis(tokens, pos);
301 if left_delimiter.is_none() {
302 return Err("Invalid delimiter after \\left".to_string());
303 }
304 pos += 1;
305 let expr_inside_start = pos;
306 let idx = find_closing_right_command(tokens, start);
307 if idx == -1 {
308 return Err("No matching \\right".to_string());
309 }
310 let expr_inside_end = idx as usize;
311 pos = expr_inside_end + 1;
312
313 pos += eat_whitespaces(tokens, pos);
314 if pos >= tokens.len() {
315 return Err("Expecting \\right after \\left".to_string());
316 }
317
318 let right_delimiter = eat_parenthesis(tokens, pos);
319 if right_delimiter.is_none() {
320 return Err("Invalid delimiter after \\right".to_string());
321 }
322 pos += 1;
323
324 let expr_inside = &tokens[expr_inside_start..expr_inside_end];
325 let body = self.parse(expr_inside.to_vec())?;
326 let args: Vec<TexNode> = vec![
327 TexNode::new(TexNodeType::Element, left_delimiter.unwrap().value.clone(), None, None),
328 body,
329 TexNode::new(TexNodeType::Element, right_delimiter.unwrap().value.clone(), None, None),
330 ];
331 let res = TexNode::new(TexNodeType::Leftright, String::new(), Some(args), None);
332 Ok((res, pos))
333 }
334
335 fn parse_begin_end_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
336 assert!(tokens[start].eq(&BEGIN_COMMAND));
337
338 let mut pos = start + 1;
339 assert!(tokens[pos].eq(&LEFT_CURLY_BRACKET));
340 assert_eq!(tokens[pos + 1].token_type, TexTokenType::Text);
341 assert!(tokens[pos + 2].eq(&RIGHT_CURLY_BRACKET));
342 let env_name = tokens[pos + 1].value.clone();
343 pos += 3;
344
345 pos += eat_whitespaces(tokens, pos); let expr_inside_start = pos;
348
349 let end_idx = find_closing_end_command(tokens, start);
350 if end_idx == -1 {
351 panic!("No matching \\end");
352 }
353 let expr_inside_end = end_idx as usize;
354 pos = expr_inside_end + 1;
355
356 assert!(tokens[pos].eq(&LEFT_CURLY_BRACKET));
357 assert_eq!(tokens[pos + 1].token_type, TexTokenType::Text);
358 assert!(tokens[pos + 2].eq(&RIGHT_CURLY_BRACKET));
359 if tokens[pos + 1].value != env_name {
360 return Err("Mismatched \\begin and \\end environments".to_string());
361 }
362 pos += 3;
363
364 let mut expr_inside = tokens[expr_inside_start..expr_inside_end].to_vec();
365 while !expr_inside.is_empty()
367 && matches!(
368 expr_inside.last().unwrap().token_type,
369 TexTokenType::Space | TexTokenType::Newline
370 )
371 {
372 expr_inside.pop();
373 }
374 let body = self.parse_aligned(&*expr_inside)?;
375 let res = TexNode::new(TexNodeType::BeginEnd, env_name, None, Some(Box::from(Array(body))));
376 Ok((res, pos))
377 }
378
379 fn parse_aligned(&self, tokens: &[TexToken]) -> Result<Vec<Vec<TexNode>>, String> {
380 let mut pos = 0;
381 let mut all_rows: Vec<Vec<TexNode>> = vec![vec![TexNode::new(
382 TexNodeType::Ordgroup,
383 String::new(),
384 Some(Vec::<TexNode>::new()),
385 None,
386 )]];
387 let mut row: &mut Vec<TexNode> = &mut all_rows[0];
388 let mut group: &mut TexNode = &mut row[0];
389
390 while pos < tokens.len() {
391 let (res, new_pos) = self.parse_next_expr(tokens, pos)?;
392 pos = new_pos;
393
394 if res.node_type == TexNodeType::Whitespace {
395 if !self.space_sensitive && res.content.replace(" ", "").is_empty() {
396 continue;
397 }
398 if !self.newline_sensitive && res.content == "\n" {
399 continue;
400 }
401 }
402
403 if res.node_type == TexNodeType::Control && res.content == "\\\\" {
404 all_rows.push(vec![TexNode::new(
405 TexNodeType::Ordgroup,
406 String::new(),
407 Some(Vec::<TexNode>::new()),
408 None,
409 )]);
410 row = all_rows.last_mut().unwrap();
411 group = &mut row[0];
412 } else if res.node_type == TexNodeType::Control && res.content == "&" {
413 row.push(TexNode::new(
414 TexNodeType::Ordgroup,
415 String::new(),
416 Some(Vec::new()),
417 None,
418 ));
419 group = row.last_mut().unwrap();
420 } else {
421 group.args.as_mut().unwrap().push(res);
422 }
423 }
424
425 Ok(all_rows)
426 }
427}
428
429pub fn parse_tex(tex: &str) -> Result<TexNode, String> {
430 let parser = LatexParser::new(false, false);
431 let tokens = tex_tokenizer::tokenize(tex)?;
432 parser.parse(tokens)
433}