tex2typst_rs/
tex_parser.rs

1use crate::command_registry::{CommandRegistry, CommandType};
2use crate::definitions::TexNodeData::Array;
3use crate::definitions::{TexNode, TexNodeData, TexNodeType, TexSupsubData, TexToken, TexTokenType};
4use crate::map::SYMBOL_MAP;
5use crate::tex_parser_utils::*;
6use crate::tex_tokenizer;
7use std::cmp::PartialEq;
8
9type ParseResult = Result<(TexNode, usize), String>;
10
11pub struct LatexParser {
12    space_sensitive: bool,
13    newline_sensitive: bool,
14    command_registry: CommandRegistry,
15}
16
17impl LatexParser {
18    pub fn new(space_sensitive: bool, newline_sensitive: bool) -> Self {
19        LatexParser {
20            space_sensitive,
21            newline_sensitive,
22            command_registry: CommandRegistry::new(),
23        }
24    }
25
26    pub fn parse(&self, tokens: Vec<TexToken>) -> Result<TexNode, String> {
27        let mut results: Vec<TexNode> = Vec::new();
28        let mut pos = 0;
29
30        while pos < tokens.len() {
31            let (res, new_pos) = self.parse_next_expr(&tokens, pos)?;
32            pos = new_pos;
33            if res.node_type == TexNodeType::Whitespace
34                && (!self.space_sensitive && res.content.replace(" ", "").is_empty()
35                    || !self.newline_sensitive && res.content == "\n")
36            {
37                continue;
38            }
39            if res.node_type == TexNodeType::Control && res.content == "&" {
40                return Err("Unexpected & outside of an alignment".to_string());
41            } else {
42                results.push(res);
43            }
44        }
45
46        if results.is_empty() {
47            Ok(EMPTY_NODE.clone())
48        } else if results.len() == 1 {
49            Ok(results.remove(0))
50        } else {
51            Ok(TexNode::new(TexNodeType::Ordgroup, String::new(), Some(results), None))
52        }
53    }
54
55    fn parse_next_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
56        let (base, mut pos) = self.parse_next_expr_without_supsub(tokens, start)?;
57        let mut sub: Option<TexNode> = None;
58        let mut sup: Option<TexNode> = None;
59        let mut num_prime = 0;
60
61        num_prime += eat_primes(tokens, pos);
62        pos += num_prime;
63        if pos < tokens.len() && tokens[pos] == *SUB_SYMBOL {
64            let (sub_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
65            sub = Some(sub_node);
66            pos = new_pos;
67            num_prime += eat_primes(tokens, pos);
68            pos += num_prime;
69            if pos < tokens.len() && tokens[pos] == *SUP_SYMBOL {
70                let (sup_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
71                sup = Some(sup_node);
72                pos = new_pos;
73                if eat_primes(tokens, pos) > 0 {
74                    panic!("Double superscript");
75                }
76            }
77        } else if pos < tokens.len() && tokens[pos] == *SUP_SYMBOL {
78            let (sup_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
79            sup = Some(sup_node);
80            pos = new_pos;
81            if eat_primes(tokens, pos) > 0 {
82                panic!("Double superscript");
83            }
84            if pos < tokens.len() && tokens[pos] == *SUB_SYMBOL {
85                let (sub_node, new_pos) = self.parse_next_expr_without_supsub(tokens, pos + 1)?;
86                sub = Some(sub_node);
87                pos = new_pos;
88                if eat_primes(tokens, pos) > 0 {
89                    panic!("Double superscript");
90                }
91            }
92        }
93
94        if sub.is_some() || sup.is_some() || num_prime > 0 {
95            let mut res = TexSupsubData {
96                base,
97                sub: None,
98                sup: None,
99            };
100            if let Some(sub_node) = sub {
101                res.sub = Some(sub_node);
102            }
103            if num_prime > 0 {
104                let mut sup_node = TexNode::new(TexNodeType::Ordgroup, String::new(), Some(Vec::new()), None);
105                for _ in 0..num_prime {
106                    sup_node.args.as_mut().unwrap().push(TexNode::new(
107                        TexNodeType::Element,
108                        "'".to_string(),
109                        None,
110                        None,
111                    ));
112                }
113                if let Some(sup_node_inner) = sup {
114                    sup_node.args.as_mut().unwrap().push(sup_node_inner);
115                }
116                if sup_node.args.as_ref().unwrap().len() == 1 {
117                    res.sup = Some(sup_node.args.unwrap().remove(0));
118                } else {
119                    res.sup = Some(sup_node);
120                }
121            } else if let Some(sup_node) = sup {
122                res.sup = Some(sup_node);
123            }
124            Ok((
125                TexNode::new(
126                    TexNodeType::SupSub,
127                    String::new(),
128                    None,
129                    Some(Box::from(TexNodeData::Supsub(res))),
130                ),
131                pos,
132            ))
133        } else {
134            Ok((base, pos))
135        }
136    }
137
138    fn parse_next_expr_without_supsub(&self, tokens: &[TexToken], start: usize) -> ParseResult {
139        match tokens.get(start) {
140            None => Err("Unexpected end of input".to_string()),
141            Some(_first_token) => {
142                let first_token = _first_token;
143                let token_type = &first_token.token_type;
144                match token_type {
145                    TexTokenType::Element => Ok((
146                        TexNode::new(TexNodeType::Element, first_token.value.clone(), None, None),
147                        start + 1,
148                    )),
149                    TexTokenType::Text => Ok((
150                        TexNode::new(TexNodeType::Text, first_token.value.clone(), None, None),
151                        start + 1,
152                    )),
153                    TexTokenType::Comment => Ok((
154                        TexNode::new(TexNodeType::Comment, first_token.value.clone(), None, None),
155                        start + 1,
156                    )),
157                    TexTokenType::Space | TexTokenType::Newline => Ok((
158                        TexNode::new(TexNodeType::Whitespace, first_token.value.clone(), None, None),
159                        start + 1,
160                    )),
161                    TexTokenType::NoBreakSpace => Ok((
162                        TexNode::new(TexNodeType::NoBreakSpace, first_token.value.clone(), None, None),
163                        start + 1,
164                    )),
165                    TexTokenType::Command => {
166                        if first_token.eq(&BEGIN_COMMAND) {
167                            self.parse_begin_end_expr(tokens, start)
168                        } else if first_token.eq(&LEFT_COMMAND) {
169                            self.parse_left_right_expr(tokens, start)
170                        } else {
171                            self.parse_command_expr(tokens, start)
172                        }
173                    }
174                    TexTokenType::Control => {
175                        let control_char = &first_token.value;
176                        match control_char.as_str() {
177                            "{" => {
178                                let pos_closing_bracket =
179                                    find_closing_match(tokens, start, &LEFT_CURLY_BRACKET, &RIGHT_CURLY_BRACKET);
180                                if pos_closing_bracket == -1 {
181                                    Err("Unmatched '{'".to_string())
182                                } else {
183                                    let expr_inside = &tokens[start + 1..pos_closing_bracket as usize];
184                                    Ok((self.parse(expr_inside.to_vec())?, pos_closing_bracket as usize + 1))
185                                }
186                            }
187                            "}" => Err("Unexpected '}'".to_string()),
188                            "\\\\" => Ok((
189                                TexNode::new(TexNodeType::Control, "\\\\".to_string(), None, None),
190                                start + 1,
191                            )),
192                            "\\," => Ok((
193                                TexNode::new(TexNodeType::Control, "\\,".to_string(), None, None),
194                                start + 1,
195                            )),
196                            "_" | "^" => Ok((EMPTY_NODE.clone(), start)),
197                            "&" => Ok((
198                                TexNode::new(TexNodeType::Control, "&".to_string(), None, None),
199                                start + 1,
200                            )),
201                            _ => Err("Unknown control sequence".to_string()),
202                        }
203                    }
204                    TexTokenType::Unknown => Ok((
205                        TexNode::new(TexNodeType::Unknown, first_token.value.clone(), None, None),
206                        start + 1,
207                    )),
208                }
209            }
210        }
211    }
212
213    fn parse_command_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
214        let command = &tokens[start].value; // command name starts with a \\
215        let pos = start + 1;
216
217        if matches!(command[1..].as_ref(), "left" | "right" | "begin" | "end") {
218            return Err(format!("Unexpected command: {}", command));
219        }
220
221        match self.command_registry.get_command_type(&command[1..]) {
222            Some(CommandType::Symbol) => {
223                if !SYMBOL_MAP.contains_key(&command[1..]) {
224                    return Ok((
225                        TexNode::new(TexNodeType::UnknownMacro, command.clone(), None, None),
226                        pos,
227                    ));
228                }
229                Ok((TexNode::new(TexNodeType::Symbol, command.clone(), None, None), pos))
230            }
231            Some(CommandType::Unary) => {
232                if pos >= tokens.len() {
233                    return Err(format!("Expecting argument for {}", command));
234                }
235                if command == "\\text" {
236                    if pos + 2 >= tokens.len() {
237                        return Err("Expecting content for \\text command".to_string());
238                    }
239                    assert_eq!(tokens[pos], *LEFT_CURLY_BRACKET);
240                    assert_eq!(tokens[pos + 1].token_type, TexTokenType::Text);
241                    assert_eq!(tokens[pos + 2], *RIGHT_CURLY_BRACKET);
242                    let text = tokens[pos + 1].value.clone();
243                    return Ok((TexNode::new(TexNodeType::Text, text, None, None), pos + 3));
244                }
245                let (arg1, new_pos) = self.parse_next_expr_without_supsub(tokens, pos)?;
246                Ok((
247                    TexNode::new(TexNodeType::UnaryFunc, command.clone(), Some(vec![arg1]), None),
248                    new_pos,
249                ))
250            }
251            Some(CommandType::Binary) => {
252                let (arg1, pos1) = self.parse_next_expr_without_supsub(tokens, pos)?;
253                let (arg2, pos2) = self.parse_next_expr_without_supsub(tokens, pos1)?;
254                Ok((
255                    TexNode::new(TexNodeType::BinaryFunc, command.clone(), Some(vec![arg1, arg2]), None),
256                    pos2,
257                ))
258            }
259            Some(CommandType::OptionalBinary) => {
260                let mut args = vec![];
261                let mut new_pos = pos;
262                if tokens[pos].token_type == TexTokenType::Element && tokens[pos].value == "[" {
263                    let pos_left_square_bracket = pos;
264                    let pos_right_square_bracket =
265                        find_closing_match(tokens, pos, &LEFT_SQUARE_BRACKET, &RIGHT_SQUARE_BRACKET);
266                    if pos_right_square_bracket == -1 {
267                        return Err("No matching right square bracket for [".to_string());
268                    }
269                    let optional_arg_inside = &tokens[pos_left_square_bracket + 1..pos_right_square_bracket as usize];
270                    let optional_arg_node = self.parse(optional_arg_inside.to_vec())?;
271                    let (mandatory_arg_node, _new_pos) =
272                        self.parse_next_expr_without_supsub(tokens, pos_right_square_bracket as usize + 1)?;
273                    args.push(optional_arg_node);
274                    args.push(mandatory_arg_node);
275                    new_pos = _new_pos;
276                } else {
277                    let (arg1, _new_pos) = self.parse_next_expr_without_supsub(tokens, pos)?;
278                    args.push(arg1);
279                    new_pos = _new_pos;
280                }
281                Ok((
282                    TexNode::new(TexNodeType::OptionBinaryFunc, command.clone(), Some(args), None),
283                    new_pos,
284                ))
285            }
286            _ => Err("Invalid number of parameters".to_string()),
287        }
288    }
289
290    fn parse_left_right_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
291        assert!(tokens[start].eq(&LEFT_COMMAND));
292
293        let mut pos = start + 1;
294        pos += eat_whitespaces(tokens, pos);
295
296        if pos >= tokens.len() {
297            return Err("Expecting delimiter after \\left".to_string());
298        }
299
300        let left_delimiter = eat_parenthesis(tokens, pos);
301        if left_delimiter.is_none() {
302            return Err("Invalid delimiter after \\left".to_string());
303        }
304        pos += 1;
305        let expr_inside_start = pos;
306        let idx = find_closing_right_command(tokens, start);
307        if idx == -1 {
308            return Err("No matching \\right".to_string());
309        }
310        let expr_inside_end = idx as usize;
311        pos = expr_inside_end + 1;
312
313        pos += eat_whitespaces(tokens, pos);
314        if pos >= tokens.len() {
315            return Err("Expecting \\right after \\left".to_string());
316        }
317
318        let right_delimiter = eat_parenthesis(tokens, pos);
319        if right_delimiter.is_none() {
320            return Err("Invalid delimiter after \\right".to_string());
321        }
322        pos += 1;
323
324        let expr_inside = &tokens[expr_inside_start..expr_inside_end];
325        let body = self.parse(expr_inside.to_vec())?;
326        let args: Vec<TexNode> = vec![
327            TexNode::new(TexNodeType::Element, left_delimiter.unwrap().value.clone(), None, None),
328            body,
329            TexNode::new(TexNodeType::Element, right_delimiter.unwrap().value.clone(), None, None),
330        ];
331        let res = TexNode::new(TexNodeType::Leftright, String::new(), Some(args), None);
332        Ok((res, pos))
333    }
334
335    fn parse_begin_end_expr(&self, tokens: &[TexToken], start: usize) -> ParseResult {
336        assert!(tokens[start].eq(&BEGIN_COMMAND));
337
338        let mut pos = start + 1;
339        assert!(tokens[pos].eq(&LEFT_CURLY_BRACKET));
340        assert_eq!(tokens[pos + 1].token_type, TexTokenType::Text);
341        assert!(tokens[pos + 2].eq(&RIGHT_CURLY_BRACKET));
342        let env_name = tokens[pos + 1].value.clone();
343        pos += 3;
344
345        pos += eat_whitespaces(tokens, pos); // ignore whitespaces and '\n' after \begin{envName}
346
347        let expr_inside_start = pos;
348
349        let end_idx = find_closing_end_command(tokens, start);
350        if end_idx == -1 {
351            panic!("No matching \\end");
352        }
353        let expr_inside_end = end_idx as usize;
354        pos = expr_inside_end + 1;
355
356        assert!(tokens[pos].eq(&LEFT_CURLY_BRACKET));
357        assert_eq!(tokens[pos + 1].token_type, TexTokenType::Text);
358        assert!(tokens[pos + 2].eq(&RIGHT_CURLY_BRACKET));
359        if tokens[pos + 1].value != env_name {
360            return Err("Mismatched \\begin and \\end environments".to_string());
361        }
362        pos += 3;
363
364        let mut expr_inside = tokens[expr_inside_start..expr_inside_end].to_vec();
365        // ignore spaces and '\n' before \end{envName}
366        while !expr_inside.is_empty()
367            && matches!(
368                expr_inside.last().unwrap().token_type,
369                TexTokenType::Space | TexTokenType::Newline
370            )
371        {
372            expr_inside.pop();
373        }
374        let body = self.parse_aligned(&*expr_inside)?;
375        let res = TexNode::new(TexNodeType::BeginEnd, env_name, None, Some(Box::from(Array(body))));
376        Ok((res, pos))
377    }
378
379    fn parse_aligned(&self, tokens: &[TexToken]) -> Result<Vec<Vec<TexNode>>, String> {
380        let mut pos = 0;
381        let mut all_rows: Vec<Vec<TexNode>> = vec![vec![TexNode::new(
382            TexNodeType::Ordgroup,
383            String::new(),
384            Some(Vec::<TexNode>::new()),
385            None,
386        )]];
387        let mut row: &mut Vec<TexNode> = &mut all_rows[0];
388        let mut group: &mut TexNode = &mut row[0];
389
390        while pos < tokens.len() {
391            let (res, new_pos) = self.parse_next_expr(tokens, pos)?;
392            pos = new_pos;
393
394            if res.node_type == TexNodeType::Whitespace {
395                if !self.space_sensitive && res.content.replace(" ", "").is_empty() {
396                    continue;
397                }
398                if !self.newline_sensitive && res.content == "\n" {
399                    continue;
400                }
401            }
402
403            if res.node_type == TexNodeType::Control && res.content == "\\\\" {
404                all_rows.push(vec![TexNode::new(
405                    TexNodeType::Ordgroup,
406                    String::new(),
407                    Some(Vec::<TexNode>::new()),
408                    None,
409                )]);
410                row = all_rows.last_mut().unwrap();
411                group = &mut row[0];
412            } else if res.node_type == TexNodeType::Control && res.content == "&" {
413                row.push(TexNode::new(
414                    TexNodeType::Ordgroup,
415                    String::new(),
416                    Some(Vec::new()),
417                    None,
418                ));
419                group = row.last_mut().unwrap();
420            } else {
421                group.args.as_mut().unwrap().push(res);
422            }
423        }
424
425        Ok(all_rows)
426    }
427}
428
429pub fn parse_tex(tex: &str) -> Result<TexNode, String> {
430    let parser = LatexParser::new(false, false);
431    let tokens = tex_tokenizer::tokenize(tex)?;
432    parser.parse(tokens)
433}