tex2typst_rs/
tex_tokenizer.rs

1use crate::definitions::{TexToken, TexTokenType};
2use crate::tex_parser_utils::{SUB_SYMBOL, SUP_SYMBOL};
3
4fn eat_command_name(latex: &Vec<char>, start: usize) -> String {
5    let mut pos = start;
6    while pos < latex.len() && latex[pos].is_alphabetic() {
7        pos += 1;
8    }
9    latex[start..pos].iter().collect::<String>()
10}
11
12fn find_closing_curly_bracket_char(latex: &Vec<char>, start: usize) -> Result<usize, &'static str> {
13    assert_eq!(latex[start], '{');
14    let mut count = 1;
15    let mut pos = start + 1;
16
17    while count > 0 {
18        if pos >= latex.len() {
19            return Err("Unmatched curly brackets");
20        }
21        if pos + 1 < latex.len() && ["\\{", "\\}"].contains(&latex[pos..pos + 2].iter().collect::<String>().as_str()) {
22            pos += 2;
23            continue;
24        }
25        match latex[pos] {
26            '{' => count += 1,
27            '}' => count -= 1,
28            _ => {}
29        }
30        pos += 1;
31    }
32
33    Ok(pos - 1)
34}
35
36pub fn tokenize(latex: &str) -> Result<Vec<TexToken>, String> {
37    let latex: Vec<char> = latex.chars().collect();
38    let mut tokens: Vec<TexToken> = Vec::new();
39    let mut pos = 0;
40
41    while pos < latex.len() {
42        let first_char = latex[pos];
43        let token: TexToken;
44        match first_char {
45            '%' => {
46                let mut new_pos = pos + 1;
47                while new_pos < latex.len() && latex[new_pos] != '\n' {
48                    new_pos += 1;
49                }
50                token = TexToken::new(TexTokenType::Comment, latex[pos + 1..new_pos].iter().collect());
51                pos = new_pos;
52            }
53            '{' | '}' | '_' | '^' | '&' => {
54                token = TexToken::new(TexTokenType::Control, first_char.to_string());
55                pos += 1;
56            }
57            '\n' => {
58                token = TexToken::new(TexTokenType::Newline, first_char.to_string());
59                pos += 1;
60            }
61            '\r' => {
62                if pos + 1 < latex.len() && latex[pos + 1] == '\n' {
63                    token = TexToken::new(TexTokenType::Newline, "\n".to_string());
64                    pos += 2;
65                } else {
66                    token = TexToken::new(TexTokenType::Newline, "\n".to_string());
67                    pos += 1;
68                }
69            }
70            ' ' => {
71                let mut new_pos = pos;
72                while new_pos < latex.len() && latex[new_pos] == ' ' {
73                    new_pos += 1;
74                }
75                token = TexToken::new(TexTokenType::Space, latex[pos..new_pos].iter().collect());
76                pos = new_pos;
77            }
78            '\\' => {
79                if pos + 1 >= latex.len() {
80                    return Err("Expecting command name after '\\'".to_string());
81                }
82                let first_two_chars = latex[pos..pos + 2].iter().collect::<String>();
83                if ["\\\\", "\\,"].contains(&&*first_two_chars) {
84                    token = TexToken::new(TexTokenType::Control, first_two_chars.to_string());
85                } else if ["\\{", "\\}", "\\%", "\\$", "\\&", "\\#", "\\_", "\\|"].contains(&&*first_two_chars) {
86                    token = TexToken::new(TexTokenType::Element, first_two_chars.to_string());
87                } else {
88                    let command = eat_command_name(&latex, pos + 1);
89                    token = TexToken::new(TexTokenType::Command, format!("\\{}", command));
90                }
91                pos += token.value.len();
92            }
93            _ => {
94                if first_char.is_digit(10) {
95                    let mut new_pos = pos;
96                    while new_pos < latex.len() && latex[new_pos].is_digit(10) {
97                        new_pos += 1;
98                    }
99                    token = TexToken::new(TexTokenType::Element, latex[pos..new_pos].iter().collect());
100                } else if first_char.is_alphabetic() {
101                    token = TexToken::new(TexTokenType::Element, first_char.to_string());
102                } else if "+-*/='<>!.,;:?()[]|".contains(first_char) {
103                    token = TexToken::new(TexTokenType::Element, first_char.to_string());
104                } else if "~".contains(first_char) {
105                    token = TexToken::new(TexTokenType::NoBreakSpace, "space.nobreak".to_string());
106                } else {
107                    token = TexToken::new(TexTokenType::Unknown, first_char.to_string());
108                }
109                pos += token.value.len();
110            }
111        }
112
113        tokens.push(token.clone());
114
115        if token.token_type == TexTokenType::Command
116            && matches!(token.value.as_str(), r"\text" | r"\operatorname" | r"\begin" | r"\end")
117        {
118            if pos >= latex.len() || latex[pos] != '{' {
119                if let Some(nn) = latex[pos..].iter().position(|&c| c == '{') {
120                    pos += nn;
121                } else {
122                    return Err(format!("No content for {} command", token.value));
123                }
124            }
125            tokens.push(TexToken::new(TexTokenType::Control, "{".to_string()));
126            let pos_closing_bracket = find_closing_curly_bracket_char(&latex, pos)?;
127            pos += 1;
128            let mut text_inside: String = latex[pos..pos_closing_bracket].iter().collect();
129            let chars = ['{', '}', '\\', '$', '&', '#', '_', '%'];
130            for &char in &chars {
131                text_inside = text_inside.replace(&format!("\\{}", char), &char.to_string());
132            }
133            tokens.push(TexToken::new(TexTokenType::Text, text_inside));
134            tokens.push(TexToken::new(TexTokenType::Control, "}".to_string()));
135            pos = pos_closing_bracket + 1;
136        }
137    }
138
139    Ok(pass_ignore_whitespace_before_script_mark(tokens))
140}
141
142// Remove all whitespace before or after _ or ^
143fn pass_ignore_whitespace_before_script_mark(tokens: Vec<TexToken>) -> Vec<TexToken> {
144    let is_script_mark = |token: &TexToken| token.eq(&SUB_SYMBOL) || token.eq(&SUP_SYMBOL);
145    let mut out_tokens: Vec<TexToken> = Vec::new();
146
147    for i in 0..tokens.len() {
148        if tokens[i].token_type == TexTokenType::Space && i + 1 < tokens.len() && is_script_mark(&tokens[i + 1]) {
149            continue;
150        }
151        if tokens[i].token_type == TexTokenType::Space && i > 0 && is_script_mark(&tokens[i - 1]) {
152            continue;
153        }
154        out_tokens.push(tokens[i].clone());
155    }
156
157    out_tokens
158}