gtk_ui_builder/parser/tokenizer.rs
1use std::collections::VecDeque;
2
3use super::token::Token;
4use super::tokenize_error::TokenizeError;
5
6pub struct Tokenizer;
7
8impl Tokenizer {
9 /// Parse syntax tokens from input string
10 ///
11 /// ```
12 /// use gtk_ui_builder::parser::prelude::*;
13 ///
14 /// let tokens = Tokenizer::parse("[a b c]").expect("Failed to tokenize");
15 ///
16 /// if let Token::SquareBrackets { tokens, .. } = &tokens[0] {
17 /// let mut list = Vec::new();
18 ///
19 /// for token in tokens {
20 /// if let Token::Other { value, .. } = token {
21 /// list.push(value);
22 /// }
23 /// }
24 ///
25 /// println!("Tokenized list: {:?}", list);
26 /// }
27 /// ```
28 pub fn parse<T: ToString>(text: T) -> Result<Vec<Token>, TokenizeError> {
29 let mut tokens = Vec::new();
30 let mut word = String::new();
31
32 let text = text.to_string().chars().collect::<Vec<char>>();
33
34 let mut token_begin = 0;
35 let mut i = 0;
36
37 while i < text.len() {
38 // Other token end
39 if text[i].is_whitespace() {
40 // Two ifs to avoid else execution
41 if !word.is_empty() {
42 tokens.push(Token::Other {
43 begin: token_begin,
44 end: i - 1,
45 value: word
46 });
47
48 word = String::new();
49 }
50
51 token_begin = i + 1;
52 }
53
54 // Parse string
55 else if text[i] == '"' {
56 // Save not empty word as Other token
57 if !word.is_empty() {
58 tokens.push(Token::Other {
59 begin: token_begin,
60 end: i - 1,
61 value: word
62 });
63
64 token_begin = i;
65 word = String::new();
66 }
67
68 let mut correct_str = false;
69 i += 1;
70
71 while i < text.len() {
72 // TODO: slashes encoding
73 if text[i] == '"' {
74 correct_str = true;
75
76 break;
77 }
78
79 else {
80 word.push(text[i]);
81 }
82
83 i += 1;
84 }
85
86 // Return Err if string end wasn't found ("string)
87 if !correct_str {
88 return Err(TokenizeError::IncorrectString {
89 message: format!("Incorrect string format found from offset {} to {}", token_begin, i),
90 begin: token_begin,
91 end: i,
92 wrong_string: word
93 });
94 }
95
96 tokens.push(Token::Text {
97 begin: token_begin,
98 end: i,
99 value: word
100 });
101
102 word = String::new();
103 token_begin = i + 1;
104 }
105
106 // Parse brackets
107 else if text[i] == '(' || text[i] == '[' || text[i] == '{' {
108 // Save not empty word as Other token
109 if !word.is_empty() {
110 tokens.push(Token::Other {
111 begin: token_begin,
112 end: i - 1,
113 value: word
114 });
115
116 token_begin = i;
117 word = String::new();
118 }
119
120 let mut brackets_stack = VecDeque::from([text[i]]);
121 let mut correct_order = false;
122
123 i += 1;
124
125 while i < text.len() {
126 if let Some(rev_bracket) = Self::get_rev_bracket(text[i]) {
127 if brackets_stack.back() == Some(&rev_bracket) {
128 brackets_stack.pop_back();
129
130 if brackets_stack.is_empty() {
131 correct_order = true;
132
133 break;
134 }
135 }
136
137 else {
138 brackets_stack.push_back(text[i]);
139 }
140 }
141
142 word.push(text[i]);
143 i += 1;
144 }
145
146 // Return Err if brackets end wasn't found ([string)
147 if !correct_order {
148 return Err(TokenizeError::IncorrectBrackets {
149 message: format!("Incorrect brackets format found from offset {} to {}", token_begin, i),
150 begin: token_begin,
151 end: i,
152 wrong_string: word
153 });
154 }
155
156 let sub_tokens = Self::inc_tokens_offsets(Self::parse(word)?, token_begin + 1);
157
158 tokens.push(match &text[token_begin] {
159 '(' => Token::Parentheses { begin: token_begin, end: i, tokens: sub_tokens },
160 '[' => Token::SquareBrackets { begin: token_begin, end: i, tokens: sub_tokens },
161 '{' => Token::CurlyBrackets { begin: token_begin, end: i, tokens: sub_tokens },
162 _ => unreachable!()
163 });
164
165 word = String::new();
166 token_begin = i + 1;
167 }
168
169 // Push Other token character
170 // There may be a situation like {}; where ; will be parsed as Other
171 // while {} as CurlyBrackets, so technically ; will be the first character. This is wrong
172 // so we need check previous character
173 else { /* if Self::is_normal_char(text[i], word.is_empty() && (if i > 0 { text[i - 1].is_whitespace() } else { true })) {*/
174 word.push(text[i]);
175 }
176
177 // Wrong Other token character
178 /*else {
179 return Err(TokenizeError::IncorrectChar {
180 message: format!("Incorrect character ({}) found at offset {}", text[i], i),
181 wrong_string: word,
182 offset: i
183 });
184 }*/
185
186 i += 1;
187 }
188
189 if !word.is_empty() {
190 tokens.push(Token::Other {
191 begin: token_begin,
192 end: text.len() - 1,
193 value: word
194 });
195 }
196
197 Ok(tokens)
198 }
199
200 fn get_rev_bracket(bracket: char) -> Option<char> {
201 match bracket {
202 '(' => Some(')'),
203 '[' => Some(']'),
204 '{' => Some('}'),
205
206 ')' => Some('('),
207 ']' => Some('['),
208 '}' => Some('{'),
209
210 _ => None
211 }
212 }
213
214 /*fn is_normal_char(char: char, first: bool) -> bool {
215 !first ||
216 (char >= 'a' && char <= 'z') ||
217 (char >= 'A' && char <= 'Z') ||
218 (char >= '0' && char <= '9') ||
219 char == '.' || char == '_' || char == '='
220 }*/
221
222 fn inc_tokens_offsets(mut tokens: Vec<Token>, offset: usize) -> Vec<Token> {
223 for token in &mut tokens {
224 match token {
225 Token::Text { begin, end, .. } => { *begin += offset; *end += offset; },
226 Token::Other { begin, end, .. } => { *begin += offset; *end += offset; },
227
228 Token::Parentheses { begin, end, tokens } => {
229 *begin += offset;
230 *end += offset;
231 *tokens = Self::inc_tokens_offsets(tokens.clone(), offset);
232 },
233
234 Token::SquareBrackets { begin, end, tokens } => {
235 *begin += offset;
236 *end += offset;
237 *tokens = Self::inc_tokens_offsets(tokens.clone(), offset);
238 },
239
240 Token::CurlyBrackets { begin, end, tokens } => {
241 *begin += offset;
242 *end += offset;
243 *tokens = Self::inc_tokens_offsets(tokens.clone(), offset);
244 }
245 }
246 }
247
248 tokens
249 }
250}