1pub mod token;
25pub mod tokenizer_error;
26
27use std::collections::HashMap;
28
29use hmap::hmap;
30use lazy_static::lazy_static;
31use regex::Regex;
32
33use self::{
34 token::{Token, TokenType},
35 tokenizer_error::TokenizerError,
36};
37
38#[derive(Debug, Clone)]
40pub struct TokenizerOptions {
41 open: String,
42 close: String,
43 pound: String,
44 at: String,
45}
46
47impl TokenizerOptions {
48 pub fn new(
50 open: impl ToString,
51 close: impl ToString,
52 pound: impl ToString,
53 at: impl ToString,
54 ) -> Result<Self, ()> {
55 let open = open.to_string();
56 let close = close.to_string();
57 let pound = pound.to_string();
58 let at = at.to_string();
59
60 if open.is_empty() || close.is_empty() || pound.is_empty() {
61 Err(())
62 } else {
63 Ok(Self {
64 open,
65 close,
66 pound,
67 at,
68 })
69 }
70 }
71}
72
73impl std::default::Default for TokenizerOptions {
74 fn default() -> Self {
75 Self::new("{{", "}}", "#", "@").unwrap()
76 }
77}
78
79pub struct Tokenizer<'a> {
81 _source: &'a str,
83 left: &'a str,
85 location: usize,
87 options: TokenizerOptions,
89 tokens: Vec<Token<'a>>,
91 static_symbol_tokens: Vec<(String, TokenType)>,
93 reserved_words: HashMap<&'static str, TokenType>,
95 recorded: Vec<(usize, &'a str)>,
97}
98
99impl<'a> Tokenizer<'a> {
100 pub fn new(source: &'a str, options: TokenizerOptions) -> Self {
102 let mut static_symbol_tokens = [
104 ("?", TokenType::QuestionMark),
105 (":", TokenType::Colon),
106 (",", TokenType::Comma),
107 (".", TokenType::Dot),
108 ("(", TokenType::LBracket),
109 (")", TokenType::RBracket),
110 ("[", TokenType::SquareLBracket),
111 ("]", TokenType::SquareRBracket),
112 ("+", TokenType::Plus),
113 ("-", TokenType::Minus),
114 ("*", TokenType::Star),
115 ("/", TokenType::Slash),
116 ("||", TokenType::Or),
117 ("&&", TokenType::And),
118 ("!", TokenType::Not),
119 ("=", TokenType::Assign),
120 (&options.open, TokenType::Open),
121 (&options.close, TokenType::Close),
122 (&options.pound, TokenType::Pound),
123 (&options.at, TokenType::At),
124 ("==", TokenType::Equals),
125 ("!=", TokenType::NotEquals),
126 (">", TokenType::GreaterThan),
127 ("<", TokenType::SmallerThan),
128 (">=", TokenType::GreaterEquals),
129 ("<=", TokenType::SmallerEquals),
130 ]
131 .map(|(key, value)| (key.to_string(), value))
132 .into_iter()
133 .collect::<Vec<_>>();
134 static_symbol_tokens.sort_by_key(|item| item.0.len());
135 static_symbol_tokens.reverse();
136
137 let reserved_words = hmap! (
138 "if" => TokenType::If,
139 "else" => TokenType::Else,
140 "elif" => TokenType::Elif,
141 "for" => TokenType::For,
142 "None" => TokenType::NoneLiteral
143 );
144
145 Self {
146 _source: source,
147 left: source,
148 location: 0,
149 options,
150 tokens: Vec::new(),
151 static_symbol_tokens,
152 reserved_words,
153 recorded: Vec::new(),
154 }
155 }
156
157 fn advance(&mut self, len: usize) {
158 self.left = &self.left[len..];
159 self.location += len;
160 }
161
162 fn record(&mut self) {
163 self.recorded.push((self.location, self.left));
164 }
165
166 fn restore(&mut self) {
167 (self.location, self.left) = self.recorded.pop().unwrap();
168 }
169
170 fn push_and_advance(&mut self, tt: TokenType, len: usize) {
171 if len > 0 {
172 self.tokens
173 .push(Token::new(tt, &self.left[..len], self.location));
174 self.advance(len);
175 }
176 }
177
178 pub fn tokenize(mut self) -> Result<Vec<Token<'a>>, TokenizerError> {
180 loop {
181 if let Some(l) = self.left.find(&self.options.open) {
183 self.push_and_advance(TokenType::Source, l);
185
186 loop {
188 if self.next_directive_token()? {
189 break;
190 }
191 }
192 } else {
193 self.push_and_advance(TokenType::Source, self.left.len());
195 break;
196 }
197 }
198
199 Ok(self.tokens)
200 }
201
202 fn next_directive_token(&mut self) -> Result<bool, TokenizerError> {
205 lazy_static! {
207 static ref IDENTIFIER_RE: Regex = Regex::new(r"^([_a-zA-Z][_a-zA-Z0-9]*)").unwrap();
208 static ref NUMERICAL_LITERAL_RE: Regex = Regex::new(r"^([0-9]+)").unwrap();
209 static ref FILE_PATH_RE: Regex = Regex::new(r"^\s*([0-9a-zA-z_\./]+)").unwrap();
210 }
211
212 if self.left.is_empty() {
213 return Ok(true);
214 }
215
216 if self.left.starts_with("\n\r") {
218 self.advance(2);
219 return Ok(false);
220 }
221 if self.left.starts_with(" ") || self.left.starts_with("\t") || self.left.starts_with("\n")
222 {
223 self.advance(1);
224 return Ok(false);
225 }
226
227 let found = self.static_symbol_tokens.iter().find_map(|(s, tt)| {
229 if self.left.starts_with(s) {
230 Some((*tt, s.len()))
231 } else {
232 None
233 }
234 });
235
236 if let Some((tt, len)) = found {
238 self.push_and_advance(tt, len);
239
240 if tt == TokenType::Close {
242 return Ok(true);
243 }
244
245 if tt == TokenType::At {
247 if let Some(captures) = FILE_PATH_RE.captures(self.left) {
249 if let Some(m0) = captures.get(0) {
251 let whole_length = m0.as_str().len();
253
254 if let Some(m) = captures.get(1) {
256 let len = m.as_str().len();
258
259 self.advance(whole_length - len);
261
262 self.push_and_advance(TokenType::FilePath, len);
264 return Ok(false);
265 }
266 }
267 }
268
269 return Err(TokenizerError::MustHaveFilePathAfterAt(
271 self.location,
272 self.options.at.clone(),
273 ));
274 }
275
276 return Ok(false);
277 }
278
279 if self.left.starts_with("'") {
281 let start_len = self.left.len();
283 self.left = self.left.trim_start_matches("'");
284 let delimiter_len = start_len - self.left.len();
285 let delimiter = "'".repeat(delimiter_len);
286
287 self.record();
289 let mut string_len = 0;
290 while !self.left.starts_with(&delimiter) {
291 if self.left.len() == 0 {
294 return Err(TokenizerError::UnclosedString(start_len));
295 }
296 self.advance(1);
297 string_len += 1;
298 }
299 self.restore();
300
301 self.push_and_advance(TokenType::StringLiteral, string_len);
303
304 self.advance(delimiter_len);
306 return Ok(false);
307 }
308
309 if let Some(captures) = NUMERICAL_LITERAL_RE.captures(self.left) {
311 let l = captures.get(0).unwrap().as_str().len();
312 self.push_and_advance(TokenType::NumericalLiteral, l);
313 return Ok(false);
314 }
315
316 if let Some(captures) = IDENTIFIER_RE.captures(self.left) {
318 let s = captures.get(0).unwrap().as_str();
319 let l = s.len();
320
321 let (tt, l) = if let Some(tt) = self.reserved_words.get(s) {
322 (*tt, l)
323 } else {
324 (TokenType::Identifier, l)
325 };
326
327 self.push_and_advance(tt, l);
328
329 return Ok(false);
330 }
331
332 return Err(TokenizerError::UnexpectedCharacter(self.location));
334 }
335}
336
337#[cfg(test)]
338mod tests {
339 use crate::tokenizer::{TokenType, Tokenizer, TokenizerOptions};
340
341 use super::Token;
342
343 fn compare_tokens(tokens: &[Token], target_tokens: &[(TokenType, Option<&str>)]) {
344 for (a, b) in tokens.into_iter().zip(target_tokens.into_iter()) {
345 assert_eq!(a.tt, b.0, "testing: {} {:?}", a.content, b.1);
346 if b.1.is_some() {
347 assert_eq!(a.content, b.1.unwrap());
348 }
349 }
350 }
351
352 #[test]
353 fn test_01() {
354 let tokens = Tokenizer::new("abc{{abc}}", TokenizerOptions::default())
355 .tokenize()
356 .unwrap();
357
358 let target_tokens = vec![
359 (TokenType::Source, None),
360 (TokenType::Open, None),
361 (TokenType::Identifier, Some("abc")),
362 (TokenType::Close, None),
363 ];
364
365 compare_tokens(&tokens, &target_tokens);
366 }
367
368 #[test]
369 fn test_02() {
370 let tokens = Tokenizer::new("abc{{abc}}abc", TokenizerOptions::default())
371 .tokenize()
372 .unwrap();
373
374 let target_tokens = vec![
375 (TokenType::Source, None),
376 (TokenType::Open, None),
377 (TokenType::Identifier, Some("abc")),
378 (TokenType::Close, None),
379 (TokenType::Source, Some("abc")),
380 ];
381
382 compare_tokens(&tokens, &target_tokens);
383 }
384
385 #[test]
386 fn test_03() {
387 let tokens = Tokenizer::new("{{>>>= ==<<<=}}", TokenizerOptions::default())
388 .tokenize()
389 .unwrap();
390
391 let target_tokens = vec![
392 (TokenType::Open, None),
394 (TokenType::GreaterThan, None),
396 (TokenType::GreaterThan, None),
397 (TokenType::GreaterEquals, None),
399 (TokenType::Equals, None),
401 (TokenType::SmallerThan, None),
403 (TokenType::SmallerThan, None),
404 (TokenType::SmallerEquals, None),
406 (TokenType::Close, None),
408 ];
409
410 compare_tokens(&tokens, &target_tokens);
411 }
412
413 #[test]
414 fn test_04() {
415 let tokens = Tokenizer::new(
416 "{{ a > b ? 'item 1' : 'item 2' }}",
417 TokenizerOptions::default(),
418 )
419 .tokenize()
420 .unwrap();
421
422 let target_tokens = vec![
423 (TokenType::Open, None),
425 (TokenType::Identifier, Some("a")),
427 (TokenType::GreaterThan, None),
428 (TokenType::Identifier, Some("b")),
429 (TokenType::QuestionMark, None),
431 (TokenType::StringLiteral, Some("item 1")),
433 (TokenType::Colon, None),
434 (TokenType::StringLiteral, Some("item 2")),
435 (TokenType::Close, None),
437 ];
438
439 compare_tokens(&tokens, &target_tokens);
440 }
441
442 #[test]
443 fn test_05() {
444 let tokens = Tokenizer::new(
445 "{{@ file(arg0 = arg, arg1 = 'txt', arg2 = 123 > num, arg3 = ''_'a'_'')}}",
446 TokenizerOptions::default(),
447 )
448 .tokenize()
449 .unwrap();
450
451 let target_tokens = vec![
452 (TokenType::Open, None),
454 (TokenType::At, None),
456 (TokenType::FilePath, Some("file")),
457 (TokenType::LBracket, None),
459 (TokenType::Identifier, Some("arg0")),
461 (TokenType::Assign, None),
462 (TokenType::Identifier, Some("arg")),
463 (TokenType::Comma, None),
464 (TokenType::Identifier, Some("arg1")),
466 (TokenType::Assign, None),
467 (TokenType::StringLiteral, Some("txt")),
468 (TokenType::Comma, None),
469 (TokenType::Identifier, Some("arg2")),
471 (TokenType::Assign, None),
472 (TokenType::NumericalLiteral, Some("123")),
473 (TokenType::GreaterThan, None),
474 (TokenType::Identifier, Some("num")),
475 (TokenType::Comma, None),
476 (TokenType::Identifier, Some("arg3")),
478 (TokenType::Assign, None),
479 (TokenType::StringLiteral, Some("_'a'_")),
480 (TokenType::RBracket, None),
482 (TokenType::Close, None),
483 ];
484
485 compare_tokens(&tokens, &target_tokens);
486 }
487
488 #[test]
489 fn test_06() {
490 let tokens = Tokenizer::new("{{a != None ? a : 'default'}}", TokenizerOptions::default())
491 .tokenize()
492 .unwrap();
493
494 let target_tokens = vec![
495 (TokenType::Open, None),
497 (TokenType::Identifier, Some("a")),
499 (TokenType::NotEquals, None),
500 (TokenType::NoneLiteral, None),
501 (TokenType::QuestionMark, None),
503 (TokenType::Identifier, Some("a")),
505 (TokenType::Colon, None),
507 (TokenType::StringLiteral, Some("default")),
509 (TokenType::Close, None),
511 ];
512
513 compare_tokens(&tokens, &target_tokens);
514 }
515}