mini_builder_rs/tokenizer/
mod.rs

1//! Tokenization of a string.
2//!
3//! - [Tokenizer] - Performs the tokenization.
4//! - [TokenizerOptions] - Various configurations for the [Tokenizer] for
5//!                        example alternative symbols for some tokens.
6//! - [Token] - A struct that describes a token - it contains the text of the
7//!             token along with its type.
8//! - [TokenType] - The type of the token.
9//! - [TokenizerError] - The error that the [Tokenizer] will throw in case of
10//!                      an error.
11//!
12//! # Example
13//! ```rust
14//! use mini_builder_rs::tokenizer::{Tokenizer, TokenizerOptions};
15//!
16//! let source = "is 1 bigger than 2? {{ 1 > 2 ? 'yes' : 'no' }}";
17//! let tokens = Tokenizer::new(source, TokenizerOptions::default())
18//!	    .tokenize()
19//!	    .unwrap();
20//! // tokens will be a vector of `Token<'a>`s where `'a` is the lifetime of
21//! // `source`
22//! ```
23
24pub mod token;
25pub mod tokenizer_error;
26
27use std::collections::HashMap;
28
29use hmap::hmap;
30use lazy_static::lazy_static;
31use regex::Regex;
32
33use self::{
34	token::{Token, TokenType},
35	tokenizer_error::TokenizerError,
36};
37
38/// Used to configure the [Tokenizer].
39#[derive(Debug, Clone)]
40pub struct TokenizerOptions {
41	open: String,
42	close: String,
43	pound: String,
44	at: String,
45}
46
47impl TokenizerOptions {
48	// TODO: proper error
49	pub fn new(
50		open: impl ToString,
51		close: impl ToString,
52		pound: impl ToString,
53		at: impl ToString,
54	) -> Result<Self, ()> {
55		let open = open.to_string();
56		let close = close.to_string();
57		let pound = pound.to_string();
58		let at = at.to_string();
59
60		if open.is_empty() || close.is_empty() || pound.is_empty() {
61			Err(())
62		} else {
63			Ok(Self {
64				open,
65				close,
66				pound,
67				at,
68			})
69		}
70	}
71}
72
73impl std::default::Default for TokenizerOptions {
74	fn default() -> Self {
75		Self::new("{{", "}}", "#", "@").unwrap()
76	}
77}
78
79/// Given a string slice, tokenizes it to a vector of [Token]s.
80pub struct Tokenizer<'a> {
81	/// The whole text that is being tokenized.
82	_source: &'a str,
83	/// The part of the text that hasn't been tokenized yet.
84	left: &'a str,
85	/// An index to a character of [_source]. Where [left] begins.
86	location: usize,
87	/// Options
88	options: TokenizerOptions,
89	// The parsed tokens.
90	tokens: Vec<Token<'a>>,
91	// Vector of all the tokens that have a specific symbol.
92	static_symbol_tokens: Vec<(String, TokenType)>,
93	// Identifiers with a specific name.
94	reserved_words: HashMap<&'static str, TokenType>,
95	// used for lookahead
96	recorded: Vec<(usize, &'a str)>,
97}
98
99impl<'a> Tokenizer<'a> {
100	/// Constructs a new tokenizer.
101	pub fn new(source: &'a str, options: TokenizerOptions) -> Self {
102		// create the vector of static tokens
103		let mut static_symbol_tokens = [
104			("?", TokenType::QuestionMark),
105			(":", TokenType::Colon),
106			(",", TokenType::Comma),
107			(".", TokenType::Dot),
108			("(", TokenType::LBracket),
109			(")", TokenType::RBracket),
110			("[", TokenType::SquareLBracket),
111			("]", TokenType::SquareRBracket),
112			("+", TokenType::Plus),
113			("-", TokenType::Minus),
114			("*", TokenType::Star),
115			("/", TokenType::Slash),
116			("||", TokenType::Or),
117			("&&", TokenType::And),
118			("!", TokenType::Not),
119			("=", TokenType::Assign),
120			(&options.open, TokenType::Open),
121			(&options.close, TokenType::Close),
122			(&options.pound, TokenType::Pound),
123			(&options.at, TokenType::At),
124			("==", TokenType::Equals),
125			("!=", TokenType::NotEquals),
126			(">", TokenType::GreaterThan),
127			("<", TokenType::SmallerThan),
128			(">=", TokenType::GreaterEquals),
129			("<=", TokenType::SmallerEquals),
130		]
131		.map(|(key, value)| (key.to_string(), value))
132		.into_iter()
133		.collect::<Vec<_>>();
134		static_symbol_tokens.sort_by_key(|item| item.0.len());
135		static_symbol_tokens.reverse();
136
137		let reserved_words = hmap! (
138			"if" => TokenType::If,
139			"else" => TokenType::Else,
140			"elif" => TokenType::Elif,
141			"for" => TokenType::For,
142			"None" => TokenType::NoneLiteral
143		);
144
145		Self {
146			_source: source,
147			left: source,
148			location: 0,
149			options,
150			tokens: Vec::new(),
151			static_symbol_tokens,
152			reserved_words,
153			recorded: Vec::new(),
154		}
155	}
156
157	fn advance(&mut self, len: usize) {
158		self.left = &self.left[len..];
159		self.location += len;
160	}
161
162	fn record(&mut self) {
163		self.recorded.push((self.location, self.left));
164	}
165
166	fn restore(&mut self) {
167		(self.location, self.left) = self.recorded.pop().unwrap();
168	}
169
170	fn push_and_advance(&mut self, tt: TokenType, len: usize) {
171		if len > 0 {
172			self.tokens
173				.push(Token::new(tt, &self.left[..len], self.location));
174			self.advance(len);
175		}
176	}
177
178	/// Tokenizes the string
179	pub fn tokenize(mut self) -> Result<Vec<Token<'a>>, TokenizerError> {
180		loop {
181			// find the next directive
182			if let Some(l) = self.left.find(&self.options.open) {
183				// push all the text before the directive as source
184				self.push_and_advance(TokenType::Source, l);
185
186				// tokenize the directive
187				loop {
188					if self.next_directive_token()? {
189						break;
190					}
191				}
192			} else {
193				// there are no more directives left, push the rest as source and exit
194				self.push_and_advance(TokenType::Source, self.left.len());
195				break;
196			}
197		}
198
199		Ok(self.tokens)
200	}
201
202	/// Tokenizes a token that is inside a directive. Returns true if the end of
203	/// either the source or the directive was reached.
204	fn next_directive_token(&mut self) -> Result<bool, TokenizerError> {
205		// Regex of tokens that are defined with patterns
206		lazy_static! {
207			static ref IDENTIFIER_RE: Regex = Regex::new(r"^([_a-zA-Z][_a-zA-Z0-9]*)").unwrap();
208			static ref NUMERICAL_LITERAL_RE: Regex = Regex::new(r"^([0-9]+)").unwrap();
209			static ref FILE_PATH_RE: Regex = Regex::new(r"^\s*([0-9a-zA-z_\./]+)").unwrap();
210		}
211
212		if self.left.is_empty() {
213			return Ok(true);
214		}
215
216		// ignore whitespaces
217		if self.left.starts_with("\n\r") {
218			self.advance(2);
219			return Ok(false);
220		}
221		if self.left.starts_with(" ") || self.left.starts_with("\t") || self.left.starts_with("\n")
222		{
223			self.advance(1);
224			return Ok(false);
225		}
226
227		// try finding a static token
228		let found = self.static_symbol_tokens.iter().find_map(|(s, tt)| {
229			if self.left.starts_with(s) {
230				Some((*tt, s.len()))
231			} else {
232				None
233			}
234		});
235
236		// if a static token was found, return it and check if its a close token
237		if let Some((tt, len)) = found {
238			self.push_and_advance(tt, len);
239
240			// if found a `Close` token, then notify the caller the directive has ended
241			if tt == TokenType::Close {
242				return Ok(true);
243			}
244
245			// if a found an `At` token, then the following token must be a file name
246			if tt == TokenType::At {
247				// if found a match, push the file path token
248				if let Some(captures) = FILE_PATH_RE.captures(self.left) {
249					// get the entire matched text
250					if let Some(m0) = captures.get(0) {
251						// the length of the entire matched text
252						let whole_length = m0.as_str().len();
253
254						// get the captured text
255						if let Some(m) = captures.get(1) {
256							// the length of the captured text
257							let len = m.as_str().len();
258
259							// advance the whitespace prior to the match
260							self.advance(whole_length - len);
261
262							// push the file path and return
263							self.push_and_advance(TokenType::FilePath, len);
264							return Ok(false);
265						}
266					}
267				}
268
269				// else return an error
270				return Err(TokenizerError::MustHaveFilePathAfterAt(
271					self.location,
272					self.options.at.clone(),
273				));
274			}
275
276			return Ok(false);
277		}
278
279		// check string literals
280		if self.left.starts_with("'") {
281			// match `'` as many times as possible - extract the delimiter
282			let start_len = self.left.len();
283			self.left = self.left.trim_start_matches("'");
284			let delimiter_len = start_len - self.left.len();
285			let delimiter = "'".repeat(delimiter_len);
286
287			// match characters until the
288			self.record();
289			let mut string_len = 0;
290			while !self.left.starts_with(&delimiter) {
291				// if no more characters are left, return an unclosed string
292				// error
293				if self.left.len() == 0 {
294					return Err(TokenizerError::UnclosedString(start_len));
295				}
296				self.advance(1);
297				string_len += 1;
298			}
299			self.restore();
300
301			// extract the content of the string literal and push it
302			self.push_and_advance(TokenType::StringLiteral, string_len);
303
304			// advance past the delimiter
305			self.advance(delimiter_len);
306			return Ok(false);
307		}
308
309		// check numerical literals
310		if let Some(captures) = NUMERICAL_LITERAL_RE.captures(self.left) {
311			let l = captures.get(0).unwrap().as_str().len();
312			self.push_and_advance(TokenType::NumericalLiteral, l);
313			return Ok(false);
314		}
315
316		// check identifiers
317		if let Some(captures) = IDENTIFIER_RE.captures(self.left) {
318			let s = captures.get(0).unwrap().as_str();
319			let l = s.len();
320
321			let (tt, l) = if let Some(tt) = self.reserved_words.get(s) {
322				(*tt, l)
323			} else {
324				(TokenType::Identifier, l)
325			};
326
327			self.push_and_advance(tt, l);
328
329			return Ok(false);
330		}
331
332		// if no token was pushed then an error has occurred
333		return Err(TokenizerError::UnexpectedCharacter(self.location));
334	}
335}
336
337#[cfg(test)]
338mod tests {
339	use crate::tokenizer::{TokenType, Tokenizer, TokenizerOptions};
340
341	use super::Token;
342
343	fn compare_tokens(tokens: &[Token], target_tokens: &[(TokenType, Option<&str>)]) {
344		for (a, b) in tokens.into_iter().zip(target_tokens.into_iter()) {
345			assert_eq!(a.tt, b.0, "testing: {} {:?}", a.content, b.1);
346			if b.1.is_some() {
347				assert_eq!(a.content, b.1.unwrap());
348			}
349		}
350	}
351
352	#[test]
353	fn test_01() {
354		let tokens = Tokenizer::new("abc{{abc}}", TokenizerOptions::default())
355			.tokenize()
356			.unwrap();
357
358		let target_tokens = vec![
359			(TokenType::Source, None),
360			(TokenType::Open, None),
361			(TokenType::Identifier, Some("abc")),
362			(TokenType::Close, None),
363		];
364
365		compare_tokens(&tokens, &target_tokens);
366	}
367
368	#[test]
369	fn test_02() {
370		let tokens = Tokenizer::new("abc{{abc}}abc", TokenizerOptions::default())
371			.tokenize()
372			.unwrap();
373
374		let target_tokens = vec![
375			(TokenType::Source, None),
376			(TokenType::Open, None),
377			(TokenType::Identifier, Some("abc")),
378			(TokenType::Close, None),
379			(TokenType::Source, Some("abc")),
380		];
381
382		compare_tokens(&tokens, &target_tokens);
383	}
384
385	#[test]
386	fn test_03() {
387		let tokens = Tokenizer::new("{{>>>= ==<<<=}}", TokenizerOptions::default())
388			.tokenize()
389			.unwrap();
390
391		let target_tokens = vec![
392			// {{
393			(TokenType::Open, None),
394			// >>
395			(TokenType::GreaterThan, None),
396			(TokenType::GreaterThan, None),
397			// >=
398			(TokenType::GreaterEquals, None),
399			// ==
400			(TokenType::Equals, None),
401			// <<
402			(TokenType::SmallerThan, None),
403			(TokenType::SmallerThan, None),
404			// <=
405			(TokenType::SmallerEquals, None),
406			// }}
407			(TokenType::Close, None),
408		];
409
410		compare_tokens(&tokens, &target_tokens);
411	}
412
413	#[test]
414	fn test_04() {
415		let tokens = Tokenizer::new(
416			"{{ a > b ? 'item 1' : 'item 2' }}",
417			TokenizerOptions::default(),
418		)
419		.tokenize()
420		.unwrap();
421
422		let target_tokens = vec![
423			// {{
424			(TokenType::Open, None),
425			// a > b
426			(TokenType::Identifier, Some("a")),
427			(TokenType::GreaterThan, None),
428			(TokenType::Identifier, Some("b")),
429			// ?
430			(TokenType::QuestionMark, None),
431			// 'item 1' : 'item 2'
432			(TokenType::StringLiteral, Some("item 1")),
433			(TokenType::Colon, None),
434			(TokenType::StringLiteral, Some("item 2")),
435			// }}
436			(TokenType::Close, None),
437		];
438
439		compare_tokens(&tokens, &target_tokens);
440	}
441
442	#[test]
443	fn test_05() {
444		let tokens = Tokenizer::new(
445			"{{@ file(arg0 = arg, arg1 = 'txt', arg2 = 123 > num, arg3 = ''_'a'_'')}}",
446			TokenizerOptions::default(),
447		)
448		.tokenize()
449		.unwrap();
450
451		let target_tokens = vec![
452			// {{
453			(TokenType::Open, None),
454			// @ file
455			(TokenType::At, None),
456			(TokenType::FilePath, Some("file")),
457			// file(
458			(TokenType::LBracket, None),
459			// arg0 = arg,
460			(TokenType::Identifier, Some("arg0")),
461			(TokenType::Assign, None),
462			(TokenType::Identifier, Some("arg")),
463			(TokenType::Comma, None),
464			// arg1 = 'txt',
465			(TokenType::Identifier, Some("arg1")),
466			(TokenType::Assign, None),
467			(TokenType::StringLiteral, Some("txt")),
468			(TokenType::Comma, None),
469			// arg2 = 123 > num,
470			(TokenType::Identifier, Some("arg2")),
471			(TokenType::Assign, None),
472			(TokenType::NumericalLiteral, Some("123")),
473			(TokenType::GreaterThan, None),
474			(TokenType::Identifier, Some("num")),
475			(TokenType::Comma, None),
476			//  arg3 = ''_'a'_''
477			(TokenType::Identifier, Some("arg3")),
478			(TokenType::Assign, None),
479			(TokenType::StringLiteral, Some("_'a'_")),
480			// )}}
481			(TokenType::RBracket, None),
482			(TokenType::Close, None),
483		];
484
485		compare_tokens(&tokens, &target_tokens);
486	}
487
488	#[test]
489	fn test_06() {
490		let tokens = Tokenizer::new("{{a != None ? a : 'default'}}", TokenizerOptions::default())
491			.tokenize()
492			.unwrap();
493
494		let target_tokens = vec![
495			// {{
496			(TokenType::Open, None),
497			// a != None
498			(TokenType::Identifier, Some("a")),
499			(TokenType::NotEquals, None),
500			(TokenType::NoneLiteral, None),
501			// ?
502			(TokenType::QuestionMark, None),
503			// a
504			(TokenType::Identifier, Some("a")),
505			// :
506			(TokenType::Colon, None),
507			// 'default''
508			(TokenType::StringLiteral, Some("default")),
509			// }}
510			(TokenType::Close, None),
511		];
512
513		compare_tokens(&tokens, &target_tokens);
514	}
515}