pupil/
lexer.rs

1//! Lexing.
2
3use ::std::str;
4use super::env::Value;
5use super::op::Operator;
6
7//----------------------------------------------------------------
8
9/// Supported tokens types.
10#[derive(Clone, Debug, PartialEq)]
11pub enum Token<'a> {
12	/// Unknown token.
13	///
14	/// It’s the caller’s responsibility to handle this with an error of some kind.
15	///
16	/// This will swallow the entire remainder of the input, such that the tokenization finishes on next iteration.
17	Unk(&'a str),
18	/// Value literal token.
19	///
20	/// Negative literals are `Token::Op(Operator::Sub)` followed by a positive literal.
21	Lit(Value),
22	/// Operator token.
23	Op(Operator),
24	/// Variable token.
25	///
26	/// Alphanumeric characters only. Not followed by a `(`.
27	Var(&'a str),
28	/// Function token.
29	///
30	/// Alphanumeric characters only. Implicitly followed by a `(`.
31	Open(&'a str),
32	/// Comma token `,`.
33	///
34	/// Used to provide multiple arguments to a function.
35	Comma,
36	/// Function closing token `)`.
37	Close,
38}
39
40//----------------------------------------------------------------
41
42/// Iterator over tokens in a string.
43pub struct TokenIterator<'a> {
44	iter: str::Chars<'a>,
45}
46
47impl<'a> TokenIterator<'a> {
48	fn skip_whitespace(&mut self) -> bool {
49		// Use Clones instead of Peekable...
50		let mut iter = self.iter.clone();
51		while let Some(chr) = iter.next() {
52			if !chr.is_whitespace() {
53				return true;
54			}
55			// Overwrite with previous iterator
56			self.iter = iter.clone();
57		}
58		return false;
59	}
60	fn lex_lit(&mut self) -> Option<Token<'a>> {
61		strtod(self.iter.as_str()).map(|(num, s)| {
62			// Update the iterator
63			self.iter = s.chars();
64			Token::Lit(num)
65		})
66	}
67	fn lex_op(&mut self) -> Option<Token<'a>> {
68		let mut iter = self.iter.clone();
69		if let Some(chr) = iter.next() {
70			let tok = match chr {
71				'+' => Token::Op(Operator::Add),
72				'-' => Token::Op(Operator::Sub),
73				'*' => Token::Op(Operator::Mul),
74				'/' => Token::Op(Operator::Div),
75				'%' => Token::Op(Operator::Rem),
76				'^' => Token::Op(Operator::Pow),
77				',' => Token::Comma,
78				')' => Token::Close,
79				_ => return None,
80			};
81			self.iter = iter;
82			Some(tok)
83		}
84		else {
85			None
86		}
87	}
88	fn lex_id(&mut self) -> Option<Token<'a>> {
89		let s = self.iter.as_str();
90		// Scan for a non-alphanumeric character, take whole string otherwise
91		let end = s.char_indices()
92			.find(|&(_, chr)| !chr.is_alphanumeric())
93			.map(|(pos, _)| pos)
94			.unwrap_or(s.len());
95		// Slice the identifier
96		let (s_id, s_rem) = s.split_at(end);
97		// Check for opening parenthesis
98		let mut paren_it = s_rem.chars();
99		let paren = if let Some(chr) = paren_it.next() { chr == '(' } else { false };
100		// Parenthesis means a function begin
101		if paren {
102			self.iter = paren_it;
103			Some(Token::Open(s_id))
104		}
105		// Otherwise is a variable
106		else {
107			// Variables can’t have length zero
108			if s_id.len() == 0 {
109				None
110			}
111			else {
112				self.iter = s_rem.chars();
113				Some(Token::Var(s_id))
114			}
115		}
116	}
117	fn lex_unk(&mut self) -> Option<Token<'a>> {
118		// Unknown tokens handled upstream
119		// Set the iterator to finish on next() otherwise it would never end
120		let s_rem = self.iter.as_str();
121		self.iter = "".chars();
122		Some(Token::Unk(s_rem))
123	}
124}
125
126fn strtod(s: &str) -> Option<(f64, &str)> {
127	// Yeah let’s go `strtod`!
128	// ...
129	// Fun fact: Rust strings aren’t zero-terminated, but `strtod` cares...
130	// To ‘fix’ this, copy at most 31 bytes form input and zero terminate it.
131	// Alternatively malloc some memory with CString but are you mad? It’s just a few bytes.
132	// A test was added, I guess that means it’s all good :)
133	use ::std::mem;
134	use ::libc;
135	unsafe {
136		let mut s_num: [libc::c_char; 32] = mem::uninitialized();
137		let s_len = ::std::cmp::min(s.len(), 31);
138		(&mut s_num[..s_len]).clone_from_slice(mem::transmute(&s.as_bytes()[..s_len]));
139		s_num[s_len] = 0;
140		let mut s_end: *mut libc::c_char = mem::uninitialized();
141		let num = libc::strtod(s_num.as_ptr(), &mut s_end);
142		let read = s_end as usize - s_num.as_ptr() as usize;
143		if read != 0 {
144			Some((num as f64, &s[read..]))
145		}
146		else {
147			None
148		}
149	}
150}
151
152impl<'a> Iterator for TokenIterator<'a> {
153	type Item = Token<'a>;
154	fn next(&mut self) -> Option<Token<'a>> {
155		// Start by skipping over the whitespace
156		if self.skip_whitespace() {
157			// Try lexing as various tokens
158			if let tok @ Some(_) = self.lex_op() { tok }
159			else if let tok @ Some(_) = self.lex_lit() { tok }
160			else if let tok @ Some(_) = self.lex_id() { tok }
161			else { self.lex_unk() }
162		}
163		// End of string
164		else {
165			None
166		}
167	}
168}
169
170/// Create a new TokenIterator for a string.
171pub fn tokenize<'a>(input: &'a str) -> TokenIterator<'a> {
172	TokenIterator {
173		iter: input.chars(),
174	}
175}
176
177#[cfg(test)]
178mod tests {
179	use super::{tokenize, strtod};
180	use super::Token::*;
181	use super::super::op::Operator::*;
182
183	#[test]
184	fn units() {
185		// Literals, can’t test NaN because reasons
186		assert_eq!(tokenize("12.4 45 -0.111 inf").collect::<Vec<_>>(),
187			vec![Lit(12.4), Lit(45.0), Op(Sub), Lit(0.111), Var("inf")]);
188		// Functions and Variables
189		assert_eq!(tokenize("fn(12, (2ans))-pi").collect::<Vec<_>>(),
190			vec![Open("fn"), Lit(12.0), Comma, Open(""), Lit(2.0), Var("ans"), Close, Close, Op(Sub), Var("pi")]);
191		// All Operators
192		assert_eq!(tokenize("1%2+3-5*-4/2^1").collect::<Vec<_>>(),
193			vec![Lit(1.0), Op(Rem), Lit(2.0), Op(Add), Lit(3.0), Op(Sub), Lit(5.0), Op(Mul), Op(Sub), Lit(4.0), Op(Div), Lit(2.0), Op(Pow), Lit(1.0)]);
194		// Unknown
195		assert_eq!(tokenize("2 + 3 * !èè&").collect::<Vec<_>>(),
196			vec![Lit(2.0), Op(Add), Lit(3.0), Op(Mul), Unk("!èè&")]);
197	}
198	#[test]
199	fn regressions() {
200		// Regression test: fixed `strtod` from reading past the real input
201		assert_eq!(strtod(&"1234"[..2]), Some((12.0, "")));
202	}
203}