1use unicode_segmentation::UnicodeSegmentation;
2use unicode_segmentation::Graphemes;
3use regex::Regex;
4
5use std::iter::Peekable;
6
7use token::Token;
8
9
10lazy_static! {
11 static ref NUMERIC: Regex = Regex::new(r"^\d").unwrap();
12 static ref ALPHABETIC: Regex = Regex::new(r"^[a-zA-Z]").unwrap();
13 static ref WORD: Regex = Regex::new(r"^\w+").unwrap();
14}
15
16pub struct Lexer<'a> {
17 input: Peekable<Graphemes<'a>>,
18 indent_level: u8,
19}
20
21impl<'a> Lexer<'a> {
22
23 pub fn new(input: &'a str) -> Lexer<'a> {
24 Lexer { input: UnicodeSegmentation::graphemes(input, true).peekable(),
25 indent_level: 0,
26 }
27 }
28
29 fn advance(&mut self) -> &str {
30 match self.input.next() {
31 Some(c) => c,
32 None => panic!("Lexical error.")
33 }
34 }
35
36 fn whitespace (&mut self) {
37 while let Some(&c) = self.input.peek() {
38 if c != " " {
39 break;
40 } else {
41 self.advance();
42 }
43 }
44 }
45
46 fn indent (&mut self) -> Option<Vec<Token>> {
47 let spaces_for_indent = 4;
49
50 let mut spaces_count = 0;
51 while let Some(&c) = self.input.peek() {
52 if c != " " && c != "\n" {
53 if spaces_count % spaces_for_indent != 0 {
54 panic!("Indentation error.")
55 }
56 let indent_count = spaces_count / spaces_for_indent;
57 let mut indent_array: Vec<Token> = vec![Token::NEWLINE];
58 if indent_count == self.indent_level {
59 return Some(indent_array)
61 } else if indent_count > self.indent_level {
62 for _ in 0..(indent_count - self.indent_level) {
64 self.indent_level += 1;
65 indent_array.push(Token::INDENT);
66 }
67 return Some(indent_array)
68 } else {
69 for _ in 0..(self.indent_level - indent_count) {
71 self.indent_level -= 1;
72 indent_array.push(Token::DEDENT);
73 }
74 return Some(indent_array)
75 }
76 } else if c == "\n" {
77 spaces_count = 0;
78 self.advance();
79 } else {
80 spaces_count += 1;
81 self.advance();
82 }
83 }
84 let mut dedent_ending_array: Vec<Token> = vec![Token::NEWLINE];
87 for _ in 0..self.indent_level {
88 dedent_ending_array.push(Token::DEDENT);
89 }
90 Some(dedent_ending_array)
91 }
92
93 fn number(&mut self, number: &str) -> Option<Vec<Token>> {
94 let mut number = number.to_string();
95 while let Some(&c) = self.input.peek() {
96 if c == "." {
97 number.push_str(self.advance());
98 while let Some(&d) = self.input.peek() {
99 if !NUMERIC.is_match(d) {
100 break;
101 }
102 number.push_str(self.advance());
103 }
104 return Some(vec![Token::FLOAT(number)]);
105 }
106 if !NUMERIC.is_match(c) {
107 break;
108 }
109 number.push_str(self.advance());
110 }
111 Some(vec![Token::INT(number)])
112 }
113
114 fn id(&mut self, id: &str) -> Option<Vec<Token>> {
115 let mut id = id.to_string();
116 while let Some(&c) = self.input.peek() {
117 if !WORD.is_match(c) {
118 break;
119 }
120 id.push_str(self.advance());
121 }
122 match id.as_ref() {
124 "true" => Some(vec![Token::BOOL(true)]),
125 "false" => Some(vec![Token::BOOL(false)]),
126
127 "or" => Some(vec![Token::OR]),
128 "and" => Some(vec![Token::AND]),
129 "not" => Some(vec![Token::NOT]),
130
131 "if" => Some(vec![Token::IF]),
132 "else" => Some(vec![Token::ELSE]),
133 "while" => Some(vec![Token::WHILE]),
134
135 "def" => Some(vec![Token::DEF]),
136 "return" => Some(vec![Token::RETURN]),
137
138 _ => Some(vec![Token::ID(id)])
139 }
140 }
141
142 fn comment (&mut self) -> Option<Vec<Token>> {
143 while let Some(&c) = self.input.peek() {
144 if c == "\n" {
145 break;
146 } else {
147 self.advance();
148 }
149 }
150 self.next()
151 }
152}
153
154impl<'a> Iterator for Lexer<'a> {
155 type Item = Vec<Token>;
156
157 fn next(&mut self) -> Option<Self::Item> {
158
159 self.whitespace();
160
161 match self.input.next() {
162 Some(c) if NUMERIC.is_match(c) => self.number(c),
163 Some(c) if ALPHABETIC.is_match(c) => self.id(c),
164 Some("\n") => self.indent(),
165 Some("=") => {
166 if self.input.peek() == Some(&"=") {
167 self.advance();
168 Some(vec![Token::EQ])
169 } else {
170 Some(vec![Token::ASSIGN])
171 }
172
173 },
174 Some("!") => {
175 if self.input.peek() == Some(&"=") {
176 self.advance();
177 Some(vec![Token::NE])
178 } else {
179 panic!("Lexical error.") }
181 }
182 Some("<") => {
183 if self.input.peek() == Some(&"=") {
184 self.advance();
185 Some(vec![Token::LE])
186 } else {
187 Some(vec![Token::LT])
188 }
189 },
190 Some(">") => {
191 if self.input.peek() == Some(&"=") {
192 self.advance();
193 Some(vec![Token::GE])
194 } else {
195 Some(vec![Token::GT])
196 }
197 },
198 Some("+") => Some(vec![Token::PLUS]),
199 Some("-") => Some(vec![Token::MINUS]),
200 Some("*") => Some(vec![Token::MUL]),
201 Some("/") => Some(vec![Token::DIV]),
202 Some("(") => Some(vec![Token::LPAREN]),
203 Some(")") => Some(vec![Token::RPAREN]),
204 Some(":") => Some(vec![Token::COLON]),
205 Some(",") => Some(vec![Token::COMMA]),
206 Some("#") => self.comment(),
207
208 None => None,
210
211 _ => panic!("Lexical error.")
213 }
214 }
215}
216
217
218#[cfg(test)]
219mod tests {
220 use super::*;
221
222 use token::Token;
223
224 fn scan_generator(input: &str) -> Vec<Token> {
225 let lexer = Lexer::new(input);
226 let mut scan = Vec::new();
227 for t in lexer {
228 scan.push(t);
229 }
230 scan.into_iter().flatten().collect::<Vec<Token>>()
231 }
232
233 #[test]
234 #[should_panic]
235 fn invalid_input() {
236 scan_generator("§");
238 }
239
240 #[test]
241 fn whitespace() {
242 let scan = scan_generator(" ");
243 assert_eq!(scan, vec!());
244 }
245
246 #[test]
247 fn lf() {
248 let scan = scan_generator("\n");
249 assert_eq!(scan, vec!(Token::NEWLINE));
250 }
251
252 #[test]
253 fn indentation() {
254 let scan = scan_generator("a\n b\n c\nd");
255 assert_eq!(scan, vec![
256 Token::ID(String::from("a")),
257 Token::NEWLINE,
258 Token::INDENT,
259 Token::ID(String::from("b")),
260 Token::NEWLINE,
261 Token::ID(String::from("c")),
262 Token::NEWLINE,
263 Token::DEDENT,
264 Token::ID(String::from("d")),
265 ])
266 }
267
268 #[test]
269 fn indentation_multiple() {
270 let scan = scan_generator("a\n b\n c\nd");
271 assert_eq!(scan, vec![
272 Token::ID(String::from("a")),
273 Token::NEWLINE,
274 Token::INDENT,
275 Token::ID(String::from("b")),
276 Token::NEWLINE,
277 Token::INDENT,
278 Token::ID(String::from("c")),
279 Token::NEWLINE,
280 Token::DEDENT,
281 Token::DEDENT,
282 Token::ID(String::from("d")),
283 ])
284 }
285
286 #[test]
287 fn comment() {
288 let scan = scan_generator("# 2+2");
289 assert_eq!(scan, vec!());
290 }
291
292 #[test]
293 fn integer_number() {
294 let scan = scan_generator("1");
295 assert_eq!(scan, vec!(Token::INT(String::from("1"))));
296 }
297
298 #[test]
299 fn float_number() {
300 let scan = scan_generator("1.0");
301 assert_eq!(scan, vec!(Token::FLOAT(String::from("1.0"))));
302 }
303
304 #[test]
305 fn plus_operand() {
306 let scan = scan_generator("+");
307 assert_eq!(scan, vec!(Token::PLUS));
308 }
309
310 #[test]
311 fn minus_operand() {
312 let scan = scan_generator("-");
313 assert_eq!(scan, vec!(Token::MINUS));
314 }
315
316 #[test]
317 fn mul_operand() {
318 let scan = scan_generator("*");
319 assert_eq!(scan, vec!(Token::MUL));
320 }
321
322 #[test]
323 fn div_operand() {
324 let scan = scan_generator("/");
325 assert_eq!(scan, vec!(Token::DIV));
326 }
327
328 #[test]
329 fn parenthesis() {
330 let scan = scan_generator("(1)");
331 assert_eq!(scan, vec!(
332 Token::LPAREN,
333 Token::INT(String::from("1")),
334 Token::RPAREN,
335 ));
336 }
337
338 #[test]
339 fn colon() {
340 let scan = scan_generator(":");
341 assert_eq!(scan, vec!(Token::COLON));
342 }
343
344 #[test]
345 fn comma() {
346 let scan = scan_generator(",");
347 assert_eq!(scan, vec!(Token::COMMA));
348 }
349
350 #[test]
351 fn assign() {
352 let scan = scan_generator("=");
353 assert_eq!(scan, vec!(Token::ASSIGN));
354 }
355
356 #[test]
357 fn boolean_true() {
358 let scan = scan_generator("true");
359 assert_eq!(scan, vec!(Token::BOOL(true)));
360 }
361
362 #[test]
363 fn boolean_false() {
364 let scan = scan_generator("false");
365 assert_eq!(scan, vec!(Token::BOOL(false)));
366 }
367
368 #[test]
369 fn id() {
370 let scan = scan_generator("bjørn");
371 assert_eq!(scan, vec!(Token::ID(String::from("bjørn"))));
372 }
373
374 #[test]
375 fn comparison_eq() {
376 let scan = scan_generator("==");
377 assert_eq!(scan, vec!(Token::EQ));
378 }
379
380 #[test]
381 fn comparison_ne() {
382 let scan = scan_generator("!=");
383 assert_eq!(scan, vec!(Token::NE));
384 }
385
386 #[test]
387 fn comparison_le() {
388 let scan = scan_generator("<=");
389 assert_eq!(scan, vec!(Token::LE));
390 }
391
392 #[test]
393 fn comparison_ge() {
394 let scan = scan_generator(">=");
395 assert_eq!(scan, vec!(Token::GE));
396 }
397
398 #[test]
399 fn comparison_lt() {
400 let scan = scan_generator("<");
401 assert_eq!(scan, vec!(Token::LT));
402 }
403
404 #[test]
405 fn comparison_gt() {
406 let scan = scan_generator(">");
407 assert_eq!(scan, vec!(Token::GT));
408 }
409
410 #[test]
411 fn logical_or_operation() {
412 let scan = scan_generator("or");
413 assert_eq!(scan, vec!(Token::OR));
414 }
415
416 #[test]
417 fn logical_and_operation() {
418 let scan = scan_generator("and");
419 assert_eq!(scan, vec!(Token::AND));
420 }
421
422 #[test]
423 fn logical_not_operation() {
424 let scan = scan_generator("not");
425 assert_eq!(scan, vec!(Token::NOT));
426 }
427
428
429 #[test]
430 fn if_keyword() {
431 let scan = scan_generator("if");
432 assert_eq!(scan, vec!(Token::IF));
433 }
434
435 #[test]
436 fn else_keyword() {
437 let scan = scan_generator("else");
438 assert_eq!(scan, vec!(Token::ELSE));
439 }
440
441 #[test]
442 fn while_keyword() {
443 let scan = scan_generator("while");
444 assert_eq!(scan, vec!(Token::WHILE));
445 }
446
447 #[test]
448 fn def_keyword() {
449 let scan = scan_generator("def");
450 assert_eq!(scan, vec!(Token::DEF));
451 }
452
453 #[test]
454 fn return_keyword() {
455 let scan = scan_generator("return");
456 assert_eq!(scan, vec!(Token::RETURN));
457 }
458}