bjorn/
lexer.rs

1use unicode_segmentation::UnicodeSegmentation;
2use unicode_segmentation::Graphemes;
3use regex::Regex;
4
5use std::iter::Peekable;
6
7use token::Token;
8
9
10lazy_static! {
11    static ref NUMERIC: Regex = Regex::new(r"^\d").unwrap();
12    static ref ALPHABETIC: Regex = Regex::new(r"^[a-zA-Z]").unwrap();
13    static ref WORD: Regex = Regex::new(r"^\w+").unwrap();
14}
15
16pub struct Lexer<'a> {
17    input: Peekable<Graphemes<'a>>,
18    indent_level: u8,
19}
20
21impl<'a> Lexer<'a> {
22
23    pub fn new(input: &'a str) -> Lexer<'a>  {
24        Lexer { input: UnicodeSegmentation::graphemes(input, true).peekable(),
25                indent_level: 0,
26            }
27    }
28
29    fn advance(&mut self) -> &str {
30        match self.input.next() {
31            Some(c) => c,
32            None => panic!("Lexical error.")
33        }
34    }
35
36    fn whitespace (&mut self)  {
37        while let Some(&c) = self.input.peek() {
38            if c != " " {
39                break;
40            } else {
41                self.advance();
42            }
43        }
44    }
45
46    fn indent (&mut self) -> Option<Vec<Token>> {
47        // For now at least, identation is forced to 4 spaces
48        let spaces_for_indent = 4;
49
50        let mut spaces_count = 0;
51        while let Some(&c) = self.input.peek() {
52            if c != " " && c != "\n" {
53                if spaces_count % spaces_for_indent != 0 {
54                    panic!("Indentation error.")
55                }
56                let indent_count = spaces_count / spaces_for_indent;
57                let mut indent_array: Vec<Token> = vec![Token::NEWLINE];
58                if indent_count == self.indent_level {
59                    // Same level of indentation
60                    return Some(indent_array)
61                } else if indent_count > self.indent_level {
62                    // At least one additional identation
63                    for _ in 0..(indent_count - self.indent_level) {
64                        self.indent_level += 1;
65                        indent_array.push(Token::INDENT);
66                    }
67                    return Some(indent_array)
68                } else {
69                    // At least one indentation in less
70                    for _ in 0..(self.indent_level - indent_count) {
71                        self.indent_level -= 1;
72                        indent_array.push(Token::DEDENT);
73                    }
74                    return Some(indent_array)
75                }
76            } else if c == "\n" {
77                spaces_count = 0;
78                self.advance();
79            } else {
80                spaces_count += 1;
81                self.advance();
82            }
83        }
84        // In the end of the program,
85        // dedent everything if the indent level is not equal to zero.
86        let mut dedent_ending_array: Vec<Token> = vec![Token::NEWLINE];
87        for _ in 0..self.indent_level {
88            dedent_ending_array.push(Token::DEDENT);
89        }
90        Some(dedent_ending_array)
91    }
92
93    fn number(&mut self, number: &str) -> Option<Vec<Token>> {
94        let mut number = number.to_string();
95        while let Some(&c) = self.input.peek() {
96            if c == "." {
97                number.push_str(self.advance());
98                while let Some(&d) = self.input.peek() {
99                    if !NUMERIC.is_match(d) {
100                        break;
101                    }
102                    number.push_str(self.advance());
103                }
104                return Some(vec![Token::FLOAT(number)]);
105            }
106            if !NUMERIC.is_match(c) {
107                break;
108            }
109            number.push_str(self.advance());
110        }
111        Some(vec![Token::INT(number)])
112    }
113
114    fn id(&mut self, id: &str) -> Option<Vec<Token>> {
115        let mut id = id.to_string();
116        while let Some(&c) = self.input.peek() {
117            if !WORD.is_match(c) {
118                break;
119            }
120            id.push_str(self.advance());
121        }
122        // Reserved keywords
123        match id.as_ref() {
124            "true" => Some(vec![Token::BOOL(true)]),
125            "false" => Some(vec![Token::BOOL(false)]),
126
127            "or" => Some(vec![Token::OR]),
128            "and" => Some(vec![Token::AND]),
129            "not" => Some(vec![Token::NOT]),
130
131            "if" => Some(vec![Token::IF]),
132            "else" => Some(vec![Token::ELSE]),
133            "while" => Some(vec![Token::WHILE]),
134
135            "def" => Some(vec![Token::DEF]),
136            "return" => Some(vec![Token::RETURN]),
137
138            _ => Some(vec![Token::ID(id)])
139        }
140    }
141
142    fn comment (&mut self) -> Option<Vec<Token>> {
143        while let Some(&c) = self.input.peek() {
144            if c == "\n" {
145                break;
146            } else {
147                self.advance();
148            }
149        }
150        self.next()
151    }
152}
153
154impl<'a> Iterator for Lexer<'a> {
155    type Item = Vec<Token>;
156
157    fn next(&mut self) -> Option<Self::Item> {
158
159        self.whitespace();
160
161         match self.input.next() {
162            Some(c) if NUMERIC.is_match(c) => self.number(c),
163            Some(c) if ALPHABETIC.is_match(c) => self.id(c),
164            Some("\n") => self.indent(),
165            Some("=") => {
166                if self.input.peek() == Some(&"=") {
167                    self.advance();
168                    Some(vec![Token::EQ])
169                } else {
170                    Some(vec![Token::ASSIGN])
171                }
172
173            },
174            Some("!") => {
175                if self.input.peek() == Some(&"=") {
176                    self.advance();
177                    Some(vec![Token::NE])
178                } else {
179                    panic!("Lexical error.") // Lexeme `!` is not supported
180                }
181            }
182            Some("<") => {
183                if self.input.peek() == Some(&"=") {
184                    self.advance();
185                    Some(vec![Token::LE])
186                } else {
187                    Some(vec![Token::LT])
188                }
189            },
190            Some(">") => {
191                if self.input.peek() == Some(&"=") {
192                    self.advance();
193                    Some(vec![Token::GE])
194                } else {
195                    Some(vec![Token::GT])
196                }
197            },
198            Some("+") => Some(vec![Token::PLUS]),
199            Some("-") => Some(vec![Token::MINUS]),
200            Some("*") => Some(vec![Token::MUL]),
201            Some("/") => Some(vec![Token::DIV]),
202            Some("(") => Some(vec![Token::LPAREN]),
203            Some(")") => Some(vec![Token::RPAREN]),
204            Some(":") => Some(vec![Token::COLON]),
205            Some(",") => Some(vec![Token::COMMA]),
206            Some("#") => self.comment(),
207
208            // End of file
209            None => None,
210
211            // Not supported lexeme
212            _ => panic!("Lexical error.")
213        }
214    }
215}
216
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    use token::Token;
223
224    fn scan_generator(input: &str) -> Vec<Token> {
225        let lexer = Lexer::new(input);
226        let mut scan = Vec::new();
227        for t in lexer {
228            scan.push(t);
229        }
230        scan.into_iter().flatten().collect::<Vec<Token>>()
231    }
232
233    #[test]
234    #[should_panic]
235    fn invalid_input() {
236        // Must change if `§` is valid one day
237        scan_generator("§");
238    }
239
240    #[test]
241    fn whitespace() {
242        let scan = scan_generator(" ");
243        assert_eq!(scan, vec!());
244    }
245
246    #[test]
247    fn lf() {
248        let scan = scan_generator("\n");
249        assert_eq!(scan, vec!(Token::NEWLINE));
250    }
251
252    #[test]
253    fn indentation() {
254        let scan = scan_generator("a\n    b\n    c\nd");
255        assert_eq!(scan, vec![
256            Token::ID(String::from("a")),
257            Token::NEWLINE,
258            Token::INDENT,
259            Token::ID(String::from("b")),
260            Token::NEWLINE,
261            Token::ID(String::from("c")),
262            Token::NEWLINE,
263            Token::DEDENT,
264            Token::ID(String::from("d")),
265            ])
266    }
267
268    #[test]
269    fn indentation_multiple() {
270        let scan = scan_generator("a\n    b\n        c\nd");
271        assert_eq!(scan, vec![
272            Token::ID(String::from("a")),
273            Token::NEWLINE,
274            Token::INDENT,
275            Token::ID(String::from("b")),
276            Token::NEWLINE,
277            Token::INDENT,
278            Token::ID(String::from("c")),
279            Token::NEWLINE,
280            Token::DEDENT,
281            Token::DEDENT,
282            Token::ID(String::from("d")),
283            ])
284    }
285
286    #[test]
287    fn comment() {
288        let scan = scan_generator("# 2+2");
289        assert_eq!(scan, vec!());
290    }
291
292    #[test]
293    fn integer_number() {
294        let scan = scan_generator("1");
295        assert_eq!(scan, vec!(Token::INT(String::from("1"))));
296    }
297
298    #[test]
299    fn float_number() {
300        let scan = scan_generator("1.0");
301        assert_eq!(scan, vec!(Token::FLOAT(String::from("1.0"))));
302    }
303
304    #[test]
305    fn plus_operand() {
306        let scan = scan_generator("+");
307        assert_eq!(scan, vec!(Token::PLUS));
308    }
309
310    #[test]
311    fn minus_operand() {
312        let scan = scan_generator("-");
313        assert_eq!(scan, vec!(Token::MINUS));
314    }
315
316    #[test]
317    fn mul_operand() {
318        let scan = scan_generator("*");
319        assert_eq!(scan, vec!(Token::MUL));
320    }
321
322    #[test]
323    fn div_operand() {
324        let scan = scan_generator("/");
325        assert_eq!(scan, vec!(Token::DIV));
326    }
327
328    #[test]
329    fn parenthesis() {
330        let scan = scan_generator("(1)");
331        assert_eq!(scan, vec!(
332            Token::LPAREN,
333            Token::INT(String::from("1")),
334            Token::RPAREN,
335        ));
336    }
337
338    #[test]
339    fn colon() {
340        let scan = scan_generator(":");
341        assert_eq!(scan, vec!(Token::COLON));
342    }
343
344    #[test]
345    fn comma() {
346        let scan = scan_generator(",");
347        assert_eq!(scan, vec!(Token::COMMA));
348    }
349
350    #[test]
351    fn assign() {
352        let scan = scan_generator("=");
353        assert_eq!(scan, vec!(Token::ASSIGN));
354    }
355
356    #[test]
357    fn boolean_true() {
358        let scan = scan_generator("true");
359        assert_eq!(scan, vec!(Token::BOOL(true)));
360    }
361
362    #[test]
363    fn boolean_false() {
364        let scan = scan_generator("false");
365        assert_eq!(scan, vec!(Token::BOOL(false)));
366    }
367
368    #[test]
369    fn id() {
370        let scan = scan_generator("bjørn");
371        assert_eq!(scan, vec!(Token::ID(String::from("bjørn"))));
372    }
373
374    #[test]
375    fn comparison_eq() {
376        let scan = scan_generator("==");
377        assert_eq!(scan, vec!(Token::EQ));
378    }
379
380    #[test]
381    fn comparison_ne() {
382        let scan = scan_generator("!=");
383        assert_eq!(scan, vec!(Token::NE));
384    }
385
386    #[test]
387    fn comparison_le() {
388        let scan = scan_generator("<=");
389        assert_eq!(scan, vec!(Token::LE));
390    }
391
392    #[test]
393    fn comparison_ge() {
394        let scan = scan_generator(">=");
395        assert_eq!(scan, vec!(Token::GE));
396    }
397
398    #[test]
399    fn comparison_lt() {
400        let scan = scan_generator("<");
401        assert_eq!(scan, vec!(Token::LT));
402    }
403
404    #[test]
405    fn comparison_gt() {
406        let scan = scan_generator(">");
407        assert_eq!(scan, vec!(Token::GT));
408    }
409
410    #[test]
411    fn logical_or_operation() {
412        let scan = scan_generator("or");
413        assert_eq!(scan, vec!(Token::OR));
414    }
415
416    #[test]
417    fn logical_and_operation() {
418        let scan = scan_generator("and");
419        assert_eq!(scan, vec!(Token::AND));
420    }
421
422    #[test]
423    fn logical_not_operation() {
424        let scan = scan_generator("not");
425        assert_eq!(scan, vec!(Token::NOT));
426    }
427
428
429    #[test]
430    fn if_keyword() {
431        let scan = scan_generator("if");
432        assert_eq!(scan, vec!(Token::IF));
433    }
434
435    #[test]
436    fn else_keyword() {
437        let scan = scan_generator("else");
438        assert_eq!(scan, vec!(Token::ELSE));
439    }
440
441    #[test]
442    fn while_keyword() {
443        let scan = scan_generator("while");
444        assert_eq!(scan, vec!(Token::WHILE));
445    }
446
447    #[test]
448    fn def_keyword() {
449        let scan = scan_generator("def");
450        assert_eq!(scan, vec!(Token::DEF));
451    }
452
453    #[test]
454    fn return_keyword() {
455        let scan = scan_generator("return");
456        assert_eq!(scan, vec!(Token::RETURN));
457    }
458}