rubic/parse/
tokenize.rs

1use parse::Token;
2use std::iter::Peekable;
3
4/// A list of symbols.
5const SYMBOLS: &'static [&'static str] = &[
6    "::", "&&", "||", "=>", "->",
7    "{", "}", "(", ")", "[", "]", "<", ">",
8    ".", ",", ";", "&", "|", "@", "=",
9    ":", "!", "?", "%", "/", "\\", "*", "+", "-",
10];
11
12const COMMENT_CHAR: char = '#';
13
14/// A tokenizer.
15pub struct Tokenizer<I: Iterator<Item=char>>
16{
17    chars: Peekable<I>,
18    /// We generate a fake new line at the end of every program.
19    sent_last_new_line: bool,
20}
21
22impl<I> Tokenizer<I> where I: Iterator<Item=char>
23{
24    /// Creates a new tokenizer.
25    pub fn new(characters: I) -> Self {
26        Tokenizer { chars: characters.peekable(), sent_last_new_line: false }
27    }
28
29    fn read_token(&mut self) -> Option<Token> {
30        self.eat_whitespace();
31        self.eat_comment();
32        self.eat_whitespace();
33
34        let peeked_char = if let Some(&c) = self.chars.peek() { c } else { return None };
35
36        if peeked_char.is_alphabetic() {
37            Some(self.read_word())
38        } else if peeked_char.is_numeric() {
39            Some(self.read_number())
40        } else if peeked_char == '\n' {
41            self.chars.next(); // Eat new line
42            Some(Token::EndOfLine)
43        } else if peeked_char == '"' || peeked_char == '\'' {
44            Some(self.read_string())
45        } else if SYMBOLS.iter().any(|sym| sym.starts_with(peeked_char)) {
46            let first_char = self.chars.next().unwrap();
47
48            let matches: Vec<_> = SYMBOLS.iter().filter(|sym| sym.starts_with(first_char)).collect();
49
50            if matches.iter().any(|sym| sym.len() > 1) {
51                if let Some(&peeked_second_char) = self.chars.peek() {
52                    let symbol = format!("{}{}", first_char, peeked_second_char);
53
54                    if let Some(exact_match) = SYMBOLS.iter().find(|&&sym| sym == symbol) {
55                        self.chars.next(); // Eat the second symbol.
56                        Some(Token::Symbol(exact_match))
57                    } else {
58                        // Fall back to using only the first char.
59                        let exact_match = SYMBOLS.iter().find(|&&sym| sym == format!("{}", first_char)).unwrap();
60                        Some(Token::Symbol(exact_match))
61                    }
62                } else {
63                    // We should just use the first char
64                    Some(Token::Symbol(matches[0]))
65                }
66            } else { // We matched with a single-char symbol.
67                debug_assert_eq!(matches.len(), 1, "matched with multiple symbols");
68                Some(Token::Symbol(matches[0]))
69            }
70        } else {
71            println!("failed: {}", peeked_char);
72            panic!("unexpected character: '{:?}'", peeked_char);
73        }
74    }
75
76    fn eat_whitespace(&mut self) {
77        while let Some(&c) = self.chars.peek() {
78            if c != '\n' && c.is_whitespace() {
79                self.chars.next(); // Eat the character.
80            } else {
81                break;
82            }
83        }
84    }
85
86    fn eat_comment(&mut self) {
87        if self.chars.peek() == Some(&COMMENT_CHAR) {
88            while self.chars.peek() != Some(&'\n') {
89                self.chars.next(); // Eat the character.
90            }
91        }
92    }
93
94    fn read_word(&mut self) -> Token {
95        let mut chars = Vec::new();
96
97        while let Some(&c) = self.chars.peek() {
98            if c.is_alphanumeric() || c == '_' || c == '!' || c == '?' {
99                self.chars.next(); // Eat the char
100                chars.push(c)
101            } else {
102                break;
103            }
104        }
105
106        Token::Word(chars.into_iter().collect())
107    }
108
109    fn read_number(&mut self) -> Token {
110        let mut chars = Vec::new();
111
112        while let Some(&c) = self.chars.peek() {
113            if c.is_numeric() {
114                self.chars.next(); // Eat the char
115                chars.push(c)
116            } else {
117                break;
118            }
119        }
120
121        let number_text: String = chars.into_iter().collect();
122        let number = number_text.parse().unwrap();
123
124        Token::Integer(number)
125    }
126
127    fn read_string(&mut self) -> Token {
128        self.chars.next(); // Eat the quote.
129
130        let mut chars = Vec::new();
131
132        while let Some(&c) = self.chars.peek() {
133            if c != '"' && c != '\'' {
134                self.chars.next(); // Eat the char
135                chars.push(c)
136            } else {
137                self.chars.next(); // Eat the quote.
138                break;
139            }
140        }
141
142        Token::String(chars.into_iter().collect())
143    }
144}
145
146impl<I: Iterator<Item=char>> Iterator for Tokenizer<I>
147{
148    type Item = Token;
149
150    fn next(&mut self) -> Option<Token> {
151        if let Some(token) = self.read_token() {
152            println!("token: {:?}", token);
153            Some(token)
154        } else {
155            if self.sent_last_new_line {
156                None
157            } else {
158                self.sent_last_new_line = true;
159                Some(Token::EndOfLine)
160            }
161        }
162    }
163}
164
165#[cfg(test)]
166mod test
167{
168    use super::*;
169    use parse::Token;
170
171    fn tokenize(s: &str) -> Vec<Token> {
172        let t = Tokenizer::new(s.chars());
173        t.collect()
174    }
175
176    #[test]
177    fn can_read_simple_word() {
178        assert_eq!(tokenize("abcdef"), vec![Token::Word("abcdef".to_owned()),
179                                            Token::EndOfLine]);
180    }
181
182    #[test]
183    fn can_handle_whitespace_at_start_of_word() {
184        assert_eq!(tokenize("     abcdef"), vec![Token::Word("abcdef".to_owned()),
185                                                 Token::EndOfLine]);
186    }
187
188    #[test]
189    fn can_read_multiple_words() {
190        assert_eq!(tokenize("\tabcdef hg"), vec![Token::Word("abcdef".to_owned()),
191                                                 Token::Word("hg".to_owned()),
192                                                 Token::EndOfLine]);
193    }
194
195    #[test]
196    fn considers_underscores_a_part_of_words() {
197        assert_eq!(tokenize("\tabcdef_hg"), vec![Token::Word("abcdef_hg".to_owned()), Token::EndOfLine]);
198    }
199
200    #[test]
201    fn can_read_single_dot() {
202        assert_eq!(tokenize("."), vec![Token::Symbol("."), Token::EndOfLine]);
203    }
204
205    #[test]
206    fn can_read_multiple_dots() {
207        assert_eq!(tokenize("..."), vec![Token::Symbol("."),
208                                         Token::Symbol("."),
209                                         Token::Symbol("."),
210                                         Token::EndOfLine]);
211    }
212
213    #[test]
214    fn can_read_new_line() {
215        assert_eq!(tokenize(" \nb"), vec![Token::EndOfLine, Token::Word("b".to_owned()), Token::EndOfLine]);
216    }
217
218    #[test]
219    fn can_read_string() {
220        assert_eq!(tokenize("\"hello\""), vec![Token::String("hello".to_owned()),
221                                               Token::EndOfLine]);
222    }
223
224    #[test]
225    fn can_read_double_colon() {
226        assert_eq!(tokenize("Abc::Def"), vec![Token::Word("Abc".to_owned()),
227                                              Token::Symbol("::"),
228                                              Token::Word("Def".to_owned()),
229                                              Token::EndOfLine]);
230    }
231
232    #[test]
233    fn can_read_positive_integer() {
234        assert_eq!(tokenize("123 45"), vec![Token::Integer(123),
235                                            Token::Integer(45),
236                                            Token::EndOfLine]);
237    }
238}