1use parse::Token;
2use std::iter::Peekable;
3
4const SYMBOLS: &'static [&'static str] = &[
6 "::", "&&", "||", "=>", "->",
7 "{", "}", "(", ")", "[", "]", "<", ">",
8 ".", ",", ";", "&", "|", "@", "=",
9 ":", "!", "?", "%", "/", "\\", "*", "+", "-",
10];
11
12const COMMENT_CHAR: char = '#';
13
14pub struct Tokenizer<I: Iterator<Item=char>>
16{
17 chars: Peekable<I>,
18 sent_last_new_line: bool,
20}
21
22impl<I> Tokenizer<I> where I: Iterator<Item=char>
23{
24 pub fn new(characters: I) -> Self {
26 Tokenizer { chars: characters.peekable(), sent_last_new_line: false }
27 }
28
29 fn read_token(&mut self) -> Option<Token> {
30 self.eat_whitespace();
31 self.eat_comment();
32 self.eat_whitespace();
33
34 let peeked_char = if let Some(&c) = self.chars.peek() { c } else { return None };
35
36 if peeked_char.is_alphabetic() {
37 Some(self.read_word())
38 } else if peeked_char.is_numeric() {
39 Some(self.read_number())
40 } else if peeked_char == '\n' {
41 self.chars.next(); Some(Token::EndOfLine)
43 } else if peeked_char == '"' || peeked_char == '\'' {
44 Some(self.read_string())
45 } else if SYMBOLS.iter().any(|sym| sym.starts_with(peeked_char)) {
46 let first_char = self.chars.next().unwrap();
47
48 let matches: Vec<_> = SYMBOLS.iter().filter(|sym| sym.starts_with(first_char)).collect();
49
50 if matches.iter().any(|sym| sym.len() > 1) {
51 if let Some(&peeked_second_char) = self.chars.peek() {
52 let symbol = format!("{}{}", first_char, peeked_second_char);
53
54 if let Some(exact_match) = SYMBOLS.iter().find(|&&sym| sym == symbol) {
55 self.chars.next(); Some(Token::Symbol(exact_match))
57 } else {
58 let exact_match = SYMBOLS.iter().find(|&&sym| sym == format!("{}", first_char)).unwrap();
60 Some(Token::Symbol(exact_match))
61 }
62 } else {
63 Some(Token::Symbol(matches[0]))
65 }
66 } else { debug_assert_eq!(matches.len(), 1, "matched with multiple symbols");
68 Some(Token::Symbol(matches[0]))
69 }
70 } else {
71 println!("failed: {}", peeked_char);
72 panic!("unexpected character: '{:?}'", peeked_char);
73 }
74 }
75
76 fn eat_whitespace(&mut self) {
77 while let Some(&c) = self.chars.peek() {
78 if c != '\n' && c.is_whitespace() {
79 self.chars.next(); } else {
81 break;
82 }
83 }
84 }
85
86 fn eat_comment(&mut self) {
87 if self.chars.peek() == Some(&COMMENT_CHAR) {
88 while self.chars.peek() != Some(&'\n') {
89 self.chars.next(); }
91 }
92 }
93
94 fn read_word(&mut self) -> Token {
95 let mut chars = Vec::new();
96
97 while let Some(&c) = self.chars.peek() {
98 if c.is_alphanumeric() || c == '_' || c == '!' || c == '?' {
99 self.chars.next(); chars.push(c)
101 } else {
102 break;
103 }
104 }
105
106 Token::Word(chars.into_iter().collect())
107 }
108
109 fn read_number(&mut self) -> Token {
110 let mut chars = Vec::new();
111
112 while let Some(&c) = self.chars.peek() {
113 if c.is_numeric() {
114 self.chars.next(); chars.push(c)
116 } else {
117 break;
118 }
119 }
120
121 let number_text: String = chars.into_iter().collect();
122 let number = number_text.parse().unwrap();
123
124 Token::Integer(number)
125 }
126
127 fn read_string(&mut self) -> Token {
128 self.chars.next(); let mut chars = Vec::new();
131
132 while let Some(&c) = self.chars.peek() {
133 if c != '"' && c != '\'' {
134 self.chars.next(); chars.push(c)
136 } else {
137 self.chars.next(); break;
139 }
140 }
141
142 Token::String(chars.into_iter().collect())
143 }
144}
145
146impl<I: Iterator<Item=char>> Iterator for Tokenizer<I>
147{
148 type Item = Token;
149
150 fn next(&mut self) -> Option<Token> {
151 if let Some(token) = self.read_token() {
152 println!("token: {:?}", token);
153 Some(token)
154 } else {
155 if self.sent_last_new_line {
156 None
157 } else {
158 self.sent_last_new_line = true;
159 Some(Token::EndOfLine)
160 }
161 }
162 }
163}
164
165#[cfg(test)]
166mod test
167{
168 use super::*;
169 use parse::Token;
170
171 fn tokenize(s: &str) -> Vec<Token> {
172 let t = Tokenizer::new(s.chars());
173 t.collect()
174 }
175
176 #[test]
177 fn can_read_simple_word() {
178 assert_eq!(tokenize("abcdef"), vec![Token::Word("abcdef".to_owned()),
179 Token::EndOfLine]);
180 }
181
182 #[test]
183 fn can_handle_whitespace_at_start_of_word() {
184 assert_eq!(tokenize(" abcdef"), vec![Token::Word("abcdef".to_owned()),
185 Token::EndOfLine]);
186 }
187
188 #[test]
189 fn can_read_multiple_words() {
190 assert_eq!(tokenize("\tabcdef hg"), vec![Token::Word("abcdef".to_owned()),
191 Token::Word("hg".to_owned()),
192 Token::EndOfLine]);
193 }
194
195 #[test]
196 fn considers_underscores_a_part_of_words() {
197 assert_eq!(tokenize("\tabcdef_hg"), vec![Token::Word("abcdef_hg".to_owned()), Token::EndOfLine]);
198 }
199
200 #[test]
201 fn can_read_single_dot() {
202 assert_eq!(tokenize("."), vec![Token::Symbol("."), Token::EndOfLine]);
203 }
204
205 #[test]
206 fn can_read_multiple_dots() {
207 assert_eq!(tokenize("..."), vec![Token::Symbol("."),
208 Token::Symbol("."),
209 Token::Symbol("."),
210 Token::EndOfLine]);
211 }
212
213 #[test]
214 fn can_read_new_line() {
215 assert_eq!(tokenize(" \nb"), vec![Token::EndOfLine, Token::Word("b".to_owned()), Token::EndOfLine]);
216 }
217
218 #[test]
219 fn can_read_string() {
220 assert_eq!(tokenize("\"hello\""), vec![Token::String("hello".to_owned()),
221 Token::EndOfLine]);
222 }
223
224 #[test]
225 fn can_read_double_colon() {
226 assert_eq!(tokenize("Abc::Def"), vec![Token::Word("Abc".to_owned()),
227 Token::Symbol("::"),
228 Token::Word("Def".to_owned()),
229 Token::EndOfLine]);
230 }
231
232 #[test]
233 fn can_read_positive_integer() {
234 assert_eq!(tokenize("123 45"), vec![Token::Integer(123),
235 Token::Integer(45),
236 Token::EndOfLine]);
237 }
238}