1#![warn(rust_2018_idioms, missing_debug_implementations)]
2
3use crate::token::{keyword_token, Token};
4use Token::*;
5
6pub mod token;
7
8#[derive(Debug, Clone, PartialEq)]
9pub struct Tokenizer {
10 input: Vec<char>,
11 position: usize,
12 read_position: usize,
13 ch: char,
14}
15
16impl Iterator for Tokenizer {
17 type Item = Token;
18
19 fn next(&mut self) -> Option<Token> {
20 self.skip_whitespace();
21 let token = self.get_token();
22 self.read_char();
23
24 match token {
25 EndOfFile => None,
26 rest => Some(rest),
27 }
28 }
29}
30
31#[allow(dead_code)]
32impl Tokenizer {
33 pub fn new(input: Vec<char>) -> Self {
34 Self {
35 input,
36 position: 0,
37 read_position: 0,
38 ch: ' ',
39 }
40 }
41
42 pub fn read_char(&mut self) {
43 if self.read_position >= self.input.len() {
44 self.ch = ' ';
45 } else {
46 self.ch = self.input[self.read_position];
47 }
48
49 self.position = self.read_position;
50 self.read_position += 1;
51 }
52
53 fn get_token(&mut self) -> Token {
54 match self.ch {
55 '[' => LeftBracket,
56 ']' => RightBracket,
57 '(' => LeftParenthesis,
58 ')' => RightParenthesis,
59 '{' => LeftBrace,
60 '}' => RightBrace,
61 '.' => {
62 let is_spread =
63 self.look_ahead() == Some('.') && self.look_ahead_by(2) == Some('.');
64 if is_spread {
65 self.skip_chars_by(2);
66 Spread
67 } else {
68 Dot
69 }
70 }
71 ';' => Semicolon,
72 ':' => Colon,
73 ',' => Comma,
74 '<' => match self.look_ahead() {
75 Some('=') => {
76 self.skip_next_char();
77 LessEquals
78 }
79 Some('<') => match self.look_ahead_by(2usize) {
80 Some('=') => {
81 self.skip_chars_by(2usize);
82 LeftShiftAssign
83 }
84 _ => {
85 self.skip_next_char();
86 LeftShift
87 }
88 },
89 _ => LessThan,
90 },
91 '>' => match self.look_ahead() {
92 Some('=') => {
93 self.skip_next_char();
94 MoreEquals
95 }
96 Some('>') => match self.look_ahead_by(2usize) {
97 Some('=') => {
98 self.skip_chars_by(2usize);
99 RightShiftAssign
100 }
101 Some('>') => match self.look_ahead_by(3usize) {
102 Some('=') => {
103 self.skip_chars_by(3usize);
104 UnsignedRightShiftAssign
105 }
106 _ => {
107 self.skip_chars_by(2usize);
108 UnsignedRightShift
109 }
110 },
111 _ => {
112 self.skip_next_char();
113 RightShift
114 }
115 },
116 _ => MoreThan,
117 },
118 '+' => match self.look_ahead() {
119 Some('=') => {
120 self.skip_next_char();
121 AdditionAssign
122 }
123 _ => Addition,
124 },
125 '-' => match self.look_ahead() {
126 Some('=') => {
127 self.skip_next_char();
128 SubtractionAssign
129 }
130 _ => Subtraction,
131 },
132 '*' => match self.look_ahead() {
133 Some('=') => {
134 self.skip_next_char();
135 MultiplicationAssign
136 }
137 _ => Multiplication,
138 },
139 '/' => match self.look_ahead() {
140 Some('=') => {
141 self.skip_next_char();
142 DivisionAssign
143 }
144 _ => Division,
145 },
146 '%' => match self.look_ahead() {
147 Some('=') => {
148 self.skip_next_char();
149 ModulusAssign
150 }
151 _ => Modulus,
152 },
153 '&' => match self.look_ahead() {
154 Some('&') => {
155 self.skip_next_char();
156 LogicalAnd
157 }
158 _ => BitwiseAnd,
159 },
160 '|' => match self.look_ahead() {
161 Some('|') => {
162 self.skip_next_char();
163 LogicalOr
164 }
165 _ => BitwiseOr,
166 },
167 '^' => match self.look_ahead() {
168 Some('=') => {
169 self.skip_next_char();
170 BitwiseXORAssign
171 }
172 _ => BitwiseXOR,
173 },
174 '!' => match self.look_ahead() {
175 Some('=') => {
176 self.skip_next_char();
177 NotEquals
178 }
179 _ => LogicalNot,
180 },
181 '~' => match self.look_ahead() {
182 Some('=') => {
183 self.skip_next_char();
184 BitwiseNotAssign
185 }
186 _ => BitwiseNot,
187 },
188 '=' => match self.look_ahead() {
189 Some('=') => match self.look_ahead_by(2usize) {
190 Some('=') => {
191 self.skip_chars_by(2usize);
192 StrictEquals
193 }
194 _ => {
195 self.skip_next_char();
196 Equals
197 }
198 },
199 Some('>') => {
200 self.skip_next_char();
201 Arrow
202 }
203 _ => Assign,
204 },
205 _ => {
206 if self.is_letter() {
207 let id = self.read_identifier();
208
209 match keyword_token(&id) {
210 Ok(token) => token,
211 Err(_) => Identifier(id),
212 }
213 } else if self.is_number() {
214 let id = self.read_number();
215
216 Identifier(id)
217 } else {
218 EndOfFile
219 }
220 }
221 }
222 }
223
224 fn look_ahead(&mut self) -> Option<char> {
226 self.look_ahead_by(1)
227 }
228
229 fn look_ahead_by(&mut self, x: usize) -> Option<char> {
231 let next_position = self.position + x;
232
233 if next_position > self.input.len() || self.input.get(next_position).is_none() {
234 None
235 } else {
236 self.input.get(next_position).map(|c| *c)
237 }
238 }
239
240 fn skip_next_char(&mut self) {
241 self.skip_chars_by(1);
242 }
243
244 fn skip_chars_by(&mut self, x: usize) {
245 self.position += x;
246 self.read_position += x;
247 }
248
249 fn read_identifier(&mut self) -> Vec<char> {
250 let pos = self.position;
251
252 while !self.is_eof() && self.is_letter() {
253 self.read_char();
254 }
255 self.back();
256
257 self.input[pos..self.position].to_vec()
258 }
259
260 fn read_number(&mut self) -> Vec<char> {
261 let pos = self.position;
262
263 while !self.is_eof() && self.is_number() {
264 self.read_char();
265 }
266 self.back();
267
268 self.input[pos..self.position].to_vec()
269 }
270
271 fn is_eof(&self) -> bool {
272 self.read_position > self.input.len()
273 }
274
275 fn is_letter(&self) -> bool {
276 ('a' <= self.ch && 'z' >= self.ch) || ('A' <= self.ch && 'Z' >= self.ch) || ('_' == self.ch)
277 }
278
279 fn is_number(&self) -> bool {
280 '0' <= self.ch && '9' > self.ch
281 }
282
283 fn skip_whitespace(&mut self) {
284 match self.ch {
285 ' ' | '\t' | '\n' | '\r' => self.read_char(),
286 _ => {}
287 }
288 }
289
290 fn back(&mut self) {
291 self.read_position -= 1;
292 self.ch = self.input[self.read_position - 1];
293 }
294}
295
296#[cfg(test)]
297mod tests {
298 use super::*;
299
300 fn vec_char(input: &str) -> Vec<char> {
301 input.chars().collect::<Vec<char>>()
302 }
303
304 fn check(tokenizer: Tokenizer, expected: Vec<Token>) {
305 let actual: Vec<Token> = tokenizer.collect();
306 assert_eq!(actual, expected);
307 }
308
309 fn identifier(name: &str) -> Token {
310 Identifier(name.chars().collect::<Vec<char>>())
311 }
312
313 #[test]
314 fn sum() {
315 let sum = vec_char("1 + 1");
316 let tokenizer = Tokenizer::new(sum);
317
318 let expected = vec![identifier("1"), Addition, identifier("1")];
319
320 check(tokenizer, expected);
321 }
322
323 #[test]
324 fn einstein() {
325 let input = vec_char("e = m * c * c");
326 let tokenizer = Tokenizer::new(input);
327
328 let expected = vec![
329 identifier("e"),
330 Assign,
331 identifier("m"),
332 Multiplication,
333 identifier("c"),
334 Multiplication,
335 identifier("c"),
336 ];
337
338 check(tokenizer, expected);
339 }
340
341 #[test]
342 fn function() {
343 let input = vec_char("function sum(a, b) { return a + b }");
344 let tokenizer = Tokenizer::new(input);
345
346 let sum = identifier("sum");
347 let a = identifier("a");
348 let b = identifier("b");
349 let expected = vec![
350 Function,
351 sum,
352 LeftParenthesis,
353 a.clone(),
354 Comma,
355 b.clone(),
356 RightParenthesis,
357 LeftBrace,
358 Return,
359 a,
360 Addition,
361 b,
362 RightBrace,
363 ];
364
365 check(tokenizer, expected);
366 }
367
368 #[test]
369 fn symbol_after_keyword() {
370 let input = vec_char("await (this.wait(200))");
371 let tokenizer = Tokenizer::new(input);
372
373 let expected = vec![
374 Await,
375 LeftParenthesis,
376 This,
377 Dot,
378 identifier("wait"),
379 LeftParenthesis,
380 identifier("200"),
381 RightParenthesis,
382 RightParenthesis,
383 ];
384
385 check(tokenizer, expected);
386 }
387
388 #[test]
389 fn different_symbols() {
390 let input = vec_char("a = (b != c) == d");
391 let tokenizer = Tokenizer::new(input);
392
393 let expected = vec![
394 identifier("a"),
395 Assign,
396 LeftParenthesis,
397 identifier("b"),
398 NotEquals,
399 identifier("c"),
400 RightParenthesis,
401 Equals,
402 identifier("d"),
403 ];
404
405 check(tokenizer, expected);
406 }
407
408 #[test]
409 fn longer_symbols() {
410 let input = vec_char("b >>>= c");
411 let tokenizer = Tokenizer::new(input);
412
413 let expected = vec![identifier("b"), UnsignedRightShiftAssign, identifier("c")];
414
415 check(tokenizer, expected);
416 }
417
418 #[test]
419 fn keywords() {
420 let input = vec_char("if (a) { return b } else { return c }");
421 let tokenizer = Tokenizer::new(input);
422
423 let expected = vec![
424 If,
425 LeftParenthesis,
426 identifier("a"),
427 RightParenthesis,
428 LeftBrace,
429 Return,
430 identifier("b"),
431 RightBrace,
432 Else,
433 LeftBrace,
434 Return,
435 identifier("c"),
436 RightBrace,
437 ];
438
439 check(tokenizer, expected);
440 }
441
442 #[test]
443 fn bitwise() {
444 let input = vec_char("z ^= a & b | c ^ d");
445 let tokenizer = Tokenizer::new(input);
446
447 let expected = vec![
448 identifier("z"),
449 BitwiseXORAssign,
450 identifier("a"),
451 BitwiseAnd,
452 identifier("b"),
453 BitwiseOr,
454 identifier("c"),
455 BitwiseXOR,
456 identifier("d"),
457 ];
458
459 check(tokenizer, expected);
460 }
461
462 #[test]
463 fn arrow_function() {
464 let input = vec_char("a => a + 1");
465 let tokenizer = Tokenizer::new(input);
466
467 let expected = vec![
468 identifier("a"),
469 Arrow,
470 identifier("a"),
471 Addition,
472 identifier("1"),
473 ];
474
475 check(tokenizer, expected);
476 }
477}