powr_tokenizer/
lib.rs

1#![warn(rust_2018_idioms, missing_debug_implementations)]
2
3use crate::token::{keyword_token, Token};
4use Token::*;
5
6pub mod token;
7
8#[derive(Debug, Clone, PartialEq)]
9pub struct Tokenizer {
10    input: Vec<char>,
11    position: usize,
12    read_position: usize,
13    ch: char,
14}
15
16impl Iterator for Tokenizer {
17    type Item = Token;
18
19    fn next(&mut self) -> Option<Token> {
20        self.skip_whitespace();
21        let token = self.get_token();
22        self.read_char();
23
24        match token {
25            EndOfFile => None,
26            rest => Some(rest),
27        }
28    }
29}
30
31#[allow(dead_code)]
32impl Tokenizer {
33    pub fn new(input: Vec<char>) -> Self {
34        Self {
35            input,
36            position: 0,
37            read_position: 0,
38            ch: ' ',
39        }
40    }
41
42    pub fn read_char(&mut self) {
43        if self.read_position >= self.input.len() {
44            self.ch = ' ';
45        } else {
46            self.ch = self.input[self.read_position];
47        }
48
49        self.position = self.read_position;
50        self.read_position += 1;
51    }
52
53    fn get_token(&mut self) -> Token {
54        match self.ch {
55            '[' => LeftBracket,
56            ']' => RightBracket,
57            '(' => LeftParenthesis,
58            ')' => RightParenthesis,
59            '{' => LeftBrace,
60            '}' => RightBrace,
61            '.' => {
62                let is_spread =
63                    self.look_ahead() == Some('.') && self.look_ahead_by(2) == Some('.');
64                if is_spread {
65                    self.skip_chars_by(2);
66                    Spread
67                } else {
68                    Dot
69                }
70            }
71            ';' => Semicolon,
72            ':' => Colon,
73            ',' => Comma,
74            '<' => match self.look_ahead() {
75                Some('=') => {
76                    self.skip_next_char();
77                    LessEquals
78                }
79                Some('<') => match self.look_ahead_by(2usize) {
80                    Some('=') => {
81                        self.skip_chars_by(2usize);
82                        LeftShiftAssign
83                    }
84                    _ => {
85                        self.skip_next_char();
86                        LeftShift
87                    }
88                },
89                _ => LessThan,
90            },
91            '>' => match self.look_ahead() {
92                Some('=') => {
93                    self.skip_next_char();
94                    MoreEquals
95                }
96                Some('>') => match self.look_ahead_by(2usize) {
97                    Some('=') => {
98                        self.skip_chars_by(2usize);
99                        RightShiftAssign
100                    }
101                    Some('>') => match self.look_ahead_by(3usize) {
102                        Some('=') => {
103                            self.skip_chars_by(3usize);
104                            UnsignedRightShiftAssign
105                        }
106                        _ => {
107                            self.skip_chars_by(2usize);
108                            UnsignedRightShift
109                        }
110                    },
111                    _ => {
112                        self.skip_next_char();
113                        RightShift
114                    }
115                },
116                _ => MoreThan,
117            },
118            '+' => match self.look_ahead() {
119                Some('=') => {
120                    self.skip_next_char();
121                    AdditionAssign
122                }
123                _ => Addition,
124            },
125            '-' => match self.look_ahead() {
126                Some('=') => {
127                    self.skip_next_char();
128                    SubtractionAssign
129                }
130                _ => Subtraction,
131            },
132            '*' => match self.look_ahead() {
133                Some('=') => {
134                    self.skip_next_char();
135                    MultiplicationAssign
136                }
137                _ => Multiplication,
138            },
139            '/' => match self.look_ahead() {
140                Some('=') => {
141                    self.skip_next_char();
142                    DivisionAssign
143                }
144                _ => Division,
145            },
146            '%' => match self.look_ahead() {
147                Some('=') => {
148                    self.skip_next_char();
149                    ModulusAssign
150                }
151                _ => Modulus,
152            },
153            '&' => match self.look_ahead() {
154                Some('&') => {
155                    self.skip_next_char();
156                    LogicalAnd
157                }
158                _ => BitwiseAnd,
159            },
160            '|' => match self.look_ahead() {
161                Some('|') => {
162                    self.skip_next_char();
163                    LogicalOr
164                }
165                _ => BitwiseOr,
166            },
167            '^' => match self.look_ahead() {
168                Some('=') => {
169                    self.skip_next_char();
170                    BitwiseXORAssign
171                }
172                _ => BitwiseXOR,
173            },
174            '!' => match self.look_ahead() {
175                Some('=') => {
176                    self.skip_next_char();
177                    NotEquals
178                }
179                _ => LogicalNot,
180            },
181            '~' => match self.look_ahead() {
182                Some('=') => {
183                    self.skip_next_char();
184                    BitwiseNotAssign
185                }
186                _ => BitwiseNot,
187            },
188            '=' => match self.look_ahead() {
189                Some('=') => match self.look_ahead_by(2usize) {
190                    Some('=') => {
191                        self.skip_chars_by(2usize);
192                        StrictEquals
193                    }
194                    _ => {
195                        self.skip_next_char();
196                        Equals
197                    }
198                },
199                Some('>') => {
200                    self.skip_next_char();
201                    Arrow
202                }
203                _ => Assign,
204            },
205            _ => {
206                if self.is_letter() {
207                    let id = self.read_identifier();
208
209                    match keyword_token(&id) {
210                        Ok(token) => token,
211                        Err(_) => Identifier(id),
212                    }
213                } else if self.is_number() {
214                    let id = self.read_number();
215
216                    Identifier(id)
217                } else {
218                    EndOfFile
219                }
220            }
221        }
222    }
223
224    /// Returns the next char, based on the struct state.
225    fn look_ahead(&mut self) -> Option<char> {
226        self.look_ahead_by(1)
227    }
228
229    /// Returns the next `x` char, based on the struct state.
230    fn look_ahead_by(&mut self, x: usize) -> Option<char> {
231        let next_position = self.position + x;
232
233        if next_position > self.input.len() || self.input.get(next_position).is_none() {
234            None
235        } else {
236            self.input.get(next_position).map(|c| *c)
237        }
238    }
239
240    fn skip_next_char(&mut self) {
241        self.skip_chars_by(1);
242    }
243
244    fn skip_chars_by(&mut self, x: usize) {
245        self.position += x;
246        self.read_position += x;
247    }
248
249    fn read_identifier(&mut self) -> Vec<char> {
250        let pos = self.position;
251
252        while !self.is_eof() && self.is_letter() {
253            self.read_char();
254        }
255        self.back();
256
257        self.input[pos..self.position].to_vec()
258    }
259
260    fn read_number(&mut self) -> Vec<char> {
261        let pos = self.position;
262
263        while !self.is_eof() && self.is_number() {
264            self.read_char();
265        }
266        self.back();
267
268        self.input[pos..self.position].to_vec()
269    }
270
271    fn is_eof(&self) -> bool {
272        self.read_position > self.input.len()
273    }
274
275    fn is_letter(&self) -> bool {
276        ('a' <= self.ch && 'z' >= self.ch) || ('A' <= self.ch && 'Z' >= self.ch) || ('_' == self.ch)
277    }
278
279    fn is_number(&self) -> bool {
280        '0' <= self.ch && '9' > self.ch
281    }
282
283    fn skip_whitespace(&mut self) {
284        match self.ch {
285            ' ' | '\t' | '\n' | '\r' => self.read_char(),
286            _ => {}
287        }
288    }
289
290    fn back(&mut self) {
291        self.read_position -= 1;
292        self.ch = self.input[self.read_position - 1];
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    fn vec_char(input: &str) -> Vec<char> {
301        input.chars().collect::<Vec<char>>()
302    }
303
304    fn check(tokenizer: Tokenizer, expected: Vec<Token>) {
305        let actual: Vec<Token> = tokenizer.collect();
306        assert_eq!(actual, expected);
307    }
308
309    fn identifier(name: &str) -> Token {
310        Identifier(name.chars().collect::<Vec<char>>())
311    }
312
313    #[test]
314    fn sum() {
315        let sum = vec_char("1 + 1");
316        let tokenizer = Tokenizer::new(sum);
317
318        let expected = vec![identifier("1"), Addition, identifier("1")];
319
320        check(tokenizer, expected);
321    }
322
323    #[test]
324    fn einstein() {
325        let input = vec_char("e = m * c * c");
326        let tokenizer = Tokenizer::new(input);
327
328        let expected = vec![
329            identifier("e"),
330            Assign,
331            identifier("m"),
332            Multiplication,
333            identifier("c"),
334            Multiplication,
335            identifier("c"),
336        ];
337
338        check(tokenizer, expected);
339    }
340
341    #[test]
342    fn function() {
343        let input = vec_char("function sum(a, b) { return a + b }");
344        let tokenizer = Tokenizer::new(input);
345
346        let sum = identifier("sum");
347        let a = identifier("a");
348        let b = identifier("b");
349        let expected = vec![
350            Function,
351            sum,
352            LeftParenthesis,
353            a.clone(),
354            Comma,
355            b.clone(),
356            RightParenthesis,
357            LeftBrace,
358            Return,
359            a,
360            Addition,
361            b,
362            RightBrace,
363        ];
364
365        check(tokenizer, expected);
366    }
367
368    #[test]
369    fn symbol_after_keyword() {
370        let input = vec_char("await (this.wait(200))");
371        let tokenizer = Tokenizer::new(input);
372
373        let expected = vec![
374            Await,
375            LeftParenthesis,
376            This,
377            Dot,
378            identifier("wait"),
379            LeftParenthesis,
380            identifier("200"),
381            RightParenthesis,
382            RightParenthesis,
383        ];
384
385        check(tokenizer, expected);
386    }
387
388    #[test]
389    fn different_symbols() {
390        let input = vec_char("a = (b != c) == d");
391        let tokenizer = Tokenizer::new(input);
392
393        let expected = vec![
394            identifier("a"),
395            Assign,
396            LeftParenthesis,
397            identifier("b"),
398            NotEquals,
399            identifier("c"),
400            RightParenthesis,
401            Equals,
402            identifier("d"),
403        ];
404
405        check(tokenizer, expected);
406    }
407
408    #[test]
409    fn longer_symbols() {
410        let input = vec_char("b >>>= c");
411        let tokenizer = Tokenizer::new(input);
412
413        let expected = vec![identifier("b"), UnsignedRightShiftAssign, identifier("c")];
414
415        check(tokenizer, expected);
416    }
417
418    #[test]
419    fn keywords() {
420        let input = vec_char("if (a) { return b } else { return c }");
421        let tokenizer = Tokenizer::new(input);
422
423        let expected = vec![
424            If,
425            LeftParenthesis,
426            identifier("a"),
427            RightParenthesis,
428            LeftBrace,
429            Return,
430            identifier("b"),
431            RightBrace,
432            Else,
433            LeftBrace,
434            Return,
435            identifier("c"),
436            RightBrace,
437        ];
438
439        check(tokenizer, expected);
440    }
441
442    #[test]
443    fn bitwise() {
444        let input = vec_char("z ^= a & b | c ^ d");
445        let tokenizer = Tokenizer::new(input);
446
447        let expected = vec![
448            identifier("z"),
449            BitwiseXORAssign,
450            identifier("a"),
451            BitwiseAnd,
452            identifier("b"),
453            BitwiseOr,
454            identifier("c"),
455            BitwiseXOR,
456            identifier("d"),
457        ];
458
459        check(tokenizer, expected);
460    }
461
462    #[test]
463    fn arrow_function() {
464        let input = vec_char("a => a + 1");
465        let tokenizer = Tokenizer::new(input);
466
467        let expected = vec![
468            identifier("a"),
469            Arrow,
470            identifier("a"),
471            Addition,
472            identifier("1"),
473        ];
474
475        check(tokenizer, expected);
476    }
477}