rawk_core/
lexer.rs

1use crate::token::{Token, TokenKind, lookup_keyword};
2
3pub struct Lexer<'a> {
4    input: &'a str,
5    position: usize,
6    read_position: usize,
7    ch: Option<u8>,
8}
9
10impl<'a> Lexer<'a> {
11    pub fn new(src: &'a str) -> Self {
12        let mut lexer = Lexer {
13            input: src,
14            position: 0,
15            read_position: 0,
16            ch: None,
17        };
18
19        lexer.read_char();
20        lexer
21    }
22
23    pub fn next_token(&mut self) -> Token<'a> {
24        self.skip_whitespace();
25
26        if Some(b'#') == self.ch {
27            while self.ch != Some(b'\n') && self.ch.is_some() {
28                self.read_char();
29            }
30        }
31
32        let token = match self.ch {
33            Some(b'{') => Token {
34                kind: TokenKind::LeftCurlyBrace,
35                literal: "{",
36            },
37            Some(b'}') => Token {
38                kind: TokenKind::RightCurlyBrace,
39                literal: "}",
40            },
41            Some(b'(') => Token {
42                kind: TokenKind::LeftParen,
43                literal: "(",
44            },
45            Some(b')') => Token {
46                kind: TokenKind::RightParen,
47                literal: ")",
48            },
49            Some(b'[') => Token {
50                kind: TokenKind::LeftSquareBracket,
51                literal: "[",
52            },
53            Some(b']') => Token {
54                kind: TokenKind::RightSquareBracket,
55                literal: "]",
56            },
57            Some(b',') => Token {
58                kind: TokenKind::Comma,
59                literal: ",",
60            },
61            Some(b';') => Token {
62                kind: TokenKind::Semicolon,
63                literal: ";",
64            },
65            Some(b'\n') => Token {
66                kind: TokenKind::NewLine,
67                literal: "<newline>",
68            },
69            Some(b'+') => {
70                if self.peek_char() == Some(b'=') {
71                    self.read_char();
72                    Token {
73                        kind: TokenKind::AddAssign,
74                        literal: "+=",
75                    }
76                } else if self.peek_char() == Some(b'+') {
77                    self.read_char();
78                    Token {
79                        kind: TokenKind::Increment,
80                        literal: "++",
81                    }
82                } else {
83                    Token {
84                        kind: TokenKind::Plus,
85                        literal: "+",
86                    }
87                }
88            }
89            Some(b'-') => {
90                if self.peek_char() == Some(b'=') {
91                    self.read_char();
92                    Token {
93                        kind: TokenKind::SubtractAssign,
94                        literal: "-=",
95                    }
96                } else if self.peek_char() == Some(b'-') {
97                    self.read_char();
98                    Token {
99                        kind: TokenKind::Decrement,
100                        literal: "--",
101                    }
102                } else {
103                    Token {
104                        kind: TokenKind::Minus,
105                        literal: "-",
106                    }
107                }
108            }
109            Some(b'*') => {
110                if self.peek_char() == Some(b'=') {
111                    self.read_char();
112                    Token {
113                        kind: TokenKind::MultiplyAssign,
114                        literal: "*=",
115                    }
116                } else {
117                    Token {
118                        kind: TokenKind::Asterisk,
119                        literal: "*",
120                    }
121                }
122            }
123            Some(b'%') => {
124                if self.peek_char() == Some(b'=') {
125                    self.read_char();
126                    Token {
127                        kind: TokenKind::ModuloAssign,
128                        literal: "%=",
129                    }
130                } else {
131                    Token {
132                        kind: TokenKind::Percent,
133                        literal: "%",
134                    }
135                }
136            }
137            Some(b'^') => {
138                if self.peek_char() == Some(b'=') {
139                    self.read_char();
140                    Token {
141                        kind: TokenKind::PowerAssign,
142                        literal: "^=",
143                    }
144                } else {
145                    Token {
146                        kind: TokenKind::Caret,
147                        literal: "^",
148                    }
149                }
150            }
151            Some(b'!') => {
152                if self.peek_char() == Some(b'=') {
153                    self.read_char();
154                    Token {
155                        kind: TokenKind::NotEqual,
156                        literal: "!=",
157                    }
158                } else if self.peek_char() == Some(b'~') {
159                    self.read_char();
160                    Token {
161                        kind: TokenKind::NoMatch,
162                        literal: "!~",
163                    }
164                } else {
165                    Token {
166                        kind: TokenKind::ExclamationMark,
167                        literal: "!",
168                    }
169                }
170            }
171            Some(b'>') => {
172                if self.peek_char() == Some(b'=') {
173                    self.read_char();
174                    Token {
175                        kind: TokenKind::GreaterThanOrEqual,
176                        literal: ">=",
177                    }
178                } else if self.peek_char() == Some(b'>') {
179                    self.read_char();
180                    Token {
181                        kind: TokenKind::Append,
182                        literal: ">>",
183                    }
184                } else {
185                    Token {
186                        kind: TokenKind::GreaterThan,
187                        literal: ">",
188                    }
189                }
190            }
191            Some(b'<') => {
192                if self.peek_char() == Some(b'=') {
193                    self.read_char();
194                    Token {
195                        kind: TokenKind::LessThanOrEqual,
196                        literal: "<=",
197                    }
198                } else {
199                    Token {
200                        kind: TokenKind::LessThan,
201                        literal: "<",
202                    }
203                }
204            }
205            Some(b'|') => {
206                if self.peek_char() == Some(b'|') {
207                    self.read_char();
208                    Token {
209                        kind: TokenKind::Or,
210                        literal: "||",
211                    }
212                } else {
213                    Token {
214                        kind: TokenKind::Pipe,
215                        literal: "|",
216                    }
217                }
218            }
219            Some(b'?') => Token {
220                kind: TokenKind::QuestionMark,
221                literal: "?",
222            },
223            Some(b':') => Token {
224                kind: TokenKind::Colon,
225                literal: ":",
226            },
227            Some(b'~') => Token {
228                kind: TokenKind::Tilde,
229                literal: "~",
230            },
231            Some(b'$') => Token {
232                kind: TokenKind::DollarSign,
233                literal: "$",
234            },
235            Some(b'=') => {
236                if self.peek_char() == Some(b'=') {
237                    self.read_char();
238                    Token {
239                        kind: TokenKind::Equal,
240                        literal: "==",
241                    }
242                } else {
243                    Token {
244                        kind: TokenKind::Assign,
245                        literal: "=",
246                    }
247                }
248            }
249            Some(b'/') => {
250                if self.peek_char() == Some(b'=') {
251                    self.read_char();
252                    Token {
253                        kind: TokenKind::DivideAssign,
254                        literal: "/=",
255                    }
256                } else {
257                    Token {
258                        kind: TokenKind::Division,
259                        literal: "/",
260                    }
261                }
262            }
263            Some(b'&') => {
264                if self.peek_char() == Some(b'&') {
265                    self.read_char();
266                    Token {
267                        kind: TokenKind::And,
268                        literal: "&&",
269                    }
270                } else {
271                    Token {
272                        kind: TokenKind::Illegal,
273                        literal: "<illegal>",
274                    }
275                }
276            }
277            Some(b'\\') => {
278                if self.peek_char() == Some(b'\n') {
279                    self.read_char();
280                    Token {
281                        kind: TokenKind::NewLine,
282                        literal: "<newline>",
283                    }
284                } else {
285                    Token {
286                        kind: TokenKind::Illegal,
287                        literal: "<illegal>",
288                    }
289                }
290            }
291            ch if is_ascii_alphabetic(ch) => self.read_identifier(),
292            ch if is_digit(ch) => self.read_number(),
293            Some(b'.')
294                if self
295                    .peek_char()
296                    .map_or(false, |arg0: u8| is_digit(Some(arg0))) =>
297            {
298                self.read_number()
299            }
300            None => Token {
301                kind: TokenKind::Eof,
302                literal: "",
303            },
304            _ => Token {
305                kind: TokenKind::Illegal,
306                literal: "<illegal>",
307            },
308        };
309
310        self.read_char();
311        token
312    }
313
314    fn read_char(&mut self) {
315        if self.read_position >= self.input.len() {
316            self.ch = None;
317        } else {
318            self.ch = Some(self.input.as_bytes()[self.read_position]);
319        }
320        self.position = self.read_position;
321        self.read_position += 1;
322    }
323
324    fn read_identifier(&mut self) -> Token<'a> {
325        let position = self.position;
326        while is_ascii_alphabetic(self.ch) {
327            self.read_char();
328        }
329        let literal = &self.input[position..self.position];
330
331        return lookup_keyword(literal);
332    }
333
334    fn read_number(&mut self) -> Token<'a> {
335        let position = self.position;
336        let mut got_digit = false;
337
338        // consume leading digits
339        if self.ch != Some(b'.') {
340            got_digit = true;
341            while is_digit(self.ch) {
342                self.read_char();
343            }
344            if self.ch == Some(b'.') {
345                self.read_char();
346            }
347        } else {
348            // consume the dot.
349            self.read_char();
350        }
351
352        // consume trailing digits
353        while is_digit(self.ch) {
354            got_digit = true;
355            self.read_char();
356        }
357
358        if !got_digit {
359            return Token {
360                kind: TokenKind::Illegal,
361                literal: "<illegal>",
362            };
363        }
364
365        let literal = &self.input[position..self.position];
366
367        Token {
368            kind: TokenKind::Number,
369            literal: literal,
370        }
371    }
372
373    fn skip_whitespace(&mut self) {
374        while is_whitespace(self.ch) {
375            self.read_char();
376        }
377    }
378
379    fn peek_char(&self) -> Option<u8> {
380        if self.read_position >= self.input.len() {
381            None
382        } else {
383            Some(self.input.as_bytes()[self.read_position])
384        }
385    }
386}
387
388fn is_ascii_alphabetic(ch: Option<u8>) -> bool {
389    match ch {
390        Some(byte) => (byte >= b'a' && byte <= b'z') || (byte >= b'A' && byte <= b'Z'),
391        None => false,
392    }
393}
394
395fn is_whitespace(ch: Option<u8>) -> bool {
396    match ch {
397        Some(byte) => byte == b' ' || byte == b'\t' || byte == b'\r',
398        None => false,
399    }
400}
401
402fn is_digit(ch: Option<u8>) -> bool {
403    match ch {
404        Some(byte) => byte >= b'0' && byte <= b'9',
405        None => false,
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn next_left_curly_brace_token() {
415        let expected = Token {
416            kind: TokenKind::LeftCurlyBrace,
417            literal: "{",
418        };
419        let input = "{";
420        let mut lexer = Lexer::new(input);
421
422        let token = lexer.next_token();
423
424        assert_eq!(expected, token);
425    }
426
427    #[test]
428    fn next_right_curly_brace_token() {
429        let expected = Token {
430            kind: TokenKind::RightCurlyBrace,
431            literal: "}",
432        };
433        let input = "}";
434        let mut lexer = Lexer::new(input);
435
436        let token = lexer.next_token();
437
438        assert_eq!(expected, token);
439    }
440
441    #[test]
442    fn next_pipe_token() {
443        let expected = Token {
444            kind: TokenKind::Pipe,
445            literal: "|",
446        };
447        let input = "|";
448        let mut lexer = Lexer::new(input);
449
450        let token = lexer.next_token();
451
452        assert_eq!(expected, token);
453    }
454
455    #[test]
456    fn next_one_character_token() {
457        let input = "{}()[],;\n+-*/%^!><|?:~$=";
458        let mut lexer = Lexer::new(input);
459        let expected_tokens = vec![
460            Token {
461                kind: TokenKind::LeftCurlyBrace,
462                literal: "{",
463            },
464            Token {
465                kind: TokenKind::RightCurlyBrace,
466                literal: "}",
467            },
468            Token {
469                kind: TokenKind::LeftParen,
470                literal: "(",
471            },
472            Token {
473                kind: TokenKind::RightParen,
474                literal: ")",
475            },
476            Token {
477                kind: TokenKind::LeftSquareBracket,
478                literal: "[",
479            },
480            Token {
481                kind: TokenKind::RightSquareBracket,
482                literal: "]",
483            },
484            Token {
485                kind: TokenKind::Comma,
486                literal: ",",
487            },
488            Token {
489                kind: TokenKind::Semicolon,
490                literal: ";",
491            },
492            Token {
493                kind: TokenKind::NewLine,
494                literal: "<newline>",
495            },
496            Token {
497                kind: TokenKind::Plus,
498                literal: "+",
499            },
500            Token {
501                kind: TokenKind::Minus,
502                literal: "-",
503            },
504            Token {
505                kind: TokenKind::Asterisk,
506                literal: "*",
507            },
508            Token {
509                kind: TokenKind::Division,
510                literal: "/",
511            },
512            Token {
513                kind: TokenKind::Percent,
514                literal: "%",
515            },
516            Token {
517                kind: TokenKind::Caret,
518                literal: "^",
519            },
520            Token {
521                kind: TokenKind::ExclamationMark,
522                literal: "!",
523            },
524            Token {
525                kind: TokenKind::GreaterThan,
526                literal: ">",
527            },
528            Token {
529                kind: TokenKind::LessThan,
530                literal: "<",
531            },
532            Token {
533                kind: TokenKind::Pipe,
534                literal: "|",
535            },
536            Token {
537                kind: TokenKind::QuestionMark,
538                literal: "?",
539            },
540            Token {
541                kind: TokenKind::Colon,
542                literal: ":",
543            },
544            Token {
545                kind: TokenKind::Tilde,
546                literal: "~",
547            },
548            Token {
549                kind: TokenKind::DollarSign,
550                literal: "$",
551            },
552            Token {
553                kind: TokenKind::Assign,
554                literal: "=",
555            },
556            Token {
557                kind: TokenKind::Eof,
558                literal: "",
559            },
560        ];
561
562        for expected in expected_tokens {
563            let token = lexer.next_token();
564            assert_eq!(expected, token);
565        }
566    }
567
568    #[test]
569    fn next_while_token() {
570        let expected = Token {
571            kind: TokenKind::While,
572            literal: "while",
573        };
574        let input = "while";
575        let mut lexer = Lexer::new(input);
576
577        let token = lexer.next_token();
578
579        assert_eq!(expected, token);
580    }
581
582    #[test]
583    fn next_identifier_token() {
584        let input = "BEGIN END break continue delete do else exit for function if in next print printf return while";
585        let mut lexer = Lexer::new(input);
586
587        let expected_tokens = vec![
588            Token {
589                kind: TokenKind::Begin,
590                literal: "BEGIN",
591            },
592            Token {
593                kind: TokenKind::End,
594                literal: "END",
595            },
596            Token {
597                kind: TokenKind::Break,
598                literal: "break",
599            },
600            Token {
601                kind: TokenKind::Continue,
602                literal: "continue",
603            },
604            Token {
605                kind: TokenKind::Delete,
606                literal: "delete",
607            },
608            Token {
609                kind: TokenKind::Do,
610                literal: "do",
611            },
612            Token {
613                kind: TokenKind::Else,
614                literal: "else",
615            },
616            Token {
617                kind: TokenKind::Exit,
618                literal: "exit",
619            },
620            Token {
621                kind: TokenKind::For,
622                literal: "for",
623            },
624            Token {
625                kind: TokenKind::Function,
626                literal: "function",
627            },
628            Token {
629                kind: TokenKind::If,
630                literal: "if",
631            },
632            Token {
633                kind: TokenKind::In,
634                literal: "in",
635            },
636            Token {
637                kind: TokenKind::Next,
638                literal: "next",
639            },
640            Token {
641                kind: TokenKind::Print,
642                literal: "print",
643            },
644            Token {
645                kind: TokenKind::Printf,
646                literal: "printf",
647            },
648            Token {
649                kind: TokenKind::Return,
650                literal: "return",
651            },
652            Token {
653                kind: TokenKind::While,
654                literal: "while",
655            },
656            Token {
657                kind: TokenKind::Eof,
658                literal: "",
659            },
660        ];
661
662        for expected in expected_tokens {
663            let token = lexer.next_token();
664            assert_eq!(expected, token);
665        }
666    }
667
668    #[test]
669    fn next_number_token() {
670        let input = "123 4567 890 42.0 .75 0.001";
671        let mut lexer = Lexer::new(input);
672
673        let expected_tokens = vec![
674            Token {
675                kind: TokenKind::Number,
676                literal: "123",
677            },
678            Token {
679                kind: TokenKind::Number,
680                literal: "4567",
681            },
682            Token {
683                kind: TokenKind::Number,
684                literal: "890",
685            },
686            Token {
687                kind: TokenKind::Number,
688                literal: "42.0",
689            },
690            Token {
691                kind: TokenKind::Number,
692                literal: ".75",
693            },
694            Token {
695                kind: TokenKind::Number,
696                literal: "0.001",
697            },
698            Token {
699                kind: TokenKind::Eof,
700                literal: "",
701            },
702        ];
703
704        for expected in expected_tokens {
705            let token = lexer.next_token();
706            assert_eq!(expected, token);
707        }
708    }
709
710    #[test]
711    fn next_or_token() {
712        let expected = Token {
713            kind: TokenKind::Or,
714            literal: "||",
715        };
716        let input = "||";
717        let mut lexer = Lexer::new(input);
718
719        let token = lexer.next_token();
720
721        assert_eq!(expected, token);
722    }
723
724    #[test]
725    fn next_two_character_token() {
726        let input = "+= -= *= /= %= ^= || && !~ == <= >= != ++ -- >>";
727        let mut lexer = Lexer::new(input);
728
729        let expected_tokens = vec![
730            Token {
731                kind: TokenKind::AddAssign,
732                literal: "+=",
733            },
734            Token {
735                kind: TokenKind::SubtractAssign,
736                literal: "-=",
737            },
738            Token {
739                kind: TokenKind::MultiplyAssign,
740                literal: "*=",
741            },
742            Token {
743                kind: TokenKind::DivideAssign,
744                literal: "/=",
745            },
746            Token {
747                kind: TokenKind::ModuloAssign,
748                literal: "%=",
749            },
750            Token {
751                kind: TokenKind::PowerAssign,
752                literal: "^=",
753            },
754            Token {
755                kind: TokenKind::Or,
756                literal: "||",
757            },
758            Token {
759                kind: TokenKind::And,
760                literal: "&&",
761            },
762            Token {
763                kind: TokenKind::NoMatch,
764                literal: "!~",
765            },
766            Token {
767                kind: TokenKind::Equal,
768                literal: "==",
769            },
770            Token {
771                kind: TokenKind::LessThanOrEqual,
772                literal: "<=",
773            },
774            Token {
775                kind: TokenKind::GreaterThanOrEqual,
776                literal: ">=",
777            },
778            Token {
779                kind: TokenKind::NotEqual,
780                literal: "!=",
781            },
782            Token {
783                kind: TokenKind::Increment,
784                literal: "++",
785            },
786            Token {
787                kind: TokenKind::Decrement,
788                literal: "--",
789            },
790            Token {
791                kind: TokenKind::Append,
792                literal: ">>",
793            },
794            Token {
795                kind: TokenKind::Eof,
796                literal: "",
797            },
798        ];
799
800        for expected in expected_tokens {
801            let token = lexer.next_token();
802            assert_eq!(expected, token);
803        }
804    }
805
806    #[test]
807    fn consume_comment() {
808        let input = "# This is a comment\n123";
809        let mut lexer = Lexer::new(input);
810
811        let expected_tokens = vec![
812            Token {
813                kind: TokenKind::NewLine,
814                literal: "<newline>",
815            },
816            Token {
817                kind: TokenKind::Number,
818                literal: "123",
819            },
820            Token {
821                kind: TokenKind::Eof,
822                literal: "",
823            },
824        ];
825
826        for expected in expected_tokens {
827            let token = lexer.next_token();
828            assert_eq!(expected, token);
829        }
830    }
831
832    #[test]
833    fn expect_newline_after_backslash() {
834        let input = "123 \\\n456";
835        let mut lexer = Lexer::new(input);
836
837        let expected_tokens = vec![
838            Token {
839                kind: TokenKind::Number,
840                literal: "123",
841            },
842            Token {
843                kind: TokenKind::NewLine,
844                literal: "<newline>",
845            },
846            Token {
847                kind: TokenKind::Number,
848                literal: "456",
849            },
850            Token {
851                kind: TokenKind::Eof,
852                literal: "",
853            },
854        ];
855        for expected in expected_tokens {
856            let token = lexer.next_token();
857            assert_eq!(expected, token);
858        }
859    }
860
861    #[test]
862    fn backslash_without_newline_is_illegal() {
863        let input = "123 \\ 456";
864        let mut lexer = Lexer::new(input);
865
866        let expected_tokens = vec![
867            Token {
868                kind: TokenKind::Number,
869                literal: "123",
870            },
871            Token {
872                kind: TokenKind::Illegal,
873                literal: "<illegal>",
874            },
875            Token {
876                kind: TokenKind::Number,
877                literal: "456",
878            },
879            Token {
880                kind: TokenKind::Eof,
881                literal: "",
882            },
883        ];
884        for expected in expected_tokens {
885            let token = lexer.next_token();
886            assert_eq!(expected, token);
887        }
888    }
889
890    #[test]
891    fn is_ascii_alphabetic_lowercase() {
892        assert!(is_ascii_alphabetic(Some(b'a')));
893        assert!(is_ascii_alphabetic(Some(b'z')));
894        assert!(is_ascii_alphabetic(Some(b'm')));
895    }
896
897    #[test]
898    fn is_ascii_alphabetic_uppercase() {
899        assert!(is_ascii_alphabetic(Some(b'A')));
900        assert!(is_ascii_alphabetic(Some(b'Z')));
901        assert!(is_ascii_alphabetic(Some(b'M')));
902    }
903
904    #[test]
905    fn is_ascii_alphabetic_digits() {
906        assert!(!is_ascii_alphabetic(Some(b'0')));
907        assert!(!is_ascii_alphabetic(Some(b'5')));
908        assert!(!is_ascii_alphabetic(Some(b'9')));
909    }
910
911    #[test]
912    fn is_ascii_alphabetic_special_chars() {
913        assert!(!is_ascii_alphabetic(Some(b'!')));
914        assert!(!is_ascii_alphabetic(Some(b' ')));
915        assert!(!is_ascii_alphabetic(Some(b'{')));
916        assert!(!is_ascii_alphabetic(Some(b'=')));
917    }
918
919    #[test]
920    fn is_ascii_alphabetic_none() {
921        assert!(!is_ascii_alphabetic(None));
922    }
923
924    #[test]
925    fn is_whitespace_space() {
926        assert!(is_whitespace(Some(b' ')), "space is considered whitespace");
927        assert!(is_whitespace(Some(b'\t')), "tab is considered whitespace");
928        assert!(
929            is_whitespace(Some(b'\r')),
930            "carriage return is considered whitespace"
931        );
932    }
933
934    #[test]
935    fn is_whitespace_special_chars() {
936        assert!(!is_whitespace(Some(b'!')));
937        assert!(!is_whitespace(Some(b'{')));
938        assert!(!is_whitespace(Some(b'=')));
939    }
940
941    #[test]
942    fn is_whitespace_none() {
943        assert!(!is_whitespace(None));
944    }
945
946    #[test]
947    fn is_digit_valid() {
948        assert!(is_digit(Some(b'0')));
949        assert!(is_digit(Some(b'5')));
950        assert!(is_digit(Some(b'9')));
951    }
952
953    #[test]
954    fn is_digit_invalid() {
955        assert!(!is_digit(Some(b'a')));
956        assert!(!is_digit(Some(b'z')));
957        assert!(!is_digit(Some(b'A')));
958        assert!(!is_digit(Some(b'Z')));
959        assert!(!is_digit(Some(b'!')));
960        assert!(!is_digit(Some(b' ')));
961        assert!(!is_digit(Some(b'{')));
962        assert!(!is_digit(Some(b'=')));
963    }
964
965    #[test]
966    fn is_digit_none() {
967        assert!(!is_digit(None));
968    }
969}