1use crate::token::{Token, TokenKind, lookup_keyword};
2
3pub struct Lexer<'a> {
4 input: &'a str,
5 position: usize,
6 read_position: usize,
7 ch: Option<u8>,
8}
9
10impl<'a> Lexer<'a> {
11 pub fn new(src: &'a str) -> Self {
12 let mut lexer = Lexer {
13 input: src,
14 position: 0,
15 read_position: 0,
16 ch: None,
17 };
18
19 lexer.read_char();
20 lexer
21 }
22
23 pub fn next_token(&mut self) -> Token<'a> {
24 self.skip_whitespace();
25
26 if Some(b'#') == self.ch {
27 while self.ch != Some(b'\n') && self.ch.is_some() {
28 self.read_char();
29 }
30 }
31
32 let token = match self.ch {
33 Some(b'{') => Token {
34 kind: TokenKind::LeftCurlyBrace,
35 literal: "{",
36 },
37 Some(b'}') => Token {
38 kind: TokenKind::RightCurlyBrace,
39 literal: "}",
40 },
41 Some(b'(') => Token {
42 kind: TokenKind::LeftParen,
43 literal: "(",
44 },
45 Some(b')') => Token {
46 kind: TokenKind::RightParen,
47 literal: ")",
48 },
49 Some(b'[') => Token {
50 kind: TokenKind::LeftSquareBracket,
51 literal: "[",
52 },
53 Some(b']') => Token {
54 kind: TokenKind::RightSquareBracket,
55 literal: "]",
56 },
57 Some(b',') => Token {
58 kind: TokenKind::Comma,
59 literal: ",",
60 },
61 Some(b';') => Token {
62 kind: TokenKind::Semicolon,
63 literal: ";",
64 },
65 Some(b'\n') => Token {
66 kind: TokenKind::NewLine,
67 literal: "<newline>",
68 },
69 Some(b'+') => {
70 if self.peek_char() == Some(b'=') {
71 self.read_char();
72 Token {
73 kind: TokenKind::AddAssign,
74 literal: "+=",
75 }
76 } else if self.peek_char() == Some(b'+') {
77 self.read_char();
78 Token {
79 kind: TokenKind::Increment,
80 literal: "++",
81 }
82 } else {
83 Token {
84 kind: TokenKind::Plus,
85 literal: "+",
86 }
87 }
88 }
89 Some(b'-') => {
90 if self.peek_char() == Some(b'=') {
91 self.read_char();
92 Token {
93 kind: TokenKind::SubtractAssign,
94 literal: "-=",
95 }
96 } else if self.peek_char() == Some(b'-') {
97 self.read_char();
98 Token {
99 kind: TokenKind::Decrement,
100 literal: "--",
101 }
102 } else {
103 Token {
104 kind: TokenKind::Minus,
105 literal: "-",
106 }
107 }
108 }
109 Some(b'*') => {
110 if self.peek_char() == Some(b'=') {
111 self.read_char();
112 Token {
113 kind: TokenKind::MultiplyAssign,
114 literal: "*=",
115 }
116 } else {
117 Token {
118 kind: TokenKind::Asterisk,
119 literal: "*",
120 }
121 }
122 }
123 Some(b'%') => {
124 if self.peek_char() == Some(b'=') {
125 self.read_char();
126 Token {
127 kind: TokenKind::ModuloAssign,
128 literal: "%=",
129 }
130 } else {
131 Token {
132 kind: TokenKind::Percent,
133 literal: "%",
134 }
135 }
136 }
137 Some(b'^') => {
138 if self.peek_char() == Some(b'=') {
139 self.read_char();
140 Token {
141 kind: TokenKind::PowerAssign,
142 literal: "^=",
143 }
144 } else {
145 Token {
146 kind: TokenKind::Caret,
147 literal: "^",
148 }
149 }
150 }
151 Some(b'!') => {
152 if self.peek_char() == Some(b'=') {
153 self.read_char();
154 Token {
155 kind: TokenKind::NotEqual,
156 literal: "!=",
157 }
158 } else if self.peek_char() == Some(b'~') {
159 self.read_char();
160 Token {
161 kind: TokenKind::NoMatch,
162 literal: "!~",
163 }
164 } else {
165 Token {
166 kind: TokenKind::ExclamationMark,
167 literal: "!",
168 }
169 }
170 }
171 Some(b'>') => {
172 if self.peek_char() == Some(b'=') {
173 self.read_char();
174 Token {
175 kind: TokenKind::GreaterThanOrEqual,
176 literal: ">=",
177 }
178 } else if self.peek_char() == Some(b'>') {
179 self.read_char();
180 Token {
181 kind: TokenKind::Append,
182 literal: ">>",
183 }
184 } else {
185 Token {
186 kind: TokenKind::GreaterThan,
187 literal: ">",
188 }
189 }
190 }
191 Some(b'<') => {
192 if self.peek_char() == Some(b'=') {
193 self.read_char();
194 Token {
195 kind: TokenKind::LessThanOrEqual,
196 literal: "<=",
197 }
198 } else {
199 Token {
200 kind: TokenKind::LessThan,
201 literal: "<",
202 }
203 }
204 }
205 Some(b'|') => {
206 if self.peek_char() == Some(b'|') {
207 self.read_char();
208 Token {
209 kind: TokenKind::Or,
210 literal: "||",
211 }
212 } else {
213 Token {
214 kind: TokenKind::Pipe,
215 literal: "|",
216 }
217 }
218 }
219 Some(b'?') => Token {
220 kind: TokenKind::QuestionMark,
221 literal: "?",
222 },
223 Some(b':') => Token {
224 kind: TokenKind::Colon,
225 literal: ":",
226 },
227 Some(b'~') => Token {
228 kind: TokenKind::Tilde,
229 literal: "~",
230 },
231 Some(b'$') => Token {
232 kind: TokenKind::DollarSign,
233 literal: "$",
234 },
235 Some(b'=') => {
236 if self.peek_char() == Some(b'=') {
237 self.read_char();
238 Token {
239 kind: TokenKind::Equal,
240 literal: "==",
241 }
242 } else {
243 Token {
244 kind: TokenKind::Assign,
245 literal: "=",
246 }
247 }
248 }
249 Some(b'/') => {
250 if self.peek_char() == Some(b'=') {
251 self.read_char();
252 Token {
253 kind: TokenKind::DivideAssign,
254 literal: "/=",
255 }
256 } else {
257 Token {
258 kind: TokenKind::Division,
259 literal: "/",
260 }
261 }
262 }
263 Some(b'&') => {
264 if self.peek_char() == Some(b'&') {
265 self.read_char();
266 Token {
267 kind: TokenKind::And,
268 literal: "&&",
269 }
270 } else {
271 Token {
272 kind: TokenKind::Illegal,
273 literal: "<illegal>",
274 }
275 }
276 }
277 Some(b'\\') => {
278 if self.peek_char() == Some(b'\n') {
279 self.read_char();
280 Token {
281 kind: TokenKind::NewLine,
282 literal: "<newline>",
283 }
284 } else {
285 Token {
286 kind: TokenKind::Illegal,
287 literal: "<illegal>",
288 }
289 }
290 }
291 ch if is_ascii_alphabetic(ch) => self.read_identifier(),
292 ch if is_digit(ch) => self.read_number(),
293 Some(b'.')
294 if self
295 .peek_char()
296 .map_or(false, |arg0: u8| is_digit(Some(arg0))) =>
297 {
298 self.read_number()
299 }
300 None => Token {
301 kind: TokenKind::Eof,
302 literal: "",
303 },
304 _ => Token {
305 kind: TokenKind::Illegal,
306 literal: "<illegal>",
307 },
308 };
309
310 self.read_char();
311 token
312 }
313
314 fn read_char(&mut self) {
315 if self.read_position >= self.input.len() {
316 self.ch = None;
317 } else {
318 self.ch = Some(self.input.as_bytes()[self.read_position]);
319 }
320 self.position = self.read_position;
321 self.read_position += 1;
322 }
323
324 fn read_identifier(&mut self) -> Token<'a> {
325 let position = self.position;
326 while is_ascii_alphabetic(self.ch) {
327 self.read_char();
328 }
329 let literal = &self.input[position..self.position];
330
331 return lookup_keyword(literal);
332 }
333
334 fn read_number(&mut self) -> Token<'a> {
335 let position = self.position;
336 let mut got_digit = false;
337
338 if self.ch != Some(b'.') {
340 got_digit = true;
341 while is_digit(self.ch) {
342 self.read_char();
343 }
344 if self.ch == Some(b'.') {
345 self.read_char();
346 }
347 } else {
348 self.read_char();
350 }
351
352 while is_digit(self.ch) {
354 got_digit = true;
355 self.read_char();
356 }
357
358 if !got_digit {
359 return Token {
360 kind: TokenKind::Illegal,
361 literal: "<illegal>",
362 };
363 }
364
365 let literal = &self.input[position..self.position];
366
367 Token {
368 kind: TokenKind::Number,
369 literal: literal,
370 }
371 }
372
373 fn skip_whitespace(&mut self) {
374 while is_whitespace(self.ch) {
375 self.read_char();
376 }
377 }
378
379 fn peek_char(&self) -> Option<u8> {
380 if self.read_position >= self.input.len() {
381 None
382 } else {
383 Some(self.input.as_bytes()[self.read_position])
384 }
385 }
386}
387
388fn is_ascii_alphabetic(ch: Option<u8>) -> bool {
389 match ch {
390 Some(byte) => (byte >= b'a' && byte <= b'z') || (byte >= b'A' && byte <= b'Z'),
391 None => false,
392 }
393}
394
395fn is_whitespace(ch: Option<u8>) -> bool {
396 match ch {
397 Some(byte) => byte == b' ' || byte == b'\t' || byte == b'\r',
398 None => false,
399 }
400}
401
402fn is_digit(ch: Option<u8>) -> bool {
403 match ch {
404 Some(byte) => byte >= b'0' && byte <= b'9',
405 None => false,
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 #[test]
414 fn next_left_curly_brace_token() {
415 let expected = Token {
416 kind: TokenKind::LeftCurlyBrace,
417 literal: "{",
418 };
419 let input = "{";
420 let mut lexer = Lexer::new(input);
421
422 let token = lexer.next_token();
423
424 assert_eq!(expected, token);
425 }
426
427 #[test]
428 fn next_right_curly_brace_token() {
429 let expected = Token {
430 kind: TokenKind::RightCurlyBrace,
431 literal: "}",
432 };
433 let input = "}";
434 let mut lexer = Lexer::new(input);
435
436 let token = lexer.next_token();
437
438 assert_eq!(expected, token);
439 }
440
441 #[test]
442 fn next_pipe_token() {
443 let expected = Token {
444 kind: TokenKind::Pipe,
445 literal: "|",
446 };
447 let input = "|";
448 let mut lexer = Lexer::new(input);
449
450 let token = lexer.next_token();
451
452 assert_eq!(expected, token);
453 }
454
455 #[test]
456 fn next_one_character_token() {
457 let input = "{}()[],;\n+-*/%^!><|?:~$=";
458 let mut lexer = Lexer::new(input);
459 let expected_tokens = vec![
460 Token {
461 kind: TokenKind::LeftCurlyBrace,
462 literal: "{",
463 },
464 Token {
465 kind: TokenKind::RightCurlyBrace,
466 literal: "}",
467 },
468 Token {
469 kind: TokenKind::LeftParen,
470 literal: "(",
471 },
472 Token {
473 kind: TokenKind::RightParen,
474 literal: ")",
475 },
476 Token {
477 kind: TokenKind::LeftSquareBracket,
478 literal: "[",
479 },
480 Token {
481 kind: TokenKind::RightSquareBracket,
482 literal: "]",
483 },
484 Token {
485 kind: TokenKind::Comma,
486 literal: ",",
487 },
488 Token {
489 kind: TokenKind::Semicolon,
490 literal: ";",
491 },
492 Token {
493 kind: TokenKind::NewLine,
494 literal: "<newline>",
495 },
496 Token {
497 kind: TokenKind::Plus,
498 literal: "+",
499 },
500 Token {
501 kind: TokenKind::Minus,
502 literal: "-",
503 },
504 Token {
505 kind: TokenKind::Asterisk,
506 literal: "*",
507 },
508 Token {
509 kind: TokenKind::Division,
510 literal: "/",
511 },
512 Token {
513 kind: TokenKind::Percent,
514 literal: "%",
515 },
516 Token {
517 kind: TokenKind::Caret,
518 literal: "^",
519 },
520 Token {
521 kind: TokenKind::ExclamationMark,
522 literal: "!",
523 },
524 Token {
525 kind: TokenKind::GreaterThan,
526 literal: ">",
527 },
528 Token {
529 kind: TokenKind::LessThan,
530 literal: "<",
531 },
532 Token {
533 kind: TokenKind::Pipe,
534 literal: "|",
535 },
536 Token {
537 kind: TokenKind::QuestionMark,
538 literal: "?",
539 },
540 Token {
541 kind: TokenKind::Colon,
542 literal: ":",
543 },
544 Token {
545 kind: TokenKind::Tilde,
546 literal: "~",
547 },
548 Token {
549 kind: TokenKind::DollarSign,
550 literal: "$",
551 },
552 Token {
553 kind: TokenKind::Assign,
554 literal: "=",
555 },
556 Token {
557 kind: TokenKind::Eof,
558 literal: "",
559 },
560 ];
561
562 for expected in expected_tokens {
563 let token = lexer.next_token();
564 assert_eq!(expected, token);
565 }
566 }
567
568 #[test]
569 fn next_while_token() {
570 let expected = Token {
571 kind: TokenKind::While,
572 literal: "while",
573 };
574 let input = "while";
575 let mut lexer = Lexer::new(input);
576
577 let token = lexer.next_token();
578
579 assert_eq!(expected, token);
580 }
581
582 #[test]
583 fn next_identifier_token() {
584 let input = "BEGIN END break continue delete do else exit for function if in next print printf return while";
585 let mut lexer = Lexer::new(input);
586
587 let expected_tokens = vec![
588 Token {
589 kind: TokenKind::Begin,
590 literal: "BEGIN",
591 },
592 Token {
593 kind: TokenKind::End,
594 literal: "END",
595 },
596 Token {
597 kind: TokenKind::Break,
598 literal: "break",
599 },
600 Token {
601 kind: TokenKind::Continue,
602 literal: "continue",
603 },
604 Token {
605 kind: TokenKind::Delete,
606 literal: "delete",
607 },
608 Token {
609 kind: TokenKind::Do,
610 literal: "do",
611 },
612 Token {
613 kind: TokenKind::Else,
614 literal: "else",
615 },
616 Token {
617 kind: TokenKind::Exit,
618 literal: "exit",
619 },
620 Token {
621 kind: TokenKind::For,
622 literal: "for",
623 },
624 Token {
625 kind: TokenKind::Function,
626 literal: "function",
627 },
628 Token {
629 kind: TokenKind::If,
630 literal: "if",
631 },
632 Token {
633 kind: TokenKind::In,
634 literal: "in",
635 },
636 Token {
637 kind: TokenKind::Next,
638 literal: "next",
639 },
640 Token {
641 kind: TokenKind::Print,
642 literal: "print",
643 },
644 Token {
645 kind: TokenKind::Printf,
646 literal: "printf",
647 },
648 Token {
649 kind: TokenKind::Return,
650 literal: "return",
651 },
652 Token {
653 kind: TokenKind::While,
654 literal: "while",
655 },
656 Token {
657 kind: TokenKind::Eof,
658 literal: "",
659 },
660 ];
661
662 for expected in expected_tokens {
663 let token = lexer.next_token();
664 assert_eq!(expected, token);
665 }
666 }
667
668 #[test]
669 fn next_number_token() {
670 let input = "123 4567 890 42.0 .75 0.001";
671 let mut lexer = Lexer::new(input);
672
673 let expected_tokens = vec![
674 Token {
675 kind: TokenKind::Number,
676 literal: "123",
677 },
678 Token {
679 kind: TokenKind::Number,
680 literal: "4567",
681 },
682 Token {
683 kind: TokenKind::Number,
684 literal: "890",
685 },
686 Token {
687 kind: TokenKind::Number,
688 literal: "42.0",
689 },
690 Token {
691 kind: TokenKind::Number,
692 literal: ".75",
693 },
694 Token {
695 kind: TokenKind::Number,
696 literal: "0.001",
697 },
698 Token {
699 kind: TokenKind::Eof,
700 literal: "",
701 },
702 ];
703
704 for expected in expected_tokens {
705 let token = lexer.next_token();
706 assert_eq!(expected, token);
707 }
708 }
709
710 #[test]
711 fn next_or_token() {
712 let expected = Token {
713 kind: TokenKind::Or,
714 literal: "||",
715 };
716 let input = "||";
717 let mut lexer = Lexer::new(input);
718
719 let token = lexer.next_token();
720
721 assert_eq!(expected, token);
722 }
723
724 #[test]
725 fn next_two_character_token() {
726 let input = "+= -= *= /= %= ^= || && !~ == <= >= != ++ -- >>";
727 let mut lexer = Lexer::new(input);
728
729 let expected_tokens = vec![
730 Token {
731 kind: TokenKind::AddAssign,
732 literal: "+=",
733 },
734 Token {
735 kind: TokenKind::SubtractAssign,
736 literal: "-=",
737 },
738 Token {
739 kind: TokenKind::MultiplyAssign,
740 literal: "*=",
741 },
742 Token {
743 kind: TokenKind::DivideAssign,
744 literal: "/=",
745 },
746 Token {
747 kind: TokenKind::ModuloAssign,
748 literal: "%=",
749 },
750 Token {
751 kind: TokenKind::PowerAssign,
752 literal: "^=",
753 },
754 Token {
755 kind: TokenKind::Or,
756 literal: "||",
757 },
758 Token {
759 kind: TokenKind::And,
760 literal: "&&",
761 },
762 Token {
763 kind: TokenKind::NoMatch,
764 literal: "!~",
765 },
766 Token {
767 kind: TokenKind::Equal,
768 literal: "==",
769 },
770 Token {
771 kind: TokenKind::LessThanOrEqual,
772 literal: "<=",
773 },
774 Token {
775 kind: TokenKind::GreaterThanOrEqual,
776 literal: ">=",
777 },
778 Token {
779 kind: TokenKind::NotEqual,
780 literal: "!=",
781 },
782 Token {
783 kind: TokenKind::Increment,
784 literal: "++",
785 },
786 Token {
787 kind: TokenKind::Decrement,
788 literal: "--",
789 },
790 Token {
791 kind: TokenKind::Append,
792 literal: ">>",
793 },
794 Token {
795 kind: TokenKind::Eof,
796 literal: "",
797 },
798 ];
799
800 for expected in expected_tokens {
801 let token = lexer.next_token();
802 assert_eq!(expected, token);
803 }
804 }
805
806 #[test]
807 fn consume_comment() {
808 let input = "# This is a comment\n123";
809 let mut lexer = Lexer::new(input);
810
811 let expected_tokens = vec![
812 Token {
813 kind: TokenKind::NewLine,
814 literal: "<newline>",
815 },
816 Token {
817 kind: TokenKind::Number,
818 literal: "123",
819 },
820 Token {
821 kind: TokenKind::Eof,
822 literal: "",
823 },
824 ];
825
826 for expected in expected_tokens {
827 let token = lexer.next_token();
828 assert_eq!(expected, token);
829 }
830 }
831
832 #[test]
833 fn expect_newline_after_backslash() {
834 let input = "123 \\\n456";
835 let mut lexer = Lexer::new(input);
836
837 let expected_tokens = vec![
838 Token {
839 kind: TokenKind::Number,
840 literal: "123",
841 },
842 Token {
843 kind: TokenKind::NewLine,
844 literal: "<newline>",
845 },
846 Token {
847 kind: TokenKind::Number,
848 literal: "456",
849 },
850 Token {
851 kind: TokenKind::Eof,
852 literal: "",
853 },
854 ];
855 for expected in expected_tokens {
856 let token = lexer.next_token();
857 assert_eq!(expected, token);
858 }
859 }
860
861 #[test]
862 fn backslash_without_newline_is_illegal() {
863 let input = "123 \\ 456";
864 let mut lexer = Lexer::new(input);
865
866 let expected_tokens = vec![
867 Token {
868 kind: TokenKind::Number,
869 literal: "123",
870 },
871 Token {
872 kind: TokenKind::Illegal,
873 literal: "<illegal>",
874 },
875 Token {
876 kind: TokenKind::Number,
877 literal: "456",
878 },
879 Token {
880 kind: TokenKind::Eof,
881 literal: "",
882 },
883 ];
884 for expected in expected_tokens {
885 let token = lexer.next_token();
886 assert_eq!(expected, token);
887 }
888 }
889
890 #[test]
891 fn is_ascii_alphabetic_lowercase() {
892 assert!(is_ascii_alphabetic(Some(b'a')));
893 assert!(is_ascii_alphabetic(Some(b'z')));
894 assert!(is_ascii_alphabetic(Some(b'm')));
895 }
896
897 #[test]
898 fn is_ascii_alphabetic_uppercase() {
899 assert!(is_ascii_alphabetic(Some(b'A')));
900 assert!(is_ascii_alphabetic(Some(b'Z')));
901 assert!(is_ascii_alphabetic(Some(b'M')));
902 }
903
904 #[test]
905 fn is_ascii_alphabetic_digits() {
906 assert!(!is_ascii_alphabetic(Some(b'0')));
907 assert!(!is_ascii_alphabetic(Some(b'5')));
908 assert!(!is_ascii_alphabetic(Some(b'9')));
909 }
910
911 #[test]
912 fn is_ascii_alphabetic_special_chars() {
913 assert!(!is_ascii_alphabetic(Some(b'!')));
914 assert!(!is_ascii_alphabetic(Some(b' ')));
915 assert!(!is_ascii_alphabetic(Some(b'{')));
916 assert!(!is_ascii_alphabetic(Some(b'=')));
917 }
918
919 #[test]
920 fn is_ascii_alphabetic_none() {
921 assert!(!is_ascii_alphabetic(None));
922 }
923
924 #[test]
925 fn is_whitespace_space() {
926 assert!(is_whitespace(Some(b' ')), "space is considered whitespace");
927 assert!(is_whitespace(Some(b'\t')), "tab is considered whitespace");
928 assert!(
929 is_whitespace(Some(b'\r')),
930 "carriage return is considered whitespace"
931 );
932 }
933
934 #[test]
935 fn is_whitespace_special_chars() {
936 assert!(!is_whitespace(Some(b'!')));
937 assert!(!is_whitespace(Some(b'{')));
938 assert!(!is_whitespace(Some(b'=')));
939 }
940
941 #[test]
942 fn is_whitespace_none() {
943 assert!(!is_whitespace(None));
944 }
945
946 #[test]
947 fn is_digit_valid() {
948 assert!(is_digit(Some(b'0')));
949 assert!(is_digit(Some(b'5')));
950 assert!(is_digit(Some(b'9')));
951 }
952
953 #[test]
954 fn is_digit_invalid() {
955 assert!(!is_digit(Some(b'a')));
956 assert!(!is_digit(Some(b'z')));
957 assert!(!is_digit(Some(b'A')));
958 assert!(!is_digit(Some(b'Z')));
959 assert!(!is_digit(Some(b'!')));
960 assert!(!is_digit(Some(b' ')));
961 assert!(!is_digit(Some(b'{')));
962 assert!(!is_digit(Some(b'=')));
963 }
964
965 #[test]
966 fn is_digit_none() {
967 assert!(!is_digit(None));
968 }
969}