1use std::{error::Error, fmt::Display};
4
5pub trait Tokenizable {
7 fn tokenize(&self) -> Result<Vec<Token<'_>>, TokenError>;
9}
10
11#[derive(Debug, PartialEq, Clone, Copy)]
13pub struct Token<'a> {
14 pub line: usize,
16 pub col: usize,
18 pub len: usize,
20 pub tag: TokenTag<'a>,
22}
23
24#[derive(Debug, PartialEq, Clone, Copy)]
26pub enum TokenTag<'a> {
27 Identifier(&'a str),
29 Number(f64),
31 String(&'a str),
33 Keyword(Keyword),
35 OpenBrace,
37 CloseBrace,
39 OpenParen,
41 CloseParen,
43 OpenBracket,
45 CloseBracket,
47 Semicolon,
49 Plus,
51 PlusPlus,
53 PlusEq,
55 Minus,
57 Star,
59 Comma,
61 Dot,
63 Slash,
65 Equal,
67 Greater,
69 GreaterEqual,
71 EqualEqual,
73 BangEqual,
75 Bang,
77 Less,
79 LessEqual,
81
82 EOF,
84}
85
86impl Display for TokenTag<'_> {
87 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
88 match *self {
89 Self::PlusPlus => write!(f, "++"),
90 Self::PlusEq => write!(f, "+="),
91 Self::Dot => write!(f, "."),
92 Self::Semicolon => write!(f, ";"),
93 Self::Number(n) => write!(f, "{n}"),
94 Self::String(s) => write!(f, "{s}"),
95 Self::Identifier(s) => write!(f, "{s}"),
96 Self::Keyword(k) => write!(f, "{k}"),
97 Self::OpenBrace => write!(f, "{{"),
98 Self::CloseBrace => write!(f, "}}"),
99
100 Self::OpenParen => write!(f, "("),
101 Self::CloseParen => write!(f, ")"),
102 Self::OpenBracket => write!(f, "["),
103 Self::CloseBracket => write!(f, "]"),
104
105 Self::Plus => write!(f, "+"),
106 Self::Minus => write!(f, "-"),
107 Self::Slash => write!(f, "/"),
108 Self::Star => write!(f, "*"),
109
110 Self::Greater => write!(f, ">"),
111 Self::GreaterEqual => write!(f, ">="),
112 Self::Less => write!(f, "<"),
113 Self::LessEqual => write!(f, "<="),
114
115 Self::Bang => write!(f, "!"),
116 Self::BangEqual => write!(f, "!="),
117 Self::Equal => write!(f, "="),
118 Self::EqualEqual => write!(f, "=="),
119
120 Self::Comma => write!(f, ","),
121 Self::EOF => write!(f, " "),
122 }
123 }
124}
125
126impl Display for Keyword {
127 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128 match *self {
129 Self::And => write!(f, "and"),
130 Self::Class => write!(f, "class"),
131 Self::Else => write!(f, "else"),
132 Self::False => write!(f, "false"),
133 Self::Fun => write!(f, "fun"),
134 Self::For => write!(f, "for"),
135 Self::If => write!(f, "if"),
136 Self::Nil => write!(f, "nil"),
137 Self::Or => write!(f, "or"),
138 Self::Print => write!(f, "print"),
139 Self::Roar => write!(f, "roar"),
140 Self::Return => write!(f, "return"),
141 Self::Super => write!(f, "super"),
142 Self::This => write!(f, "this"),
143 Self::True => write!(f, "true"),
144 Self::Var => write!(f, "var"),
145 Self::While => write!(f, "while"),
146 }
147 }
148}
149
150#[derive(Debug, PartialEq, Clone, Copy)]
152pub enum Keyword {
153 And,
155 Class,
157 Else,
159 False,
161 Fun,
163 For,
165 If,
167 Nil,
169 Or,
171 Print,
173 Roar,
175 Return,
177 Super,
179 This,
181 True,
183 Var,
185 While,
187}
188
189impl TryFrom<&str> for Keyword {
190 type Error = ();
191 fn try_from(value: &str) -> Result<Self, Self::Error> {
192 match value {
193 "and" => Ok(Self::And),
194 "class" => Ok(Self::Class),
195 "else" => Ok(Self::Else),
196 "false" => Ok(Self::False),
197 "fun" => Ok(Self::Fun),
198 "for" => Ok(Self::For),
199 "if" => Ok(Self::If),
200 "nil" => Ok(Self::Nil),
201 "or" => Ok(Self::Or),
202 "print" => Ok(Self::Print),
203 "roar" => Ok(Self::Roar),
204 "return" => Ok(Self::Return),
205 "super" => Ok(Self::Super),
206 "this" => Ok(Self::This),
207 "true" => Ok(Self::True),
208 "var" => Ok(Self::Var),
209 "while" => Ok(Self::While),
210 _ => Err(()),
211 }
212 }
213}
214
215#[derive(Debug, Clone, Copy, PartialEq, Eq)]
217pub struct TokenError {
218 pub token: char,
220 pub line: usize,
222 pub col: usize,
224}
225
226impl TokenError {
227 pub fn new(token: char, line: usize, col: usize) -> Self {
229 Self { token, line, col }
230 }
231}
232
233impl Display for TokenError {
234 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
235 write!(
236 f,
237 "Invalid token {} at line {}, col {}",
238 self.token, self.line, self.col
239 )
240 }
241}
242
243impl Error for TokenError {}
244
245impl<STR> Tokenizable for STR
246where
247 STR: AsRef<str>,
248{
249 fn tokenize(&self) -> Result<Vec<Token<'_>>, TokenError> {
250 let mut peek = self.as_ref().chars().enumerate().peekable();
251 let mut tokens = vec![];
252 let mut line = 1;
253 let mut col = 0;
254
255 while let Some((idx, tok)) = peek.next() {
256 let mut len = 1;
257 col += 1;
258 let next_tag = match tok {
259 '[' => TokenTag::OpenBracket,
260 ']' => TokenTag::CloseBracket,
261
262 '{' => TokenTag::OpenBrace,
263 '}' => TokenTag::CloseBrace,
264
265 '(' => TokenTag::OpenParen,
266 ')' => TokenTag::CloseParen,
267
268 ';' => TokenTag::Semicolon,
269 '.' => TokenTag::Dot,
270
271 '=' => match peek.peek() {
272 Some((_, '=')) => {
273 peek.next();
274 col += 1;
275 len += 1;
276
277 TokenTag::EqualEqual
278 }
279 _ => TokenTag::Equal,
280 },
281
282 '!' => match peek.peek() {
283 Some((_, '=')) => {
284 peek.next();
285 col += 1;
286 len += 1;
287
288 TokenTag::BangEqual
289 }
290 _ => TokenTag::Bang,
291 },
292
293 '<' => match peek.peek() {
294 Some((_, '=')) => {
295 peek.next();
296 col += 1;
297 len += 1;
298
299 TokenTag::LessEqual
300 }
301 _ => TokenTag::Less,
302 },
303
304 '>' => match peek.peek() {
305 Some((_, '=')) => {
306 peek.next();
307 col += 1;
308 len += 1;
309
310 TokenTag::GreaterEqual
311 }
312 _ => TokenTag::Greater,
313 },
314
315 '+' => match peek.peek() {
316 Some((_, '+')) => {
317 peek.next();
318 TokenTag::PlusPlus
319 }
320 Some((_, '=')) => {
321 peek.next();
322 TokenTag::PlusEq
323 }
324 _ => TokenTag::Plus,
325 },
326 '-' => TokenTag::Minus,
327 '*' => TokenTag::Star,
328 '/' => match peek.peek() {
329 Some((_, '/')) => {
330 for (_, ch) in peek.by_ref() {
331 if ch == '\n' {
332 break;
333 }
334 }
335 continue;
336 }
337 Some((_, '*')) => {
338 peek.next();
339 while let Some((_, ch)) = peek.next() {
340 if ch == '*' {
341 if let Some((_, '/')) = peek.peek() {
342 peek.next();
343 break;
344 }
345 }
346 }
347
348 continue;
349 }
350 _ => TokenTag::Slash,
351 },
352
353 '\n' => {
354 col = 0;
355 line += 1;
356 continue;
357 }
358
359 ',' => TokenTag::Comma,
360
361 ws if ws.is_whitespace() => continue,
362
363 num if num.is_numeric() => {
364 let mut curr = String::new();
365 curr.push(num);
366
367 let mut dot = false;
368 while let Some((_, next)) = peek.peek() {
369 if next.is_numeric() {
370 col += 1;
371 len += 1;
372 curr.push(peek.next().unwrap().1);
373 } else if *next == '.' && !dot {
374 col += 1;
375 len += 1;
376 curr.push(peek.next().unwrap().1);
377 dot = true;
378 } else {
379 break;
380 }
381 }
382
383 TokenTag::Number(curr.parse().unwrap())
386 }
387
388 '"' => {
389 let mut idx2 = idx;
390 let mut ended = false;
391
392 for (_, c) in peek.by_ref() {
393 if c != '"' {
394 idx2 += 1;
395 col += 1;
396 len += 1;
397 } else {
398 ended = true;
399 break;
400 }
401 }
402
403 if ended {
404 TokenTag::String(&self.as_ref()[idx + 1..=idx2])
405 } else {
406 return Err(TokenError::new('"', line, col));
407 }
408 }
409
410 ch if ch.is_alphanumeric() || ch == '_' => {
411 let mut end = idx;
412
413 while let Some((idx2, next)) = peek.peek() {
414 if !(next.is_alphanumeric() || *next == '_') {
415 break;
416 }
417
418 end = *idx2;
419 col += 1;
420 len += 1;
421 peek.next();
422 }
423
424 let word = &self.as_ref()[idx..=end];
425 if let Ok(keyword) = Keyword::try_from(word) {
426 TokenTag::Keyword(keyword)
427 } else {
428 TokenTag::Identifier(word)
429 }
430 }
431
432 bad => return Err(TokenError::new(bad, line, col)),
433 };
434
435 let next = Token {
436 line,
437 col,
438 len,
439 tag: next_tag,
440 };
441 tokens.push(next);
442 }
443
444 tokens.push(Token {
445 line,
446 col,
447 len: 0,
448 tag: TokenTag::EOF,
449 });
450
451 Ok(tokens)
452 }
453}
454
455#[cfg(test)]
456mod tests {
457 use crate::tokenizer::{Keyword, Token, TokenError};
458
459 use super::{TokenTag, Tokenizable};
460
461 fn tags<'a>(toks: Vec<Token<'a>>) -> Vec<TokenTag<'a>> {
462 toks.iter().map(|t| t.tag).collect()
463 }
464
465 #[test]
466 fn operators_parsing() {
467 let tokens = "var pi = 3.14".tokenize().expect("Tokenize");
468 assert_eq!(
469 tags(tokens),
470 [
471 TokenTag::Keyword(Keyword::Var),
472 TokenTag::Identifier("pi"),
473 TokenTag::Equal,
474 TokenTag::Number(3.14),
475 TokenTag::EOF
476 ]
477 )
478 }
479
480 #[test]
481 fn plus_eq_operator() {
482 let tokens = "i+=".tokenize().expect("Tokenize");
483 assert_eq!(
484 tags(tokens),
485 [TokenTag::Identifier("i"), TokenTag::PlusEq, TokenTag::EOF]
486 )
487 }
488
489 #[test]
490 fn inc_operator() {
491 let tokens = "i++".tokenize().expect("Tokenize");
492 assert_eq!(
493 tags(tokens),
494 [TokenTag::Identifier("i"), TokenTag::PlusPlus, TokenTag::EOF]
495 )
496 }
497
498 #[test]
499 fn invalid_characters() {
500 let tokens = "foo bar baz ?".tokenize();
501
502 assert_eq!(tokens, Err(TokenError::new('?', 1, 13)))
503 }
504
505 #[test]
506 fn parentheses_and_braces() {
507 let tokens = "(x + y) { z; }".tokenize().expect("TokenizeTag");
508 assert_eq!(
509 tags(tokens),
510 [
511 TokenTag::OpenParen,
512 TokenTag::Identifier("x"),
513 TokenTag::Plus,
514 TokenTag::Identifier("y"),
515 TokenTag::CloseParen,
516 TokenTag::OpenBrace,
517 TokenTag::Identifier("z"),
518 TokenTag::Semicolon,
519 TokenTag::CloseBrace,
520 TokenTag::EOF
521 ]
522 );
523 }
524
525 #[test]
526 fn multi_line_comment() {
527 let tokens = r#"
528 /*
529 This is a multi line comment
530 * all of this should be ignored *
531 */
532 x = 10
533 "#
534 .tokenize()
535 .expect("Tokenize");
536
537 let expected = [
538 TokenTag::Identifier("x"),
539 TokenTag::Equal,
540 TokenTag::Number(10.0),
541 TokenTag::EOF,
542 ];
543
544 assert_eq!(tags(tokens), expected)
545 }
546
547 #[test]
548 fn brackets() {
549 let tokens = "foo[1]".tokenize().expect("Tokenize");
550
551 let expected = [
552 TokenTag::Identifier("foo"),
553 TokenTag::OpenBracket,
554 TokenTag::Number(1.0),
555 TokenTag::CloseBracket,
556 TokenTag::EOF,
557 ];
558
559 assert_eq!(tags(tokens), expected)
560 }
561
562 #[test]
563 fn single_line_comment() {
564 let tokens = r#"
565// This is a comment
566 x = 10
567 "#
568 .tokenize()
569 .expect("Tokenize");
570
571 let expected = [
572 TokenTag::Identifier("x"),
573 TokenTag::Equal,
574 TokenTag::Number(10.0),
575 TokenTag::EOF,
576 ];
577
578 assert_eq!(tags(tokens), expected)
579 }
580
581 #[test]
582 fn invalid_tokens() {
583 let tokens = "x = @".tokenize();
584 assert_eq!(tokens, Err(TokenError::new('@', 1, 5)));
585
586 let tokens = "x = #y".tokenize();
587 assert_eq!(tokens, Err(TokenError::new('#', 1, 5)));
588 }
589
590 #[test]
591 fn empty_input() {
592 let tokens = "".tokenize().expect("Tokenize");
593 assert_eq!(tags(tokens), [TokenTag::EOF]);
594 }
595
596 #[test]
597 fn keywords_as_identifiers() {
598 let tokens = "var varx = forx".tokenize().expect("Tokenize");
599 assert_eq!(
600 tags(tokens),
601 [
602 TokenTag::Keyword(Keyword::Var),
603 TokenTag::Identifier("varx"),
604 TokenTag::Equal,
605 TokenTag::Identifier("forx"),
606 TokenTag::EOF
607 ]
608 );
609 }
610
611 #[test]
612 fn long_identifiers() {
613 let tokens = "very_long_identifier_name = 123"
614 .tokenize()
615 .expect("Tokenize");
616 assert_eq!(
617 tags(tokens),
618 [
619 TokenTag::Identifier("very_long_identifier_name"),
620 TokenTag::Equal,
621 TokenTag::Number(123.0),
622 TokenTag::EOF
623 ]
624 );
625 }
626
627 #[test]
628 fn bad_string() {
629 let tokens = r#""hello!"#.tokenize();
630 assert!(tokens.is_err())
631 }
632
633 #[test]
634 fn string() {
635 let tokens = r#""hello!""#.tokenize().expect("TokenizeTag");
636 assert_eq!(tags(tokens), [TokenTag::String("hello!"), TokenTag::EOF])
637 }
638
639 #[test]
640 fn whitespace_handling() {
641 let tokens = " var x = 123 ".tokenize().expect("Tokenize");
642 assert_eq!(
643 tags(tokens),
644 [
645 TokenTag::Keyword(Keyword::Var),
646 TokenTag::Identifier("x"),
647 TokenTag::Equal,
648 TokenTag::Number(123.0),
649 TokenTag::EOF
650 ]
651 );
652 }
653
654 #[test]
655 fn numeric_literals() {
656 let tokens = "123 45.67 0.123 123.".tokenize().expect("TokenizeTag");
657 assert_eq!(
658 tags(tokens),
659 [
660 TokenTag::Number(123.0),
661 TokenTag::Number(45.67),
662 TokenTag::Number(0.123),
663 TokenTag::Number(123.0),
664 TokenTag::EOF
665 ]
666 );
667
668 let tokens = "123.45.67".tokenize().expect("Tokenize");
669 assert_eq!(
670 tags(tokens),
671 [
672 TokenTag::Number(123.45),
673 TokenTag::Dot,
674 TokenTag::Number(67.0),
675 TokenTag::EOF
676 ]
677 );
678 }
679
680 #[test]
681 fn identifiers_and_keywords() {
682 let tokens = "var x = if y".tokenize().expect("Tokenize");
683 assert_eq!(
684 tags(tokens),
685 [
686 TokenTag::Keyword(Keyword::Var),
687 TokenTag::Identifier("x"),
688 TokenTag::Equal,
689 TokenTag::Keyword(Keyword::If),
690 TokenTag::Identifier("y"),
691 TokenTag::EOF
692 ]
693 );
694
695 let tokens = "for var ifelse".tokenize().expect("Tokenize");
696 assert_eq!(
697 tags(tokens),
698 [
699 TokenTag::Keyword(Keyword::For),
700 TokenTag::Keyword(Keyword::Var),
701 TokenTag::Identifier("ifelse"),
702 TokenTag::EOF
703 ]
704 );
705 }
706
707 #[test]
708 fn arithmetic_expressions() {
709 let tokens = "x = (a + b) * c - d / e".tokenize().expect("Tokenize");
710 assert_eq!(
711 tags(tokens),
712 [
713 TokenTag::Identifier("x"),
714 TokenTag::Equal,
715 TokenTag::OpenParen,
716 TokenTag::Identifier("a"),
717 TokenTag::Plus,
718 TokenTag::Identifier("b"),
719 TokenTag::CloseParen,
720 TokenTag::Star,
721 TokenTag::Identifier("c"),
722 TokenTag::Minus,
723 TokenTag::Identifier("d"),
724 TokenTag::Slash,
725 TokenTag::Identifier("e"),
726 TokenTag::EOF
727 ]
728 );
729 }
730
731 #[test]
732 fn multiple_lines_error() {
733 let input = r#"var x = 1;
734var y = 2;
735x = x + $;
736 "#;
737 let tokens = input.tokenize();
738 assert_eq!(tokens, Err(TokenError::new('$', 3, 9)));
739 }
740
741 #[test]
742 fn multiple_lines() {
743 let input = r#"
744 var x = 1;
745 var y = 2;
746 x = x + y;
747 "#;
748 let tokens = input.tokenize().expect("Tokenize");
749 assert_eq!(
750 tags(tokens),
751 [
752 TokenTag::Keyword(Keyword::Var),
753 TokenTag::Identifier("x"),
754 TokenTag::Equal,
755 TokenTag::Number(1.0),
756 TokenTag::Semicolon,
757 TokenTag::Keyword(Keyword::Var),
758 TokenTag::Identifier("y"),
759 TokenTag::Equal,
760 TokenTag::Number(2.0),
761 TokenTag::Semicolon,
762 TokenTag::Identifier("x"),
763 TokenTag::Equal,
764 TokenTag::Identifier("x"),
765 TokenTag::Plus,
766 TokenTag::Identifier("y"),
767 TokenTag::Semicolon,
768 TokenTag::EOF
769 ]
770 );
771 }
772
773 #[test]
774 fn edge_cases_for_identifiers() {
775 let tokens = "_underscore".tokenize().expect("Tokenize");
776 assert_eq!(
777 tags(tokens),
778 [TokenTag::Identifier("_underscore"), TokenTag::EOF]
779 );
780
781 let tokens = "var1".tokenize().expect("Tokenize");
782 assert_eq!(tags(tokens), [TokenTag::Identifier("var1"), TokenTag::EOF]);
783
784 let tokens = "var_1".tokenize().expect("Tokenize");
785 assert_eq!(tags(tokens), [TokenTag::Identifier("var_1"), TokenTag::EOF]);
786 }
787
788 #[test]
789 fn edge_cases_for_numbers() {
790 let tokens = "0123".tokenize().expect("Tokenize");
791 assert_eq!(tags(tokens), [TokenTag::Number(123.0), TokenTag::EOF]);
792
793 let tokens = "123.".tokenize().expect("Tokenize");
794 assert_eq!(tags(tokens), [TokenTag::Number(123.0), TokenTag::EOF]);
795
796 let tokens = "123..456".tokenize().expect("Tokenize");
797 assert_eq!(
798 tags(tokens),
799 [
800 TokenTag::Number(123.0),
801 TokenTag::Dot,
802 TokenTag::Number(456.0),
803 TokenTag::EOF
804 ]
805 );
806 }
807
808 #[test]
809 fn edge_cases_for_operators() {
810 let tokens = "x = y + -z".tokenize().expect("Tokenize");
811 assert_eq!(
812 tags(tokens),
813 [
814 TokenTag::Identifier("x"),
815 TokenTag::Equal,
816 TokenTag::Identifier("y"),
817 TokenTag::Plus,
818 TokenTag::Minus,
819 TokenTag::Identifier("z"),
820 TokenTag::EOF
821 ]
822 );
823
824 let tokens = "x = y * / z".tokenize().expect("Tokenize");
825 assert_eq!(
826 tags(tokens),
827 [
828 TokenTag::Identifier("x"),
829 TokenTag::Equal,
830 TokenTag::Identifier("y"),
831 TokenTag::Star,
832 TokenTag::Slash,
833 TokenTag::Identifier("z"),
834 TokenTag::EOF
835 ]
836 );
837
838 let tokens = "x = y == z".tokenize().expect("TokenizeTag");
839 assert_eq!(
840 tags(tokens),
841 [
842 TokenTag::Identifier("x"),
843 TokenTag::Equal,
844 TokenTag::Identifier("y"),
845 TokenTag::EqualEqual,
846 TokenTag::Identifier("z"),
847 TokenTag::EOF
848 ]
849 );
850 }
851
852 #[test]
853 fn edge_cases_for_whitespace() {
854 let tokens = " var x = 123 ".tokenize().expect("TokenizeTag");
855 assert_eq!(
856 tags(tokens),
857 [
858 TokenTag::Keyword(Keyword::Var),
859 TokenTag::Identifier("x"),
860 TokenTag::Equal,
861 TokenTag::Number(123.0),
862 TokenTag::EOF
863 ]
864 );
865
866 let tokens = "\tvar\tx\t=\t123\t".tokenize().expect("Tokenize");
867 assert_eq!(
868 tags(tokens),
869 [
870 TokenTag::Keyword(Keyword::Var),
871 TokenTag::Identifier("x"),
872 TokenTag::Equal,
873 TokenTag::Number(123.0),
874 TokenTag::EOF
875 ]
876 );
877
878 let tokens = "\nvar\nx\n=\n123\n".tokenize().expect("Tokenize");
879 assert_eq!(
880 tags(tokens),
881 [
882 TokenTag::Keyword(Keyword::Var),
883 TokenTag::Identifier("x"),
884 TokenTag::Equal,
885 TokenTag::Number(123.0),
886 TokenTag::EOF
887 ]
888 );
889 }
890}