1#[cfg(feature = "no_std")]
2use alloc::{format, string::String, vec::Vec};
3
4use crate::error::BopError;
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum StringPart {
8 Literal(String),
9 Variable(String),
10}
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14 Int(i64),
20 Number(f64),
21 Str(String),
22 StringInterp(Vec<StringPart>),
23 True,
24 False,
25 None,
26
27 Ident(String),
29 Let,
30 Const,
31 Fn,
32 Return,
33 If,
34 Else,
35 While,
36 For,
37 In,
38 Repeat,
39 Break,
40 Continue,
41 Use,
42 As,
43 Struct,
44 Enum,
45 Match,
46 Try,
47
48 Plus,
50 Minus,
51 Star,
52 Slash,
53 Percent,
54 EqEq,
55 BangEq,
56 Lt,
57 Gt,
58 LtEq,
59 GtEq,
60 AmpAmp,
61 PipePipe,
62 Bang,
63 Eq,
64 PlusEq,
65 MinusEq,
66 StarEq,
67 SlashEq,
68 PercentEq,
69
70 LParen,
72 RParen,
73 LBracket,
74 RBracket,
75 LBrace,
76 RBrace,
77 Comma,
78 Colon,
79 ColonColon,
80 Dot,
81 DotDot,
82 Semicolon,
83 FatArrow,
84 Pipe,
85
86 Newline,
88
89 Eof,
90}
91
92#[derive(Debug, Clone)]
93pub struct SpannedToken {
94 pub token: Token,
95 pub line: u32,
96 pub column: u32,
100}
101
102pub fn lex(source: &str) -> Result<Vec<SpannedToken>, BopError> {
103 let mut lexer = Lexer::new(source);
104 let raw = lexer.lex_all()?;
105 Ok(insert_semicolons(raw))
106}
107
108fn triggers_semicolon(token: &Token) -> bool {
109 matches!(
110 token,
111 Token::Ident(_)
112 | Token::Int(_)
113 | Token::Number(_)
114 | Token::Str(_)
115 | Token::StringInterp(_)
116 | Token::True
117 | Token::False
118 | Token::None
119 | Token::Break
120 | Token::Continue
121 | Token::Return
122 | Token::RParen
123 | Token::RBracket
124 | Token::RBrace
125 )
126}
127
128fn insert_semicolons(raw: Vec<SpannedToken>) -> Vec<SpannedToken> {
129 let mut result: Vec<SpannedToken> = Vec::new();
130 for token in raw {
131 if token.token == Token::Newline {
132 if let Some(last) = result.last() {
133 if triggers_semicolon(&last.token) {
134 result.push(SpannedToken {
135 token: Token::Semicolon,
136 line: token.line,
137 column: token.column,
138 });
139 }
140 }
141 } else {
142 result.push(token);
143 }
144 }
145 result
146}
147
148struct Lexer {
149 chars: Vec<char>,
150 pos: usize,
151 line: u32,
152 column: u32,
155}
156
157impl Lexer {
158 fn new(source: &str) -> Self {
159 Self {
160 chars: source.chars().collect(),
161 pos: 0,
162 line: 1,
163 column: 1,
164 }
165 }
166
167 fn peek(&self) -> Option<char> {
168 self.chars.get(self.pos).copied()
169 }
170
171 fn peek_next(&self) -> Option<char> {
172 self.chars.get(self.pos + 1).copied()
173 }
174
175 fn advance(&mut self) -> Option<char> {
176 let ch = self.chars.get(self.pos).copied()?;
177 self.pos += 1;
178 if ch == '\n' {
179 self.column = 1;
185 } else {
186 self.column += 1;
187 }
188 Some(ch)
189 }
190
191 fn error(&self, message: impl Into<String>) -> BopError {
192 BopError {
193 line: Some(self.line),
194 column: Some(self.column),
195 message: message.into(),
196 friendly_hint: None,
197 is_fatal: false,
198 is_try_return: false,
199 }
200 }
201
202 fn error_with_hint(
203 &self,
204 message: impl Into<String>,
205 hint: impl Into<String>,
206 ) -> BopError {
207 BopError {
208 line: Some(self.line),
209 column: Some(self.column),
210 message: message.into(),
211 friendly_hint: Some(hint.into()),
212 is_fatal: false,
213 is_try_return: false,
214 }
215 }
216
217 fn lex_all(&mut self) -> Result<Vec<SpannedToken>, BopError> {
218 let mut tokens = Vec::new();
219
220 loop {
221 while let Some(ch) = self.peek() {
223 if ch == ' ' || ch == '\t' || ch == '\r' {
224 self.advance();
225 } else {
226 break;
227 }
228 }
229
230 let Some(ch) = self.peek() else {
231 tokens.push(SpannedToken {
232 token: Token::Eof,
233 line: self.line,
234 column: self.column,
235 });
236 break;
237 };
238
239 let line = self.line;
243 let column = self.column;
244
245 match ch {
246 '\n' => {
247 self.advance();
248 self.line += 1;
249 tokens.push(SpannedToken {
250 token: Token::Newline,
251 line,
252 column,
253 });
254 }
255
256 '"' => {
257 tokens.push(SpannedToken {
258 token: self.lex_string()?,
259 line,
260 column,
261 });
262 }
263
264 '0'..='9' => {
265 tokens.push(SpannedToken {
266 token: self.lex_number()?,
267 line,
268 column,
269 });
270 }
271
272 'a'..='z' | 'A'..='Z' | '_' => {
273 tokens.push(SpannedToken {
274 token: self.lex_ident_or_keyword(),
275 line,
276 column,
277 });
278 }
279
280 '+' => {
281 self.advance();
282 if self.peek() == Some('=') {
283 self.advance();
284 tokens.push(SpannedToken {
285 token: Token::PlusEq,
286 line,
287 column,
288 });
289 } else {
290 tokens.push(SpannedToken {
291 token: Token::Plus,
292 line,
293 column,
294 });
295 }
296 }
297 '-' => {
298 self.advance();
299 if self.peek() == Some('=') {
300 self.advance();
301 tokens.push(SpannedToken {
302 token: Token::MinusEq,
303 line,
304 column,
305 });
306 } else {
307 tokens.push(SpannedToken {
308 token: Token::Minus,
309 line,
310 column,
311 });
312 }
313 }
314 '*' => {
315 self.advance();
316 if self.peek() == Some('=') {
317 self.advance();
318 tokens.push(SpannedToken {
319 token: Token::StarEq,
320 line,
321 column,
322 });
323 } else {
324 tokens.push(SpannedToken {
325 token: Token::Star,
326 line,
327 column,
328 });
329 }
330 }
331 '/' => {
332 self.advance();
333 if self.peek() == Some('=') {
334 self.advance();
335 tokens.push(SpannedToken {
336 token: Token::SlashEq,
337 line,
338 column,
339 });
340 } else if self.peek() == Some('/') {
341 self.advance();
348 while let Some(c) = self.peek() {
349 if c == '\n' {
350 break;
351 }
352 self.advance();
353 }
354 } else {
355 tokens.push(SpannedToken {
356 token: Token::Slash,
357 line,
358 column,
359 });
360 }
361 }
362 '%' => {
363 self.advance();
364 if self.peek() == Some('=') {
365 self.advance();
366 tokens.push(SpannedToken {
367 token: Token::PercentEq,
368 line,
369 column,
370 });
371 } else {
372 tokens.push(SpannedToken {
373 token: Token::Percent,
374 line,
375 column,
376 });
377 }
378 }
379
380 '=' => {
381 self.advance();
382 if self.peek() == Some('=') {
383 self.advance();
384 tokens.push(SpannedToken {
385 token: Token::EqEq,
386 line,
387 column,
388 });
389 } else if self.peek() == Some('>') {
390 self.advance();
391 tokens.push(SpannedToken {
392 token: Token::FatArrow,
393 line,
394 column,
395 });
396 } else {
397 tokens.push(SpannedToken {
398 token: Token::Eq,
399 line,
400 column,
401 });
402 }
403 }
404 '!' => {
405 self.advance();
406 if self.peek() == Some('=') {
407 self.advance();
408 tokens.push(SpannedToken {
409 token: Token::BangEq,
410 line,
411 column,
412 });
413 } else {
414 tokens.push(SpannedToken {
415 token: Token::Bang,
416 line,
417 column,
418 });
419 }
420 }
421 '<' => {
422 self.advance();
423 if self.peek() == Some('=') {
424 self.advance();
425 tokens.push(SpannedToken {
426 token: Token::LtEq,
427 line,
428 column,
429 });
430 } else {
431 tokens.push(SpannedToken {
432 token: Token::Lt,
433 line,
434 column,
435 });
436 }
437 }
438 '>' => {
439 self.advance();
440 if self.peek() == Some('=') {
441 self.advance();
442 tokens.push(SpannedToken {
443 token: Token::GtEq,
444 line,
445 column,
446 });
447 } else {
448 tokens.push(SpannedToken {
449 token: Token::Gt,
450 line,
451 column,
452 });
453 }
454 }
455
456 '&' => {
457 self.advance();
458 if self.peek() == Some('&') {
459 self.advance();
460 tokens.push(SpannedToken {
461 token: Token::AmpAmp,
462 line,
463 column,
464 });
465 } else {
466 return Err(
467 self.error_with_hint("Unexpected `&`", "Did you mean `&&` (and)?")
468 );
469 }
470 }
471 '|' => {
472 self.advance();
473 if self.peek() == Some('|') {
474 self.advance();
475 tokens.push(SpannedToken {
476 token: Token::PipePipe,
477 line,
478 column,
479 });
480 } else {
481 tokens.push(SpannedToken {
486 token: Token::Pipe,
487 line,
488 column,
489 });
490 }
491 }
492
493 '(' => {
494 self.advance();
495 tokens.push(SpannedToken {
496 token: Token::LParen,
497 line,
498 column,
499 });
500 }
501 ')' => {
502 self.advance();
503 tokens.push(SpannedToken {
504 token: Token::RParen,
505 line,
506 column,
507 });
508 }
509 '[' => {
510 self.advance();
511 tokens.push(SpannedToken {
512 token: Token::LBracket,
513 line,
514 column,
515 });
516 }
517 ']' => {
518 self.advance();
519 tokens.push(SpannedToken {
520 token: Token::RBracket,
521 line,
522 column,
523 });
524 }
525 '{' => {
526 self.advance();
527 tokens.push(SpannedToken {
528 token: Token::LBrace,
529 line,
530 column,
531 });
532 }
533 '}' => {
534 self.advance();
535 tokens.push(SpannedToken {
536 token: Token::RBrace,
537 line,
538 column,
539 });
540 }
541 ',' => {
542 self.advance();
543 tokens.push(SpannedToken {
544 token: Token::Comma,
545 line,
546 column,
547 });
548 }
549 ':' => {
550 self.advance();
551 if self.peek() == Some(':') {
552 self.advance();
553 tokens.push(SpannedToken {
554 token: Token::ColonColon,
555 line,
556 column,
557 });
558 } else {
559 tokens.push(SpannedToken {
560 token: Token::Colon,
561 line,
562 column,
563 });
564 }
565 }
566 '.' => {
567 self.advance();
568 if self.peek() == Some('.') {
569 self.advance();
570 tokens.push(SpannedToken {
571 token: Token::DotDot,
572 line,
573 column,
574 });
575 } else {
576 tokens.push(SpannedToken {
577 token: Token::Dot,
578 line,
579 column,
580 });
581 }
582 }
583 ';' => {
584 self.advance();
585 tokens.push(SpannedToken {
586 token: Token::Semicolon,
587 line,
588 column,
589 });
590 }
591
592 _ => {
593 return Err(self.error(format!("I don't understand the character `{}`", ch)));
594 }
595 }
596 }
597
598 Ok(tokens)
599 }
600
601 fn lex_number(&mut self) -> Result<Token, BopError> {
602 let mut s = String::new();
603 while let Some(ch) = self.peek() {
604 if ch.is_ascii_digit() {
605 s.push(ch);
606 self.advance();
607 } else {
608 break;
609 }
610 }
611 let is_float = if self.peek() == Some('.')
616 && self.peek_next().is_some_and(|c| c.is_ascii_digit())
617 {
618 s.push('.');
619 self.advance();
620 while let Some(ch) = self.peek() {
621 if ch.is_ascii_digit() {
622 s.push(ch);
623 self.advance();
624 } else {
625 break;
626 }
627 }
628 true
629 } else {
630 false
631 };
632 if is_float {
633 let n: f64 = s
634 .parse()
635 .map_err(|_| self.error(format!("Invalid number: {}", s)))?;
636 Ok(Token::Number(n))
637 } else {
638 match s.parse::<i64>() {
643 Ok(n) => Ok(Token::Int(n)),
644 Err(_) => Err(self.error(format!(
645 "Integer literal out of range for i64: {}",
646 s
647 ))),
648 }
649 }
650 }
651
652 fn lex_ident_or_keyword(&mut self) -> Token {
653 let mut s = String::new();
654 while let Some(ch) = self.peek() {
655 if ch.is_ascii_alphanumeric() || ch == '_' {
656 s.push(ch);
657 self.advance();
658 } else {
659 break;
660 }
661 }
662 match s.as_str() {
663 "let" => Token::Let,
664 "const" => Token::Const,
665 "fn" => Token::Fn,
666 "return" => Token::Return,
667 "if" => Token::If,
668 "else" => Token::Else,
669 "while" => Token::While,
670 "for" => Token::For,
671 "in" => Token::In,
672 "repeat" => Token::Repeat,
673 "break" => Token::Break,
674 "continue" => Token::Continue,
675 "use" => Token::Use,
676 "as" => Token::As,
677 "struct" => Token::Struct,
678 "enum" => Token::Enum,
679 "match" => Token::Match,
680 "try" => Token::Try,
681 "true" => Token::True,
682 "false" => Token::False,
683 "none" => Token::None,
684 _ => Token::Ident(s),
685 }
686 }
687
688 fn lex_string(&mut self) -> Result<Token, BopError> {
689 self.advance(); let mut parts: Vec<StringPart> = Vec::new();
691 let mut current = String::new();
692
693 loop {
694 match self.peek() {
695 None | Some('\n') => {
696 return Err(self.error_with_hint(
697 "This string is missing its closing `\"`",
698 "Every string needs to start and end with quotes.",
699 ));
700 }
701 Some('"') => {
702 self.advance();
703 break;
704 }
705 Some('\\') => {
706 self.advance();
707 match self.peek() {
708 Some('"') => {
709 current.push('"');
710 self.advance();
711 }
712 Some('\\') => {
713 current.push('\\');
714 self.advance();
715 }
716 Some('n') => {
717 current.push('\n');
718 self.advance();
719 }
720 Some('t') => {
721 current.push('\t');
722 self.advance();
723 }
724 Some('r') => {
725 current.push('\r');
730 self.advance();
731 }
732 Some('{') => {
733 current.push('{');
734 self.advance();
735 }
736 Some('}') => {
737 current.push('}');
738 self.advance();
739 }
740 Some(c) => {
741 return Err(self.error(format!("Unknown escape sequence `\\{}`", c)));
742 }
743 None => {
744 return Err(self.error("Unexpected end of string after `\\`"));
745 }
746 }
747 }
748 Some('{')
749 if self
750 .peek_next()
751 .is_some_and(|c| c.is_ascii_alphabetic() || c == '_') =>
752 {
753 self.advance(); let mut var = String::new();
756 while let Some(ch) = self.peek() {
757 if ch.is_ascii_alphanumeric() || ch == '_' {
758 var.push(ch);
759 self.advance();
760 } else {
761 break;
762 }
763 }
764 if self.peek() != Some('}') {
765 return Err(self.error_with_hint(
766 format!("Missing `}}` after `{{{}`", var),
767 "String interpolation needs a closing `}`, like: \"{name}\"",
768 ));
769 }
770 self.advance(); if !current.is_empty() {
772 parts.push(StringPart::Literal(core::mem::take(&mut current)));
773 }
774 parts.push(StringPart::Variable(var));
775 }
776 Some(ch) => {
777 current.push(ch);
778 self.advance();
779 }
780 }
781 }
782
783 if parts.is_empty() {
784 Ok(Token::Str(current))
786 } else {
787 if !current.is_empty() {
788 parts.push(StringPart::Literal(current));
789 }
790 Ok(Token::StringInterp(parts))
791 }
792 }
793}
794
795#[cfg(test)]
796mod tests {
797 use super::*;
798
799 fn toks(code: &str) -> Vec<Token> {
801 lex(code)
802 .unwrap()
803 .into_iter()
804 .map(|t| t.token)
805 .filter(|t| !matches!(t, Token::Eof))
806 .collect()
807 }
808
809 fn lex_err(code: &str) -> String {
810 lex(code).unwrap_err().message
811 }
812
813 #[test]
816 fn integer() {
817 assert_eq!(toks("42"), vec![Token::Int(42)]);
819 }
820
821 #[test]
822 fn float() {
823 assert_eq!(toks("3.14"), vec![Token::Number(3.14)]);
824 }
825
826 #[test]
827 fn leading_zero_float() {
828 assert_eq!(toks("0.5"), vec![Token::Number(0.5)]);
829 }
830
831 #[test]
834 fn plain_string() {
835 assert_eq!(toks(r#""hello""#), vec![Token::Str("hello".into())]);
836 }
837
838 #[test]
839 fn escape_sequences() {
840 assert_eq!(
841 toks(r#""a\nb\t\\\"c""#),
842 vec![Token::Str("a\nb\t\\\"c".into())]
843 );
844 }
845
846 #[test]
847 fn escape_sequence_cr() {
848 assert_eq!(
852 toks(r#""a\rb""#),
853 vec![Token::Str("a\rb".into())]
854 );
855 }
856
857 #[test]
858 fn string_interpolation() {
859 assert_eq!(
860 toks(r#""hi {name}!""#),
861 vec![Token::StringInterp(vec![
862 StringPart::Literal("hi ".into()),
863 StringPart::Variable("name".into()),
864 StringPart::Literal("!".into()),
865 ])]
866 );
867 }
868
869 #[test]
870 fn string_interpolation_multiple_vars() {
871 assert_eq!(
872 toks(r#""{x},{y}""#),
873 vec![Token::StringInterp(vec![
874 StringPart::Variable("x".into()),
875 StringPart::Literal(",".into()),
876 StringPart::Variable("y".into()),
877 ])]
878 );
879 }
880
881 #[test]
882 fn unterminated_string() {
883 assert!(lex_err(r#""hello"#).contains("missing its closing"));
884 }
885
886 #[test]
887 fn unknown_escape() {
888 assert!(lex_err(r#""hello\q""#).contains("Unknown escape"));
889 }
890
891 #[test]
894 fn keywords() {
895 assert_eq!(
896 toks("let fn return if else while for in repeat break continue true false none"),
897 vec![
898 Token::Let,
899 Token::Fn,
900 Token::Return,
901 Token::If,
902 Token::Else,
903 Token::While,
904 Token::For,
905 Token::In,
906 Token::Repeat,
907 Token::Break,
908 Token::Continue,
909 Token::True,
910 Token::False,
911 Token::None,
912 ]
913 );
914 }
915
916 #[test]
917 fn identifiers() {
918 assert_eq!(
919 toks("foo bar_baz _x abc123"),
920 vec![
921 Token::Ident("foo".into()),
922 Token::Ident("bar_baz".into()),
923 Token::Ident("_x".into()),
924 Token::Ident("abc123".into()),
925 ]
926 );
927 }
928
929 #[test]
932 fn single_char_ops() {
933 assert_eq!(
934 toks("+ - * / % = ! < > ( ) [ ] { } , : . ;"),
935 vec![
936 Token::Plus,
937 Token::Minus,
938 Token::Star,
939 Token::Slash,
940 Token::Percent,
941 Token::Eq,
942 Token::Bang,
943 Token::Lt,
944 Token::Gt,
945 Token::LParen,
946 Token::RParen,
947 Token::LBracket,
948 Token::RBracket,
949 Token::LBrace,
950 Token::RBrace,
951 Token::Comma,
952 Token::Colon,
953 Token::Dot,
954 Token::Semicolon,
955 ]
956 );
957 }
958
959 #[test]
960 fn double_char_ops() {
961 assert_eq!(
962 toks("== != <= >= && || += -= *= /= %="),
963 vec![
964 Token::EqEq,
965 Token::BangEq,
966 Token::LtEq,
967 Token::GtEq,
968 Token::AmpAmp,
969 Token::PipePipe,
970 Token::PlusEq,
971 Token::MinusEq,
972 Token::StarEq,
973 Token::SlashEq,
974 Token::PercentEq,
975 ]
976 );
977 }
978
979 #[test]
980 fn lone_ampersand_error() {
981 assert!(lex_err("&x").contains("Unexpected `&`"));
982 }
983
984 #[test]
985 fn lone_pipe_lexes_as_or_pattern_separator() {
986 assert_eq!(
990 toks("|"),
991 vec![Token::Pipe]
992 );
993 }
994
995 #[test]
998 fn line_comment_skipped() {
999 assert_eq!(
1000 toks("1 // comment\n2"),
1001 vec![Token::Int(1), Token::Semicolon, Token::Int(2)]
1002 );
1003 }
1004
1005 #[test]
1006 fn comment_at_end() {
1007 assert_eq!(toks("x // done"), vec![Token::Ident("x".into())]);
1008 }
1009
1010 #[test]
1011 fn hash_is_not_a_comment() {
1012 assert!(lex_err("x # nope").contains("don't understand"));
1015 }
1016
1017 #[test]
1020 fn auto_semi_after_ident() {
1021 assert_eq!(
1022 toks("x\ny"),
1023 vec![
1024 Token::Ident("x".into()),
1025 Token::Semicolon,
1026 Token::Ident("y".into()),
1027 ]
1028 );
1029 }
1030
1031 #[test]
1032 fn auto_semi_after_number() {
1033 assert_eq!(
1034 toks("42\n10"),
1035 vec![Token::Int(42), Token::Semicolon, Token::Int(10)]
1036 );
1037 }
1038
1039 #[test]
1040 fn auto_semi_after_rparen() {
1041 assert_eq!(
1042 toks("f()\ng()"),
1043 vec![
1044 Token::Ident("f".into()),
1045 Token::LParen,
1046 Token::RParen,
1047 Token::Semicolon,
1048 Token::Ident("g".into()),
1049 Token::LParen,
1050 Token::RParen,
1051 ]
1052 );
1053 }
1054
1055 #[test]
1056 fn auto_semi_after_rbrace() {
1057 assert_eq!(
1058 toks("{\n}\nx"),
1059 vec![
1060 Token::LBrace,
1061 Token::RBrace,
1062 Token::Semicolon,
1063 Token::Ident("x".into()),
1064 ]
1065 );
1066 }
1067
1068 #[test]
1069 fn no_semi_after_open_delim() {
1070 assert_eq!(toks("{\nx"), vec![Token::LBrace, Token::Ident("x".into()),]);
1071 }
1072
1073 #[test]
1074 fn no_semi_after_operator() {
1075 assert_eq!(
1076 toks("x +\ny"),
1077 vec![
1078 Token::Ident("x".into()),
1079 Token::Plus,
1080 Token::Ident("y".into()),
1081 ]
1082 );
1083 }
1084
1085 #[test]
1086 fn auto_semi_after_break_continue_return() {
1087 assert_eq!(
1088 toks("break\ncontinue\nreturn"),
1089 vec![
1090 Token::Break,
1091 Token::Semicolon,
1092 Token::Continue,
1093 Token::Semicolon,
1094 Token::Return,
1095 ]
1096 );
1097 }
1098
1099 #[test]
1100 fn auto_semi_after_true_false_none() {
1101 assert_eq!(
1102 toks("true\nfalse\nnone"),
1103 vec![
1104 Token::True,
1105 Token::Semicolon,
1106 Token::False,
1107 Token::Semicolon,
1108 Token::None,
1109 ]
1110 );
1111 }
1112
1113 #[test]
1116 fn line_numbers() {
1117 let tokens = lex("x\ny\nz").unwrap();
1118 let lines: Vec<u32> = tokens.iter().map(|t| t.line).collect();
1119 assert_eq!(lines, vec![1, 1, 2, 2, 3, 3]);
1121 }
1122
1123 #[test]
1126 fn unknown_char() {
1127 assert!(lex_err("@").contains("don't understand"));
1128 }
1129}