1#[cfg(not(feature = "std"))]
2use alloc::{format, string::String, vec::Vec};
3
4use crate::error::BopError;
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum StringPart {
8 Literal(String),
9 Variable(String),
10}
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14 Number(f64),
16 Str(String),
17 StringInterp(Vec<StringPart>),
18 True,
19 False,
20 None,
21
22 Ident(String),
24 Let,
25 Fn,
26 Return,
27 If,
28 Else,
29 While,
30 For,
31 In,
32 Repeat,
33 Break,
34 Continue,
35
36 Plus,
38 Minus,
39 Star,
40 Slash,
41 Percent,
42 EqEq,
43 BangEq,
44 Lt,
45 Gt,
46 LtEq,
47 GtEq,
48 AmpAmp,
49 PipePipe,
50 Bang,
51 Eq,
52 PlusEq,
53 MinusEq,
54 StarEq,
55 SlashEq,
56 PercentEq,
57
58 LParen,
60 RParen,
61 LBracket,
62 RBracket,
63 LBrace,
64 RBrace,
65 Comma,
66 Colon,
67 Dot,
68 Semicolon,
69
70 Newline,
72
73 Eof,
74}
75
76#[derive(Debug, Clone)]
77pub struct SpannedToken {
78 pub token: Token,
79 pub line: u32,
80}
81
82pub fn lex(source: &str) -> Result<Vec<SpannedToken>, BopError> {
83 let mut lexer = Lexer::new(source);
84 let raw = lexer.lex_all()?;
85 Ok(insert_semicolons(raw))
86}
87
88fn triggers_semicolon(token: &Token) -> bool {
89 matches!(
90 token,
91 Token::Ident(_)
92 | Token::Number(_)
93 | Token::Str(_)
94 | Token::StringInterp(_)
95 | Token::True
96 | Token::False
97 | Token::None
98 | Token::Break
99 | Token::Continue
100 | Token::Return
101 | Token::RParen
102 | Token::RBracket
103 | Token::RBrace
104 )
105}
106
107fn insert_semicolons(raw: Vec<SpannedToken>) -> Vec<SpannedToken> {
108 let mut result: Vec<SpannedToken> = Vec::new();
109 for token in raw {
110 if token.token == Token::Newline {
111 if let Some(last) = result.last() {
112 if triggers_semicolon(&last.token) {
113 result.push(SpannedToken {
114 token: Token::Semicolon,
115 line: token.line,
116 });
117 }
118 }
119 } else {
120 result.push(token);
121 }
122 }
123 result
124}
125
126struct Lexer {
127 chars: Vec<char>,
128 pos: usize,
129 line: u32,
130}
131
132impl Lexer {
133 fn new(source: &str) -> Self {
134 Self {
135 chars: source.chars().collect(),
136 pos: 0,
137 line: 1,
138 }
139 }
140
141 fn peek(&self) -> Option<char> {
142 self.chars.get(self.pos).copied()
143 }
144
145 fn peek_next(&self) -> Option<char> {
146 self.chars.get(self.pos + 1).copied()
147 }
148
149 fn advance(&mut self) -> Option<char> {
150 let ch = self.chars.get(self.pos).copied()?;
151 self.pos += 1;
152 Some(ch)
153 }
154
155 fn error(&self, message: impl Into<String>) -> BopError {
156 BopError {
157 line: Some(self.line),
158 column: None,
159 message: message.into(),
160 friendly_hint: None,
161 }
162 }
163
164 fn error_with_hint(
165 &self,
166 message: impl Into<String>,
167 hint: impl Into<String>,
168 ) -> BopError {
169 BopError {
170 line: Some(self.line),
171 column: None,
172 message: message.into(),
173 friendly_hint: Some(hint.into()),
174 }
175 }
176
177 fn lex_all(&mut self) -> Result<Vec<SpannedToken>, BopError> {
178 let mut tokens = Vec::new();
179
180 loop {
181 while let Some(ch) = self.peek() {
183 if ch == ' ' || ch == '\t' || ch == '\r' {
184 self.advance();
185 } else {
186 break;
187 }
188 }
189
190 let Some(ch) = self.peek() else {
191 tokens.push(SpannedToken {
192 token: Token::Eof,
193 line: self.line,
194 });
195 break;
196 };
197
198 let line = self.line;
199
200 match ch {
201 '\n' => {
202 self.advance();
203 self.line += 1;
204 tokens.push(SpannedToken {
205 token: Token::Newline,
206 line,
207 });
208 }
209
210 '/' if self.peek_next() == Some('/') => {
211 while let Some(c) = self.peek() {
213 if c == '\n' {
214 break;
215 }
216 self.advance();
217 }
218 }
219
220 '"' => {
221 tokens.push(SpannedToken {
222 token: self.lex_string()?,
223 line,
224 });
225 }
226
227 '0'..='9' => {
228 tokens.push(SpannedToken {
229 token: self.lex_number()?,
230 line,
231 });
232 }
233
234 'a'..='z' | 'A'..='Z' | '_' => {
235 tokens.push(SpannedToken {
236 token: self.lex_ident_or_keyword(),
237 line,
238 });
239 }
240
241 '+' => {
242 self.advance();
243 if self.peek() == Some('=') {
244 self.advance();
245 tokens.push(SpannedToken {
246 token: Token::PlusEq,
247 line,
248 });
249 } else {
250 tokens.push(SpannedToken {
251 token: Token::Plus,
252 line,
253 });
254 }
255 }
256 '-' => {
257 self.advance();
258 if self.peek() == Some('=') {
259 self.advance();
260 tokens.push(SpannedToken {
261 token: Token::MinusEq,
262 line,
263 });
264 } else {
265 tokens.push(SpannedToken {
266 token: Token::Minus,
267 line,
268 });
269 }
270 }
271 '*' => {
272 self.advance();
273 if self.peek() == Some('=') {
274 self.advance();
275 tokens.push(SpannedToken {
276 token: Token::StarEq,
277 line,
278 });
279 } else {
280 tokens.push(SpannedToken {
281 token: Token::Star,
282 line,
283 });
284 }
285 }
286 '/' => {
287 self.advance();
288 if self.peek() == Some('=') {
289 self.advance();
290 tokens.push(SpannedToken {
291 token: Token::SlashEq,
292 line,
293 });
294 } else {
295 tokens.push(SpannedToken {
296 token: Token::Slash,
297 line,
298 });
299 }
300 }
301 '%' => {
302 self.advance();
303 if self.peek() == Some('=') {
304 self.advance();
305 tokens.push(SpannedToken {
306 token: Token::PercentEq,
307 line,
308 });
309 } else {
310 tokens.push(SpannedToken {
311 token: Token::Percent,
312 line,
313 });
314 }
315 }
316
317 '=' => {
318 self.advance();
319 if self.peek() == Some('=') {
320 self.advance();
321 tokens.push(SpannedToken {
322 token: Token::EqEq,
323 line,
324 });
325 } else {
326 tokens.push(SpannedToken {
327 token: Token::Eq,
328 line,
329 });
330 }
331 }
332 '!' => {
333 self.advance();
334 if self.peek() == Some('=') {
335 self.advance();
336 tokens.push(SpannedToken {
337 token: Token::BangEq,
338 line,
339 });
340 } else {
341 tokens.push(SpannedToken {
342 token: Token::Bang,
343 line,
344 });
345 }
346 }
347 '<' => {
348 self.advance();
349 if self.peek() == Some('=') {
350 self.advance();
351 tokens.push(SpannedToken {
352 token: Token::LtEq,
353 line,
354 });
355 } else {
356 tokens.push(SpannedToken {
357 token: Token::Lt,
358 line,
359 });
360 }
361 }
362 '>' => {
363 self.advance();
364 if self.peek() == Some('=') {
365 self.advance();
366 tokens.push(SpannedToken {
367 token: Token::GtEq,
368 line,
369 });
370 } else {
371 tokens.push(SpannedToken {
372 token: Token::Gt,
373 line,
374 });
375 }
376 }
377
378 '&' => {
379 self.advance();
380 if self.peek() == Some('&') {
381 self.advance();
382 tokens.push(SpannedToken {
383 token: Token::AmpAmp,
384 line,
385 });
386 } else {
387 return Err(
388 self.error_with_hint("Unexpected `&`", "Did you mean `&&` (and)?")
389 );
390 }
391 }
392 '|' => {
393 self.advance();
394 if self.peek() == Some('|') {
395 self.advance();
396 tokens.push(SpannedToken {
397 token: Token::PipePipe,
398 line,
399 });
400 } else {
401 return Err(
402 self.error_with_hint("Unexpected `|`", "Did you mean `||` (or)?")
403 );
404 }
405 }
406
407 '(' => {
408 self.advance();
409 tokens.push(SpannedToken {
410 token: Token::LParen,
411 line,
412 });
413 }
414 ')' => {
415 self.advance();
416 tokens.push(SpannedToken {
417 token: Token::RParen,
418 line,
419 });
420 }
421 '[' => {
422 self.advance();
423 tokens.push(SpannedToken {
424 token: Token::LBracket,
425 line,
426 });
427 }
428 ']' => {
429 self.advance();
430 tokens.push(SpannedToken {
431 token: Token::RBracket,
432 line,
433 });
434 }
435 '{' => {
436 self.advance();
437 tokens.push(SpannedToken {
438 token: Token::LBrace,
439 line,
440 });
441 }
442 '}' => {
443 self.advance();
444 tokens.push(SpannedToken {
445 token: Token::RBrace,
446 line,
447 });
448 }
449 ',' => {
450 self.advance();
451 tokens.push(SpannedToken {
452 token: Token::Comma,
453 line,
454 });
455 }
456 ':' => {
457 self.advance();
458 tokens.push(SpannedToken {
459 token: Token::Colon,
460 line,
461 });
462 }
463 '.' => {
464 self.advance();
465 tokens.push(SpannedToken {
466 token: Token::Dot,
467 line,
468 });
469 }
470 ';' => {
471 self.advance();
472 tokens.push(SpannedToken {
473 token: Token::Semicolon,
474 line,
475 });
476 }
477
478 _ => {
479 return Err(self.error(format!("I don't understand the character `{}`", ch)));
480 }
481 }
482 }
483
484 Ok(tokens)
485 }
486
487 fn lex_number(&mut self) -> Result<Token, BopError> {
488 let mut s = String::new();
489 while let Some(ch) = self.peek() {
490 if ch.is_ascii_digit() {
491 s.push(ch);
492 self.advance();
493 } else {
494 break;
495 }
496 }
497 if self.peek() == Some('.') && self.peek_next().is_some_and(|c| c.is_ascii_digit()) {
498 s.push('.');
499 self.advance();
500 while let Some(ch) = self.peek() {
501 if ch.is_ascii_digit() {
502 s.push(ch);
503 self.advance();
504 } else {
505 break;
506 }
507 }
508 }
509 let n: f64 = s
510 .parse()
511 .map_err(|_| self.error(format!("Invalid number: {}", s)))?;
512 Ok(Token::Number(n))
513 }
514
515 fn lex_ident_or_keyword(&mut self) -> Token {
516 let mut s = String::new();
517 while let Some(ch) = self.peek() {
518 if ch.is_ascii_alphanumeric() || ch == '_' {
519 s.push(ch);
520 self.advance();
521 } else {
522 break;
523 }
524 }
525 match s.as_str() {
526 "let" => Token::Let,
527 "fn" => Token::Fn,
528 "return" => Token::Return,
529 "if" => Token::If,
530 "else" => Token::Else,
531 "while" => Token::While,
532 "for" => Token::For,
533 "in" => Token::In,
534 "repeat" => Token::Repeat,
535 "break" => Token::Break,
536 "continue" => Token::Continue,
537 "true" => Token::True,
538 "false" => Token::False,
539 "none" => Token::None,
540 _ => Token::Ident(s),
541 }
542 }
543
544 fn lex_string(&mut self) -> Result<Token, BopError> {
545 self.advance(); let mut parts: Vec<StringPart> = Vec::new();
547 let mut current = String::new();
548
549 loop {
550 match self.peek() {
551 None | Some('\n') => {
552 return Err(self.error_with_hint(
553 "This string is missing its closing `\"`",
554 "Every string needs to start and end with quotes.",
555 ));
556 }
557 Some('"') => {
558 self.advance();
559 break;
560 }
561 Some('\\') => {
562 self.advance();
563 match self.peek() {
564 Some('"') => {
565 current.push('"');
566 self.advance();
567 }
568 Some('\\') => {
569 current.push('\\');
570 self.advance();
571 }
572 Some('n') => {
573 current.push('\n');
574 self.advance();
575 }
576 Some('t') => {
577 current.push('\t');
578 self.advance();
579 }
580 Some('{') => {
581 current.push('{');
582 self.advance();
583 }
584 Some('}') => {
585 current.push('}');
586 self.advance();
587 }
588 Some(c) => {
589 return Err(self.error(format!("Unknown escape sequence `\\{}`", c)));
590 }
591 None => {
592 return Err(self.error("Unexpected end of string after `\\`"));
593 }
594 }
595 }
596 Some('{')
597 if self
598 .peek_next()
599 .is_some_and(|c| c.is_ascii_alphabetic() || c == '_') =>
600 {
601 self.advance(); let mut var = String::new();
604 while let Some(ch) = self.peek() {
605 if ch.is_ascii_alphanumeric() || ch == '_' {
606 var.push(ch);
607 self.advance();
608 } else {
609 break;
610 }
611 }
612 if self.peek() != Some('}') {
613 return Err(self.error_with_hint(
614 format!("Missing `}}` after `{{{}`", var),
615 "String interpolation needs a closing `}`, like: \"{name}\"",
616 ));
617 }
618 self.advance(); if !current.is_empty() {
620 parts.push(StringPart::Literal(core::mem::take(&mut current)));
621 }
622 parts.push(StringPart::Variable(var));
623 }
624 Some(ch) => {
625 current.push(ch);
626 self.advance();
627 }
628 }
629 }
630
631 if parts.is_empty() {
632 Ok(Token::Str(current))
634 } else {
635 if !current.is_empty() {
636 parts.push(StringPart::Literal(current));
637 }
638 Ok(Token::StringInterp(parts))
639 }
640 }
641}
642
643#[cfg(test)]
644mod tests {
645 use super::*;
646
647 fn toks(code: &str) -> Vec<Token> {
649 lex(code)
650 .unwrap()
651 .into_iter()
652 .map(|t| t.token)
653 .filter(|t| !matches!(t, Token::Eof))
654 .collect()
655 }
656
657 fn lex_err(code: &str) -> String {
658 lex(code).unwrap_err().message
659 }
660
661 #[test]
664 fn integer() {
665 assert_eq!(toks("42"), vec![Token::Number(42.0)]);
666 }
667
668 #[test]
669 fn float() {
670 assert_eq!(toks("3.14"), vec![Token::Number(3.14)]);
671 }
672
673 #[test]
674 fn leading_zero_float() {
675 assert_eq!(toks("0.5"), vec![Token::Number(0.5)]);
676 }
677
678 #[test]
681 fn plain_string() {
682 assert_eq!(toks(r#""hello""#), vec![Token::Str("hello".into())]);
683 }
684
685 #[test]
686 fn escape_sequences() {
687 assert_eq!(
688 toks(r#""a\nb\t\\\"c""#),
689 vec![Token::Str("a\nb\t\\\"c".into())]
690 );
691 }
692
693 #[test]
694 fn string_interpolation() {
695 assert_eq!(
696 toks(r#""hi {name}!""#),
697 vec![Token::StringInterp(vec![
698 StringPart::Literal("hi ".into()),
699 StringPart::Variable("name".into()),
700 StringPart::Literal("!".into()),
701 ])]
702 );
703 }
704
705 #[test]
706 fn string_interpolation_multiple_vars() {
707 assert_eq!(
708 toks(r#""{x},{y}""#),
709 vec![Token::StringInterp(vec![
710 StringPart::Variable("x".into()),
711 StringPart::Literal(",".into()),
712 StringPart::Variable("y".into()),
713 ])]
714 );
715 }
716
717 #[test]
718 fn unterminated_string() {
719 assert!(lex_err(r#""hello"#).contains("missing its closing"));
720 }
721
722 #[test]
723 fn unknown_escape() {
724 assert!(lex_err(r#""hello\q""#).contains("Unknown escape"));
725 }
726
727 #[test]
730 fn keywords() {
731 assert_eq!(
732 toks("let fn return if else while for in repeat break continue true false none"),
733 vec![
734 Token::Let,
735 Token::Fn,
736 Token::Return,
737 Token::If,
738 Token::Else,
739 Token::While,
740 Token::For,
741 Token::In,
742 Token::Repeat,
743 Token::Break,
744 Token::Continue,
745 Token::True,
746 Token::False,
747 Token::None,
748 ]
749 );
750 }
751
752 #[test]
753 fn identifiers() {
754 assert_eq!(
755 toks("foo bar_baz _x abc123"),
756 vec![
757 Token::Ident("foo".into()),
758 Token::Ident("bar_baz".into()),
759 Token::Ident("_x".into()),
760 Token::Ident("abc123".into()),
761 ]
762 );
763 }
764
765 #[test]
768 fn single_char_ops() {
769 assert_eq!(
770 toks("+ - * / % = ! < > ( ) [ ] { } , : . ;"),
771 vec![
772 Token::Plus,
773 Token::Minus,
774 Token::Star,
775 Token::Slash,
776 Token::Percent,
777 Token::Eq,
778 Token::Bang,
779 Token::Lt,
780 Token::Gt,
781 Token::LParen,
782 Token::RParen,
783 Token::LBracket,
784 Token::RBracket,
785 Token::LBrace,
786 Token::RBrace,
787 Token::Comma,
788 Token::Colon,
789 Token::Dot,
790 Token::Semicolon,
791 ]
792 );
793 }
794
795 #[test]
796 fn double_char_ops() {
797 assert_eq!(
798 toks("== != <= >= && || += -= *= /= %="),
799 vec![
800 Token::EqEq,
801 Token::BangEq,
802 Token::LtEq,
803 Token::GtEq,
804 Token::AmpAmp,
805 Token::PipePipe,
806 Token::PlusEq,
807 Token::MinusEq,
808 Token::StarEq,
809 Token::SlashEq,
810 Token::PercentEq,
811 ]
812 );
813 }
814
815 #[test]
816 fn lone_ampersand_error() {
817 assert!(lex_err("&x").contains("Unexpected `&`"));
818 }
819
820 #[test]
821 fn lone_pipe_error() {
822 assert!(lex_err("|x").contains("Unexpected `|`"));
823 }
824
825 #[test]
828 fn line_comment_skipped() {
829 assert_eq!(
830 toks("1 // comment\n2"),
831 vec![Token::Number(1.0), Token::Semicolon, Token::Number(2.0),]
832 );
833 }
834
835 #[test]
836 fn comment_at_end() {
837 assert_eq!(toks("x // done"), vec![Token::Ident("x".into())]);
838 }
839
840 #[test]
843 fn auto_semi_after_ident() {
844 assert_eq!(
845 toks("x\ny"),
846 vec![
847 Token::Ident("x".into()),
848 Token::Semicolon,
849 Token::Ident("y".into()),
850 ]
851 );
852 }
853
854 #[test]
855 fn auto_semi_after_number() {
856 assert_eq!(
857 toks("42\n10"),
858 vec![Token::Number(42.0), Token::Semicolon, Token::Number(10.0),]
859 );
860 }
861
862 #[test]
863 fn auto_semi_after_rparen() {
864 assert_eq!(
865 toks("f()\ng()"),
866 vec![
867 Token::Ident("f".into()),
868 Token::LParen,
869 Token::RParen,
870 Token::Semicolon,
871 Token::Ident("g".into()),
872 Token::LParen,
873 Token::RParen,
874 ]
875 );
876 }
877
878 #[test]
879 fn auto_semi_after_rbrace() {
880 assert_eq!(
881 toks("{\n}\nx"),
882 vec![
883 Token::LBrace,
884 Token::RBrace,
885 Token::Semicolon,
886 Token::Ident("x".into()),
887 ]
888 );
889 }
890
891 #[test]
892 fn no_semi_after_open_delim() {
893 assert_eq!(toks("{\nx"), vec![Token::LBrace, Token::Ident("x".into()),]);
894 }
895
896 #[test]
897 fn no_semi_after_operator() {
898 assert_eq!(
899 toks("x +\ny"),
900 vec![
901 Token::Ident("x".into()),
902 Token::Plus,
903 Token::Ident("y".into()),
904 ]
905 );
906 }
907
908 #[test]
909 fn auto_semi_after_break_continue_return() {
910 assert_eq!(
911 toks("break\ncontinue\nreturn"),
912 vec![
913 Token::Break,
914 Token::Semicolon,
915 Token::Continue,
916 Token::Semicolon,
917 Token::Return,
918 ]
919 );
920 }
921
922 #[test]
923 fn auto_semi_after_true_false_none() {
924 assert_eq!(
925 toks("true\nfalse\nnone"),
926 vec![
927 Token::True,
928 Token::Semicolon,
929 Token::False,
930 Token::Semicolon,
931 Token::None,
932 ]
933 );
934 }
935
936 #[test]
939 fn line_numbers() {
940 let tokens = lex("x\ny\nz").unwrap();
941 let lines: Vec<u32> = tokens.iter().map(|t| t.line).collect();
942 assert_eq!(lines, vec![1, 1, 2, 2, 3, 3]);
944 }
945
946 #[test]
949 fn unknown_char() {
950 assert!(lex_err("@").contains("don't understand"));
951 }
952}