Skip to main content

bop/
lexer.rs

1#[cfg(not(feature = "std"))]
2use alloc::{format, string::String, vec::Vec};
3
4use crate::error::BopError;
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum StringPart {
8    Literal(String),
9    Variable(String),
10}
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14    // Literals
15    Number(f64),
16    Str(String),
17    StringInterp(Vec<StringPart>),
18    True,
19    False,
20    None,
21
22    // Identifiers & Keywords
23    Ident(String),
24    Let,
25    Fn,
26    Return,
27    If,
28    Else,
29    While,
30    For,
31    In,
32    Repeat,
33    Break,
34    Continue,
35
36    // Operators
37    Plus,
38    Minus,
39    Star,
40    Slash,
41    Percent,
42    EqEq,
43    BangEq,
44    Lt,
45    Gt,
46    LtEq,
47    GtEq,
48    AmpAmp,
49    PipePipe,
50    Bang,
51    Eq,
52    PlusEq,
53    MinusEq,
54    StarEq,
55    SlashEq,
56    PercentEq,
57
58    // Delimiters
59    LParen,
60    RParen,
61    LBracket,
62    RBracket,
63    LBrace,
64    RBrace,
65    Comma,
66    Colon,
67    Dot,
68    Semicolon,
69
70    // Internal (removed after auto-semicolons)
71    Newline,
72
73    Eof,
74}
75
76#[derive(Debug, Clone)]
77pub struct SpannedToken {
78    pub token: Token,
79    pub line: u32,
80}
81
82pub fn lex(source: &str) -> Result<Vec<SpannedToken>, BopError> {
83    let mut lexer = Lexer::new(source);
84    let raw = lexer.lex_all()?;
85    Ok(insert_semicolons(raw))
86}
87
88fn triggers_semicolon(token: &Token) -> bool {
89    matches!(
90        token,
91        Token::Ident(_)
92            | Token::Number(_)
93            | Token::Str(_)
94            | Token::StringInterp(_)
95            | Token::True
96            | Token::False
97            | Token::None
98            | Token::Break
99            | Token::Continue
100            | Token::Return
101            | Token::RParen
102            | Token::RBracket
103            | Token::RBrace
104    )
105}
106
107fn insert_semicolons(raw: Vec<SpannedToken>) -> Vec<SpannedToken> {
108    let mut result: Vec<SpannedToken> = Vec::new();
109    for token in raw {
110        if token.token == Token::Newline {
111            if let Some(last) = result.last() {
112                if triggers_semicolon(&last.token) {
113                    result.push(SpannedToken {
114                        token: Token::Semicolon,
115                        line: token.line,
116                    });
117                }
118            }
119        } else {
120            result.push(token);
121        }
122    }
123    result
124}
125
126struct Lexer {
127    chars: Vec<char>,
128    pos: usize,
129    line: u32,
130}
131
132impl Lexer {
133    fn new(source: &str) -> Self {
134        Self {
135            chars: source.chars().collect(),
136            pos: 0,
137            line: 1,
138        }
139    }
140
141    fn peek(&self) -> Option<char> {
142        self.chars.get(self.pos).copied()
143    }
144
145    fn peek_next(&self) -> Option<char> {
146        self.chars.get(self.pos + 1).copied()
147    }
148
149    fn advance(&mut self) -> Option<char> {
150        let ch = self.chars.get(self.pos).copied()?;
151        self.pos += 1;
152        Some(ch)
153    }
154
155    fn error(&self, message: impl Into<String>) -> BopError {
156        BopError {
157            line: Some(self.line),
158            column: None,
159            message: message.into(),
160            friendly_hint: None,
161        }
162    }
163
164    fn error_with_hint(
165        &self,
166        message: impl Into<String>,
167        hint: impl Into<String>,
168    ) -> BopError {
169        BopError {
170            line: Some(self.line),
171            column: None,
172            message: message.into(),
173            friendly_hint: Some(hint.into()),
174        }
175    }
176
177    fn lex_all(&mut self) -> Result<Vec<SpannedToken>, BopError> {
178        let mut tokens = Vec::new();
179
180        loop {
181            // Skip whitespace (not newlines)
182            while let Some(ch) = self.peek() {
183                if ch == ' ' || ch == '\t' || ch == '\r' {
184                    self.advance();
185                } else {
186                    break;
187                }
188            }
189
190            let Some(ch) = self.peek() else {
191                tokens.push(SpannedToken {
192                    token: Token::Eof,
193                    line: self.line,
194                });
195                break;
196            };
197
198            let line = self.line;
199
200            match ch {
201                '\n' => {
202                    self.advance();
203                    self.line += 1;
204                    tokens.push(SpannedToken {
205                        token: Token::Newline,
206                        line,
207                    });
208                }
209
210                '/' if self.peek_next() == Some('/') => {
211                    // Line comment — skip to end of line
212                    while let Some(c) = self.peek() {
213                        if c == '\n' {
214                            break;
215                        }
216                        self.advance();
217                    }
218                }
219
220                '"' => {
221                    tokens.push(SpannedToken {
222                        token: self.lex_string()?,
223                        line,
224                    });
225                }
226
227                '0'..='9' => {
228                    tokens.push(SpannedToken {
229                        token: self.lex_number()?,
230                        line,
231                    });
232                }
233
234                'a'..='z' | 'A'..='Z' | '_' => {
235                    tokens.push(SpannedToken {
236                        token: self.lex_ident_or_keyword(),
237                        line,
238                    });
239                }
240
241                '+' => {
242                    self.advance();
243                    if self.peek() == Some('=') {
244                        self.advance();
245                        tokens.push(SpannedToken {
246                            token: Token::PlusEq,
247                            line,
248                        });
249                    } else {
250                        tokens.push(SpannedToken {
251                            token: Token::Plus,
252                            line,
253                        });
254                    }
255                }
256                '-' => {
257                    self.advance();
258                    if self.peek() == Some('=') {
259                        self.advance();
260                        tokens.push(SpannedToken {
261                            token: Token::MinusEq,
262                            line,
263                        });
264                    } else {
265                        tokens.push(SpannedToken {
266                            token: Token::Minus,
267                            line,
268                        });
269                    }
270                }
271                '*' => {
272                    self.advance();
273                    if self.peek() == Some('=') {
274                        self.advance();
275                        tokens.push(SpannedToken {
276                            token: Token::StarEq,
277                            line,
278                        });
279                    } else {
280                        tokens.push(SpannedToken {
281                            token: Token::Star,
282                            line,
283                        });
284                    }
285                }
286                '/' => {
287                    self.advance();
288                    if self.peek() == Some('=') {
289                        self.advance();
290                        tokens.push(SpannedToken {
291                            token: Token::SlashEq,
292                            line,
293                        });
294                    } else {
295                        tokens.push(SpannedToken {
296                            token: Token::Slash,
297                            line,
298                        });
299                    }
300                }
301                '%' => {
302                    self.advance();
303                    if self.peek() == Some('=') {
304                        self.advance();
305                        tokens.push(SpannedToken {
306                            token: Token::PercentEq,
307                            line,
308                        });
309                    } else {
310                        tokens.push(SpannedToken {
311                            token: Token::Percent,
312                            line,
313                        });
314                    }
315                }
316
317                '=' => {
318                    self.advance();
319                    if self.peek() == Some('=') {
320                        self.advance();
321                        tokens.push(SpannedToken {
322                            token: Token::EqEq,
323                            line,
324                        });
325                    } else {
326                        tokens.push(SpannedToken {
327                            token: Token::Eq,
328                            line,
329                        });
330                    }
331                }
332                '!' => {
333                    self.advance();
334                    if self.peek() == Some('=') {
335                        self.advance();
336                        tokens.push(SpannedToken {
337                            token: Token::BangEq,
338                            line,
339                        });
340                    } else {
341                        tokens.push(SpannedToken {
342                            token: Token::Bang,
343                            line,
344                        });
345                    }
346                }
347                '<' => {
348                    self.advance();
349                    if self.peek() == Some('=') {
350                        self.advance();
351                        tokens.push(SpannedToken {
352                            token: Token::LtEq,
353                            line,
354                        });
355                    } else {
356                        tokens.push(SpannedToken {
357                            token: Token::Lt,
358                            line,
359                        });
360                    }
361                }
362                '>' => {
363                    self.advance();
364                    if self.peek() == Some('=') {
365                        self.advance();
366                        tokens.push(SpannedToken {
367                            token: Token::GtEq,
368                            line,
369                        });
370                    } else {
371                        tokens.push(SpannedToken {
372                            token: Token::Gt,
373                            line,
374                        });
375                    }
376                }
377
378                '&' => {
379                    self.advance();
380                    if self.peek() == Some('&') {
381                        self.advance();
382                        tokens.push(SpannedToken {
383                            token: Token::AmpAmp,
384                            line,
385                        });
386                    } else {
387                        return Err(
388                            self.error_with_hint("Unexpected `&`", "Did you mean `&&` (and)?")
389                        );
390                    }
391                }
392                '|' => {
393                    self.advance();
394                    if self.peek() == Some('|') {
395                        self.advance();
396                        tokens.push(SpannedToken {
397                            token: Token::PipePipe,
398                            line,
399                        });
400                    } else {
401                        return Err(
402                            self.error_with_hint("Unexpected `|`", "Did you mean `||` (or)?")
403                        );
404                    }
405                }
406
407                '(' => {
408                    self.advance();
409                    tokens.push(SpannedToken {
410                        token: Token::LParen,
411                        line,
412                    });
413                }
414                ')' => {
415                    self.advance();
416                    tokens.push(SpannedToken {
417                        token: Token::RParen,
418                        line,
419                    });
420                }
421                '[' => {
422                    self.advance();
423                    tokens.push(SpannedToken {
424                        token: Token::LBracket,
425                        line,
426                    });
427                }
428                ']' => {
429                    self.advance();
430                    tokens.push(SpannedToken {
431                        token: Token::RBracket,
432                        line,
433                    });
434                }
435                '{' => {
436                    self.advance();
437                    tokens.push(SpannedToken {
438                        token: Token::LBrace,
439                        line,
440                    });
441                }
442                '}' => {
443                    self.advance();
444                    tokens.push(SpannedToken {
445                        token: Token::RBrace,
446                        line,
447                    });
448                }
449                ',' => {
450                    self.advance();
451                    tokens.push(SpannedToken {
452                        token: Token::Comma,
453                        line,
454                    });
455                }
456                ':' => {
457                    self.advance();
458                    tokens.push(SpannedToken {
459                        token: Token::Colon,
460                        line,
461                    });
462                }
463                '.' => {
464                    self.advance();
465                    tokens.push(SpannedToken {
466                        token: Token::Dot,
467                        line,
468                    });
469                }
470                ';' => {
471                    self.advance();
472                    tokens.push(SpannedToken {
473                        token: Token::Semicolon,
474                        line,
475                    });
476                }
477
478                _ => {
479                    return Err(self.error(format!("I don't understand the character `{}`", ch)));
480                }
481            }
482        }
483
484        Ok(tokens)
485    }
486
487    fn lex_number(&mut self) -> Result<Token, BopError> {
488        let mut s = String::new();
489        while let Some(ch) = self.peek() {
490            if ch.is_ascii_digit() {
491                s.push(ch);
492                self.advance();
493            } else {
494                break;
495            }
496        }
497        if self.peek() == Some('.') && self.peek_next().is_some_and(|c| c.is_ascii_digit()) {
498            s.push('.');
499            self.advance();
500            while let Some(ch) = self.peek() {
501                if ch.is_ascii_digit() {
502                    s.push(ch);
503                    self.advance();
504                } else {
505                    break;
506                }
507            }
508        }
509        let n: f64 = s
510            .parse()
511            .map_err(|_| self.error(format!("Invalid number: {}", s)))?;
512        Ok(Token::Number(n))
513    }
514
515    fn lex_ident_or_keyword(&mut self) -> Token {
516        let mut s = String::new();
517        while let Some(ch) = self.peek() {
518            if ch.is_ascii_alphanumeric() || ch == '_' {
519                s.push(ch);
520                self.advance();
521            } else {
522                break;
523            }
524        }
525        match s.as_str() {
526            "let" => Token::Let,
527            "fn" => Token::Fn,
528            "return" => Token::Return,
529            "if" => Token::If,
530            "else" => Token::Else,
531            "while" => Token::While,
532            "for" => Token::For,
533            "in" => Token::In,
534            "repeat" => Token::Repeat,
535            "break" => Token::Break,
536            "continue" => Token::Continue,
537            "true" => Token::True,
538            "false" => Token::False,
539            "none" => Token::None,
540            _ => Token::Ident(s),
541        }
542    }
543
544    fn lex_string(&mut self) -> Result<Token, BopError> {
545        self.advance(); // consume opening "
546        let mut parts: Vec<StringPart> = Vec::new();
547        let mut current = String::new();
548
549        loop {
550            match self.peek() {
551                None | Some('\n') => {
552                    return Err(self.error_with_hint(
553                        "This string is missing its closing `\"`",
554                        "Every string needs to start and end with quotes.",
555                    ));
556                }
557                Some('"') => {
558                    self.advance();
559                    break;
560                }
561                Some('\\') => {
562                    self.advance();
563                    match self.peek() {
564                        Some('"') => {
565                            current.push('"');
566                            self.advance();
567                        }
568                        Some('\\') => {
569                            current.push('\\');
570                            self.advance();
571                        }
572                        Some('n') => {
573                            current.push('\n');
574                            self.advance();
575                        }
576                        Some('t') => {
577                            current.push('\t');
578                            self.advance();
579                        }
580                        Some('{') => {
581                            current.push('{');
582                            self.advance();
583                        }
584                        Some('}') => {
585                            current.push('}');
586                            self.advance();
587                        }
588                        Some(c) => {
589                            return Err(self.error(format!("Unknown escape sequence `\\{}`", c)));
590                        }
591                        None => {
592                            return Err(self.error("Unexpected end of string after `\\`"));
593                        }
594                    }
595                }
596                Some('{')
597                    if self
598                        .peek_next()
599                        .is_some_and(|c| c.is_ascii_alphabetic() || c == '_') =>
600                {
601                    self.advance(); // consume {
602                    // Read variable name
603                    let mut var = String::new();
604                    while let Some(ch) = self.peek() {
605                        if ch.is_ascii_alphanumeric() || ch == '_' {
606                            var.push(ch);
607                            self.advance();
608                        } else {
609                            break;
610                        }
611                    }
612                    if self.peek() != Some('}') {
613                        return Err(self.error_with_hint(
614                            format!("Missing `}}` after `{{{}`", var),
615                            "String interpolation needs a closing `}`, like: \"{name}\"",
616                        ));
617                    }
618                    self.advance(); // consume }
619                    if !current.is_empty() {
620                        parts.push(StringPart::Literal(core::mem::take(&mut current)));
621                    }
622                    parts.push(StringPart::Variable(var));
623                }
624                Some(ch) => {
625                    current.push(ch);
626                    self.advance();
627                }
628            }
629        }
630
631        if parts.is_empty() {
632            // Plain string, no interpolation
633            Ok(Token::Str(current))
634        } else {
635            if !current.is_empty() {
636                parts.push(StringPart::Literal(current));
637            }
638            Ok(Token::StringInterp(parts))
639        }
640    }
641}
642
643#[cfg(test)]
644mod tests {
645    use super::*;
646
647    /// Lex and strip Eof, returning just token variants
648    fn toks(code: &str) -> Vec<Token> {
649        lex(code)
650            .unwrap()
651            .into_iter()
652            .map(|t| t.token)
653            .filter(|t| !matches!(t, Token::Eof))
654            .collect()
655    }
656
657    fn lex_err(code: &str) -> String {
658        lex(code).unwrap_err().message
659    }
660
661    // ─── Numbers ───────────────────────────────────────────────────
662
663    #[test]
664    fn integer() {
665        assert_eq!(toks("42"), vec![Token::Number(42.0)]);
666    }
667
668    #[test]
669    fn float() {
670        assert_eq!(toks("3.14"), vec![Token::Number(3.14)]);
671    }
672
673    #[test]
674    fn leading_zero_float() {
675        assert_eq!(toks("0.5"), vec![Token::Number(0.5)]);
676    }
677
678    // ─── Strings ───────────────────────────────────────────────────
679
680    #[test]
681    fn plain_string() {
682        assert_eq!(toks(r#""hello""#), vec![Token::Str("hello".into())]);
683    }
684
685    #[test]
686    fn escape_sequences() {
687        assert_eq!(
688            toks(r#""a\nb\t\\\"c""#),
689            vec![Token::Str("a\nb\t\\\"c".into())]
690        );
691    }
692
693    #[test]
694    fn string_interpolation() {
695        assert_eq!(
696            toks(r#""hi {name}!""#),
697            vec![Token::StringInterp(vec![
698                StringPart::Literal("hi ".into()),
699                StringPart::Variable("name".into()),
700                StringPart::Literal("!".into()),
701            ])]
702        );
703    }
704
705    #[test]
706    fn string_interpolation_multiple_vars() {
707        assert_eq!(
708            toks(r#""{x},{y}""#),
709            vec![Token::StringInterp(vec![
710                StringPart::Variable("x".into()),
711                StringPart::Literal(",".into()),
712                StringPart::Variable("y".into()),
713            ])]
714        );
715    }
716
717    #[test]
718    fn unterminated_string() {
719        assert!(lex_err(r#""hello"#).contains("missing its closing"));
720    }
721
722    #[test]
723    fn unknown_escape() {
724        assert!(lex_err(r#""hello\q""#).contains("Unknown escape"));
725    }
726
727    // ─── Keywords vs Identifiers ───────────────────────────────────
728
729    #[test]
730    fn keywords() {
731        assert_eq!(
732            toks("let fn return if else while for in repeat break continue true false none"),
733            vec![
734                Token::Let,
735                Token::Fn,
736                Token::Return,
737                Token::If,
738                Token::Else,
739                Token::While,
740                Token::For,
741                Token::In,
742                Token::Repeat,
743                Token::Break,
744                Token::Continue,
745                Token::True,
746                Token::False,
747                Token::None,
748            ]
749        );
750    }
751
752    #[test]
753    fn identifiers() {
754        assert_eq!(
755            toks("foo bar_baz _x abc123"),
756            vec![
757                Token::Ident("foo".into()),
758                Token::Ident("bar_baz".into()),
759                Token::Ident("_x".into()),
760                Token::Ident("abc123".into()),
761            ]
762        );
763    }
764
765    // ─── Operators ─────────────────────────────────────────────────
766
767    #[test]
768    fn single_char_ops() {
769        assert_eq!(
770            toks("+ - * / % = ! < > ( ) [ ] { } , : . ;"),
771            vec![
772                Token::Plus,
773                Token::Minus,
774                Token::Star,
775                Token::Slash,
776                Token::Percent,
777                Token::Eq,
778                Token::Bang,
779                Token::Lt,
780                Token::Gt,
781                Token::LParen,
782                Token::RParen,
783                Token::LBracket,
784                Token::RBracket,
785                Token::LBrace,
786                Token::RBrace,
787                Token::Comma,
788                Token::Colon,
789                Token::Dot,
790                Token::Semicolon,
791            ]
792        );
793    }
794
795    #[test]
796    fn double_char_ops() {
797        assert_eq!(
798            toks("== != <= >= && || += -= *= /= %="),
799            vec![
800                Token::EqEq,
801                Token::BangEq,
802                Token::LtEq,
803                Token::GtEq,
804                Token::AmpAmp,
805                Token::PipePipe,
806                Token::PlusEq,
807                Token::MinusEq,
808                Token::StarEq,
809                Token::SlashEq,
810                Token::PercentEq,
811            ]
812        );
813    }
814
815    #[test]
816    fn lone_ampersand_error() {
817        assert!(lex_err("&x").contains("Unexpected `&`"));
818    }
819
820    #[test]
821    fn lone_pipe_error() {
822        assert!(lex_err("|x").contains("Unexpected `|`"));
823    }
824
825    // ─── Comments ──────────────────────────────────────────────────
826
827    #[test]
828    fn line_comment_skipped() {
829        assert_eq!(
830            toks("1 // comment\n2"),
831            vec![Token::Number(1.0), Token::Semicolon, Token::Number(2.0),]
832        );
833    }
834
835    #[test]
836    fn comment_at_end() {
837        assert_eq!(toks("x // done"), vec![Token::Ident("x".into())]);
838    }
839
840    // ─── Auto-semicolons ──────────────────────────────────────────
841
842    #[test]
843    fn auto_semi_after_ident() {
844        assert_eq!(
845            toks("x\ny"),
846            vec![
847                Token::Ident("x".into()),
848                Token::Semicolon,
849                Token::Ident("y".into()),
850            ]
851        );
852    }
853
854    #[test]
855    fn auto_semi_after_number() {
856        assert_eq!(
857            toks("42\n10"),
858            vec![Token::Number(42.0), Token::Semicolon, Token::Number(10.0),]
859        );
860    }
861
862    #[test]
863    fn auto_semi_after_rparen() {
864        assert_eq!(
865            toks("f()\ng()"),
866            vec![
867                Token::Ident("f".into()),
868                Token::LParen,
869                Token::RParen,
870                Token::Semicolon,
871                Token::Ident("g".into()),
872                Token::LParen,
873                Token::RParen,
874            ]
875        );
876    }
877
878    #[test]
879    fn auto_semi_after_rbrace() {
880        assert_eq!(
881            toks("{\n}\nx"),
882            vec![
883                Token::LBrace,
884                Token::RBrace,
885                Token::Semicolon,
886                Token::Ident("x".into()),
887            ]
888        );
889    }
890
891    #[test]
892    fn no_semi_after_open_delim() {
893        assert_eq!(toks("{\nx"), vec![Token::LBrace, Token::Ident("x".into()),]);
894    }
895
896    #[test]
897    fn no_semi_after_operator() {
898        assert_eq!(
899            toks("x +\ny"),
900            vec![
901                Token::Ident("x".into()),
902                Token::Plus,
903                Token::Ident("y".into()),
904            ]
905        );
906    }
907
908    #[test]
909    fn auto_semi_after_break_continue_return() {
910        assert_eq!(
911            toks("break\ncontinue\nreturn"),
912            vec![
913                Token::Break,
914                Token::Semicolon,
915                Token::Continue,
916                Token::Semicolon,
917                Token::Return,
918            ]
919        );
920    }
921
922    #[test]
923    fn auto_semi_after_true_false_none() {
924        assert_eq!(
925            toks("true\nfalse\nnone"),
926            vec![
927                Token::True,
928                Token::Semicolon,
929                Token::False,
930                Token::Semicolon,
931                Token::None,
932            ]
933        );
934    }
935
936    // ─── Line tracking ─────────────────────────────────────────────
937
938    #[test]
939    fn line_numbers() {
940        let tokens = lex("x\ny\nz").unwrap();
941        let lines: Vec<u32> = tokens.iter().map(|t| t.line).collect();
942        // x(L1), ;(L1), y(L2), ;(L2), z(L3), Eof(L3)
943        assert_eq!(lines, vec![1, 1, 2, 2, 3, 3]);
944    }
945
946    // ─── Unknown character ─────────────────────────────────────────
947
948    #[test]
949    fn unknown_char() {
950        assert!(lex_err("@").contains("don't understand"));
951    }
952}