rusty_basic/
lexer.rs

1use crate::token::Token;
2
3#[derive(Debug, PartialEq)]
4pub enum Error {
5    /// Found an invalid character like b'@', b'$'
6    InvalidCharacter,
7    /// Lexed identifier is not keyword or variable
8    UnknownIdentifier,
9    /// Non terminated string literal like "Hello, World!
10    NonTerminatedStringLiteral,
11}
12
13pub fn lex(code: &[u8]) -> Result<Vec<Token>, Error> {
14    let mut tokens = Vec::new();
15    let mut chars = code.iter().peekable();
16
17    while let Some(ch) = chars.next() {
18        let token = match ch {
19            b',' => Token::Comma,
20            b'(' => Token::OpeningParenthesis,
21            b')' => Token::ClosingParenthesis,
22            b'=' => Token::Equal,
23            b'<' => match chars.peek() {
24                Some(b'=') => {
25                    chars.next();
26                    Token::LessThanOrEqual
27                }
28                Some(b'>') => {
29                    chars.next();
30                    Token::NotEqual
31                }
32                _ => Token::LessThan,
33            },
34            b'>' => match chars.peek() {
35                Some(b'=') => {
36                    chars.next();
37                    Token::GreaterThanOrEqual
38                }
39                Some(b'<') => {
40                    chars.next();
41                    Token::NotEqual
42                }
43                _ => Token::GreaterThan,
44            },
45            b'+' => Token::Plus,
46            b'-' => Token::Minus,
47            b'*' => Token::Multiply,
48            b'/' => Token::Divide,
49            b'0'..=b'9' => {
50                let mut value: i16 = (ch - b'0') as i16;
51                while let Some(&ch @ b'0'..=b'9') = chars.peek() {
52                    value *= 10;
53                    value += (ch - b'0') as i16;
54                    chars.next();
55                }
56
57                Token::NumberLiteral(value)
58            }
59            b'"' => {
60                let mut value = Vec::new();
61                let mut is_string_terminated = false;
62                for &ch in &mut chars {
63                    if ch == b'"' {
64                        is_string_terminated = true;
65                        break;
66                    }
67
68                    value.push(ch);
69                }
70
71                if !is_string_terminated {
72                    return Err(Error::NonTerminatedStringLiteral);
73                }
74
75                Token::StringLiteral { value }
76            }
77            ch if ch.is_ascii_alphabetic() => {
78                /// The longest length of valid identifiers.
79                const MAX_IDENTIFIER_LENGTH: usize = 6;
80
81                let mut identifier = Vec::with_capacity(MAX_IDENTIFIER_LENGTH);
82                identifier.push(ch.to_ascii_uppercase());
83
84                while let Some(&ch) = chars.peek() {
85                    if !ch.is_ascii_alphanumeric() {
86                        break;
87                    }
88
89                    identifier.push(ch.to_ascii_uppercase());
90                    chars.next();
91                }
92
93                debug_assert_eq!(identifier, identifier.to_ascii_uppercase());
94
95                // handle variable identifier
96                if identifier.len() == 1 {
97                    Token::Variable {
98                        identifier: identifier[0],
99                    }
100                } else {
101                    match identifier.as_slice() {
102                        b"PRINT" => Token::Print,
103                        b"IF" => Token::If,
104                        b"THEN" => Token::Then,
105                        b"GOTO" => Token::Goto,
106                        b"INPUT" => Token::Input,
107                        b"LET" => Token::Let,
108                        b"GOSUB" => Token::GoSub,
109                        b"RETURN" => Token::Return,
110                        b"CLEAR" => Token::Clear,
111                        b"LIST" => Token::List,
112                        b"RUN" => Token::Run,
113                        b"END" => Token::End,
114                        _ => return Err(Error::UnknownIdentifier),
115                    }
116                }
117            }
118            ch if ch.is_ascii_whitespace() => continue,
119            _ => return Err(Error::InvalidCharacter),
120        };
121
122        tokens.push(token);
123    }
124
125    Ok(tokens)
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131
132    #[test]
133    fn lex_hello_world_returns_tokens() {
134        let code = b"PRINT \"Hello, World!\"";
135        let expected = vec![
136            Token::Print,
137            Token::StringLiteral {
138                value: b"Hello, World!".to_vec(),
139            },
140        ];
141
142        let actual = lex(code);
143
144        assert_eq!(Ok(expected), actual);
145    }
146
147    #[test]
148    fn lex_expression_returns_tokens() {
149        let expression = b"1 + 2 * 3 / 4 - 5";
150        let expected = vec![
151            Token::NumberLiteral(1),
152            Token::Plus,
153            Token::NumberLiteral(2),
154            Token::Multiply,
155            Token::NumberLiteral(3),
156            Token::Divide,
157            Token::NumberLiteral(4),
158            Token::Minus,
159            Token::NumberLiteral(5),
160        ];
161
162        let actual = lex(expression);
163
164        assert_eq!(Ok(expected), actual);
165    }
166
167    #[test]
168    fn lex_keywords_returns_tokens() {
169        let code = b"PRINT IF THEN GOTO INPUT LET GOSUB RETURN CLEAR LIST RUN END";
170        let expected = vec![
171            Token::Print,
172            Token::If,
173            Token::Then,
174            Token::Goto,
175            Token::Input,
176            Token::Let,
177            Token::GoSub,
178            Token::Return,
179            Token::Clear,
180            Token::List,
181            Token::Run,
182            Token::End,
183        ];
184
185        let actual = lex(code);
186
187        assert_eq!(Ok(expected), actual);
188    }
189
190    #[test]
191    fn lex_variable_returns_token() {
192        let code = b"IF A < B THEN PRINT Z";
193        let expected = vec![
194            Token::If,
195            Token::Variable { identifier: b'A' },
196            Token::LessThan,
197            Token::Variable { identifier: b'B' },
198            Token::Then,
199            Token::Print,
200            Token::Variable { identifier: b'Z' },
201        ];
202
203        let actual = lex(code);
204
205        assert_eq!(Ok(expected), actual);
206    }
207
208    #[test]
209    fn lex_lowercase_variable_returns_uppercase_token() {
210        let code = b"IF a < b THEN PRINT z";
211        let expected = vec![
212            Token::If,
213            Token::Variable { identifier: b'A' },
214            Token::LessThan,
215            Token::Variable { identifier: b'B' },
216            Token::Then,
217            Token::Print,
218            Token::Variable { identifier: b'Z' },
219        ];
220
221        let actual = lex(code);
222
223        assert_eq!(Ok(expected), actual);
224    }
225
226    #[test]
227    fn lex_unknown_identifier_returns_error() {
228        let invalid_code = b"PRINT HELLO";
229
230        let actual = lex(invalid_code);
231
232        assert_eq!(Err(Error::UnknownIdentifier), actual);
233    }
234
235    #[test]
236    fn lex_non_terminated_string_returns_error() {
237        let invalid_code = b"PRINT \"Hello, World!";
238
239        let actual = lex(invalid_code);
240
241        assert_eq!(Err(Error::NonTerminatedStringLiteral), actual);
242    }
243
244    #[test]
245    fn lex_empty_string_literal_returns_tokens() {
246        let code = br#"PRINT "", """#;
247        let expected = vec![
248            Token::Print,
249            Token::StringLiteral { value: Vec::new() },
250            Token::Comma,
251            Token::StringLiteral { value: Vec::new() },
252        ];
253
254        let actual = lex(code);
255
256        assert_eq!(Ok(expected), actual);
257    }
258
259    #[test]
260    fn lex_number_with_boundary_digit_returns_tokens() {
261        let code = b"9999";
262        let expected = vec![Token::NumberLiteral(9999)];
263
264        let actual = lex(code);
265
266        assert_eq!(Ok(expected), actual);
267    }
268}