slac/
scanner.rs

1use std::vec;
2
3use crate::error::{Error, Result};
4use crate::token::Token;
5use crate::value::Value;
6
7/// A lexer to split a string into a list of [`Tokens`](Token).
8pub struct Scanner<'a> {
9    source: &'a str,
10    start: usize,
11    current: usize,
12    end: usize,
13}
14
15impl<'a> Scanner<'a> {
16    /// Converts an input string into a list of [`Tokens`](Token).
17    ///
18    /// # Examples
19    /// ```
20    /// use slac::{Scanner, Token, Value};
21    ///
22    /// let tokens = Scanner::tokenize("40 + 2").unwrap();
23    /// let expected: Vec<Token> = vec![Token::Literal(Value::Number(40.0)), Token::Plus, Token::Literal(Value::Number(2.0))];
24    ///
25    /// assert_eq!(tokens, expected);
26    /// ```
27    /// # Errors
28    /// Returns an [`Error`] when encountering invalid input.
29    pub fn tokenize(source: &'a str) -> Result<Vec<Token>> {
30        let mut scanner = Scanner {
31            source,
32            start: 0,
33            current: 0,
34            end: source.chars().count(),
35        };
36
37        let mut tokens: Vec<Token> = vec![];
38
39        scanner.skip_whitespace();
40
41        while !scanner.is_at_end() {
42            tokens.push(scanner.next_token()?);
43            scanner.skip_whitespace();
44        }
45
46        if tokens.is_empty() {
47            Err(Error::Eof)
48        } else {
49            Ok(tokens)
50        }
51    }
52
53    fn next_token(&mut self) -> Result<Token> {
54        self.start = self.current;
55        let next = self.next_char().ok_or(Error::Eof)?;
56
57        if Scanner::is_identifier_start(next) {
58            return Ok(self.identifier());
59        }
60
61        if char::is_numeric(next) {
62            return self.number();
63        }
64
65        match next {
66            '\'' => self.string(),
67            '.' => self.number(), // interprete .1 as 0.1
68            '(' => Ok(Token::LeftParen),
69            ')' => Ok(Token::RightParen),
70            '[' => Ok(Token::LeftBracket),
71            ']' => Ok(Token::RightBracket),
72            ',' => Ok(Token::Comma),
73            '+' => Ok(Token::Plus),
74            '-' => Ok(Token::Minus),
75            '*' => Ok(Token::Star),
76            '/' => Ok(Token::Slash),
77            '=' => Ok(Token::Equal),
78            '>' => Ok(self.greater()),
79            '<' => Ok(self.lesser()),
80            _ => Err(Error::InvalidCharacter(next)),
81        }
82    }
83
84    fn is_at_end(&self) -> bool {
85        self.current >= self.end
86    }
87
88    fn advance(&mut self) {
89        self.current += 1;
90    }
91
92    fn advance_numeric(&mut self) {
93        while let Some(c) = self.peek() {
94            if c.is_numeric() {
95                self.advance();
96            } else {
97                break;
98            }
99        }
100    }
101
102    fn next_char(&mut self) -> Option<char> {
103        self.advance();
104        self.source.chars().nth(self.current - 1)
105    }
106
107    fn peek(&self) -> Option<char> {
108        self.peek_ahead(0)
109    }
110
111    fn peek_ahead(&self, offset: usize) -> Option<char> {
112        self.source.chars().nth(self.current + offset)
113    }
114
115    fn skip_whitespace(&mut self) {
116        loop {
117            while let Some(' ' | '\r' | '\t' | '\n') = self.peek() {
118                self.advance();
119            }
120
121            if !self.skip_comments() {
122                // repeat check for whitespace if comments where found
123                break;
124            };
125        }
126    }
127
128    fn skip_comments(&mut self) -> bool {
129        match (self.peek_ahead(0), self.peek_ahead(1)) {
130            (Some('/'), Some('/')) => {
131                while self.next_char().is_some_and(|c| c != '\n') {
132                    // skip via next_char() until eof or linebreak
133                }
134                true // found line comment
135            }
136            (Some('{'), _) => {
137                self.advance(); // skip the '{'
138
139                let mut comment_depth: i32 = 1;
140                while comment_depth > 0 {
141                    match self.next_char() {
142                        Some('{') => comment_depth += 1,
143                        Some('}') => comment_depth -= 1,
144                        None => break, // Eof
145                        _ => (),
146                    }
147                }
148                true // found block comment
149            }
150            _ => false, // no comment
151        }
152    }
153
154    fn get_content(&self, trim_by: usize) -> String {
155        let from = self.start + trim_by;
156        let to = self.current - trim_by;
157
158        self.source.chars().take(to).skip(from).collect()
159    }
160
161    fn is_identifier_start(character: char) -> bool {
162        character.is_alphabetic() || character == '_'
163    }
164
165    fn is_identifier(character: char) -> bool {
166        character.is_alphanumeric() || character == '_'
167    }
168
169    fn identifier(&mut self) -> Token {
170        while self.peek().is_some_and(Scanner::is_identifier) {
171            self.advance();
172        }
173
174        let ident = self.get_content(0);
175
176        match ident.to_lowercase().as_str() {
177            "true" => Token::Literal(Value::Boolean(true)),
178            "false" => Token::Literal(Value::Boolean(false)),
179            "and" => Token::And,
180            "or" => Token::Or,
181            "xor" => Token::Xor,
182            "not" => Token::Not,
183            "div" => Token::Div,
184            "mod" => Token::Mod,
185            _ => Token::Identifier(ident),
186        }
187    }
188
189    fn extract_number(content: &str) -> Result<f64> {
190        content
191            .parse::<f64>()
192            .map_err(|o| Error::InvalidNumber(o.to_string()))
193    }
194
195    fn number(&mut self) -> Result<Token> {
196        self.advance_numeric(); // advance integral
197
198        if self.peek() == Some('.') {
199            self.advance(); // advance dot
200
201            if let Some(fractional) = self.peek() {
202                if fractional.is_numeric() {
203                    self.advance_numeric(); // advance fraction
204                }
205            }
206        }
207
208        let content = self.get_content(0);
209        let number = Scanner::extract_number(content.as_str())?;
210
211        Ok(Token::Literal(Value::Number(number)))
212    }
213
214    fn string(&mut self) -> Result<Token> {
215        let mut contains_single_quote = false;
216
217        loop {
218            while self.peek().is_some_and(|c| c != '\'') {
219                self.advance(); // advance to the last single quote or the end
220            }
221
222            if self.is_at_end() {
223                return Err(Error::UnterminatedStringLiteral);
224            };
225
226            self.advance(); // consume closing single quote
227
228            if self.peek() == Some('\'') {
229                contains_single_quote = true; // character after the last single quote is also a single quote
230                self.advance();
231            } else {
232                break; // end of string
233            }
234        }
235
236        let mut content = self.get_content(1);
237
238        if contains_single_quote {
239            content = content.replace("''", "'"); // replace all double quotes with single quotes
240        }
241
242        Ok(Token::Literal(Value::String(content)))
243    }
244
245    fn encounter_double(&mut self, token: Token) -> Token {
246        self.advance();
247        token
248    }
249
250    fn greater(&mut self) -> Token {
251        match self.peek() {
252            Some('=') => self.encounter_double(Token::GreaterEqual),
253            _ => Token::Greater,
254        }
255    }
256
257    fn lesser(&mut self) -> Token {
258        match self.peek() {
259            Some('=') => self.encounter_double(Token::LessEqual),
260            Some('>') => self.encounter_double(Token::NotEqual),
261            _ => Token::Less,
262        }
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use std::f64::consts::PI;
269
270    use super::{Scanner, Token};
271    use crate::{
272        error::{Error, Result},
273        value::Value,
274    };
275
276    #[test]
277    fn simple_bool() -> Result<()> {
278        let tokens = Scanner::tokenize("True")?;
279        let expected = Token::Literal(Value::Boolean(true));
280
281        assert_eq!(tokens[0], expected);
282        Ok(())
283    }
284
285    #[test]
286    fn simple_integer() -> Result<()> {
287        let tokens = Scanner::tokenize("9001")?;
288        let expected = Token::Literal(Value::Number(9001.0));
289
290        assert_eq!(tokens[0], expected);
291        Ok(())
292    }
293
294    #[test]
295    fn simple_float() -> Result<()> {
296        let tokens = Scanner::tokenize("3.141592653589793")?;
297        let expected = Token::Literal(Value::Number(PI));
298
299        assert_eq!(tokens[0], expected);
300        Ok(())
301    }
302
303    #[test]
304    fn simple_string() -> Result<()> {
305        let tokens = Scanner::tokenize("'Hello World'")?;
306        let expected = Token::Literal(Value::String(String::from("Hello World")));
307
308        assert!(tokens.first().is_some());
309        assert_eq!(tokens[0], expected);
310        Ok(())
311    }
312
313    #[test]
314    fn multiple_tokens() -> Result<()> {
315        let tokens = Scanner::tokenize("1 + 1")?;
316        let expected: Vec<Token> = vec![
317            Token::Literal(Value::Number(1.0)),
318            Token::Plus,
319            Token::Literal(Value::Number(1.0)),
320        ];
321
322        assert_eq!(tokens, expected);
323        Ok(())
324    }
325
326    #[test]
327    fn var_name_underscore() -> Result<()> {
328        let tokens = Scanner::tokenize("(_SOME_VAR1 * ANOTHER-ONE)")?;
329        let expected = vec![
330            Token::LeftParen,
331            Token::Identifier(String::from("_SOME_VAR1")),
332            Token::Star,
333            Token::Identifier(String::from("ANOTHER")),
334            Token::Minus,
335            Token::Identifier(String::from("ONE")),
336            Token::RightParen,
337        ];
338
339        assert_eq!(expected, tokens);
340        Ok(())
341    }
342
343    #[test]
344    fn unterminated_less() -> Result<()> {
345        let tokens = Scanner::tokenize("<")?;
346        let expected = vec![Token::Less];
347
348        assert_eq!(expected, tokens);
349        Ok(())
350    }
351
352    fn test_number(input: &str, expected: f64) -> Result<()> {
353        let tokens = Scanner::tokenize(input)?;
354        let expected = vec![Token::Literal(Value::Number(expected))];
355
356        assert_eq!(expected, tokens);
357        Ok(())
358    }
359
360    #[test]
361    fn number_parts() -> Result<()> {
362        test_number("10", 10.0)?;
363        test_number("10.0", 10.0)?;
364        test_number("20.4", 20.4)?;
365        test_number("30.", 30.0)?;
366        test_number(".4", 0.4)?;
367
368        Ok(())
369    }
370
371    #[test]
372    fn err_empty_input() {
373        let tokens = Scanner::tokenize("");
374        let expected = Err(Error::Eof);
375
376        assert_eq!(expected, tokens);
377    }
378
379    #[test]
380    fn err_unknown_token_1() {
381        let tokens = Scanner::tokenize("$");
382        let expected = Err(Error::InvalidCharacter('$'));
383
384        assert_eq!(expected, tokens);
385    }
386
387    #[test]
388    fn err_unknown_token_2() {
389        let tokens = Scanner::tokenize("$hello");
390        let expected = Err(Error::InvalidCharacter('$'));
391
392        assert_eq!(expected, tokens);
393    }
394
395    #[test]
396    fn err_unterminated_string() {
397        let tokens = Scanner::tokenize("'hello' + 'world");
398        let expected = Err(Error::UnterminatedStringLiteral);
399
400        assert_eq!(expected, tokens);
401    }
402
403    #[test]
404    fn has_slash_comment() {
405        let tokens = Scanner::tokenize("true // some comment");
406        let expected = Ok(vec![Token::Literal(Value::Boolean(true))]);
407
408        assert_eq!(expected, tokens);
409
410        let tokens = Scanner::tokenize("true //");
411        let expected = Ok(vec![Token::Literal(Value::Boolean(true))]);
412
413        assert_eq!(expected, tokens);
414    }
415
416    #[test]
417    fn has_slash_comment_multiline() {
418        let tokens = Scanner::tokenize("true // some comment \n and false");
419        let expected = Ok(vec![
420            Token::Literal(Value::Boolean(true)),
421            Token::And,
422            Token::Literal(Value::Boolean(false)),
423        ]);
424
425        assert_eq!(expected, tokens);
426
427        let tokens = Scanner::tokenize("true //\n//\n and false");
428        let expected = Ok(vec![
429            Token::Literal(Value::Boolean(true)),
430            Token::And,
431            Token::Literal(Value::Boolean(false)),
432        ]);
433
434        assert_eq!(expected, tokens);
435    }
436
437    #[test]
438    fn has_brace_comment() {
439        let expected = Ok(vec![
440            Token::Literal(Value::Number(1.0)),
441            Token::Plus,
442            Token::Literal(Value::Number(3.0)),
443        ]);
444
445        assert_eq!(expected, Scanner::tokenize("1 + {2} 3"));
446        assert_eq!(expected, Scanner::tokenize("1 + 3 {123}"));
447        assert_eq!(expected, Scanner::tokenize("1 + {123 {+4}} 3"));
448        assert_eq!(expected, Scanner::tokenize("1 + 3 {  "));
449        assert_eq!(expected, Scanner::tokenize("{Test}1+3"));
450    }
451
452    #[test]
453    fn quote_char_in_string() {
454        let expected = Ok(vec![Token::Literal(Value::String(String::from(
455            "It's Working!",
456        )))]);
457        assert_eq!(expected, Scanner::tokenize("'It''s Working!'"));
458
459        let expected = Ok(vec![Token::Literal(Value::String(String::from("'")))]);
460        assert_eq!(expected, Scanner::tokenize("''''"));
461
462        let expected = Err(Error::UnterminatedStringLiteral);
463        assert_eq!(expected, Scanner::tokenize("'''"));
464
465        let expected = Ok(vec![
466            Token::Literal(Value::String(String::from(""))),
467            Token::Literal(Value::String(String::from(""))),
468        ]);
469        assert_eq!(expected, Scanner::tokenize("'' ''"));
470
471        let expected = Ok(vec![Token::Literal(Value::String(String::from(
472            "He's She's It's",
473        )))]);
474        assert_eq!(expected, Scanner::tokenize("'He''s She''s It''s'"));
475    }
476}