tenda_scanner/
scanner.rs

1use tenda_common::source::IdentifiedSource;
2
3use crate::scanner_error::LexicalError;
4use crate::source_iter::SourceIter;
5use crate::token::{Literal, Token, TokenKind};
6use std::char;
7
8pub struct Scanner<'a> {
9    source: SourceIter<'a>,
10}
11
12impl<'a> Scanner<'a> {
13    pub fn new(source: &'a str, source_id: IdentifiedSource) -> Scanner<'a> {
14        Scanner {
15            source: SourceIter::new(source, source_id),
16        }
17    }
18
19    pub fn scan(&mut self) -> Result<Vec<Token>, Vec<LexicalError>> {
20        let mut tokens: Vec<Token> = Vec::new();
21        let mut errors = Vec::new();
22        let mut had_error = false;
23
24        while let Some(c) = self.source.next() {
25            let token = self.consume_token(c, tokens.last());
26
27            match token {
28                Ok(Some(value)) => {
29                    had_error = false;
30                    tokens.push(value)
31                }
32                Err(err) if !had_error => {
33                    had_error = true;
34                    errors.push(err);
35                }
36                _ => (),
37            };
38        }
39
40        tokens.push(self.source.consume_eof());
41
42        if errors.is_empty() {
43            Ok(tokens)
44        } else {
45            Err(errors)
46        }
47    }
48
49    fn consume_token(
50        &mut self,
51        char: char,
52        previous_token: Option<&Token>,
53    ) -> Result<Option<Token>, LexicalError> {
54        match char {
55            '\n' => match previous_token {
56                Some(token) if token.kind != TokenKind::Newline => {
57                    self.source.consume_token(TokenKind::Newline, "\n").into()
58                }
59                _ => {
60                    self.source.ignore_char();
61                    Ok(None)
62                }
63            },
64            c if c.is_whitespace() => {
65                self.source.ignore_char();
66                Ok(None)
67            }
68            '(' => self.source.consume_token(TokenKind::LeftParen, "(").into(),
69            ')' => self.source.consume_token(TokenKind::RightParen, ")").into(),
70            '[' => self
71                .source
72                .consume_token(TokenKind::LeftBracket, "[")
73                .into(),
74            ']' => self
75                .source
76                .consume_token(TokenKind::RightBracket, "]")
77                .into(),
78            '{' => self.source.consume_token(TokenKind::LeftBrace, "{").into(),
79            '}' => self.source.consume_token(TokenKind::RightBrace, "}").into(),
80            ':' => self.source.consume_token(TokenKind::Colon, ":").into(),
81            '+' => self.source.consume_token(TokenKind::Plus, "+").into(),
82            '-' => {
83                if let Some('>') = self.source.peek() {
84                    self.source.next();
85                    self.source.consume_token(TokenKind::Arrow, "->").into()
86                } else {
87                    self.source.consume_token(TokenKind::Minus, "-").into()
88                }
89            }
90            '*' => self.source.consume_token(TokenKind::Star, "*").into(),
91            '^' => self.source.consume_token(TokenKind::Caret, "^").into(),
92            '%' => self.source.consume_token(TokenKind::Percent, "%").into(),
93            '=' => self.source.consume_token(TokenKind::EqualSign, "=").into(),
94            '"' => self.consume_string(char).map(Some),
95            ',' => self.source.consume_token(TokenKind::Comma, ",").into(),
96            '.' => self.source.consume_token(TokenKind::Dot, ".").into(),
97            '>' => match self.source.peek() {
98                Some('=') => {
99                    self.source.next();
100                    self.source
101                        .consume_token(TokenKind::GreaterOrEqual, ">")
102                        .into()
103                }
104                _ => self.source.consume_token(TokenKind::Greater, ">").into(),
105            },
106            '<' => match self.source.peek() {
107                Some('=') => {
108                    self.source.next();
109                    self.source
110                        .consume_token(TokenKind::LessOrEqual, "<")
111                        .into()
112                }
113                _ => self.source.consume_token(TokenKind::Less, "<").into(),
114            },
115            c if c.is_ascii_digit() => self.consume_number(c).map(Some),
116            c if c.is_alphabetic() || c == '_' => self.consume_identifier(c).map(Some),
117            '/' => match self.source.peek() {
118                Some('/') => {
119                    self.consume_comment();
120                    Ok(None)
121                }
122                Some('*') => {
123                    self.consume_multiline_comment();
124                    Ok(None)
125                }
126                _ => self.source.consume_token(TokenKind::Slash, "/").into(),
127            },
128            _ => Err(LexicalError::UnexpectedChar {
129                character: char,
130                span: self.source.consume_span(),
131            }),
132        }
133    }
134
135    fn consume_string(&mut self, first_quote: char) -> Result<Token, LexicalError> {
136        let mut buf = String::new();
137        let mut closed = false;
138
139        buf.push(first_quote);
140
141        while let Some(&ch) = self.source.peek() {
142            match ch {
143                '"' => {
144                    self.source.next();
145                    closed = true;
146                    break;
147                }
148                '\n' => {
149                    return Err(LexicalError::UnexpectedStringEol {
150                        span: self.source.consume_span(),
151                    });
152                }
153                '\\' => {
154                    self.source.next();
155
156                    let esc = self
157                        .source
158                        .next()
159                        .ok_or(LexicalError::UnexpectedStringEol {
160                            span: self.source.consume_span(),
161                        })?;
162
163                    let resolved = match esc {
164                        '0' => Some('\0'),
165                        'a' => Some('\x07'),
166                        'b' => Some('\x08'),
167                        'e' => Some('\x1B'),
168                        'f' => Some('\x0C'),
169                        'n' => Some('\n'),
170                        'r' => Some('\r'),
171                        't' => Some('\t'),
172                        'v' => Some('\x0B'),
173                        '\\' => Some('\\'),
174                        '\'' => Some('\''),
175                        '"' => Some('"'),
176                        'x' => {
177                            let hi = self.read_hex_digit()?;
178                            let lo = self.read_hex_digit()?;
179                            Some(char::from(
180                                u8::from_str_radix(&format!("{hi}{lo}"), 16).unwrap(),
181                            ))
182                        }
183                        'u' => {
184                            let code = self.read_n_hex(4)?;
185                            char::from_u32(code)
186                        }
187                        'U' => {
188                            let code = self.read_n_hex(8)?;
189                            char::from_u32(code)
190                        }
191                        d @ '1'..='7' => {
192                            let d2 = self.read_octal_digit()?;
193                            let d3 = self.read_octal_digit()?;
194                            let val = u8::from_str_radix(&format!("{d}{d2}{d3}"), 8).unwrap();
195                            Some(char::from(val))
196                        }
197                        _ => {
198                            return Err(LexicalError::UnknownEscape {
199                                span: self.source.consume_span(),
200                                found: esc,
201                            })
202                        }
203                    };
204
205                    if let Some(c) = resolved {
206                        buf.push(c);
207                    } else {
208                        return Err(LexicalError::InvalidUnicodeEscape {
209                            span: self.source.consume_span(),
210                        });
211                    }
212                }
213                _ => {
214                    buf.push(ch);
215                    self.source.next();
216                }
217            }
218        }
219
220        if !closed {
221            return Err(LexicalError::UnexpectedStringEol {
222                span: self.source.consume_span(),
223            });
224        }
225
226        let literal = buf[1..].to_owned();
227
228        Ok(self.source.consume_token_with_literal(
229            TokenKind::String,
230            literal.clone(),
231            Literal::String(literal),
232        ))
233    }
234
235    fn consume_number(&mut self, first: char) -> Result<Token, LexicalError> {
236        let mut raw = String::new();
237        raw.push(first);
238
239        if first == '0' {
240            if let Some(&next) = self.source.peek() {
241                match next {
242                    'b' | 'B' | 'o' | 'O' | 'x' | 'X' => {
243                        self.source.next();
244                        raw.push(next);
245
246                        let (radix, valid_digit): (u32, fn(char) -> bool) = match next {
247                            'b' | 'B' => (2, |c: char| c == '0' || c == '1'),
248                            'o' | 'O' => (8, |c: char| ('0'..='7').contains(&c)),
249                            'x' | 'X' => (16, |c: char| c.is_ascii_hexdigit()),
250                            _ => unreachable!(),
251                        };
252
253                        let mut digits = String::new();
254
255                        while let Some(&ch) = self.source.peek() {
256                            if ch == '_' {
257                                self.source.next();
258                                continue;
259                            }
260                            if valid_digit(ch) {
261                                digits.push(ch);
262                                raw.push(ch);
263                                self.source.next();
264                            } else {
265                                break;
266                            }
267                        }
268
269                        if digits.is_empty() {
270                            return Err(LexicalError::UnexpectedChar {
271                                character: next,
272                                span: self.source.consume_span(),
273                            });
274                        }
275
276                        let value = u64::from_str_radix(&digits, radix).unwrap() as f64;
277
278                        return Ok(self.source.consume_token_with_literal(
279                            TokenKind::Number,
280                            raw,
281                            Literal::Number(value),
282                        ));
283                    }
284                    _ => (),
285                }
286            }
287        }
288
289        let mut matched_dot = first == '.';
290        let mut matched_exp = false;
291
292        while let Some(&ch) = self.source.peek() {
293            match ch {
294                '_' => {
295                    raw.push(ch);
296                    self.source.next();
297                }
298                d if d.is_ascii_digit() => {
299                    raw.push(d);
300                    self.source.next();
301                }
302                '.' if !matched_dot && !matched_exp => {
303                    matched_dot = true;
304                    raw.push('.');
305                    self.source.next();
306                }
307                'e' | 'E' if !matched_exp => {
308                    matched_exp = true;
309                    raw.push(ch);
310                    self.source.next();
311
312                    if let Some(&sign @ ('+' | '-')) = self.source.peek() {
313                        raw.push(sign);
314                        self.source.next();
315                    }
316                }
317                c if c.is_alphabetic() => {
318                    return Err(LexicalError::UnexpectedChar {
319                        character: c,
320                        span: self.source.consume_span(),
321                    });
322                }
323
324                _ => break,
325            }
326        }
327
328        let cleaned: String = raw.chars().filter(|c| *c != '_').collect();
329        let value: f64 = cleaned.parse().unwrap();
330
331        Ok(self
332            .source
333            .consume_token_with_literal(TokenKind::Number, raw, Literal::Number(value)))
334    }
335
336    fn consume_identifier(&mut self, char: char) -> Result<Token, LexicalError> {
337        let mut identifier = String::new();
338
339        identifier.push(char);
340
341        while let Some(&peeked) = self.source.peek() {
342            if peeked.is_alphanumeric() || peeked == '_' {
343                identifier.push(peeked);
344                self.source.next();
345            } else {
346                break;
347            }
348        }
349
350        let token = match identifier.as_str() {
351            Literal::TRUE_LITERAL => self.source.consume_token_with_literal(
352                TokenKind::True,
353                Literal::TRUE_LITERAL.to_string(),
354                Literal::Boolean(true),
355            ),
356            Literal::FALSE_LITERAL => self.source.consume_token_with_literal(
357                TokenKind::False,
358                Literal::FALSE_LITERAL.to_string(),
359                Literal::Boolean(false),
360            ),
361            Literal::NIL_LITERAL => self.source.consume_token_with_literal(
362                TokenKind::Nil,
363                Literal::NIL_LITERAL.to_string(),
364                Literal::Nil,
365            ),
366            "função" => self.source.consume_token(TokenKind::Function, "função"),
367            "não" => self.source.consume_token(TokenKind::Not, "não"),
368            "é" => self.source.consume_token(TokenKind::Equals, "é"),
369            "seja" => self.source.consume_token(TokenKind::Let, "seja"),
370            "se" => self.source.consume_token(TokenKind::If, "se"),
371            "então" => self.source.consume_token(TokenKind::Then, "então"),
372            "retorna" => self.source.consume_token(TokenKind::Return, "retorna"),
373            "senão" => self.source.consume_token(TokenKind::Else, "senão"),
374            "fim" => self.source.consume_token(TokenKind::BlockEnd, "fim"),
375            "ou" => self.source.consume_token(TokenKind::Or, "ou"),
376            "e" => self.source.consume_token(TokenKind::And, "e"),
377            "até" => self.source.consume_token(TokenKind::Until, "até"),
378            "para" => self.source.consume_token(TokenKind::ForOrBreak, "para"),
379            "cada" => self.source.consume_token(TokenKind::Each, "cada"),
380            "em" => self.source.consume_token(TokenKind::In, "em"),
381            "tem" => self.source.consume_token(TokenKind::Has, "tem"),
382            "enquanto" => self.source.consume_token(TokenKind::While, "enquanto"),
383            "faça" => self.source.consume_token(TokenKind::Do, "faça"),
384            "continua" => self.source.consume_token(TokenKind::Continue, "continua"),
385            identifier => self.source.consume_token_with_literal(
386                TokenKind::Identifier,
387                identifier.to_string(),
388                Literal::String(identifier.to_string()),
389            ),
390        };
391
392        Ok(token)
393    }
394
395    fn consume_comment(&mut self) {
396        while let Some(&peeked) = self.source.peek() {
397            if peeked == '\n' {
398                break;
399            }
400
401            self.source.next();
402        }
403
404        self.source.ignore_char();
405    }
406
407    fn consume_multiline_comment(&mut self) {
408        while let Some(_) = self.source.next() {
409            if self.peek_match("*/") {
410                break;
411            }
412        }
413
414        self.source.ignore_char();
415    }
416}
417
418impl Scanner<'_> {
419    fn peek_match(&mut self, expected: &str) -> bool {
420        for c in expected.chars() {
421            if let Some(&peeked) = self.source.peek() {
422                if peeked != c {
423                    return false;
424                }
425
426                self.source.next();
427            }
428        }
429
430        true
431    }
432
433    fn read_hex_digit(&mut self) -> Result<char, LexicalError> {
434        self.source
435            .next()
436            .filter(|c| c.is_ascii_hexdigit())
437            .ok_or(LexicalError::InvalidHexEscape {
438                span: self.source.consume_span(),
439            })
440    }
441
442    fn read_n_hex(&mut self, n: usize) -> Result<u32, LexicalError> {
443        let mut s = String::new();
444        for _ in 0..n {
445            s.push(self.read_hex_digit()?);
446        }
447        u32::from_str_radix(&s, 16).map_err(|_| LexicalError::InvalidHexEscape {
448            span: self.source.consume_span(),
449        })
450    }
451
452    fn read_octal_digit(&mut self) -> Result<char, LexicalError> {
453        self.source
454            .next()
455            .filter(|c| ('0'..='7').contains(c))
456            .ok_or(LexicalError::InvalidOctalEscape {
457                span: self.source.consume_span(),
458            })
459    }
460}