symboscript_lexer/lexer/
mod.rs

1use std::str::Chars;
2use symboscript_types::lexer::{Token, TokenKind, TokenValue};
3use symboscript_utils::report_error;
4
5pub struct Lexer<'a> {
6    /// Path of the source file
7    path: &'a str,
8
9    /// Source Text
10    source: &'a str,
11
12    /// The remaining characters
13    chars: Chars<'a>,
14
15    /// Lex comments
16    comment: bool,
17}
18
19impl<'a> Lexer<'a> {
20    pub fn new(path: &'a str, source: &'a str, comment: bool) -> Self {
21        Self {
22            path,
23            source,
24            chars: source.chars(),
25            comment,
26        }
27    }
28
29    pub fn tokenize(&mut self) -> Vec<Token> {
30        let mut tokens = Vec::new();
31
32        loop {
33            let token = self.next_token();
34            if token.kind == TokenKind::Eof {
35                break;
36            }
37            tokens.push(token);
38        }
39
40        tokens
41    }
42
43    pub fn next_token(&mut self) -> Token {
44        self.skip_trivia();
45        let start = self.offset();
46        let mut kind = self.next_kind();
47        let end = self.offset();
48
49        if kind == TokenKind::Skip {
50            return self.next_token();
51        }
52
53        let s = self.source[start..end].to_owned();
54
55        let mut value = TokenValue::None;
56
57        match kind {
58            TokenKind::Number => {
59                value = TokenValue::Number(s.parse::<f64>().unwrap_or_default());
60            }
61
62            TokenKind::Identifier => {
63                kind = self.match_keyword(&s);
64
65                if kind == TokenKind::Identifier {
66                    value = TokenValue::Identifier(s);
67                }
68            }
69
70            TokenKind::Str => {
71                value = TokenValue::Str(s[1..s.len() - 1].to_string().replace("\\n", "\n"));
72            }
73
74            TokenKind::DocComment => value = TokenValue::Str(s),
75
76            TokenKind::Unexpected => {
77                report_error(self.path, self.source, "Unexpected token", start, end)
78            }
79            _ => {}
80        };
81
82        Token {
83            kind,
84            start,
85            end,
86            value,
87        }
88    }
89
90    fn next_kind(&mut self) -> TokenKind {
91        while let Some(c) = self.next() {
92            match c {
93                '#' => return self.read_comment(),
94
95                ';' => return TokenKind::Semicolon,
96                ',' => return TokenKind::Comma,
97                ':' => return self.read_one_more('=', TokenKind::FormulaAssign, TokenKind::Colon),
98                '.' => return self.read_dot(),
99
100                '+' => {
101                    return self.read_one_more_variants(
102                        TokenKind::Plus,
103                        &['=', '+'],
104                        &[TokenKind::PlusAssign, TokenKind::PlusPlus],
105                    )
106                }
107                '-' => {
108                    return self.read_one_more_variants(
109                        TokenKind::Minus,
110                        &['=', '-'],
111                        &[TokenKind::MinusAssign, TokenKind::MinusMinus],
112                    )
113                }
114                '*' => return self.read_one_more('=', TokenKind::MultiplyAssign, TokenKind::Star),
115                '/' => return self.read_one_more('=', TokenKind::DivideAssign, TokenKind::Slash),
116                '^' => return self.read_one_more('=', TokenKind::PowerAssign, TokenKind::Caret),
117                '%' => return self.read_one_more('=', TokenKind::ModuloAssign, TokenKind::Modulo),
118
119                '&' => {
120                    return self.read_one_more(
121                        '&',
122                        TokenKind::AmpersandAmpersand,
123                        TokenKind::Ampersand,
124                    )
125                }
126                '|' => return self.read_one_more('|', TokenKind::PipePipe, TokenKind::Pipe),
127                '~' => return TokenKind::Tilde,
128                '?' => return TokenKind::Question,
129
130                '=' => return self.read_one_more('=', TokenKind::Equal, TokenKind::Assign),
131                '!' => {
132                    return self.read_one_more('=', TokenKind::NotEqual, TokenKind::ExclamationMark)
133                }
134                '<' => {
135                    return self.read_one_more_variants(
136                        TokenKind::Less,
137                        &['=', '<'],
138                        &[TokenKind::LessEqual, TokenKind::BitLeftShift],
139                    )
140                }
141                '>' => {
142                    return self.read_one_more_variants(
143                        TokenKind::Greater,
144                        &['=', '>'],
145                        &[TokenKind::GreaterEqual, TokenKind::BitRightShift],
146                    )
147                }
148
149                '(' => return TokenKind::LParen,
150                ')' => return TokenKind::RParen,
151                '{' => return TokenKind::LAngle,
152                '}' => return TokenKind::RAngle,
153                '[' => return TokenKind::LSquare,
154                ']' => return TokenKind::RSquare,
155
156                'a'..='z' | 'A'..='Z' | '_' => return self.read_identifier(),
157
158                '0'..='9' => return self.read_number(),
159                '"' | '\'' | '`' => return self.read_string(c),
160
161                _ => return TokenKind::Unexpected,
162            };
163        }
164        TokenKind::Eof
165    }
166
167    fn match_keyword(&self, ident: &str) -> TokenKind {
168        // all keywords are 1 <= length <= 10
169        if ident.len() == 1 || ident.len() > 10 {
170            return TokenKind::Identifier;
171        }
172
173        match ident {
174            "true" => TokenKind::True,
175            "false" => TokenKind::False,
176            "None" => TokenKind::None,
177
178            "if" => TokenKind::If,
179            "else" => TokenKind::Else,
180            "while" => TokenKind::While,
181            "loop" => TokenKind::Loop,
182            "for" => TokenKind::For,
183            "let" => TokenKind::Let,
184            "fn" => TokenKind::Function,
185            "scope" => TokenKind::Scope,
186            "return" => TokenKind::Return,
187            "yield" => TokenKind::Yield,
188            "break" => TokenKind::Break,
189            "continue" => TokenKind::Continue,
190            "in" => TokenKind::In,
191            "of" => TokenKind::Of,
192            "delete" => TokenKind::Delete,
193
194            "throw" => TokenKind::Throw,
195
196            "import" => TokenKind::Import,
197            "as" => TokenKind::As,
198            "context" => TokenKind::Context,
199
200            "async" => TokenKind::Async,
201            "await" => TokenKind::Await,
202
203            "block" => TokenKind::Block,
204
205            "mut" => TokenKind::Mut,
206
207            // ---Keyword2Operator---
208            "band" => TokenKind::Ampersand,
209            "bxor" => TokenKind::BitXor,
210            "bor" => TokenKind::Pipe,
211            "bnot" => TokenKind::Tilde,
212            "bshl" => TokenKind::BitLeftShift,
213            "bshr" => TokenKind::BitRightShift,
214
215            "xor" => TokenKind::Xor,
216            "and" => TokenKind::AmpersandAmpersand,
217            "or" => TokenKind::PipePipe,
218            "not" => TokenKind::ExclamationMark,
219            //---Keyword2Operator---
220
221            //
222            _ => TokenKind::Identifier,
223        }
224    }
225
226    fn skip_trivia(&mut self) {
227        while let Some(c) = self.peek() {
228            match c {
229                ' ' | '\t' | '\n' | '\r' => {
230                    self.next();
231                }
232                _ => break,
233            }
234        }
235    }
236
237    fn read_dot(&mut self) -> TokenKind {
238        if self.peek() == Some('.') {
239            self.next();
240            return TokenKind::Range;
241        } else if ("0"..="9").contains(&self.peek().unwrap_or_default().to_string().as_str()) {
242            return self.read_number();
243        }
244        return TokenKind::Dot;
245    }
246
247    fn read_number(&mut self) -> TokenKind {
248        while let Some(c) = self.peek() {
249            match c {
250                '0'..='9' => {
251                    self.next();
252                }
253                '.' | 'e' | 'E' => {
254                    if let Some(c) = self.peek_two() {
255                        match c {
256                            '0'..='9' => {
257                                self.next();
258                                self.next();
259                            }
260                            _ => {
261                                break;
262                            }
263                        }
264                    } else {
265                        break;
266                    }
267                }
268                _ => break,
269            };
270        }
271
272        TokenKind::Number
273    }
274
275    fn read_comment(&mut self) -> TokenKind {
276        if self.eat('/') {
277            while let Some(c) = self.peek() {
278                self.next();
279                if c == '/' {
280                    if self.eat('#') {
281                        if self.comment {
282                            return TokenKind::DocComment;
283                        } else {
284                            return TokenKind::Skip;
285                        }
286                    }
287                }
288            }
289        }
290
291        while let Some(c) = self.peek() {
292            match c {
293                '\n' => {
294                    self.next();
295                    break;
296                }
297                _ => {
298                    self.next();
299                }
300            };
301        }
302
303        if self.comment {
304            TokenKind::Comment
305        } else {
306            TokenKind::Skip
307        }
308    }
309
310    fn read_string(&mut self, init_char: char) -> TokenKind {
311        while let Some(c) = self.peek() {
312            match c {
313                c if c == init_char => {
314                    self.next();
315                    return TokenKind::Str;
316                }
317                '\\' => {
318                    self.next();
319                    self.next();
320                }
321                _ => {
322                    self.next();
323                }
324            };
325        }
326        TokenKind::Unexpected
327    }
328
329    fn read_identifier(&mut self) -> TokenKind {
330        while let Some(c) = self.peek() {
331            match c {
332                'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => {
333                    self.next();
334                }
335                _ => break,
336            };
337        }
338
339        TokenKind::Identifier
340    }
341
342    fn read_one_more(
343        &mut self,
344        ch: char,
345        kind_expected: TokenKind,
346        kind_unexpected: TokenKind,
347    ) -> TokenKind {
348        match self.peek() {
349            Some(c) if c == ch => {
350                self.next();
351                return kind_expected;
352            }
353            _ => return kind_unexpected,
354        }
355    }
356
357    fn read_one_more_variants(
358        &mut self,
359        kind_unexpected: TokenKind,
360        char_expected: &[char],
361        kind_expected: &[TokenKind],
362    ) -> TokenKind {
363        match self.peek() {
364            Some(c) if char_expected.contains(&c) => {
365                self.next();
366                return kind_expected[char_expected.iter().position(|&x| x == c).unwrap()];
367            }
368
369            _ => return kind_unexpected,
370        }
371    }
372
373    /// Get the length offset from the source text, in UTF-8 bytes
374    fn offset(&self) -> usize {
375        self.source.len() - self.chars.as_str().len()
376    }
377
378    fn peek(&self) -> Option<char> {
379        self.chars.as_str().chars().next()
380    }
381
382    fn peek_two(&self) -> Option<char> {
383        let mut new_chars = self.chars.as_str().chars();
384        new_chars.next();
385        new_chars.next()
386    }
387
388    fn eat(&mut self, ch: char) -> bool {
389        if self.peek() == Some(ch) {
390            self.next();
391            true
392        } else {
393            false
394        }
395    }
396
397    fn next(&mut self) -> Option<char> {
398        self.chars.next()
399    }
400}