zen_expression/lexer/
lexer.rs

1use crate::lexer::codes::{is_token_type, token_type};
2use crate::lexer::cursor::{Cursor, CursorItem};
3use crate::lexer::error::LexerResult;
4use crate::lexer::token::{
5    Bracket, ComparisonOperator, Identifier, LogicalOperator, Operator, Token, TokenKind,
6};
7use crate::lexer::{LexerError, QuotationMark, TemplateString};
8use std::str::FromStr;
9
10#[derive(Debug, Default)]
11pub struct Lexer<'arena> {
12    tokens: Vec<Token<'arena>>,
13}
14
15impl<'arena> Lexer<'arena> {
16    pub fn new() -> Self {
17        Self::default()
18    }
19
20    pub fn tokenize(&mut self, source: &'arena str) -> LexerResult<&[Token<'arena>]> {
21        self.tokens.clear();
22
23        Scanner::new(source, &mut self.tokens).scan()?;
24        Ok(&self.tokens)
25    }
26}
27
28struct Scanner<'arena, 'self_ref> {
29    cursor: Cursor<'arena>,
30    tokens: &'self_ref mut Vec<Token<'arena>>,
31    source: &'arena str,
32}
33
34impl<'arena, 'self_ref> Scanner<'arena, 'self_ref> {
35    pub fn new(source: &'arena str, tokens: &'self_ref mut Vec<Token<'arena>>) -> Self {
36        Self {
37            cursor: Cursor::from(source),
38            source,
39            tokens,
40        }
41    }
42
43    pub fn scan(&mut self) -> LexerResult<()> {
44        while let Some(cursor_item) = self.cursor.peek() {
45            self.scan_cursor_item(cursor_item)?;
46        }
47
48        Ok(())
49    }
50
51    pub(crate) fn scan_cursor_item(&mut self, cursor_item: CursorItem) -> LexerResult<()> {
52        let (i, s) = cursor_item;
53
54        match s {
55            token_type!("space") => {
56                self.cursor.next();
57                Ok(())
58            }
59            '\'' => self.string(QuotationMark::SingleQuote),
60            '"' => self.string(QuotationMark::DoubleQuote),
61            token_type!("digit") => self.number(),
62            token_type!("bracket") => self.bracket(),
63            token_type!("cmp_operator") => self.cmp_operator(),
64            token_type!("operator") => self.operator(),
65            token_type!("question_mark") => self.question_mark(),
66            '`' => self.template_string(),
67            '.' => self.dot(),
68            token_type!("alpha") => self.identifier(),
69            _ => Err(LexerError::UnmatchedSymbol {
70                symbol: s,
71                position: i as u32,
72            }),
73        }
74    }
75
76    fn next(&self) -> LexerResult<CursorItem> {
77        self.cursor.next().ok_or_else(|| {
78            let (a, b) = self.cursor.peek_back().unwrap_or((0, ' '));
79
80            LexerError::UnexpectedEof {
81                symbol: b,
82                position: a as u32,
83            }
84        })
85    }
86
87    fn push(&mut self, token: Token<'arena>) {
88        self.tokens.push(token);
89    }
90
91    fn template_string(&mut self) -> LexerResult<()> {
92        let (start, _) = self.next()?;
93
94        self.tokens.push(Token {
95            kind: TokenKind::QuotationMark(QuotationMark::Backtick),
96            span: (start as u32, (start + 1) as u32),
97            value: QuotationMark::Backtick.into(),
98        });
99
100        let mut in_expression = false;
101        let mut str_start = start + 1;
102        loop {
103            let (e, c) = self.next()?;
104
105            match (c, in_expression) {
106                ('`', _) => {
107                    if str_start < e {
108                        self.tokens.push(Token {
109                            kind: TokenKind::Literal,
110                            span: (str_start as u32, e as u32),
111                            value: &self.source[str_start..e],
112                        });
113                    }
114
115                    self.tokens.push(Token {
116                        kind: TokenKind::QuotationMark(QuotationMark::Backtick),
117                        span: (e as u32, (e + 1) as u32),
118                        value: QuotationMark::Backtick.into(),
119                    });
120
121                    break;
122                }
123                ('$', false) => {
124                    in_expression = self.cursor.next_if_is("{");
125                    if in_expression {
126                        self.tokens.push(Token {
127                            kind: TokenKind::Literal,
128                            span: (str_start as u32, e as u32),
129                            value: &self.source[str_start..e],
130                        });
131
132                        self.tokens.push(Token {
133                            kind: TokenKind::TemplateString(TemplateString::ExpressionStart),
134                            span: (e as u32, (e + 2) as u32),
135                            value: TemplateString::ExpressionStart.into(),
136                        });
137                    }
138                }
139                ('}', true) => {
140                    in_expression = false;
141                    self.tokens.push(Token {
142                        kind: TokenKind::TemplateString(TemplateString::ExpressionEnd),
143                        span: (str_start as u32, e as u32),
144                        value: TemplateString::ExpressionEnd.into(),
145                    });
146
147                    str_start = e + 1;
148                }
149                (_, false) => {
150                    // Continue reading string
151                }
152                (_, true) => {
153                    self.cursor.back();
154                    self.scan_cursor_item((e, c))?;
155                }
156            }
157        }
158
159        Ok(())
160    }
161
162    fn string(&mut self, quote_kind: QuotationMark) -> LexerResult<()> {
163        let (start, opener) = self.next()?;
164        let end: usize;
165
166        loop {
167            let (e, c) = self.next()?;
168            if c == opener {
169                end = e;
170                break;
171            }
172        }
173
174        self.push(Token {
175            kind: TokenKind::QuotationMark(quote_kind),
176            span: (start as u32, (start + 1) as u32),
177            value: quote_kind.into(),
178        });
179
180        self.push(Token {
181            kind: TokenKind::Literal,
182            span: ((start + 1) as u32, end as u32),
183            value: &self.source[start + 1..end],
184        });
185
186        self.push(Token {
187            kind: TokenKind::QuotationMark(quote_kind),
188            span: (end as u32, (end + 1) as u32),
189            value: quote_kind.into(),
190        });
191
192        Ok(())
193    }
194
195    fn number(&mut self) -> LexerResult<()> {
196        let (start, _) = self.next()?;
197        let mut end = start;
198        let mut fractal = false;
199
200        while let Some((e, c)) = self
201            .cursor
202            .next_if(|c| is_token_type!(c, "digit") || c == '_' || c == '.')
203        {
204            if fractal && c == '.' {
205                self.cursor.back();
206                break;
207            }
208
209            if c == '.' {
210                if let Some((_, p)) = self.cursor.peek() {
211                    if p == '.' {
212                        self.cursor.back();
213                        break;
214                    }
215
216                    fractal = true
217                }
218            }
219
220            end = e;
221        }
222
223        if let Some((e_pos, _)) = self.cursor.next_if(|c| c == 'e') {
224            end = e_pos;
225
226            if let Some((sign_pos, _)) = self.cursor.next_if(|c| c == '+' || c == '-') {
227                end = sign_pos;
228            }
229
230            let mut has_exponent_digits = false;
231            while let Some((exp_pos, _)) = self.cursor.next_if(|c| is_token_type!(c, "digit")) {
232                end = exp_pos;
233                has_exponent_digits = true;
234            }
235
236            if !has_exponent_digits {
237                while self.cursor.position() > e_pos {
238                    self.cursor.back();
239                }
240
241                end = e_pos - 1;
242            }
243        }
244
245        self.push(Token {
246            kind: TokenKind::Number,
247            span: (start as u32, (end + 1) as u32),
248            value: &self.source[start..=end],
249        });
250
251        Ok(())
252    }
253
254    fn bracket(&mut self) -> LexerResult<()> {
255        let (start, _) = self.next()?;
256
257        let value = &self.source[start..=start];
258        let span = (start as u32, (start + 1) as u32);
259        self.push(Token {
260            kind: TokenKind::Bracket(Bracket::from_str(value).map_err(|_| {
261                LexerError::UnexpectedSymbol {
262                    symbol: value.to_string(),
263                    span,
264                }
265            })?),
266            span,
267            value,
268        });
269
270        Ok(())
271    }
272
273    fn dot(&mut self) -> LexerResult<()> {
274        let (start, _) = self.next()?;
275        let mut end = start;
276
277        if self.cursor.next_if(|c| c == '.').is_some() {
278            end += 1;
279        }
280
281        let value = &self.source[start..=end];
282        let span = (start as u32, (end + 1) as u32);
283        self.push(Token {
284            kind: TokenKind::Operator(Operator::from_str(value).map_err(|_| {
285                LexerError::UnexpectedSymbol {
286                    symbol: value.to_string(),
287                    span,
288                }
289            })?),
290            span,
291            value,
292        });
293
294        Ok(())
295    }
296
297    fn cmp_operator(&mut self) -> LexerResult<()> {
298        let (start, _) = self.next()?;
299        let mut end = start;
300
301        if self.cursor.next_if(|c| c == '=').is_some() {
302            end += 1;
303        }
304
305        let value = &self.source[start..=end];
306        self.push(Token {
307            kind: TokenKind::Operator(Operator::from_str(value).map_err(|_| {
308                LexerError::UnexpectedSymbol {
309                    symbol: value.to_string(),
310                    span: (start as u32, (end + 1) as u32),
311                }
312            })?),
313            span: (start as u32, (end + 1) as u32),
314            value,
315        });
316
317        Ok(())
318    }
319
320    fn question_mark(&mut self) -> LexerResult<()> {
321        let (start, _) = self.next()?;
322        let mut kind = TokenKind::Operator(Operator::QuestionMark);
323        let mut end = start;
324
325        if self.cursor.next_if(|c| c == '?').is_some() {
326            kind = TokenKind::Operator(Operator::Logical(LogicalOperator::NullishCoalescing));
327            end += 1;
328        }
329
330        let value = &self.source[start..=end];
331        self.push(Token {
332            kind,
333            value,
334            span: (start as u32, (end + 1) as u32),
335        });
336
337        Ok(())
338    }
339
340    fn operator(&mut self) -> LexerResult<()> {
341        let (start, _) = self.next()?;
342
343        let value = &self.source[start..=start];
344        let span = (start as u32, (start + 1) as u32);
345        self.push(Token {
346            kind: TokenKind::Operator(Operator::from_str(value).map_err(|_| {
347                LexerError::UnexpectedSymbol {
348                    symbol: value.to_string(),
349                    span,
350                }
351            })?),
352            span,
353            value,
354        });
355
356        Ok(())
357    }
358
359    fn not(&mut self, start: usize) -> LexerResult<()> {
360        if self.cursor.next_if_is(" in ") {
361            let end = self.cursor.position();
362
363            self.push(Token {
364                kind: TokenKind::Operator(Operator::Comparison(ComparisonOperator::NotIn)),
365                span: (start as u32, (end - 1) as u32),
366                value: "not in",
367            })
368        } else {
369            let end = self.cursor.position();
370
371            self.push(Token {
372                kind: TokenKind::Operator(Operator::Logical(LogicalOperator::Not)),
373                span: (start as u32, end as u32),
374                value: "not",
375            })
376        }
377
378        Ok(())
379    }
380
381    fn identifier(&mut self) -> LexerResult<()> {
382        let (start, _) = self.next()?;
383        let mut end = start;
384
385        while let Some((e, _)) = self.cursor.next_if(|c| is_token_type!(c, "alphanumeric")) {
386            end = e;
387        }
388
389        let value = &self.source[start..=end];
390        match value {
391            "and" => self.push(Token {
392                kind: TokenKind::Operator(Operator::Logical(LogicalOperator::And)),
393                span: (start as u32, (end + 1) as u32),
394                value,
395            }),
396            "or" => self.push(Token {
397                kind: TokenKind::Operator(Operator::Logical(LogicalOperator::Or)),
398                span: (start as u32, (end + 1) as u32),
399                value,
400            }),
401            "in" => self.push(Token {
402                kind: TokenKind::Operator(Operator::Comparison(ComparisonOperator::In)),
403                span: (start as u32, (end + 1) as u32),
404                value,
405            }),
406            "true" => self.push(Token {
407                kind: TokenKind::Boolean(true),
408                span: (start as u32, (end + 1) as u32),
409                value,
410            }),
411            "false" => self.push(Token {
412                kind: TokenKind::Boolean(false),
413                span: (start as u32, (end + 1) as u32),
414                value,
415            }),
416            "not" => self.not(start)?,
417            _ => self.push(Token {
418                kind: Identifier::try_from(value)
419                    .map(|identifier| TokenKind::Identifier(identifier))
420                    .unwrap_or(TokenKind::Literal),
421                span: (start as u32, (end + 1) as u32),
422                value,
423            }),
424        }
425
426        Ok(())
427    }
428}