mf_expression/lexer/
lexer.rs

1use crate::lexer::codes::{is_token_type, token_type};
2use crate::lexer::cursor::{Cursor, CursorItem};
3use crate::lexer::error::LexerResult;
4use crate::lexer::token::{
5    Bracket, ComparisonOperator, Identifier, LogicalOperator, Operator, Token,
6    TokenKind,
7};
8use crate::lexer::{LexerError, QuotationMark, TemplateString};
9use std::str::FromStr;
10
11#[derive(Debug, Default)]
12pub struct Lexer<'arena> {
13    tokens: Vec<Token<'arena>>,
14}
15
16impl<'arena> Lexer<'arena> {
17    pub fn new() -> Self {
18        Self::default()
19    }
20
21    pub fn tokenize(
22        &mut self,
23        source: &'arena str,
24    ) -> LexerResult<&[Token<'arena>]> {
25        self.tokens.clear();
26
27        Scanner::new(source, &mut self.tokens).scan()?;
28        Ok(&self.tokens)
29    }
30}
31
32struct Scanner<'arena, 'self_ref> {
33    cursor: Cursor<'arena>,
34    tokens: &'self_ref mut Vec<Token<'arena>>,
35    source: &'arena str,
36}
37
38impl<'arena, 'self_ref> Scanner<'arena, 'self_ref> {
39    pub fn new(
40        source: &'arena str,
41        tokens: &'self_ref mut Vec<Token<'arena>>,
42    ) -> Self {
43        Self { cursor: Cursor::from(source), source, tokens }
44    }
45
46    pub fn scan(&mut self) -> LexerResult<()> {
47        while let Some(cursor_item) = self.cursor.peek() {
48            self.scan_cursor_item(cursor_item)?;
49        }
50
51        Ok(())
52    }
53
54    pub(crate) fn scan_cursor_item(
55        &mut self,
56        cursor_item: CursorItem,
57    ) -> LexerResult<()> {
58        let (i, s) = cursor_item;
59
60        match s {
61            token_type!("space") => {
62                self.cursor.next();
63                Ok(())
64            },
65            '\'' => self.string(QuotationMark::SingleQuote),
66            '"' => self.string(QuotationMark::DoubleQuote),
67            token_type!("digit") => self.number(),
68            token_type!("bracket") => self.bracket(),
69            token_type!("cmp_operator") => self.cmp_operator(),
70            token_type!("operator") => self.operator(),
71            token_type!("question_mark") => self.question_mark(),
72            '=' => self.equals(),
73            '`' => self.template_string(),
74            '.' => self.dot(),
75            ';' => self.semi(),
76            token_type!("alpha") => self.identifier(),
77            _ => Err(LexerError::UnmatchedSymbol {
78                symbol: s,
79                position: i as u32,
80            }),
81        }
82    }
83
84    fn next(&self) -> LexerResult<CursorItem> {
85        self.cursor.next().ok_or_else(|| {
86            let (a, b) = self.cursor.peek_back().unwrap_or((0, ' '));
87
88            LexerError::UnexpectedEof { symbol: b, position: a as u32 }
89        })
90    }
91
92    fn push(
93        &mut self,
94        token: Token<'arena>,
95    ) {
96        self.tokens.push(token);
97    }
98
99    fn template_string(&mut self) -> LexerResult<()> {
100        let (start, _) = self.next()?;
101
102        self.tokens.push(Token {
103            kind: TokenKind::QuotationMark(QuotationMark::Backtick),
104            span: (start as u32, (start + 1) as u32),
105            value: QuotationMark::Backtick.into(),
106        });
107
108        let mut in_expression = false;
109        let mut str_start = start + 1;
110        loop {
111            let (e, c) = self.next()?;
112
113            match (c, in_expression) {
114                ('`', _) => {
115                    if str_start < e {
116                        self.tokens.push(Token {
117                            kind: TokenKind::Literal,
118                            span: (str_start as u32, e as u32),
119                            value: &self.source[str_start..e],
120                        });
121                    }
122
123                    self.tokens.push(Token {
124                        kind: TokenKind::QuotationMark(QuotationMark::Backtick),
125                        span: (e as u32, (e + 1) as u32),
126                        value: QuotationMark::Backtick.into(),
127                    });
128
129                    break;
130                },
131                ('$', false) => {
132                    in_expression = self.cursor.next_if_is("{");
133                    if in_expression {
134                        self.tokens.push(Token {
135                            kind: TokenKind::Literal,
136                            span: (str_start as u32, e as u32),
137                            value: &self.source[str_start..e],
138                        });
139
140                        self.tokens.push(Token {
141                            kind: TokenKind::TemplateString(
142                                TemplateString::ExpressionStart,
143                            ),
144                            span: (e as u32, (e + 2) as u32),
145                            value: TemplateString::ExpressionStart.into(),
146                        });
147                    }
148                },
149                ('}', true) => {
150                    in_expression = false;
151                    self.tokens.push(Token {
152                        kind: TokenKind::TemplateString(
153                            TemplateString::ExpressionEnd,
154                        ),
155                        span: (str_start as u32, e as u32),
156                        value: TemplateString::ExpressionEnd.into(),
157                    });
158
159                    str_start = e + 1;
160                },
161                (_, false) => {
162                    // Continue reading string
163                },
164                (_, true) => {
165                    self.cursor.back();
166                    self.scan_cursor_item((e, c))?;
167                },
168            }
169        }
170
171        Ok(())
172    }
173
174    fn string(
175        &mut self,
176        quote_kind: QuotationMark,
177    ) -> LexerResult<()> {
178        let (start, opener) = self.next()?;
179        let end: usize;
180
181        loop {
182            let (e, c) = self.next()?;
183            if c == opener {
184                end = e;
185                break;
186            }
187        }
188
189        self.push(Token {
190            kind: TokenKind::QuotationMark(quote_kind),
191            span: (start as u32, (start + 1) as u32),
192            value: quote_kind.into(),
193        });
194
195        self.push(Token {
196            kind: TokenKind::Literal,
197            span: ((start + 1) as u32, end as u32),
198            value: &self.source[start + 1..end],
199        });
200
201        self.push(Token {
202            kind: TokenKind::QuotationMark(quote_kind),
203            span: (end as u32, (end + 1) as u32),
204            value: quote_kind.into(),
205        });
206
207        Ok(())
208    }
209
210    fn number(&mut self) -> LexerResult<()> {
211        let (start, _) = self.next()?;
212        let mut end = start;
213        let mut fractal = false;
214
215        while let Some((e, c)) = self
216            .cursor
217            .next_if(|c| is_token_type!(c, "digit") || c == '_' || c == '.')
218        {
219            if fractal && c == '.' {
220                self.cursor.back();
221                break;
222            }
223
224            if c == '.' {
225                if let Some((_, p)) = self.cursor.peek() {
226                    if p == '.' {
227                        self.cursor.back();
228                        break;
229                    }
230
231                    fractal = true
232                }
233            }
234
235            end = e;
236        }
237
238        if let Some((e_pos, _)) = self.cursor.next_if(|c| c == 'e') {
239            end = e_pos;
240
241            if let Some((sign_pos, _)) =
242                self.cursor.next_if(|c| c == '+' || c == '-')
243            {
244                end = sign_pos;
245            }
246
247            let mut has_exponent_digits = false;
248            while let Some((exp_pos, _)) =
249                self.cursor.next_if(|c| is_token_type!(c, "digit"))
250            {
251                end = exp_pos;
252                has_exponent_digits = true;
253            }
254
255            if !has_exponent_digits {
256                while self.cursor.position() > e_pos {
257                    self.cursor.back();
258                }
259
260                end = e_pos - 1;
261            }
262        }
263
264        self.push(Token {
265            kind: TokenKind::Number,
266            span: (start as u32, (end + 1) as u32),
267            value: &self.source[start..=end],
268        });
269
270        Ok(())
271    }
272
273    fn bracket(&mut self) -> LexerResult<()> {
274        let (start, _) = self.next()?;
275
276        let value = &self.source[start..=start];
277        let span = (start as u32, (start + 1) as u32);
278        self.push(Token {
279            kind: TokenKind::Bracket(Bracket::from_str(value).map_err(
280                |_| LexerError::UnexpectedSymbol {
281                    symbol: value.to_string(),
282                    span,
283                },
284            )?),
285            span,
286            value,
287        });
288
289        Ok(())
290    }
291
292    fn dot(&mut self) -> LexerResult<()> {
293        let (start, _) = self.next()?;
294        let mut end = start;
295
296        if self.cursor.next_if(|c| c == '.').is_some() {
297            end += 1;
298        }
299
300        let value = &self.source[start..=end];
301        let span = (start as u32, (end + 1) as u32);
302        self.push(Token {
303            kind: TokenKind::Operator(Operator::from_str(value).map_err(
304                |_| LexerError::UnexpectedSymbol {
305                    symbol: value.to_string(),
306                    span,
307                },
308            )?),
309            span,
310            value,
311        });
312
313        Ok(())
314    }
315
316    fn cmp_operator(&mut self) -> LexerResult<()> {
317        let (start, _) = self.next()?;
318        let mut end = start;
319
320        if self.cursor.next_if(|c| c == '=').is_some() {
321            end += 1;
322        }
323
324        let value = &self.source[start..=end];
325        self.push(Token {
326            kind: TokenKind::Operator(Operator::from_str(value).map_err(
327                |_| LexerError::UnexpectedSymbol {
328                    symbol: value.to_string(),
329                    span: (start as u32, (end + 1) as u32),
330                },
331            )?),
332            span: (start as u32, (end + 1) as u32),
333            value,
334        });
335
336        Ok(())
337    }
338
339    fn semi(&mut self) -> LexerResult<()> {
340        let (start, _) = self.next()?;
341        self.push(Token {
342            kind: TokenKind::Operator(Operator::Semi),
343            span: (start as u32, (start + 1) as u32),
344            value: &self.source[start..=start],
345        });
346
347        Ok(())
348    }
349
350    fn equals(&mut self) -> LexerResult<()> {
351        let (start, _) = self.next()?;
352        let Some((end, _)) = self.cursor.next_if(|c| c == '=') else {
353            self.push(Token {
354                kind: TokenKind::Operator(Operator::Assign),
355                span: (start as u32, (start + 1) as u32),
356                value: &self.source[start..=start],
357            });
358
359            return Ok(());
360        };
361
362        self.push(Token {
363            kind: TokenKind::Operator(Operator::Comparison(
364                ComparisonOperator::Equal,
365            )),
366            span: (start as u32, (end + 1) as u32),
367            value: &self.source[start..=end],
368        });
369
370        Ok(())
371    }
372
373    fn question_mark(&mut self) -> LexerResult<()> {
374        let (start, _) = self.next()?;
375        let mut kind = TokenKind::Operator(Operator::QuestionMark);
376        let mut end = start;
377
378        if self.cursor.next_if(|c| c == '?').is_some() {
379            kind = TokenKind::Operator(Operator::Logical(
380                LogicalOperator::NullishCoalescing,
381            ));
382            end += 1;
383        }
384
385        let value = &self.source[start..=end];
386        self.push(Token {
387            kind,
388            value,
389            span: (start as u32, (end + 1) as u32),
390        });
391
392        Ok(())
393    }
394
395    fn operator(&mut self) -> LexerResult<()> {
396        let (start, _) = self.next()?;
397
398        let value = &self.source[start..=start];
399        let span = (start as u32, (start + 1) as u32);
400        self.push(Token {
401            kind: TokenKind::Operator(Operator::from_str(value).map_err(
402                |_| LexerError::UnexpectedSymbol {
403                    symbol: value.to_string(),
404                    span,
405                },
406            )?),
407            span,
408            value,
409        });
410
411        Ok(())
412    }
413
414    fn not(
415        &mut self,
416        start: usize,
417    ) -> LexerResult<()> {
418        if self.cursor.next_if_is(" in ") {
419            let end = self.cursor.position();
420
421            self.push(Token {
422                kind: TokenKind::Operator(Operator::Comparison(
423                    ComparisonOperator::NotIn,
424                )),
425                span: (start as u32, (end - 1) as u32),
426                value: "not in",
427            })
428        } else {
429            let end = self.cursor.position();
430
431            self.push(Token {
432                kind: TokenKind::Operator(Operator::Logical(
433                    LogicalOperator::Not,
434                )),
435                span: (start as u32, end as u32),
436                value: "not",
437            })
438        }
439
440        Ok(())
441    }
442
443    fn identifier(&mut self) -> LexerResult<()> {
444        let (start, _) = self.next()?;
445        let mut end = start;
446
447        while let Some((e, _)) =
448            self.cursor.next_if(|c| is_token_type!(c, "alphanumeric"))
449        {
450            end = e;
451        }
452
453        let value = &self.source[start..=end];
454        match value {
455            "and" => self.push(Token {
456                kind: TokenKind::Operator(Operator::Logical(
457                    LogicalOperator::And,
458                )),
459                span: (start as u32, (end + 1) as u32),
460                value,
461            }),
462            "or" => self.push(Token {
463                kind: TokenKind::Operator(Operator::Logical(
464                    LogicalOperator::Or,
465                )),
466                span: (start as u32, (end + 1) as u32),
467                value,
468            }),
469            "in" => self.push(Token {
470                kind: TokenKind::Operator(Operator::Comparison(
471                    ComparisonOperator::In,
472                )),
473                span: (start as u32, (end + 1) as u32),
474                value,
475            }),
476            "true" => self.push(Token {
477                kind: TokenKind::Boolean(true),
478                span: (start as u32, (end + 1) as u32),
479                value,
480            }),
481            "false" => self.push(Token {
482                kind: TokenKind::Boolean(false),
483                span: (start as u32, (end + 1) as u32),
484                value,
485            }),
486            "not" => self.not(start)?,
487            _ => self.push(Token {
488                kind: Identifier::try_from(value)
489                    .map(|identifier| TokenKind::Identifier(identifier))
490                    .unwrap_or(TokenKind::Literal),
491                span: (start as u32, (end + 1) as u32),
492                value,
493            }),
494        }
495
496        Ok(())
497    }
498}