moduforge_rules_expression/lexer/
lexer.rs

1use crate::lexer::codes::{is_token_type, token_type};
2use crate::lexer::cursor::{Cursor, CursorItem};
3use crate::lexer::error::LexerResult;
4use crate::lexer::token::{
5    Bracket, ComparisonOperator, Identifier, LogicalOperator, Operator, Token,
6    TokenKind,
7};
8use crate::lexer::{LexerError, QuotationMark, TemplateString};
9use std::str::FromStr;
10
11/// 词法分析器结构体
12/// 负责将字符串转换为令牌序列
13#[derive(Debug, Default)]
14pub struct Lexer<'arena> {
15    tokens: Vec<Token<'arena>>, // 存储解析出的令牌
16}
17
18impl<'arena> Lexer<'arena> {
19    /// 创建新的词法分析器实例
20    pub fn new() -> Self {
21        Self::default()
22    }
23
24    /// 对输入字符串进行词法分析
25    /// 返回解析出的令牌数组的引用
26    pub fn tokenize(
27        &mut self,
28        source: &'arena str,
29    ) -> LexerResult<&[Token<'arena>]> {
30        self.tokens.clear();
31
32        Scanner::new(source, &mut self.tokens).scan()?;
33        Ok(&self.tokens)
34    }
35}
36
37/// 扫描器结构体
38/// 执行实际的词法分析工作
39struct Scanner<'arena, 'self_ref> {
40    cursor: Cursor<'arena>,                    // 字符串游标
41    tokens: &'self_ref mut Vec<Token<'arena>>, // 令牌向量的可变引用
42    source: &'arena str,                       // 源字符串
43}
44
45impl<'arena, 'self_ref> Scanner<'arena, 'self_ref> {
46    /// 创建新的扫描器
47    pub fn new(
48        source: &'arena str,
49        tokens: &'self_ref mut Vec<Token<'arena>>,
50    ) -> Self {
51        Self { cursor: Cursor::from(source), source, tokens }
52    }
53
54    /// 执行扫描过程
55    /// 遍历整个源字符串并识别所有令牌
56    pub fn scan(&mut self) -> LexerResult<()> {
57        while let Some(cursor_item) = self.cursor.peek() {
58            self.scan_cursor_item(cursor_item)?;
59        }
60
61        Ok(())
62    }
63
64    /// 扫描单个字符项
65    /// 根据字符类型调用相应的处理方法
66    pub(crate) fn scan_cursor_item(
67        &mut self,
68        cursor_item: CursorItem,
69    ) -> LexerResult<()> {
70        let (i, s) = cursor_item;
71
72        match s {
73            // 空白字符:跳过
74            token_type!("space") => {
75                self.cursor.next();
76                Ok(())
77            },
78            '\'' => self.string(QuotationMark::SingleQuote), // 单引号字符串
79            '"' => self.string(QuotationMark::DoubleQuote),  // 双引号字符串
80            token_type!("digit") => self.number(),           // 数字
81            token_type!("bracket") => self.bracket(),        // 括号
82            token_type!("cmp_operator") => self.cmp_operator(), // 比较操作符
83            token_type!("operator") => self.operator(),      // 其他操作符
84            token_type!("question_mark") => self.question_mark(), // 问号
85            '`' => self.template_string(),                   // 模板字符串
86            '.' => self.dot(),                               // 点操作符
87            token_type!("alpha") => self.identifier(),       // 标识符
88            _ => Err(LexerError::UnmatchedSymbol {
89                // 未知字符
90                symbol: s,
91                position: i as u32,
92            }),
93        }
94    }
95
96    /// 获取下一个字符
97    /// 如果到达文件末尾则返回错误
98    fn next(&self) -> LexerResult<CursorItem> {
99        self.cursor.next().ok_or_else(|| {
100            let (a, b) = self.cursor.peek_back().unwrap_or((0, ' '));
101
102            LexerError::UnexpectedEof { symbol: b, position: a as u32 }
103        })
104    }
105
106    /// 添加令牌到令牌向量
107    fn push(
108        &mut self,
109        token: Token<'arena>,
110    ) {
111        self.tokens.push(token);
112    }
113
114    /// 处理模板字符串
115    /// 解析反引号包围的字符串,处理其中的表达式插值 ${...}
116    fn template_string(&mut self) -> LexerResult<()> {
117        let (start, _) = self.next()?;
118
119        // 添加开始的反引号令牌
120        self.tokens.push(Token {
121            kind: TokenKind::QuotationMark(QuotationMark::Backtick),
122            span: (start as u32, (start + 1) as u32),
123            value: QuotationMark::Backtick.into(),
124        });
125
126        let mut in_expression = false; // 是否在表达式内部
127        let mut str_start = start + 1; // 字符串内容开始位置
128        loop {
129            let (e, c) = self.next()?;
130
131            match (c, in_expression) {
132                // 遇到结束的反引号
133                ('`', _) => {
134                    if str_start < e {
135                        // 添加剩余的字符串字面量
136                        self.tokens.push(Token {
137                            kind: TokenKind::Literal,
138                            span: (str_start as u32, e as u32),
139                            value: &self.source[str_start..e],
140                        });
141                    }
142
143                    // 添加结束的反引号令牌
144                    self.tokens.push(Token {
145                        kind: TokenKind::QuotationMark(QuotationMark::Backtick),
146                        span: (e as u32, (e + 1) as u32),
147                        value: QuotationMark::Backtick.into(),
148                    });
149
150                    break;
151                },
152                // 在字符串中遇到 $,检查是否是表达式开始
153                ('$', false) => {
154                    in_expression = self.cursor.next_if_is("{");
155                    if in_expression {
156                        // 添加表达式前的字符串字面量
157                        self.tokens.push(Token {
158                            kind: TokenKind::Literal,
159                            span: (str_start as u32, e as u32),
160                            value: &self.source[str_start..e],
161                        });
162
163                        // 添加表达式开始标记
164                        self.tokens.push(Token {
165                            kind: TokenKind::TemplateString(
166                                TemplateString::ExpressionStart,
167                            ),
168                            span: (e as u32, (e + 2) as u32),
169                            value: TemplateString::ExpressionStart.into(),
170                        });
171                    }
172                },
173                // 在表达式中遇到 },表达式结束
174                ('}', true) => {
175                    in_expression = false;
176                    self.tokens.push(Token {
177                        kind: TokenKind::TemplateString(
178                            TemplateString::ExpressionEnd,
179                        ),
180                        span: (str_start as u32, e as u32),
181                        value: TemplateString::ExpressionEnd.into(),
182                    });
183
184                    str_start = e + 1;
185                },
186                // 在字符串中继续读取
187                (_, false) => {
188                    // Continue reading string
189                },
190                // 在表达式中,递归解析字符
191                (_, true) => {
192                    self.cursor.back();
193                    self.scan_cursor_item((e, c))?;
194                },
195            }
196        }
197
198        Ok(())
199    }
200
201    /// 处理普通字符串(单引号或双引号包围)
202    fn string(
203        &mut self,
204        quote_kind: QuotationMark,
205    ) -> LexerResult<()> {
206        let (start, opener) = self.next()?;
207        let end: usize;
208
209        // 寻找匹配的结束引号
210        loop {
211            let (e, c) = self.next()?;
212            if c == opener {
213                end = e;
214                break;
215            }
216        }
217
218        // 添加开始引号令牌
219        self.push(Token {
220            kind: TokenKind::QuotationMark(quote_kind),
221            span: (start as u32, (start + 1) as u32),
222            value: quote_kind.into(),
223        });
224
225        // 添加字符串内容令牌
226        self.push(Token {
227            kind: TokenKind::Literal,
228            span: ((start + 1) as u32, end as u32),
229            value: &self.source[start + 1..end],
230        });
231
232        // 添加结束引号令牌
233        self.push(Token {
234            kind: TokenKind::QuotationMark(quote_kind),
235            span: (end as u32, (end + 1) as u32),
236            value: quote_kind.into(),
237        });
238
239        Ok(())
240    }
241
242    /// 处理数字
243    /// 支持整数、小数和科学计数法
244    fn number(&mut self) -> LexerResult<()> {
245        let (start, _) = self.next()?;
246        let mut end = start;
247        let mut fractal = false; // 是否已有小数点
248
249        // 读取数字字符、下划线和小数点
250        while let Some((e, c)) = self
251            .cursor
252            .next_if(|c| is_token_type!(c, "digit") || c == '_' || c == '.')
253        {
254            // 避免多个小数点
255            if fractal && c == '.' {
256                self.cursor.back();
257                break;
258            }
259
260            if c == '.' {
261                // 检查是否是范围操作符 ..
262                if let Some((_, p)) = self.cursor.peek() {
263                    if p == '.' {
264                        self.cursor.back();
265                        break;
266                    }
267
268                    fractal = true
269                }
270            }
271
272            end = e;
273        }
274
275        // 处理科学计数法 (e/E)
276        if let Some((e_pos, _)) = self.cursor.next_if(|c| c == 'e') {
277            end = e_pos;
278
279            // 处理可选的正负号
280            if let Some((sign_pos, _)) =
281                self.cursor.next_if(|c| c == '+' || c == '-')
282            {
283                end = sign_pos;
284            }
285
286            // 读取指数部分的数字
287            let mut has_exponent_digits = false;
288            while let Some((exp_pos, _)) =
289                self.cursor.next_if(|c| is_token_type!(c, "digit"))
290            {
291                end = exp_pos;
292                has_exponent_digits = true;
293            }
294
295            // 如果没有指数数字,回退到 e 之前
296            if !has_exponent_digits {
297                while self.cursor.position() > e_pos {
298                    self.cursor.back();
299                }
300
301                end = e_pos - 1;
302            }
303        }
304
305        // 添加数字令牌
306        self.push(Token {
307            kind: TokenKind::Number,
308            span: (start as u32, (end + 1) as u32),
309            value: &self.source[start..=end],
310        });
311
312        Ok(())
313    }
314
315    /// 处理括号
316    fn bracket(&mut self) -> LexerResult<()> {
317        let (start, _) = self.next()?;
318
319        let value = &self.source[start..=start];
320        let span = (start as u32, (start + 1) as u32);
321        self.push(Token {
322            kind: TokenKind::Bracket(Bracket::from_str(value).map_err(
323                |_| LexerError::UnexpectedSymbol {
324                    symbol: value.to_string(),
325                    span,
326                },
327            )?),
328            span,
329            value,
330        });
331
332        Ok(())
333    }
334
335    /// 处理点操作符
336    /// 支持单个点 . 和范围操作符 ..
337    fn dot(&mut self) -> LexerResult<()> {
338        let (start, _) = self.next()?;
339        let mut end = start;
340
341        // 检查是否是范围操作符 ..
342        if self.cursor.next_if(|c| c == '.').is_some() {
343            end += 1;
344        }
345
346        let value = &self.source[start..=end];
347        let span = (start as u32, (end + 1) as u32);
348        self.push(Token {
349            kind: TokenKind::Operator(Operator::from_str(value).map_err(
350                |_| LexerError::UnexpectedSymbol {
351                    symbol: value.to_string(),
352                    span,
353                },
354            )?),
355            span,
356            value,
357        });
358
359        Ok(())
360    }
361
362    /// 处理比较操作符
363    /// 支持 <, >, !, = 及其组合
364    fn cmp_operator(&mut self) -> LexerResult<()> {
365        let (start, _) = self.next()?;
366        let mut end = start;
367
368        // 检查是否有后续的 = 组成复合操作符
369        if self.cursor.next_if(|c| c == '=').is_some() {
370            end += 1;
371        }
372
373        let value = &self.source[start..=end];
374        self.push(Token {
375            kind: TokenKind::Operator(Operator::from_str(value).map_err(
376                |_| LexerError::UnexpectedSymbol {
377                    symbol: value.to_string(),
378                    span: (start as u32, (end + 1) as u32),
379                },
380            )?),
381            span: (start as u32, (end + 1) as u32),
382            value,
383        });
384
385        Ok(())
386    }
387
388    /// 处理问号操作符
389    /// 支持单个问号 ? 和空值合并操作符 ??
390    fn question_mark(&mut self) -> LexerResult<()> {
391        let (start, _) = self.next()?;
392        let mut kind = TokenKind::Operator(Operator::QuestionMark);
393        let mut end = start;
394
395        // 检查是否是空值合并操作符 ??
396        if self.cursor.next_if(|c| c == '?').is_some() {
397            kind = TokenKind::Operator(Operator::Logical(
398                LogicalOperator::NullishCoalescing,
399            ));
400            end += 1;
401        }
402
403        let value = &self.source[start..=end];
404        self.push(Token {
405            kind,
406            value,
407            span: (start as u32, (end + 1) as u32),
408        });
409
410        Ok(())
411    }
412
413    /// 处理其他操作符
414    /// 包括算术操作符:+ - * / % ^ 和其他符号:, :
415    fn operator(&mut self) -> LexerResult<()> {
416        let (start, _) = self.next()?;
417
418        let value = &self.source[start..=start];
419        let span = (start as u32, (start + 1) as u32);
420        self.push(Token {
421            kind: TokenKind::Operator(Operator::from_str(value).map_err(
422                |_| LexerError::UnexpectedSymbol {
423                    symbol: value.to_string(),
424                    span,
425                },
426            )?),
427            span,
428            value,
429        });
430
431        Ok(())
432    }
433
434    /// 处理 not 关键字
435    /// 支持 not 和 not in 两种形式
436    fn not(
437        &mut self,
438        start: usize,
439    ) -> LexerResult<()> {
440        if self.cursor.next_if_is(" in ") {
441            // not in 操作符
442            let end = self.cursor.position();
443
444            self.push(Token {
445                kind: TokenKind::Operator(Operator::Comparison(
446                    ComparisonOperator::NotIn,
447                )),
448                span: (start as u32, (end - 1) as u32),
449                value: "not in",
450            })
451        } else {
452            // not 操作符
453            let end = self.cursor.position();
454
455            self.push(Token {
456                kind: TokenKind::Operator(Operator::Logical(
457                    LogicalOperator::Not,
458                )),
459                span: (start as u32, end as u32),
460                value: "not",
461            })
462        }
463
464        Ok(())
465    }
466
467    /// 处理标识符和关键字
468    /// 包括变量名、布尔值、逻辑操作符等
469    fn identifier(&mut self) -> LexerResult<()> {
470        let (start, _) = self.next()?;
471        let mut end = start;
472
473        // 读取完整的标识符(字母、数字、下划线等)
474        while let Some((e, _)) =
475            self.cursor.next_if(|c| is_token_type!(c, "alphanumeric"))
476        {
477            end = e;
478        }
479
480        let value = &self.source[start..=end];
481        match value {
482            // 逻辑操作符
483            "and" => self.push(Token {
484                kind: TokenKind::Operator(Operator::Logical(
485                    LogicalOperator::And,
486                )),
487                span: (start as u32, (end + 1) as u32),
488                value,
489            }),
490            "or" => self.push(Token {
491                kind: TokenKind::Operator(Operator::Logical(
492                    LogicalOperator::Or,
493                )),
494                span: (start as u32, (end + 1) as u32),
495                value,
496            }),
497            // 比较操作符
498            "in" => self.push(Token {
499                kind: TokenKind::Operator(Operator::Comparison(
500                    ComparisonOperator::In,
501                )),
502                span: (start as u32, (end + 1) as u32),
503                value,
504            }),
505            // 布尔值
506            "true" => self.push(Token {
507                kind: TokenKind::Boolean(true),
508                span: (start as u32, (end + 1) as u32),
509                value,
510            }),
511            "false" => self.push(Token {
512                kind: TokenKind::Boolean(false),
513                span: (start as u32, (end + 1) as u32),
514                value,
515            }),
516            // not 关键字(可能是 not 或 not in)
517            "not" => self.not(start)?,
518            // 其他标识符或字面量
519            _ => self.push(Token {
520                kind: Identifier::try_from(value)
521                    .map(|identifier| TokenKind::Identifier(identifier))
522                    .unwrap_or(TokenKind::Literal),
523                span: (start as u32, (end + 1) as u32),
524                value,
525            }),
526        }
527
528        Ok(())
529    }
530}