pipeline_script/lexer/
mod.rs

1pub mod iter;
2pub mod position;
3pub mod token;
4
5use crate::lexer::position::Position;
6use crate::lexer::token::Token;
7use ariadne::{Label, Report, ReportKind, Source};
8use logos::{Lexer as LogosLexer, Logos, Span};
9
10/// 基于 logos 库的高性能词法分析器
11#[derive(Debug, Clone)]
12pub struct Lexer {
13    /// 源文件名
14    file_name: String,
15    /// logos 词法分析器实例
16    logos_lexer: LogosLexer<'static, Token>,
17    /// 源代码文本
18    source: &'static str,
19    /// 行开始位置的映射,用于快速计算行列号
20    line_starts: Vec<usize>,
21    /// 是否已到达文件末尾
22    is_eof: bool,
23}
24
25impl Lexer {
26    /// 创建新的词法分析器实例
27    ///
28    /// # 参数
29    /// - `file_name`: 源文件名,用于错误报告
30    pub fn new(file_name: impl Into<String>) -> Self {
31        // 创建一个空的静态字符串引用,稍后会被 set_chars 或 from_script 替换
32        let empty_source: &'static str = Box::leak("".to_string().into_boxed_str());
33        let logos_lexer = Token::lexer(empty_source);
34
35        Self {
36            file_name: file_name.into(),
37            logos_lexer,
38            source: empty_source,
39            line_starts: vec![0],
40            is_eof: false,
41        }
42    }
43
44    /// 检查是否已到达文件末尾
45    pub fn is_eof(&self) -> bool {
46        self.is_eof || self.logos_lexer.span().start >= self.source.len()
47    }
48
49    /// 获取源文件名
50    pub fn get_file_name(&self) -> String {
51        self.file_name.clone()
52    }
53
54    /// 获取指定行的内容
55    ///
56    /// # 参数
57    /// - `line`: 行号(从1开始)
58    ///
59    /// # 返回值
60    /// 指定行的文本内容
61    pub fn line(&self, line: usize) -> String {
62        if line == 0 || line > self.line_starts.len() {
63            return String::new();
64        }
65
66        let start = self.line_starts[line - 1];
67        let end = if line < self.line_starts.len() {
68            self.line_starts[line]
69        } else {
70            self.source.len()
71        };
72
73        self.source[start..end].trim_end_matches('\n').to_string()
74    }
75
76    /// 设置要分析的字符序列
77    ///
78    /// # 参数
79    /// - `chars`: 字符向量
80    pub fn set_chars(&mut self, chars: Vec<char>) {
81        let source_string: String = chars.into_iter().collect();
82        let source: &'static str = Box::leak(source_string.into_boxed_str());
83        self.source = source;
84        self.logos_lexer = Token::lexer(source);
85        self.compute_line_starts();
86        self.is_eof = false;
87    }
88
89    /// 从脚本字符串创建词法分析器
90    ///
91    /// # 参数
92    /// - `file_name`: 文件名
93    /// - `script`: 脚本内容
94    ///
95    /// # 返回值
96    /// 新的词法分析器实例
97    pub fn from_script(file_name: impl Into<String>, script: impl Into<String>) -> Self {
98        let script_string = script.into();
99        let source: &'static str = Box::leak(script_string.into_boxed_str());
100        let logos_lexer = Token::lexer(source);
101
102        let mut lexer = Self {
103            file_name: file_name.into(),
104            logos_lexer,
105            source,
106            line_starts: Vec::new(),
107            is_eof: false,
108        };
109
110        lexer.compute_line_starts();
111        lexer
112    }
113
114    /// 获取下一个词法单元
115    ///
116    /// # 返回值
117    /// 包含词法单元和位置信息的元组,如果到达文件末尾则返回 None
118    pub fn next_token(&mut self) -> Option<(Token, Position)> {
119        if self.is_eof {
120            return None;
121        }
122
123        match self.logos_lexer.next() {
124            Some(Ok(token)) => {
125                let span = self.logos_lexer.span();
126                let position = self.create_position_from_span(span);
127                Some((token, position))
128            }
129            Some(Err(_)) => {
130                // 处理词法分析错误,跳过当前字符并继续
131                let span = self.logos_lexer.span();
132                Report::build(ReportKind::Warning, (self.file_name.as_str(), span.clone()))
133                    .with_message(format!("非法的字符{}", self.logos_lexer.slice()))
134                    .with_label(
135                        Label::new((self.file_name.as_str(), span))
136                            .with_message("该字符不支持解析"),
137                    )
138                    .finish()
139                    .print((self.file_name.as_str(), Source::from(self.source)))
140                    .unwrap();
141                None
142            }
143            None => {
144                // 到达文件末尾
145                self.is_eof = true;
146                let position =
147                    Position::new(self.file_name.clone(), self.source.len(), self.source.len());
148                Some((Token::Eof, position))
149            }
150        }
151    }
152
153    /// 获取源代码文本
154    pub fn get_source(&self) -> &str {
155        self.source
156    }
157
158    /// 预计算行开始位置,用于快速行列号计算
159    fn compute_line_starts(&mut self) {
160        self.line_starts.clear();
161        self.line_starts.push(0);
162
163        for (i, ch) in self.source.char_indices() {
164            if ch == '\n' {
165                self.line_starts.push(i + 1);
166            }
167        }
168    }
169
170    /// 从字节跨度创建位置信息
171    ///
172    /// # 参数
173    /// - `span`: 字节跨度
174    ///
175    /// # 返回值
176    /// 位置信息对象
177    fn create_position_from_span(&self, span: Span) -> Position {
178        Position::new(self.file_name.clone(), span.start, span.end)
179    }
180}
181
182/// 兼容性实现:提供与原始 TokenType 枚举相似的功能
183#[derive(Debug, Clone, Copy, PartialEq)]
184pub enum TokenType {
185    /// 标识符
186    Identifier,
187    /// 整数
188    Integer,
189    /// 浮点数
190    Float,
191    /// 运算符或分隔符
192    Symbol,
193}
194
195impl From<&Token> for TokenType {
196    fn from(token: &Token) -> Self {
197        match token {
198            Token::Identifier(_) => TokenType::Identifier,
199            Token::Int(_) => TokenType::Integer,
200            Token::Float(_) => TokenType::Float,
201            _ => TokenType::Symbol,
202        }
203    }
204}
205
206/// 词法分析器状态(保留用于兼容性)
207#[derive(Debug, Clone, PartialEq)]
208pub enum State {
209    /// 初始状态
210    Initial,
211    /// 识别标识符状态
212    Identifier,
213    /// 识别数字状态
214    Number,
215    /// 识别小数状态
216    Decimal,
217    /// 识别字符串状态
218    String,
219    /// 识别格式化字符串状态
220    FormatString,
221    /// 识别注释状态
222    Comment,
223    /// 识别多行注释状态
224    MultilineComment,
225    /// 完成状态,带有类型信息
226    Done(TokenType),
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_basic_tokenization() {
235        let mut lexer = Lexer::from_script("test.script", "let x = 42;");
236
237        let tokens: Vec<_> = std::iter::from_fn(|| lexer.next_token())
238            .map(|(token, _)| token)
239            .collect();
240
241        assert_eq!(tokens[1], Token::Identifier("x".to_string()));
242        assert_eq!(tokens[2], Token::Assign);
243        assert_eq!(tokens[3], Token::Int(42));
244        assert_eq!(tokens[4], Token::Eof);
245    }
246
247    #[test]
248    fn test_string_parsing() {
249        let mut lexer = Lexer::from_script("test.script", r#""hello world""#);
250
251        if let Some((Token::String(s), _)) = lexer.next_token() {
252            assert_eq!(s, "hello world");
253        } else {
254            panic!("Expected string token");
255        }
256    }
257
258    #[test]
259    fn test_format_string_parsing() {
260        let mut lexer = Lexer::from_script("test.script", r#"f"hello {name}""#);
261
262        if let Some((Token::FormatString(s), _)) = lexer.next_token() {
263            assert_eq!(s, "hello {name}");
264        } else {
265            panic!("Expected format string token");
266        }
267    }
268
269    #[test]
270    fn test_comments_are_skipped() {
271        let mut lexer =
272            Lexer::from_script("test.script", "// comment\nlet x = 1; /* block comment */");
273
274        let tokens: Vec<_> = std::iter::from_fn(|| lexer.next_token())
275            .map(|(token, _)| token)
276            .collect();
277
278        assert_eq!(tokens[1], Token::Identifier("x".to_string()));
279        assert_eq!(tokens[2], Token::Assign);
280        assert_eq!(tokens[3], Token::Int(1));
281    }
282}