oak_lua/lexer/
mod.rs

1//! Lua 词法分析
2//!
3//! 实现Lua 语言的词法分析,将源代码转换token 序列
4
5use crate::{kind::LuaSyntaxKind, language::LuaLanguage};
6use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
7
8type State<S> = LexerState<S, LuaLanguage>;
9
10/// Lua 词法分析
11#[derive(Clone)]
12pub struct LuaLexer<'config> {
13    config: &'config LuaLanguage,
14}
15
16impl<'config> LuaLexer<'config> {
17    /// 创建新的 Lua 词法分析
18    pub fn new(config: &'config LuaLanguage) -> Self {
19        Self { config }
20    }
21
22    /// 跳过空白字符
23    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
24        let start_pos = state.get_position();
25
26        while let Some(ch) = state.peek() {
27            if ch == ' ' || ch == '\t' {
28                state.advance(ch.len_utf8());
29            }
30            else {
31                break;
32            }
33        }
34
35        if state.get_position() > start_pos {
36            state.add_token(LuaSyntaxKind::Whitespace, start_pos, state.get_position());
37            true
38        }
39        else {
40            false
41        }
42    }
43
44    /// 处理换行
45    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
46        let start_pos = state.get_position();
47
48        if let Some('\n') = state.peek() {
49            state.advance(1);
50            state.add_token(LuaSyntaxKind::Whitespace, start_pos, state.get_position());
51            true
52        }
53        else if let Some('\r') = state.peek() {
54            state.advance(1);
55            if let Some('\n') = state.peek() {
56                state.advance(1);
57            }
58            state.add_token(LuaSyntaxKind::Whitespace, start_pos, state.get_position());
59            true
60        }
61        else {
62            false
63        }
64    }
65
66    /// 处理注释
67    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
68        let start_pos = state.get_position();
69
70        if let Some('-') = state.current() {
71            if let Some('-') = state.peek() {
72                state.advance(1); // 第一'-'
73                state.advance(1); // 第二'-'
74
75                // 检查是否是长注--[[
76                if let Some('[') = state.current() {
77                    if let Some('[') = state.peek() {
78                        state.advance(1); // '['
79                        state.advance(1); // '['
80
81                        // 寻找 ]]
82                        while let Some(ch) = state.current() {
83                            if ch == ']' {
84                                if let Some(']') = state.peek() {
85                                    state.advance(1); // ']'
86                                    state.advance(1); // ']'
87                                    break;
88                                }
89                            }
90                            state.advance(ch.len_utf8());
91                        }
92                    }
93                    else {
94                        // 单行注释,读到行
95                        while let Some(ch) = state.current() {
96                            if ch == '\n' || ch == '\r' {
97                                break;
98                            }
99                            state.advance(ch.len_utf8());
100                        }
101                    }
102                }
103                else {
104                    // 单行注释,读到行
105                    while let Some(ch) = state.current() {
106                        if ch == '\n' || ch == '\r' {
107                            break;
108                        }
109                        state.advance(ch.len_utf8());
110                    }
111                }
112
113                state.add_token(LuaSyntaxKind::Comment, start_pos, state.get_position());
114                true
115            }
116            else {
117                false
118            }
119        }
120        else {
121            false
122        }
123    }
124
125    /// 处理字符串字面量
126    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
127        let start_pos = state.get_position();
128
129        if let Some(quote_char) = state.current() {
130            if quote_char == '"' || quote_char == '\'' {
131                state.advance(1); // 跳过开始引
132
133                let mut escaped = false;
134                while let Some(ch) = state.current() {
135                    if escaped {
136                        escaped = false;
137                        state.advance(ch.len_utf8());
138                    }
139                    else if ch == '\\' {
140                        escaped = true;
141                        state.advance(1);
142                    }
143                    else if ch == quote_char {
144                        state.advance(1); // 跳过结束引号
145                        break;
146                    }
147                    else if ch == '\n' || ch == '\r' {
148                        // 字符串不能跨行(除非转义
149                        break;
150                    }
151                    else {
152                        state.advance(ch.len_utf8());
153                    }
154                }
155
156                state.add_token(LuaSyntaxKind::String, start_pos, state.get_position());
157                true
158            }
159            else if quote_char == '[' {
160                // 长字符串 [[...]]
161                if let Some('[') = state.peek() {
162                    state.advance(1); // '['
163                    state.advance(1); // '['
164
165                    // 寻找 ]]
166                    while let Some(ch) = state.current() {
167                        if ch == ']' {
168                            if let Some(']') = state.peek() {
169                                state.advance(1); // ']'
170                                state.advance(1); // ']'
171                                break;
172                            }
173                        }
174                        state.advance(ch.len_utf8());
175                    }
176
177                    state.add_token(LuaSyntaxKind::String, start_pos, state.get_position());
178                    true
179                }
180                else {
181                    false
182                }
183            }
184            else {
185                false
186            }
187        }
188        else {
189            false
190        }
191    }
192
193    /// 处理数字
194    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
195        let start_pos = state.get_position();
196
197        if let Some(ch) = state.current() {
198            if ch.is_ascii_digit() {
199                // 检查是否是十六进制
200                if ch == '0' {
201                    if let Some(next_ch) = state.peek() {
202                        if next_ch == 'x' || next_ch == 'X' {
203                            state.advance(1); // '0'
204                            state.advance(1); // 'x' 'X'
205
206                            // 读取十六进制数字
207                            while let Some(hex_ch) = state.current() {
208                                if hex_ch.is_ascii_hexdigit() {
209                                    state.advance(1);
210                                }
211                                else {
212                                    break;
213                                }
214                            }
215
216                            state.add_token(LuaSyntaxKind::Number, start_pos, state.get_position());
217                            return true;
218                        }
219                    }
220                }
221
222                // 普通数
223                let mut has_dot = false;
224                let mut has_exp = false;
225
226                while let Some(num_ch) = state.current() {
227                    if num_ch.is_ascii_digit() {
228                        state.advance(1);
229                    }
230                    else if num_ch == '.' && !has_dot && !has_exp {
231                        has_dot = true;
232                        state.advance(1);
233                    }
234                    else if (num_ch == 'e' || num_ch == 'E') && !has_exp {
235                        has_exp = true;
236                        state.advance(1);
237
238                        // 可选的符号
239                        if let Some(sign_ch) = state.current() {
240                            if sign_ch == '+' || sign_ch == '-' {
241                                state.advance(1);
242                            }
243                        }
244                    }
245                    else {
246                        break;
247                    }
248                }
249
250                state.add_token(LuaSyntaxKind::Number, start_pos, state.get_position());
251                true
252            }
253            else {
254                false
255            }
256        }
257        else {
258            false
259        }
260    }
261
262    /// 处理标识符或关键
263    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
264        if let Some(ch) = state.current() {
265            if ch.is_ascii_alphabetic() || ch == '_' {
266                let range = state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
267                // 使用 Source trait 的 get_text_in 方法
268                let text = state.get_text_in(range);
269                let token_kind = self.keyword_or_identifier(text);
270                state.add_token(token_kind, range.start, range.end);
271                true
272            }
273            else {
274                false
275            }
276        }
277        else {
278            false
279        }
280    }
281
282    /// 识别关键
283    fn keyword_or_identifier(&self, text: &str) -> LuaSyntaxKind {
284        match text {
285            "and" => LuaSyntaxKind::And,
286            "break" => LuaSyntaxKind::Break,
287            "do" => LuaSyntaxKind::Do,
288            "else" => LuaSyntaxKind::Else,
289            "elseif" => LuaSyntaxKind::Elseif,
290            "end" => LuaSyntaxKind::End,
291            "false" => LuaSyntaxKind::False,
292            "for" => LuaSyntaxKind::For,
293            "function" => LuaSyntaxKind::Function,
294            "goto" => LuaSyntaxKind::Goto,
295            "if" => LuaSyntaxKind::If,
296            "in" => LuaSyntaxKind::In,
297            "local" => LuaSyntaxKind::Local,
298            "nil" => LuaSyntaxKind::Nil,
299            "not" => LuaSyntaxKind::Not,
300            "or" => LuaSyntaxKind::Or,
301            "repeat" => LuaSyntaxKind::Repeat,
302            "return" => LuaSyntaxKind::Return,
303            "then" => LuaSyntaxKind::Then,
304            "true" => LuaSyntaxKind::True,
305            "until" => LuaSyntaxKind::Until,
306            "while" => LuaSyntaxKind::While,
307            _ => LuaSyntaxKind::Identifier,
308        }
309    }
310
311    /// 处理操作符和分隔
312    fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
313        let start_pos = state.get_position();
314
315        if let Some(ch) = state.peek() {
316            let token_kind = match ch {
317                '=' => {
318                    state.advance(1);
319                    if let Some('=') = state.peek() {
320                        state.advance(1);
321                        LuaSyntaxKind::EqEq
322                    }
323                    else {
324                        LuaSyntaxKind::Eq
325                    }
326                }
327                '~' => {
328                    state.advance(1);
329                    if let Some('=') = state.peek() {
330                        state.advance(1);
331                        LuaSyntaxKind::TildeEq
332                    }
333                    else {
334                        LuaSyntaxKind::Tilde
335                    }
336                }
337                '<' => {
338                    state.advance(1);
339                    if let Some('=') = state.peek() {
340                        state.advance(1);
341                        LuaSyntaxKind::LtEq
342                    }
343                    else if let Some('<') = state.peek() {
344                        state.advance(1);
345                        LuaSyntaxKind::LtLt
346                    }
347                    else {
348                        LuaSyntaxKind::Lt
349                    }
350                }
351                '>' => {
352                    state.advance(1);
353                    if let Some('=') = state.peek() {
354                        state.advance(1);
355                        LuaSyntaxKind::GtEq
356                    }
357                    else if let Some('>') = state.peek() {
358                        state.advance(1);
359                        LuaSyntaxKind::GtGt
360                    }
361                    else {
362                        LuaSyntaxKind::Gt
363                    }
364                }
365                '.' => {
366                    state.advance(1);
367                    if let Some('.') = state.peek() {
368                        state.advance(1);
369                        if let Some('.') = state.peek() {
370                            state.advance(1);
371                            LuaSyntaxKind::DotDotDot
372                        }
373                        else {
374                            LuaSyntaxKind::DotDot
375                        }
376                    }
377                    else {
378                        LuaSyntaxKind::Dot
379                    }
380                }
381                ':' => {
382                    state.advance(1);
383                    if let Some(':') = state.peek() {
384                        state.advance(1);
385                        LuaSyntaxKind::ColonColon
386                    }
387                    else {
388                        LuaSyntaxKind::Colon
389                    }
390                }
391                '/' => {
392                    state.advance(1);
393                    if let Some('/') = state.peek() {
394                        state.advance(1);
395                        LuaSyntaxKind::SlashSlash
396                    }
397                    else {
398                        LuaSyntaxKind::Slash
399                    }
400                }
401                '+' => {
402                    state.advance(1);
403                    LuaSyntaxKind::Plus
404                }
405                '-' => {
406                    state.advance(1);
407                    LuaSyntaxKind::Minus
408                }
409                '*' => {
410                    state.advance(1);
411                    LuaSyntaxKind::Star
412                }
413                '%' => {
414                    state.advance(1);
415                    LuaSyntaxKind::Percent
416                }
417                '^' => {
418                    state.advance(1);
419                    LuaSyntaxKind::Caret
420                }
421                '#' => {
422                    state.advance(1);
423                    LuaSyntaxKind::Hash
424                }
425                '&' => {
426                    state.advance(1);
427                    LuaSyntaxKind::Ampersand
428                }
429                '|' => {
430                    state.advance(1);
431                    LuaSyntaxKind::Pipe
432                }
433                '(' => {
434                    state.advance(1);
435                    LuaSyntaxKind::LeftParen
436                }
437                ')' => {
438                    state.advance(1);
439                    LuaSyntaxKind::RightParen
440                }
441                '{' => {
442                    state.advance(1);
443                    LuaSyntaxKind::LeftBrace
444                }
445                '}' => {
446                    state.advance(1);
447                    LuaSyntaxKind::RightBrace
448                }
449                '[' => {
450                    state.advance(1);
451                    LuaSyntaxKind::LeftBracket
452                }
453                ']' => {
454                    state.advance(1);
455                    LuaSyntaxKind::RightBracket
456                }
457                ';' => {
458                    state.advance(1);
459                    LuaSyntaxKind::Semicolon
460                }
461                ',' => {
462                    state.advance(1);
463                    LuaSyntaxKind::Comma
464                }
465                _ => return false,
466            };
467
468            state.add_token(token_kind, start_pos, state.get_position());
469            true
470        }
471        else {
472            false
473        }
474    }
475}
476
477impl<'config> Lexer<LuaLanguage> for LuaLexer<'config> {
478    fn lex_incremental(
479        &self,
480        source: impl Source,
481        _offset: usize,
482        _cache: IncrementalCache<LuaLanguage>,
483    ) -> LexOutput<LuaLanguage> {
484        let mut state = LexerState::new_with_cache(source, _offset, _cache);
485        let result = self.run(&mut state);
486        state.finish(result)
487    }
488}
489
490impl<'config> LuaLexer<'config> {
491    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
492        loop {
493            // 尝试各种词法规则
494            if self.skip_whitespace(state) {
495                continue;
496            }
497
498            if self.lex_newline(state) {
499                continue;
500            }
501
502            if self.lex_comment(state) {
503                continue;
504            }
505
506            if self.lex_string(state) {
507                continue;
508            }
509
510            if self.lex_number(state) {
511                continue;
512            }
513
514            if self.lex_identifier_or_keyword(state) {
515                continue;
516            }
517
518            if self.lex_operator_or_delimiter(state) {
519                continue;
520            }
521
522            // 如果所有规则都不匹配,检查是否到达文件末尾
523            if let Some(ch) = state.current() {
524                // 跳过当前字符并标记为错误
525                let start_pos = state.get_position();
526                state.advance(ch.len_utf8());
527                state.add_token(LuaSyntaxKind::Error, start_pos, state.get_position());
528            }
529            else {
530                // 到达文件末尾,退出循环
531                break;
532            }
533        }
534
535        // 添加 EOF token
536        let eof_pos = state.get_position();
537        state.add_token(LuaSyntaxKind::Eof, eof_pos, eof_pos);
538
539        Ok(())
540    }
541}