Skip to main content

oak_lua/lexer/
mod.rs

1//! Lua 词法分析
2//!
3//! 实现Lua 语言的词法分析,将源代码转换token 序列
4
5use crate::{kind::LuaSyntaxKind, language::LuaLanguage};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, LuaLanguage>;
9
10/// Lua 词法分析
11#[derive(Clone)]
12pub struct LuaLexer<'config> {
13    _config: &'config LuaLanguage,
14}
15
16impl<'config> LuaLexer<'config> {
17    /// 创建新的 Lua 词法分析
18    pub fn new(config: &'config LuaLanguage) -> Self {
19        Self { _config: config }
20    }
21
22    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
23        while state.not_at_end() {
24            let safe_point = state.get_position();
25
26            // 尝试各种词法规则
27            if self.skip_whitespace(state) {
28                continue;
29            }
30
31            if self.lex_newline(state) {
32                continue;
33            }
34
35            if self.lex_comment(state) {
36                continue;
37            }
38
39            if self.lex_string(state) {
40                continue;
41            }
42
43            if self.lex_number(state) {
44                continue;
45            }
46
47            if self.lex_identifier_or_keyword(state) {
48                continue;
49            }
50
51            if self.lex_operator_or_delimiter(state) {
52                continue;
53            }
54
55            // 如果所有规则都不匹配,跳过当前字符并标记为错误
56            let start_pos = state.get_position();
57            if let Some(ch) = state.peek() {
58                state.advance(ch.len_utf8());
59                state.add_token(LuaSyntaxKind::Error, start_pos, state.get_position());
60            }
61
62            state.advance_if_dead_lock(safe_point);
63        }
64
65        Ok(())
66    }
67
68    /// 跳过空白字符
69    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
70        let start_pos = state.get_position();
71
72        while let Some(ch) = state.peek() {
73            if ch == ' ' || ch == '\t' {
74                state.advance(ch.len_utf8());
75            }
76            else {
77                break;
78            }
79        }
80
81        if state.get_position() > start_pos {
82            state.add_token(LuaSyntaxKind::Whitespace, start_pos, state.get_position());
83            true
84        }
85        else {
86            false
87        }
88    }
89
90    /// 处理换行
91    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
92        let start_pos = state.get_position();
93
94        if let Some('\n') = state.peek() {
95            state.advance(1);
96            state.add_token(LuaSyntaxKind::Newline, start_pos, state.get_position());
97            true
98        }
99        else if let Some('\r') = state.peek() {
100            state.advance(1);
101            if let Some('\n') = state.peek() {
102                state.advance(1);
103            }
104            state.add_token(LuaSyntaxKind::Newline, start_pos, state.get_position());
105            true
106        }
107        else {
108            false
109        }
110    }
111
112    /// 处理注释
113    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
114        let start_pos = state.get_position();
115
116        if let Some('-') = state.current() {
117            if let Some('-') = state.peek() {
118                state.advance(1); // 第一'-'
119                state.advance(1); // 第二'-'
120
121                // 检查是否是长注--[[
122                if let Some('[') = state.current() {
123                    if let Some('[') = state.peek() {
124                        state.advance(1); // '['
125                        state.advance(1); // '['
126
127                        // 寻找 ]]
128                        while let Some(ch) = state.current() {
129                            if ch == ']' {
130                                if let Some(']') = state.peek() {
131                                    state.advance(1); // ']'
132                                    state.advance(1); // ']'
133                                    break;
134                                }
135                            }
136                            state.advance(ch.len_utf8());
137                        }
138                    }
139                    else {
140                        // 单行注释,读到行
141                        while let Some(ch) = state.current() {
142                            if ch == '\n' || ch == '\r' {
143                                break;
144                            }
145                            state.advance(ch.len_utf8());
146                        }
147                    }
148                }
149                else {
150                    // 单行注释,读到行
151                    while let Some(ch) = state.current() {
152                        if ch == '\n' || ch == '\r' {
153                            break;
154                        }
155                        state.advance(ch.len_utf8());
156                    }
157                }
158
159                state.add_token(LuaSyntaxKind::Comment, start_pos, state.get_position());
160                true
161            }
162            else {
163                false
164            }
165        }
166        else {
167            false
168        }
169    }
170
171    /// 处理字符串字面量
172    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
173        let start_pos = state.get_position();
174
175        if let Some(quote_char) = state.current() {
176            if quote_char == '"' || quote_char == '\'' {
177                state.advance(1); // 跳过开始引
178
179                let mut escaped = false;
180                while let Some(ch) = state.current() {
181                    if escaped {
182                        escaped = false;
183                        state.advance(ch.len_utf8());
184                    }
185                    else if ch == '\\' {
186                        escaped = true;
187                        state.advance(1);
188                    }
189                    else if ch == quote_char {
190                        state.advance(1); // 跳过结束引号
191                        break;
192                    }
193                    else if ch == '\n' || ch == '\r' {
194                        // 字符串不能跨行(除非转义
195                        break;
196                    }
197                    else {
198                        state.advance(ch.len_utf8());
199                    }
200                }
201
202                state.add_token(LuaSyntaxKind::String, start_pos, state.get_position());
203                true
204            }
205            else if quote_char == '[' {
206                // 长字符串 [[...]]
207                if let Some('[') = state.peek() {
208                    state.advance(1); // '['
209                    state.advance(1); // '['
210
211                    // 寻找 ]]
212                    while let Some(ch) = state.current() {
213                        if ch == ']' {
214                            if let Some(']') = state.peek() {
215                                state.advance(1); // ']'
216                                state.advance(1); // ']'
217                                break;
218                            }
219                        }
220                        state.advance(ch.len_utf8());
221                    }
222
223                    state.add_token(LuaSyntaxKind::String, start_pos, state.get_position());
224                    true
225                }
226                else {
227                    false
228                }
229            }
230            else {
231                false
232            }
233        }
234        else {
235            false
236        }
237    }
238
239    /// 处理数字
240    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241        let start_pos = state.get_position();
242
243        if let Some(ch) = state.current() {
244            if ch.is_ascii_digit() {
245                // 检查是否是十六进制
246                if ch == '0' {
247                    if let Some(next_ch) = state.peek() {
248                        if next_ch == 'x' || next_ch == 'X' {
249                            state.advance(1); // '0'
250                            state.advance(1); // 'x' 'X'
251
252                            // 读取十六进制数字
253                            while let Some(hex_ch) = state.current() {
254                                if hex_ch.is_ascii_hexdigit() {
255                                    state.advance(1);
256                                }
257                                else {
258                                    break;
259                                }
260                            }
261
262                            state.add_token(LuaSyntaxKind::Number, start_pos, state.get_position());
263                            return true;
264                        }
265                    }
266                }
267
268                // 普通数
269                let mut has_dot = false;
270                let mut has_exp = false;
271
272                while let Some(num_ch) = state.current() {
273                    if num_ch.is_ascii_digit() {
274                        state.advance(1);
275                    }
276                    else if num_ch == '.' && !has_dot && !has_exp {
277                        has_dot = true;
278                        state.advance(1);
279                    }
280                    else if (num_ch == 'e' || num_ch == 'E') && !has_exp {
281                        has_exp = true;
282                        state.advance(1);
283
284                        // 可选的符号
285                        if let Some(sign_ch) = state.current() {
286                            if sign_ch == '+' || sign_ch == '-' {
287                                state.advance(1);
288                            }
289                        }
290                    }
291                    else {
292                        break;
293                    }
294                }
295
296                state.add_token(LuaSyntaxKind::Number, start_pos, state.get_position());
297                true
298            }
299            else {
300                false
301            }
302        }
303        else {
304            false
305        }
306    }
307
308    /// 处理标识符或关键
309    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
310        if let Some(ch) = state.current() {
311            if ch.is_ascii_alphabetic() || ch == '_' {
312                let range = state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
313                // 使用 Source trait 的 get_text_in 方法
314                let text = state.get_text_in(range.clone().into());
315                let token_kind = self.keyword_or_identifier(&text);
316                state.add_token(token_kind, range.start, range.end);
317                true
318            }
319            else {
320                false
321            }
322        }
323        else {
324            false
325        }
326    }
327
328    /// 识别关键
329    fn keyword_or_identifier(&self, text: &str) -> LuaSyntaxKind {
330        match text {
331            "and" => LuaSyntaxKind::And,
332            "break" => LuaSyntaxKind::Break,
333            "do" => LuaSyntaxKind::Do,
334            "else" => LuaSyntaxKind::Else,
335            "elseif" => LuaSyntaxKind::Elseif,
336            "end" => LuaSyntaxKind::End,
337            "false" => LuaSyntaxKind::False,
338            "for" => LuaSyntaxKind::For,
339            "function" => LuaSyntaxKind::Function,
340            "goto" => LuaSyntaxKind::Goto,
341            "if" => LuaSyntaxKind::If,
342            "in" => LuaSyntaxKind::In,
343            "local" => LuaSyntaxKind::Local,
344            "nil" => LuaSyntaxKind::Nil,
345            "not" => LuaSyntaxKind::Not,
346            "or" => LuaSyntaxKind::Or,
347            "repeat" => LuaSyntaxKind::Repeat,
348            "return" => LuaSyntaxKind::Return,
349            "then" => LuaSyntaxKind::Then,
350            "true" => LuaSyntaxKind::True,
351            "until" => LuaSyntaxKind::Until,
352            "while" => LuaSyntaxKind::While,
353            _ => LuaSyntaxKind::Identifier,
354        }
355    }
356
357    /// 处理操作符和分隔
358    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
359        let start_pos = state.get_position();
360
361        if let Some(ch) = state.peek() {
362            let token_kind = match ch {
363                '=' => {
364                    state.advance(1);
365                    if let Some('=') = state.peek() {
366                        state.advance(1);
367                        LuaSyntaxKind::EqEq
368                    }
369                    else {
370                        LuaSyntaxKind::Eq
371                    }
372                }
373                '~' => {
374                    state.advance(1);
375                    if let Some('=') = state.peek() {
376                        state.advance(1);
377                        LuaSyntaxKind::TildeEq
378                    }
379                    else {
380                        LuaSyntaxKind::Tilde
381                    }
382                }
383                '<' => {
384                    state.advance(1);
385                    if let Some('=') = state.peek() {
386                        state.advance(1);
387                        LuaSyntaxKind::LtEq
388                    }
389                    else if let Some('<') = state.peek() {
390                        state.advance(1);
391                        LuaSyntaxKind::LtLt
392                    }
393                    else {
394                        LuaSyntaxKind::Lt
395                    }
396                }
397                '>' => {
398                    state.advance(1);
399                    if let Some('=') = state.peek() {
400                        state.advance(1);
401                        LuaSyntaxKind::GtEq
402                    }
403                    else if let Some('>') = state.peek() {
404                        state.advance(1);
405                        LuaSyntaxKind::GtGt
406                    }
407                    else {
408                        LuaSyntaxKind::Gt
409                    }
410                }
411                '.' => {
412                    state.advance(1);
413                    if let Some('.') = state.peek() {
414                        state.advance(1);
415                        if let Some('.') = state.peek() {
416                            state.advance(1);
417                            LuaSyntaxKind::DotDotDot
418                        }
419                        else {
420                            LuaSyntaxKind::DotDot
421                        }
422                    }
423                    else {
424                        LuaSyntaxKind::Dot
425                    }
426                }
427                ':' => {
428                    state.advance(1);
429                    if let Some(':') = state.peek() {
430                        state.advance(1);
431                        LuaSyntaxKind::ColonColon
432                    }
433                    else {
434                        LuaSyntaxKind::Colon
435                    }
436                }
437                '/' => {
438                    state.advance(1);
439                    if let Some('/') = state.peek() {
440                        state.advance(1);
441                        LuaSyntaxKind::SlashSlash
442                    }
443                    else {
444                        LuaSyntaxKind::Slash
445                    }
446                }
447                '+' => {
448                    state.advance(1);
449                    LuaSyntaxKind::Plus
450                }
451                '-' => {
452                    state.advance(1);
453                    LuaSyntaxKind::Minus
454                }
455                '*' => {
456                    state.advance(1);
457                    LuaSyntaxKind::Star
458                }
459                '%' => {
460                    state.advance(1);
461                    LuaSyntaxKind::Percent
462                }
463                '^' => {
464                    state.advance(1);
465                    LuaSyntaxKind::Caret
466                }
467                '#' => {
468                    state.advance(1);
469                    LuaSyntaxKind::Hash
470                }
471                '&' => {
472                    state.advance(1);
473                    LuaSyntaxKind::Ampersand
474                }
475                '|' => {
476                    state.advance(1);
477                    LuaSyntaxKind::Pipe
478                }
479                '(' => {
480                    state.advance(1);
481                    LuaSyntaxKind::LeftParen
482                }
483                ')' => {
484                    state.advance(1);
485                    LuaSyntaxKind::RightParen
486                }
487                '{' => {
488                    state.advance(1);
489                    LuaSyntaxKind::LeftBrace
490                }
491                '}' => {
492                    state.advance(1);
493                    LuaSyntaxKind::RightBrace
494                }
495                '[' => {
496                    state.advance(1);
497                    LuaSyntaxKind::LeftBracket
498                }
499                ']' => {
500                    state.advance(1);
501                    LuaSyntaxKind::RightBracket
502                }
503                ';' => {
504                    state.advance(1);
505                    LuaSyntaxKind::Semicolon
506                }
507                ',' => {
508                    state.advance(1);
509                    LuaSyntaxKind::Comma
510                }
511                _ => return false,
512            };
513
514            state.add_token(token_kind, start_pos, state.get_position());
515            true
516        }
517        else {
518            false
519        }
520    }
521}
522
523impl<'config> Lexer<LuaLanguage> for LuaLexer<'config> {
524    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<LuaLanguage>) -> LexOutput<LuaLanguage> {
525        let mut state = State::new_with_cache(source, 0, cache);
526        let result = self.run(&mut state);
527        if result.is_ok() {
528            state.add_eof();
529        }
530        state.finish_with_cache(result, cache)
531    }
532}