Skip to main content

oak_lua/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4/// Lua 词法分析
5///
6/// 实现Lua 语言的词法分析,将源代码转换token 序列
7use crate::language::LuaLanguage;
8pub use crate::lexer::token_type::LuaTokenType;
9use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
10
11type State<'a, S> = LexerState<'a, S, LuaLanguage>;
12
13/// Lua 词法分析
14#[derive(Clone)]
15pub struct LuaLexer<'config> {
16    _config: &'config LuaLanguage,
17}
18
19impl<'config> LuaLexer<'config> {
20    /// 创建新的 Lua 词法分析
21    pub fn new(config: &'config LuaLanguage) -> Self {
22        Self { _config: config }
23    }
24
25    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
26        while state.not_at_end() {
27            let safe_point = state.get_position();
28
29            // 尝试各种词法规则
30            if self.skip_whitespace(state) {
31                continue;
32            }
33
34            if self.lex_newline(state) {
35                continue;
36            }
37
38            if self.lex_comment(state) {
39                continue;
40            }
41
42            if self.lex_string(state) {
43                continue;
44            }
45
46            if self.lex_number(state) {
47                continue;
48            }
49
50            if self.lex_identifier_or_keyword(state) {
51                continue;
52            }
53
54            if self.lex_operator_or_delimiter(state) {
55                continue;
56            }
57
58            // 如果所有规则都不匹配,跳过当前字符并标记为错误
59            let start_pos = state.get_position();
60            if let Some(ch) = state.peek() {
61                state.advance(ch.len_utf8());
62                state.add_token(LuaTokenType::Error, start_pos, state.get_position())
63            }
64
65            state.advance_if_dead_lock(safe_point)
66        }
67
68        Ok(())
69    }
70
71    /// 跳过空白字符
72    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
73        let start_pos = state.get_position();
74
75        while let Some(ch) = state.peek() {
76            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
77        }
78
79        if state.get_position() > start_pos {
80            state.add_token(LuaTokenType::Whitespace, start_pos, state.get_position());
81            true
82        }
83        else {
84            false
85        }
86    }
87
88    /// 处理换行
89    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90        let start_pos = state.get_position();
91
92        if let Some('\n') = state.peek() {
93            state.advance(1);
94            state.add_token(LuaTokenType::Newline, start_pos, state.get_position());
95            true
96        }
97        else if let Some('\r') = state.peek() {
98            state.advance(1);
99            if let Some('\n') = state.peek() {
100                state.advance(1)
101            }
102            state.add_token(LuaTokenType::Newline, start_pos, state.get_position());
103            true
104        }
105        else {
106            false
107        }
108    }
109
110    /// 处理注释
111    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        let start_pos = state.get_position();
113
114        if let Some('-') = state.current() {
115            if let Some('-') = state.peek() {
116                state.advance(1); // 第一'-'
117                state.advance(1); // 第二'-'
118
119                // 检查是否是长注--[[
120                if let Some('[') = state.current() {
121                    if let Some('[') = state.peek() {
122                        state.advance(1); // '['
123                        state.advance(1); // '['
124
125                        // 寻找 ]]
126                        while let Some(ch) = state.current() {
127                            if ch == ']' {
128                                if let Some(']') = state.peek() {
129                                    state.advance(1); // ']'
130                                    state.advance(1); // ']'
131                                    break;
132                                }
133                            }
134                            state.advance(ch.len_utf8())
135                        }
136                    }
137                    else {
138                        // 单行注释,读到行
139                        while let Some(ch) = state.current() {
140                            if ch == '\n' || ch == '\r' {
141                                break;
142                            }
143                            state.advance(ch.len_utf8())
144                        }
145                    }
146                }
147                else {
148                    // 单行注释,读到行
149                    while let Some(ch) = state.current() {
150                        if ch == '\n' || ch == '\r' {
151                            break;
152                        }
153                        state.advance(ch.len_utf8())
154                    }
155                }
156
157                state.add_token(LuaTokenType::Comment, start_pos, state.get_position());
158                true
159            }
160            else {
161                false
162            }
163        }
164        else {
165            false
166        }
167    }
168
169    /// 处理字符串字面量
170    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
171        let start_pos = state.get_position();
172
173        if let Some(quote_char) = state.current() {
174            if quote_char == '"' || quote_char == '\'' {
175                state.advance(1); // 跳过开始引
176
177                let mut escaped = false;
178                while let Some(ch) = state.current() {
179                    if escaped {
180                        escaped = false;
181                        state.advance(ch.len_utf8())
182                    }
183                    else if ch == '\\' {
184                        escaped = true;
185                        state.advance(1)
186                    }
187                    else if ch == quote_char {
188                        state.advance(1); // 跳过结束引号
189                        break;
190                    }
191                    else if ch == '\n' || ch == '\r' {
192                        // 字符串不能跨行(除非转义
193                        break;
194                    }
195                    else {
196                        state.advance(ch.len_utf8())
197                    }
198                }
199
200                state.add_token(LuaTokenType::String, start_pos, state.get_position());
201                true
202            }
203            else if quote_char == '[' {
204                // 长字符串 [[...]]
205                if let Some('[') = state.peek() {
206                    state.advance(1); // '['
207                    state.advance(1); // '['
208
209                    // 寻找 ]]
210                    while let Some(ch) = state.current() {
211                        if ch == ']' {
212                            if let Some(']') = state.peek() {
213                                state.advance(1); // ']'
214                                state.advance(1); // ']'
215                                break;
216                            }
217                        }
218                        state.advance(ch.len_utf8())
219                    }
220
221                    state.add_token(LuaTokenType::String, start_pos, state.get_position());
222                    true
223                }
224                else {
225                    false
226                }
227            }
228            else {
229                false
230            }
231        }
232        else {
233            false
234        }
235    }
236
237    /// 处理数字
238    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
239        let start_pos = state.get_position();
240
241        if let Some(ch) = state.current() {
242            if ch.is_ascii_digit() {
243                // 检查是否是十六进制
244                if ch == '0' {
245                    if let Some(next_ch) = state.peek() {
246                        if next_ch == 'x' || next_ch == 'X' {
247                            state.advance(1); // '0'
248                            state.advance(1); // 'x' 'X'
249
250                            // 读取十六进制数字
251                            while let Some(hex_ch) = state.current() {
252                                if hex_ch.is_ascii_hexdigit() { state.advance(1) } else { break }
253                            }
254
255                            state.add_token(LuaTokenType::Number, start_pos, state.get_position());
256                            return true;
257                        }
258                    }
259                }
260
261                // 普通数
262                let mut has_dot = false;
263                let mut has_exp = false;
264
265                while let Some(num_ch) = state.current() {
266                    if num_ch.is_ascii_digit() {
267                        state.advance(1)
268                    }
269                    else if num_ch == '.' && !has_dot && !has_exp {
270                        has_dot = true;
271                        state.advance(1)
272                    }
273                    else if (num_ch == 'e' || num_ch == 'E') && !has_exp {
274                        has_exp = true;
275                        state.advance(1);
276
277                        // 可选的符号
278                        if let Some(sign_ch) = state.current() {
279                            if sign_ch == '+' || sign_ch == '-' {
280                                state.advance(1)
281                            }
282                        }
283                    }
284                    else {
285                        break;
286                    }
287                }
288
289                state.add_token(LuaTokenType::Number, start_pos, state.get_position());
290                true
291            }
292            else {
293                false
294            }
295        }
296        else {
297            false
298        }
299    }
300
301    /// 处理标识符或关键
302    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
303        if let Some(ch) = state.current() {
304            if ch.is_ascii_alphabetic() || ch == '_' {
305                let range = state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
306                // 使用 Source trait 的 get_text_in 方法
307                let text = state.get_text_in(range.clone().into());
308                let token_kind = self.keyword_or_identifier(&text);
309                state.add_token(token_kind, range.start, range.end);
310                true
311            }
312            else {
313                false
314            }
315        }
316        else {
317            false
318        }
319    }
320
321    /// 识别关键
322    fn keyword_or_identifier(&self, text: &str) -> LuaTokenType {
323        match text {
324            "and" => LuaTokenType::And,
325            "break" => LuaTokenType::Break,
326            "do" => LuaTokenType::Do,
327            "else" => LuaTokenType::Else,
328            "elseif" => LuaTokenType::Elseif,
329            "end" => LuaTokenType::End,
330            "false" => LuaTokenType::False,
331            "for" => LuaTokenType::For,
332            "function" => LuaTokenType::Function,
333            "goto" => LuaTokenType::Goto,
334            "if" => LuaTokenType::If,
335            "in" => LuaTokenType::In,
336            "local" => LuaTokenType::Local,
337            "nil" => LuaTokenType::Nil,
338            "not" => LuaTokenType::Not,
339            "or" => LuaTokenType::Or,
340            "repeat" => LuaTokenType::Repeat,
341            "return" => LuaTokenType::Return,
342            "then" => LuaTokenType::Then,
343            "true" => LuaTokenType::True,
344            "until" => LuaTokenType::Until,
345            "while" => LuaTokenType::While,
346            _ => LuaTokenType::Identifier,
347        }
348    }
349
350    /// 处理操作符和分隔
351    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
352        let start_pos = state.get_position();
353
354        if let Some(ch) = state.peek() {
355            let token_kind = match ch {
356                '=' => {
357                    state.advance(1);
358                    if let Some('=') = state.peek() {
359                        state.advance(1);
360                        LuaTokenType::EqEq
361                    }
362                    else {
363                        LuaTokenType::Eq
364                    }
365                }
366                '~' => {
367                    state.advance(1);
368                    if let Some('=') = state.peek() {
369                        state.advance(1);
370                        LuaTokenType::TildeEq
371                    }
372                    else {
373                        LuaTokenType::Tilde
374                    }
375                }
376                '<' => {
377                    state.advance(1);
378                    if let Some('=') = state.peek() {
379                        state.advance(1);
380                        LuaTokenType::LtEq
381                    }
382                    else if let Some('<') = state.peek() {
383                        state.advance(1);
384                        LuaTokenType::LtLt
385                    }
386                    else {
387                        LuaTokenType::Lt
388                    }
389                }
390                '>' => {
391                    state.advance(1);
392                    if let Some('=') = state.peek() {
393                        state.advance(1);
394                        LuaTokenType::GtEq
395                    }
396                    else if let Some('>') = state.peek() {
397                        state.advance(1);
398                        LuaTokenType::GtGt
399                    }
400                    else {
401                        LuaTokenType::Gt
402                    }
403                }
404                '.' => {
405                    state.advance(1);
406                    if let Some('.') = state.peek() {
407                        state.advance(1);
408                        if let Some('.') = state.peek() {
409                            state.advance(1);
410                            LuaTokenType::DotDotDot
411                        }
412                        else {
413                            LuaTokenType::DotDot
414                        }
415                    }
416                    else {
417                        LuaTokenType::Dot
418                    }
419                }
420                ':' => {
421                    state.advance(1);
422                    if let Some(':') = state.peek() {
423                        state.advance(1);
424                        LuaTokenType::ColonColon
425                    }
426                    else {
427                        LuaTokenType::Colon
428                    }
429                }
430                '/' => {
431                    state.advance(1);
432                    if let Some('/') = state.peek() {
433                        state.advance(1);
434                        LuaTokenType::SlashSlash
435                    }
436                    else {
437                        LuaTokenType::Slash
438                    }
439                }
440                '+' => {
441                    state.advance(1);
442                    LuaTokenType::Plus
443                }
444                '-' => {
445                    state.advance(1);
446                    LuaTokenType::Minus
447                }
448                '*' => {
449                    state.advance(1);
450                    LuaTokenType::Star
451                }
452                '%' => {
453                    state.advance(1);
454                    LuaTokenType::Percent
455                }
456                '^' => {
457                    state.advance(1);
458                    LuaTokenType::Caret
459                }
460                '#' => {
461                    state.advance(1);
462                    LuaTokenType::Hash
463                }
464                '&' => {
465                    state.advance(1);
466                    LuaTokenType::Ampersand
467                }
468                '|' => {
469                    state.advance(1);
470                    LuaTokenType::Pipe
471                }
472                '(' => {
473                    state.advance(1);
474                    LuaTokenType::LeftParen
475                }
476                ')' => {
477                    state.advance(1);
478                    LuaTokenType::RightParen
479                }
480                '{' => {
481                    state.advance(1);
482                    LuaTokenType::LeftBrace
483                }
484                '}' => {
485                    state.advance(1);
486                    LuaTokenType::RightBrace
487                }
488                '[' => {
489                    state.advance(1);
490                    LuaTokenType::LeftBracket
491                }
492                ']' => {
493                    state.advance(1);
494                    LuaTokenType::RightBracket
495                }
496                ';' => {
497                    state.advance(1);
498                    LuaTokenType::Semicolon
499                }
500                ',' => {
501                    state.advance(1);
502                    LuaTokenType::Comma
503                }
504                _ => return false,
505            };
506
507            state.add_token(token_kind, start_pos, state.get_position());
508            true
509        }
510        else {
511            false
512        }
513    }
514}
515
516impl<'config> Lexer<LuaLanguage> for LuaLexer<'config> {
517    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<LuaLanguage>) -> LexOutput<LuaLanguage> {
518        let mut state = State::new_with_cache(source, 0, cache);
519        let result = self.run(&mut state);
520        if result.is_ok() {
521            state.add_eof()
522        }
523        state.finish_with_cache(result, cache)
524    }
525}