Skip to main content

oak_erlang/lexer/
mod.rs

1use crate::{kind::ErlangSyntaxKind, language::ErlangLanguage};
2use oak_core::{
3    errors::OakError,
4    lexer::{LexOutput, Lexer, LexerCache, LexerState},
5    source::{Source, TextEdit},
6};
7use std::{collections::HashSet, sync::LazyLock};
8
9/// Erlang 词法分析器
10#[derive(Clone)]
11pub struct ErlangLexer<'config> {
12    _config: &'config ErlangLanguage,
13}
14
15impl<'config> Lexer<ErlangLanguage> for ErlangLexer<'config> {
16    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ErlangLanguage>) -> LexOutput<ErlangLanguage> {
17        let mut state = LexerState::new(text);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof();
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl<'config> ErlangLexer<'config> {
27    pub fn new(config: &'config ErlangLanguage) -> Self {
28        Self { _config: config }
29    }
30
31    /// 主要的词法分析运行方法
32    pub fn run<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ErlangLanguage>) -> Result<(), OakError> {
33        while state.not_at_end() {
34            // 安全检查,防止无限循环
35            let start_pos = state.get_position();
36
37            // 跳过空白字符和注释
38            if self.skip_whitespace_and_comments(state) {
39                continue;
40            }
41
42            // 词法分析各种 token
43            if self.lex_string_literal(state) {
44                continue;
45            }
46
47            if self.lex_character_literal(state) {
48                continue;
49            }
50
51            if self.lex_number(state) {
52                continue;
53            }
54
55            if self.lex_identifier_atom_or_keyword(state) {
56                continue;
57            }
58
59            if self.lex_operator(state) {
60                continue;
61            }
62
63            if self.lex_single_char_token(state) {
64                continue;
65            }
66
67            // 安全检查
68            if state.get_position() == start_pos {
69                // 如果位置没有前进,跳过一个字符以避免无限循环
70                if let Some(ch) = state.current() {
71                    state.advance(ch.len_utf8());
72                    let end = state.get_position();
73                    state.add_token(ErlangSyntaxKind::Error, start_pos, end);
74                }
75            }
76        }
77        Ok(())
78    }
79
80    /// 跳过空白字符和注释
81    fn skip_whitespace_and_comments<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
82        let mut skipped = false;
83
84        // 跳过空白字符
85        while let Some(ch) = state.current() {
86            if WHITESPACE.contains(&ch) {
87                let start = state.get_position();
88                if ch == '\n' {
89                    state.advance(1);
90                    state.add_token(ErlangSyntaxKind::Newline, start, state.get_position());
91                }
92                else {
93                    // 跳过连续的空白字符
94                    while let Some(ch) = state.current() {
95                        if WHITESPACE.contains(&ch) && ch != '\n' {
96                            state.advance(ch.len_utf8());
97                        }
98                        else {
99                            break;
100                        }
101                    }
102                    state.add_token(ErlangSyntaxKind::Whitespace, start, state.get_position());
103                }
104                skipped = true;
105            }
106            else if ch == '%' {
107                // 行注释
108                let start = state.get_position();
109                state.advance(1); // 跳过 '%'
110
111                // 读取到行尾
112                while let Some(ch) = state.current() {
113                    if ch == '\n' {
114                        break;
115                    }
116                    state.advance(ch.len_utf8());
117                }
118
119                state.add_token(ErlangSyntaxKind::Comment, start, state.get_position());
120                skipped = true;
121            }
122            else {
123                break;
124            }
125        }
126
127        skipped
128    }
129
130    /// 词法分析字符串字面量
131    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
132        if let Some('"') = state.current() {
133            let start = state.get_position();
134            state.advance(1); // 跳过开始的 '"'
135
136            while let Some(ch) = state.current() {
137                if ch == '"' {
138                    state.advance(1); // 跳过结束的 '"'
139                    let end = state.get_position();
140                    state.add_token(ErlangSyntaxKind::String, start, end);
141                    return true;
142                }
143                else if ch == '\\' {
144                    state.advance(1); // 跳过转义字符
145                    if let Some(ch) = state.current() {
146                        state.advance(ch.len_utf8());
147                    }
148                }
149                else {
150                    state.advance(ch.len_utf8());
151                }
152            }
153
154            // 未闭合的字符串
155            let end = state.get_position();
156            state.add_token(ErlangSyntaxKind::String, start, end);
157            true
158        }
159        else {
160            false
161        }
162    }
163
164    /// 词法分析字符字面量
165    fn lex_character_literal<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
166        if let Some('$') = state.current() {
167            let start = state.get_position();
168            state.advance(1); // 跳过 '$'
169
170            if let Some(ch) = state.current() {
171                if ch == '\\' {
172                    state.advance(1);
173                    // 简单的转义或八进制转义
174                    if let Some(next) = state.current() {
175                        if next.is_ascii_digit() {
176                            // 八进制
177                            let mut count = 0;
178                            while let Some(ch) = state.current() {
179                                if ch.is_ascii_digit() && count < 3 {
180                                    state.advance(1);
181                                    count += 1;
182                                }
183                                else {
184                                    break;
185                                }
186                            }
187                        }
188                        else {
189                            state.advance(next.len_utf8());
190                        }
191                    }
192                }
193                else {
194                    state.advance(ch.len_utf8());
195                }
196                state.add_token(ErlangSyntaxKind::Character, start, state.get_position());
197                true
198            }
199            else {
200                // 只有 $ 没有字符
201                state.add_token(ErlangSyntaxKind::Error, start, state.get_position());
202                true
203            }
204        }
205        else {
206            false
207        }
208    }
209
210    /// 词法分析数字
211    fn lex_number<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
212        if let Some(ch) = state.current() {
213            if ch.is_ascii_digit() {
214                let start = state.get_position();
215
216                // 读取整数部分
217                while let Some(ch) = state.current() {
218                    if ch.is_ascii_digit() {
219                        state.advance(1);
220                    }
221                    else {
222                        break;
223                    }
224                }
225
226                // 检查小数点
227                if let Some('.') = state.current() {
228                    if let Some(next_ch) = state.peek() {
229                        if next_ch.is_ascii_digit() {
230                            state.advance(1); // 跳过 '.'
231
232                            // 读取小数部分
233                            while let Some(ch) = state.current() {
234                                if ch.is_ascii_digit() {
235                                    state.advance(1);
236                                }
237                                else {
238                                    break;
239                                }
240                            }
241                        }
242                    }
243                }
244
245                // 检查科学计数法
246                if let Some(ch) = state.current() {
247                    if ch == 'e' || ch == 'E' {
248                        state.advance(1);
249
250                        // 可选的符号
251                        if let Some(ch) = state.current() {
252                            if ch == '+' || ch == '-' {
253                                state.advance(1);
254                            }
255                        }
256
257                        // 指数部分
258                        while let Some(ch) = state.current() {
259                            if ch.is_ascii_digit() {
260                                state.advance(1);
261                            }
262                            else {
263                                break;
264                            }
265                        }
266                    }
267                }
268
269                state.add_token(ErlangSyntaxKind::Number, start, state.get_position());
270                true
271            }
272            else {
273                false
274            }
275        }
276        else {
277            false
278        }
279    }
280
281    /// 词法分析标识符、原子或关键字
282    fn lex_identifier_atom_or_keyword<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
283        if let Some(ch) = state.current() {
284            let start = state.get_position();
285
286            // 变量 (大写字母或下划线开头)
287            if ch.is_ascii_uppercase() || ch == '_' {
288                state.advance(1);
289                while let Some(ch) = state.current() {
290                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '@' {
291                        state.advance(1);
292                    }
293                    else {
294                        break;
295                    }
296                }
297                state.add_token(ErlangSyntaxKind::Variable, start, state.get_position());
298                return true;
299            }
300
301            // 原子 (小写字母开头)
302            if ch.is_ascii_lowercase() {
303                state.advance(1);
304                while let Some(ch) = state.current() {
305                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '@' {
306                        state.advance(1);
307                    }
308                    else {
309                        break;
310                    }
311                }
312                let end = state.get_position();
313                let text = state.source().get_text_in(oak_core::Range { start, end });
314
315                // 检查是否是关键字
316                if KEYWORDS.contains(text.as_ref()) {
317                    let kind = match text.as_ref() {
318                        "after" => ErlangSyntaxKind::After,
319                        "and" => ErlangSyntaxKind::And,
320                        "andalso" => ErlangSyntaxKind::Andalso,
321                        "band" => ErlangSyntaxKind::Band,
322                        "begin" => ErlangSyntaxKind::Begin,
323                        "bnot" => ErlangSyntaxKind::Bnot,
324                        "bor" => ErlangSyntaxKind::Bor,
325                        "bsl" => ErlangSyntaxKind::Bsl,
326                        "bsr" => ErlangSyntaxKind::Bsr,
327                        "bxor" => ErlangSyntaxKind::Bxor,
328                        "case" => ErlangSyntaxKind::Case,
329                        "catch" => ErlangSyntaxKind::Catch,
330                        "cond" => ErlangSyntaxKind::Cond,
331                        "div" => ErlangSyntaxKind::Div,
332                        "end" => ErlangSyntaxKind::End,
333                        "fun" => ErlangSyntaxKind::Fun,
334                        "if" => ErlangSyntaxKind::If,
335                        "let" => ErlangSyntaxKind::Let,
336                        "not" => ErlangSyntaxKind::Not,
337                        "of" => ErlangSyntaxKind::Of,
338                        "or" => ErlangSyntaxKind::Or,
339                        "orelse" => ErlangSyntaxKind::Orelse,
340                        "query" => ErlangSyntaxKind::Query,
341                        "receive" => ErlangSyntaxKind::Receive,
342                        "rem" => ErlangSyntaxKind::Rem,
343                        "try" => ErlangSyntaxKind::Try,
344                        "when" => ErlangSyntaxKind::When,
345                        "xor" => ErlangSyntaxKind::Xor,
346                        _ => ErlangSyntaxKind::Atom,
347                    };
348                    state.add_token(kind, start, end);
349                }
350                else {
351                    state.add_token(ErlangSyntaxKind::Atom, start, end);
352                }
353                return true;
354            }
355
356            // 引用原子 ('atom')
357            if ch == '\'' {
358                state.advance(1);
359                while let Some(ch) = state.current() {
360                    if ch == '\'' {
361                        state.advance(1);
362                        state.add_token(ErlangSyntaxKind::Atom, start, state.get_position());
363                        return true;
364                    }
365                    else if ch == '\\' {
366                        state.advance(1);
367                        if let Some(next) = state.current() {
368                            state.advance(next.len_utf8());
369                        }
370                    }
371                    else {
372                        state.advance(ch.len_utf8());
373                    }
374                }
375                state.add_token(ErlangSyntaxKind::Atom, start, state.get_position());
376                return true;
377            }
378        }
379        false
380    }
381
382    /// 词法分析操作符
383    fn lex_operator<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
384        if let Some(ch) = state.current() {
385            let start = state.get_position();
386
387            match ch {
388                '+' => {
389                    state.advance(1);
390                    if let Some('+') = state.current() {
391                        state.advance(1);
392                        state.add_token(ErlangSyntaxKind::PlusPlus, start, state.get_position());
393                    }
394                    else {
395                        state.add_token(ErlangSyntaxKind::Plus, start, state.get_position());
396                    }
397                    true
398                }
399                '-' => {
400                    state.advance(1);
401                    if let Some('-') = state.current() {
402                        state.advance(1);
403                        state.add_token(ErlangSyntaxKind::MinusMinus, start, state.get_position());
404                    }
405                    else if let Some('>') = state.current() {
406                        state.advance(1);
407                        state.add_token(ErlangSyntaxKind::Arrow, start, state.get_position());
408                    }
409                    else {
410                        state.add_token(ErlangSyntaxKind::Minus, start, state.get_position());
411                    }
412                    true
413                }
414                '*' => {
415                    state.advance(1);
416                    state.add_token(ErlangSyntaxKind::Star, start, state.get_position());
417                    true
418                }
419                '/' => {
420                    state.advance(1);
421                    if let Some('=') = state.current() {
422                        state.advance(1);
423                        state.add_token(ErlangSyntaxKind::SlashEqual, start, state.get_position());
424                    }
425                    else {
426                        state.add_token(ErlangSyntaxKind::Slash, start, state.get_position());
427                    }
428                    true
429                }
430                '=' => {
431                    state.advance(1);
432                    match state.current() {
433                        Some('=') => {
434                            state.advance(1);
435                            state.add_token(ErlangSyntaxKind::EqualEqual, start, state.get_position());
436                        }
437                        Some(':') => {
438                            state.advance(1);
439                            if let Some('=') = state.current() {
440                                state.advance(1);
441                                state.add_token(ErlangSyntaxKind::EqualColonEqual, start, state.get_position());
442                            }
443                            else {
444                                // 回退
445                                state.set_position(start + 1);
446                                state.add_token(ErlangSyntaxKind::Equal, start, state.get_position());
447                            }
448                        }
449                        Some('/') => {
450                            state.advance(1);
451                            if let Some('=') = state.current() {
452                                state.advance(1);
453                                state.add_token(ErlangSyntaxKind::EqualSlashEqual, start, state.get_position());
454                            }
455                            else {
456                                // 回退
457                                state.set_position(start + 1);
458                                state.add_token(ErlangSyntaxKind::Equal, start, state.get_position());
459                            }
460                        }
461                        Some('<') => {
462                            state.advance(1);
463                            state.add_token(ErlangSyntaxKind::LessEqual, start, state.get_position());
464                        }
465                        _ => {
466                            state.add_token(ErlangSyntaxKind::Equal, start, state.get_position());
467                        }
468                    }
469                    true
470                }
471                '<' => {
472                    state.advance(1);
473                    state.add_token(ErlangSyntaxKind::Less, start, state.get_position());
474                    true
475                }
476                '>' => {
477                    state.advance(1);
478                    if let Some('=') = state.current() {
479                        state.advance(1);
480                        state.add_token(ErlangSyntaxKind::GreaterEqual, start, state.get_position());
481                    }
482                    else {
483                        state.add_token(ErlangSyntaxKind::Greater, start, state.get_position());
484                    }
485                    true
486                }
487                '!' => {
488                    state.advance(1);
489                    state.add_token(ErlangSyntaxKind::Exclamation, start, state.get_position());
490                    true
491                }
492                '?' => {
493                    state.advance(1);
494                    state.add_token(ErlangSyntaxKind::Question, start, state.get_position());
495                    true
496                }
497                '|' => {
498                    state.advance(1);
499                    if let Some('|') = state.current() {
500                        state.advance(1);
501                        state.add_token(ErlangSyntaxKind::PipePipe, start, state.get_position());
502                    }
503                    else {
504                        state.add_token(ErlangSyntaxKind::Pipe, start, state.get_position());
505                    }
506                    true
507                }
508                '#' => {
509                    state.advance(1);
510                    state.add_token(ErlangSyntaxKind::Hash, start, state.get_position());
511                    true
512                }
513                _ => false,
514            }
515        }
516        else {
517            false
518        }
519    }
520
521    /// 词法分析单字符 token
522    fn lex_single_char_token<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
523        if let Some(ch) = state.current() {
524            let start = state.get_position();
525            let kind = match ch {
526                '(' => Some(ErlangSyntaxKind::LeftParen),
527                ')' => Some(ErlangSyntaxKind::RightParen),
528                '{' => Some(ErlangSyntaxKind::LeftBrace),
529                '}' => Some(ErlangSyntaxKind::RightBrace),
530                '[' => Some(ErlangSyntaxKind::LeftBracket),
531                ']' => Some(ErlangSyntaxKind::RightBracket),
532                ',' => Some(ErlangSyntaxKind::Comma),
533                ';' => Some(ErlangSyntaxKind::Semicolon),
534                '.' => Some(ErlangSyntaxKind::Dot),
535                ':' => Some(ErlangSyntaxKind::Colon),
536                _ => None,
537            };
538
539            if let Some(kind) = kind {
540                state.advance(ch.len_utf8());
541                state.add_token(kind, start, state.get_position());
542                true
543            }
544            else {
545                false
546            }
547        }
548        else {
549            false
550        }
551    }
552}
553
554// 静态配置
555static WHITESPACE: LazyLock<HashSet<char>> = LazyLock::new(|| [' ', '\t', '\r', '\n'].into_iter().collect());
556
557static KEYWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
558    ["after", "and", "andalso", "band", "begin", "bnot", "bor", "bsl", "bsr", "bxor", "case", "catch", "cond", "div", "end", "fun", "if", "let", "not", "of", "or", "orelse", "query", "receive", "rem", "try", "when", "xor"].into_iter().collect()
559});