Skip to main content

oak_perl/lexer/
mod.rs

1use crate::{kind::PerlSyntaxKind, language::PerlLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, PerlLanguage>;
10
11static PERL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static PERL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
13
14#[derive(Clone, Debug)]
15pub struct PerlLexer<'config> {
16    _config: &'config PerlLanguage,
17}
18
19impl<'config> PerlLexer<'config> {
20    pub fn new(config: &'config PerlLanguage) -> Self {
21        Self { _config: config }
22    }
23
24    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
25        PERL_WHITESPACE.scan(state, PerlSyntaxKind::Whitespace)
26    }
27
28    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
29        PERL_COMMENT.scan(state, PerlSyntaxKind::Comment, PerlSyntaxKind::Comment)
30    }
31
32    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
33        let start_pos = state.get_position();
34
35        if let Some(quote_char) = state.peek() {
36            if quote_char == '"' || quote_char == '\'' {
37                state.advance(1); // 跳过开始引号
38
39                let mut escaped = false;
40                while let Some(ch) = state.peek() {
41                    if escaped {
42                        escaped = false;
43                        state.advance(ch.len_utf8());
44                    }
45                    else if ch == '\\' {
46                        escaped = true;
47                        state.advance(1);
48                    }
49                    else if ch == quote_char {
50                        state.advance(1); // 跳过结束引号
51                        break;
52                    }
53                    else if ch == '\n' || ch == '\r' {
54                        // 字符串不能跨行(除非转义)
55                        break;
56                    }
57                    else {
58                        state.advance(ch.len_utf8());
59                    }
60                }
61
62                state.add_token(PerlSyntaxKind::StringLiteral, start_pos, state.get_position());
63                true
64            }
65            else {
66                false
67            }
68        }
69        else {
70            false
71        }
72    }
73
74    fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
75        if let Some(ch) = state.peek() {
76            let start_pos = state.get_position();
77
78            match ch {
79                '$' => {
80                    state.advance(1);
81                    // 读取变量名
82                    while let Some(ch) = state.peek() {
83                        if ch.is_alphanumeric() || ch == '_' {
84                            state.advance(ch.len_utf8());
85                        }
86                        else {
87                            break;
88                        }
89                    }
90                    state.add_token(PerlSyntaxKind::Dollar, start_pos, state.get_position());
91                    true
92                }
93                '@' => {
94                    state.advance(1);
95                    // 读取数组变量名
96                    while let Some(ch) = state.peek() {
97                        if ch.is_alphanumeric() || ch == '_' {
98                            state.advance(ch.len_utf8());
99                        }
100                        else {
101                            break;
102                        }
103                    }
104                    state.add_token(PerlSyntaxKind::At, start_pos, state.get_position());
105                    true
106                }
107                '%' => {
108                    state.advance(1);
109                    // 读取哈希变量名
110                    while let Some(ch) = state.peek() {
111                        if ch.is_alphanumeric() || ch == '_' {
112                            state.advance(ch.len_utf8());
113                        }
114                        else {
115                            break;
116                        }
117                    }
118                    state.add_token(PerlSyntaxKind::Percent_, start_pos, state.get_position());
119                    true
120                }
121                _ => false,
122            }
123        }
124        else {
125            false
126        }
127    }
128
129    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
130        if let Some(ch) = state.peek() {
131            if ch.is_alphabetic() || ch == '_' {
132                let start_pos = state.get_position();
133                let mut text = String::new();
134
135                // 读取标识符
136                while let Some(ch) = state.peek() {
137                    if ch.is_alphanumeric() || ch == '_' {
138                        text.push(ch);
139                        state.advance(ch.len_utf8());
140                    }
141                    else {
142                        break;
143                    }
144                }
145
146                // 检查是否是关键字
147                let kind = match text.as_str() {
148                    "if" => PerlSyntaxKind::If,
149                    "else" => PerlSyntaxKind::Else,
150                    "elsif" => PerlSyntaxKind::Elsif,
151                    "unless" => PerlSyntaxKind::Unless,
152                    "while" => PerlSyntaxKind::While,
153                    "until" => PerlSyntaxKind::Until,
154                    "for" => PerlSyntaxKind::For,
155                    "foreach" => PerlSyntaxKind::Foreach,
156                    "do" => PerlSyntaxKind::Do,
157                    "sub" => PerlSyntaxKind::Sub,
158                    "package" => PerlSyntaxKind::Package,
159                    "use" => PerlSyntaxKind::Use,
160                    "require" => PerlSyntaxKind::Require,
161                    "my" => PerlSyntaxKind::My,
162                    "our" => PerlSyntaxKind::Our,
163                    "local" => PerlSyntaxKind::Local,
164                    "return" => PerlSyntaxKind::Return,
165                    "last" => PerlSyntaxKind::Last,
166                    "next" => PerlSyntaxKind::Next,
167                    "redo" => PerlSyntaxKind::Redo,
168                    "die" => PerlSyntaxKind::Die,
169                    "warn" => PerlSyntaxKind::Warn,
170                    "eval" => PerlSyntaxKind::Eval,
171                    "print" => PerlSyntaxKind::Print,
172                    "printf" => PerlSyntaxKind::Printf,
173                    "chomp" => PerlSyntaxKind::Chomp,
174                    "chop" => PerlSyntaxKind::Chop,
175                    "split" => PerlSyntaxKind::Split,
176                    "join" => PerlSyntaxKind::Join,
177                    "push" => PerlSyntaxKind::Push,
178                    "pop" => PerlSyntaxKind::Pop,
179                    "shift" => PerlSyntaxKind::Shift,
180                    "unshift" => PerlSyntaxKind::Unshift,
181                    "keys" => PerlSyntaxKind::Keys,
182                    "values" => PerlSyntaxKind::Values,
183                    "each" => PerlSyntaxKind::Each,
184                    "exists" => PerlSyntaxKind::Exists,
185                    "delete" => PerlSyntaxKind::Delete,
186                    "defined" => PerlSyntaxKind::Defined,
187                    "undef" => PerlSyntaxKind::Undef,
188                    "ref" => PerlSyntaxKind::Ref,
189                    "bless" => PerlSyntaxKind::Bless,
190                    "new" => PerlSyntaxKind::New,
191                    "and" => PerlSyntaxKind::And,
192                    "or" => PerlSyntaxKind::Or,
193                    "not" => PerlSyntaxKind::Not,
194                    _ => PerlSyntaxKind::Identifier,
195                };
196
197                state.add_token(kind, start_pos, state.get_position());
198                true
199            }
200            else {
201                false
202            }
203        }
204        else {
205            false
206        }
207    }
208
209    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
210        if let Some(ch) = state.peek() {
211            if ch.is_ascii_digit() {
212                let start_pos = state.get_position();
213                let mut has_dot = false;
214
215                // 读取数字
216                while let Some(ch) = state.peek() {
217                    if ch.is_ascii_digit() {
218                        state.advance(1);
219                    }
220                    else if ch == '.' && !has_dot {
221                        has_dot = true;
222                        state.advance(1);
223                    }
224                    else {
225                        break;
226                    }
227                }
228
229                let kind = PerlSyntaxKind::NumberLiteral;
230
231                state.add_token(kind, start_pos, state.get_position());
232                true
233            }
234            else {
235                false
236            }
237        }
238        else {
239            false
240        }
241    }
242
243    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
244        if let Some(ch) = state.peek() {
245            let start_pos = state.get_position();
246
247            let kind = match ch {
248                '+' => {
249                    state.advance(1);
250                    if let Some('+') = state.peek() {
251                        state.advance(1);
252                        PerlSyntaxKind::Increment
253                    }
254                    else if let Some('=') = state.peek() {
255                        state.advance(1);
256                        PerlSyntaxKind::PlusAssign
257                    }
258                    else {
259                        PerlSyntaxKind::Plus
260                    }
261                }
262                '-' => {
263                    state.advance(1);
264                    if let Some('-') = state.peek() {
265                        state.advance(1);
266                        PerlSyntaxKind::Decrement
267                    }
268                    else if let Some('=') = state.peek() {
269                        state.advance(1);
270                        PerlSyntaxKind::MinusAssign
271                    }
272                    else if let Some('>') = state.peek() {
273                        state.advance(1);
274                        PerlSyntaxKind::Arrow
275                    }
276                    else {
277                        PerlSyntaxKind::Minus
278                    }
279                }
280                '*' => {
281                    state.advance(1);
282                    if let Some('*') = state.peek() {
283                        state.advance(1);
284                        PerlSyntaxKind::Power
285                    }
286                    else if let Some('=') = state.peek() {
287                        state.advance(1);
288                        PerlSyntaxKind::MultiplyAssign
289                    }
290                    else {
291                        PerlSyntaxKind::Star
292                    }
293                }
294                '/' => {
295                    state.advance(1);
296                    if let Some('=') = state.peek() {
297                        state.advance(1);
298                        PerlSyntaxKind::DivideAssign
299                    }
300                    else {
301                        PerlSyntaxKind::Slash
302                    }
303                }
304                '=' => {
305                    state.advance(1);
306                    if let Some('=') = state.peek() {
307                        state.advance(1);
308                        PerlSyntaxKind::Equal
309                    }
310                    else if let Some('~') = state.peek() {
311                        state.advance(1);
312                        PerlSyntaxKind::Match
313                    }
314                    else {
315                        PerlSyntaxKind::Assign
316                    }
317                }
318                '!' => {
319                    state.advance(1);
320                    if let Some('=') = state.peek() {
321                        state.advance(1);
322                        PerlSyntaxKind::NotEqual
323                    }
324                    else if let Some('~') = state.peek() {
325                        state.advance(1);
326                        PerlSyntaxKind::NotMatch
327                    }
328                    else {
329                        PerlSyntaxKind::Not
330                    }
331                }
332                '<' => {
333                    state.advance(1);
334                    if let Some('=') = state.peek() {
335                        state.advance(1);
336                        if let Some('>') = state.peek() {
337                            state.advance(1);
338                            PerlSyntaxKind::Spaceship
339                        }
340                        else {
341                            PerlSyntaxKind::LessEqual
342                        }
343                    }
344                    else if let Some('<') = state.peek() {
345                        state.advance(1);
346                        PerlSyntaxKind::LeftShift
347                    }
348                    else {
349                        PerlSyntaxKind::LessThan
350                    }
351                }
352                '>' => {
353                    state.advance(1);
354                    if let Some('=') = state.peek() {
355                        state.advance(1);
356                        PerlSyntaxKind::GreaterEqual
357                    }
358                    else if let Some('>') = state.peek() {
359                        state.advance(1);
360                        PerlSyntaxKind::RightShift
361                    }
362                    else {
363                        PerlSyntaxKind::GreaterThan
364                    }
365                }
366                '&' => {
367                    state.advance(1);
368                    if let Some('&') = state.peek() {
369                        state.advance(1);
370                        PerlSyntaxKind::LogicalAnd
371                    }
372                    else {
373                        PerlSyntaxKind::BitwiseAnd
374                    }
375                }
376                '|' => {
377                    state.advance(1);
378                    if let Some('|') = state.peek() {
379                        state.advance(1);
380                        PerlSyntaxKind::LogicalOr
381                    }
382                    else {
383                        PerlSyntaxKind::BitwiseOr
384                    }
385                }
386                '^' => {
387                    state.advance(1);
388                    PerlSyntaxKind::BitwiseXor
389                }
390                '~' => {
391                    state.advance(1);
392                    PerlSyntaxKind::BitwiseNot
393                }
394                '.' => {
395                    state.advance(1);
396                    if let Some('.') = state.peek() {
397                        state.advance(1);
398                        PerlSyntaxKind::Range
399                    }
400                    else {
401                        PerlSyntaxKind::Concat
402                    }
403                }
404                '?' => {
405                    state.advance(1);
406                    PerlSyntaxKind::Question
407                }
408                ':' => {
409                    state.advance(1);
410                    PerlSyntaxKind::Colon
411                }
412                ';' => {
413                    state.advance(1);
414                    PerlSyntaxKind::Semicolon
415                }
416                ',' => {
417                    state.advance(1);
418                    PerlSyntaxKind::Comma
419                }
420                '(' => {
421                    state.advance(1);
422                    PerlSyntaxKind::LeftParen
423                }
424                ')' => {
425                    state.advance(1);
426                    PerlSyntaxKind::RightParen
427                }
428                '[' => {
429                    state.advance(1);
430                    PerlSyntaxKind::LeftBracket
431                }
432                ']' => {
433                    state.advance(1);
434                    PerlSyntaxKind::RightBracket
435                }
436                '{' => {
437                    state.advance(1);
438                    PerlSyntaxKind::LeftBrace
439                }
440                '}' => {
441                    state.advance(1);
442                    PerlSyntaxKind::RightBrace
443                }
444                '\n' => {
445                    state.advance(1);
446                    PerlSyntaxKind::Newline
447                }
448                _ => {
449                    state.advance(ch.len_utf8());
450                    PerlSyntaxKind::Error
451                }
452            };
453
454            state.add_token(kind, start_pos, state.get_position());
455            true
456        }
457        else {
458            false
459        }
460    }
461}
462
463impl<'config> Lexer<PerlLanguage> for PerlLexer<'config> {
464    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PerlLanguage>) -> LexOutput<PerlLanguage> {
465        let mut state = LexerState::new(source);
466        let result = self.run(&mut state);
467        if result.is_ok() {
468            state.add_eof();
469        }
470        state.finish_with_cache(result, cache)
471    }
472}
473
474impl<'config> PerlLexer<'config> {
475    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
476        while state.not_at_end() {
477            let safe_point = state.get_position();
478
479            // 跳过空白字符
480            if self.skip_whitespace(state) {
481                continue;
482            }
483
484            // 处理注释
485            if self.skip_comment(state) {
486                continue;
487            }
488
489            // 处理字符串
490            if self.lex_string(state) {
491                continue;
492            }
493
494            // 处理变量
495            if self.lex_variable(state) {
496                continue;
497            }
498
499            // 处理标识符和关键字
500            if self.lex_identifier_or_keyword(state) {
501                continue;
502            }
503
504            // 处理数字
505            if self.lex_number(state) {
506                continue;
507            }
508
509            // 处理操作符和标点符号
510            if self.lex_operators_and_punctuation(state) {
511                continue;
512            }
513
514            // 如果没有匹配任何模式,创建错误 token
515            let start_pos = state.get_position();
516            if let Some(ch) = state.peek() {
517                state.advance(ch.len_utf8());
518                state.add_token(PerlSyntaxKind::Error, start_pos, state.get_position());
519            }
520
521            state.advance_if_dead_lock(safe_point);
522        }
523
524        Ok(())
525    }
526}