oak_powershell/lexer/
mod.rs

1use crate::{kind::PowerShellSyntaxKind, language::PowerShellLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PowerShellLanguage>;
5
6#[derive(Clone)]
7pub struct PowerShellLexer<'config> {
8    config: &'config PowerShellLanguage,
9}
10
11impl<'config> PowerShellLexer<'config> {
12    pub fn new(config: &'config PowerShellLanguage) -> Self {
13        Self { config }
14    }
15
16    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            if self.skip_whitespace(state) {
19                continue;
20            }
21
22            if self.lex_newline(state) {
23                continue;
24            }
25
26            if self.lex_comment(state) {
27                continue;
28            }
29
30            if self.lex_string(state) {
31                continue;
32            }
33
34            if self.lex_number(state) {
35                continue;
36            }
37
38            if self.lex_variable(state) {
39                continue;
40            }
41
42            if self.lex_identifier_or_keyword(state) {
43                continue;
44            }
45
46            if self.lex_operators_and_punctuation(state) {
47                continue;
48            }
49
50            // 如果没有匹配任何规则,跳过当前字符
51            if let Some(ch) = state.peek() {
52                let start_pos = state.get_position();
53                state.advance(ch.len_utf8());
54                state.add_token(PowerShellSyntaxKind::Error, start_pos, state.get_position());
55            }
56            else {
57                // 如果已到达文件末尾,退出循环
58                break;
59            }
60        }
61
62        // Add EOF token
63        let pos = state.get_position();
64        state.add_token(PowerShellSyntaxKind::Eof, pos, pos);
65
66        Ok(())
67    }
68
69    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
70        let start_pos = state.get_position();
71
72        while let Some(ch) = state.peek() {
73            if ch == ' ' || ch == '\t' {
74                state.advance(ch.len_utf8());
75            }
76            else {
77                break;
78            }
79        }
80
81        if state.get_position() > start_pos {
82            state.add_token(PowerShellSyntaxKind::Whitespace, start_pos, state.get_position());
83            true
84        }
85        else {
86            false
87        }
88    }
89
90    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
91        let start_pos = state.get_position();
92
93        if let Some('\n') = state.peek() {
94            state.advance(1);
95            state.add_token(PowerShellSyntaxKind::Newline, start_pos, state.get_position());
96            true
97        }
98        else if let Some('\r') = state.peek() {
99            state.advance(1);
100            if let Some('\n') = state.peek() {
101                state.advance(1);
102            }
103            state.add_token(PowerShellSyntaxKind::Newline, start_pos, state.get_position());
104            true
105        }
106        else {
107            false
108        }
109    }
110
111    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
112        let start_pos = state.get_position();
113
114        if let Some('#') = state.peek() {
115            state.advance(1);
116            // 单行注释
117            while let Some(ch) = state.peek() {
118                if ch == '\n' || ch == '\r' {
119                    break;
120                }
121                state.advance(ch.len_utf8());
122            }
123            state.add_token(PowerShellSyntaxKind::Comment, start_pos, state.get_position());
124            true
125        }
126        else if let Some('<') = state.peek() {
127            state.advance(1);
128            if let Some('#') = state.peek() {
129                state.advance(1);
130                // 多行注释 <# ... #>
131                let mut depth = 1;
132                while let Some(ch) = state.peek()
133                    && depth > 0
134                {
135                    if ch == '<' {
136                        state.advance(1);
137                        if let Some('#') = state.peek() {
138                            state.advance(1);
139                            depth += 1;
140                        }
141                    }
142                    else if ch == '#' {
143                        state.advance(1);
144                        if let Some('>') = state.peek() {
145                            state.advance(1);
146                            depth -= 1;
147                        }
148                    }
149                    else {
150                        state.advance(ch.len_utf8());
151                    }
152                }
153                state.add_token(PowerShellSyntaxKind::Comment, start_pos, state.get_position());
154                true
155            }
156            else {
157                // 回退,这不是注释
158                state.set_position(start_pos);
159                false
160            }
161        }
162        else {
163            false
164        }
165    }
166
167    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
168        let start_pos = state.get_position();
169
170        if let Some(quote_char) = state.peek() {
171            if quote_char == '"' || quote_char == '\'' {
172                state.advance(1); // 跳过开始引号
173
174                let mut escaped = false;
175                while let Some(ch) = state.peek() {
176                    if escaped {
177                        escaped = false;
178                        state.advance(ch.len_utf8());
179                    }
180                    else if ch == '`' {
181                        // PowerShell 使用反引号作为转义字符
182                        escaped = true;
183                        state.advance(1);
184                    }
185                    else if ch == quote_char {
186                        state.advance(1); // 跳过结束引号
187                        break;
188                    }
189                    else if ch == '\n' || ch == '\r' {
190                        // 字符串可以跨行
191                        state.advance(ch.len_utf8());
192                    }
193                    else {
194                        state.advance(ch.len_utf8());
195                    }
196                }
197
198                state.add_token(PowerShellSyntaxKind::StringLiteral, start_pos, state.get_position());
199                true
200            }
201            else {
202                false
203            }
204        }
205        else {
206            false
207        }
208    }
209
210    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
211        if let Some(ch) = state.peek() {
212            if ch.is_ascii_digit() {
213                let start_pos = state.get_position();
214
215                // 读取整数部分
216                while let Some(ch) = state.peek() {
217                    if ch.is_ascii_digit() {
218                        state.advance(1);
219                    }
220                    else {
221                        break;
222                    }
223                }
224
225                // 检查小数点
226                if let Some('.') = state.peek() {
227                    state.advance(1);
228                    // 读取小数部分
229                    while let Some(ch) = state.peek() {
230                        if ch.is_ascii_digit() {
231                            state.advance(1);
232                        }
233                        else {
234                            break;
235                        }
236                    }
237                }
238
239                // 检查科学记数法
240                if let Some(ch) = state.peek() {
241                    if ch == 'e' || ch == 'E' {
242                        state.advance(1);
243                        if let Some(ch) = state.peek() {
244                            if ch == '+' || ch == '-' {
245                                state.advance(1);
246                            }
247                        }
248                        while let Some(ch) = state.peek() {
249                            if ch.is_ascii_digit() {
250                                state.advance(1);
251                            }
252                            else {
253                                break;
254                            }
255                        }
256                    }
257                }
258
259                state.add_token(PowerShellSyntaxKind::NumberLiteral, start_pos, state.get_position());
260                true
261            }
262            else {
263                false
264            }
265        }
266        else {
267            false
268        }
269    }
270
271    fn lex_variable<S: Source>(&self, state: &mut State<S>) -> bool {
272        let start_pos = state.get_position();
273
274        if let Some('$') = state.peek() {
275            state.advance(1);
276
277            // 变量名必须以字母或下划线开头
278            if let Some(ch) = state.peek() {
279                if ch.is_alphabetic() || ch == '_' {
280                    state.advance(ch.len_utf8());
281
282                    // 后续字符可以是字母、数字或下划线
283                    while let Some(ch) = state.peek() {
284                        if ch.is_alphanumeric() || ch == '_' {
285                            state.advance(ch.len_utf8());
286                        }
287                        else {
288                            break;
289                        }
290                    }
291
292                    state.add_token(PowerShellSyntaxKind::Variable, start_pos, state.get_position());
293                    true
294                }
295                else {
296                    // 只有 $ 符号,作为操作符处理
297                    state.add_token(PowerShellSyntaxKind::Dollar, start_pos, state.get_position());
298                    true
299                }
300            }
301            else {
302                state.add_token(PowerShellSyntaxKind::Dollar, start_pos, state.get_position());
303                true
304            }
305        }
306        else {
307            false
308        }
309    }
310
311    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
312        if let Some(ch) = state.peek() {
313            if ch.is_alphabetic() || ch == '_' {
314                let start_pos = state.get_position();
315                let mut text = String::new();
316
317                // 读取标识符
318                while let Some(ch) = state.peek() {
319                    if ch.is_alphanumeric() || ch == '_' || ch == '-' {
320                        text.push(ch);
321                        state.advance(ch.len_utf8());
322                    }
323                    else {
324                        break;
325                    }
326                }
327
328                // 检查是否是关键字
329                let kind = match text.as_str() {
330                    "begin" => PowerShellSyntaxKind::Begin,
331                    "break" => PowerShellSyntaxKind::Break,
332                    "catch" => PowerShellSyntaxKind::Catch,
333                    "class" => PowerShellSyntaxKind::Class,
334                    "continue" => PowerShellSyntaxKind::Continue,
335                    "data" => PowerShellSyntaxKind::Data,
336                    "define" => PowerShellSyntaxKind::Define,
337                    "do" => PowerShellSyntaxKind::Do,
338                    "dynamicparam" => PowerShellSyntaxKind::DynamicParam,
339                    "else" => PowerShellSyntaxKind::Else,
340                    "elseif" => PowerShellSyntaxKind::ElseIf,
341                    "end" => PowerShellSyntaxKind::End,
342                    "exit" => PowerShellSyntaxKind::Exit,
343                    "filter" => PowerShellSyntaxKind::Filter,
344                    "finally" => PowerShellSyntaxKind::Finally,
345                    "for" => PowerShellSyntaxKind::For,
346                    "foreach" => PowerShellSyntaxKind::ForEach,
347                    "from" => PowerShellSyntaxKind::From,
348                    "function" => PowerShellSyntaxKind::Function,
349                    "if" => PowerShellSyntaxKind::If,
350                    "in" => PowerShellSyntaxKind::In,
351                    "param" => PowerShellSyntaxKind::Param,
352                    "process" => PowerShellSyntaxKind::Process,
353                    "return" => PowerShellSyntaxKind::Return,
354                    "switch" => PowerShellSyntaxKind::Switch,
355                    "throw" => PowerShellSyntaxKind::Throw,
356                    "trap" => PowerShellSyntaxKind::Trap,
357                    "try" => PowerShellSyntaxKind::Try,
358                    "until" => PowerShellSyntaxKind::Until,
359                    "using" => PowerShellSyntaxKind::Using,
360                    "var" => PowerShellSyntaxKind::Var,
361                    "while" => PowerShellSyntaxKind::While,
362                    "workflow" => PowerShellSyntaxKind::Workflow,
363                    "true" => PowerShellSyntaxKind::BooleanLiteral,
364                    "false" => PowerShellSyntaxKind::BooleanLiteral,
365                    "null" => PowerShellSyntaxKind::NullLiteral,
366                    _ => PowerShellSyntaxKind::Identifier,
367                };
368
369                state.add_token(kind, start_pos, state.get_position());
370                true
371            }
372            else {
373                false
374            }
375        }
376        else {
377            false
378        }
379    }
380
381    fn lex_operators_and_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
382        if let Some(ch) = state.peek() {
383            let start_pos = state.get_position();
384
385            let kind = match ch {
386                '+' => {
387                    state.advance(1);
388                    if let Some('+') = state.peek() {
389                        state.advance(1);
390                        PowerShellSyntaxKind::Plus
391                    }
392                    else if let Some('=') = state.peek() {
393                        state.advance(1);
394                        PowerShellSyntaxKind::Equal
395                    }
396                    else {
397                        PowerShellSyntaxKind::Plus
398                    }
399                }
400                '-' => {
401                    state.advance(1);
402                    if let Some('-') = state.peek() {
403                        state.advance(1);
404                        PowerShellSyntaxKind::Minus
405                    }
406                    else if let Some('=') = state.peek() {
407                        state.advance(1);
408                        PowerShellSyntaxKind::Equal
409                    }
410                    else {
411                        PowerShellSyntaxKind::Minus
412                    }
413                }
414                '*' => {
415                    state.advance(1);
416                    if let Some('=') = state.peek() {
417                        state.advance(1);
418                        PowerShellSyntaxKind::Equal
419                    }
420                    else {
421                        PowerShellSyntaxKind::Multiply
422                    }
423                }
424                '/' => {
425                    state.advance(1);
426                    if let Some('=') = state.peek() {
427                        state.advance(1);
428                        PowerShellSyntaxKind::Equal
429                    }
430                    else {
431                        PowerShellSyntaxKind::Divide
432                    }
433                }
434                '%' => {
435                    state.advance(1);
436                    if let Some('=') = state.peek() {
437                        state.advance(1);
438                        PowerShellSyntaxKind::Equal
439                    }
440                    else {
441                        PowerShellSyntaxKind::Modulo
442                    }
443                }
444                '=' => {
445                    state.advance(1);
446                    if let Some('=') = state.peek() {
447                        state.advance(1);
448                        PowerShellSyntaxKind::Equal
449                    }
450                    else {
451                        PowerShellSyntaxKind::Equal
452                    }
453                }
454                '!' => {
455                    state.advance(1);
456                    if let Some('=') = state.peek() {
457                        state.advance(1);
458                        PowerShellSyntaxKind::NotEqual
459                    }
460                    else {
461                        PowerShellSyntaxKind::Exclamation
462                    }
463                }
464                '<' => {
465                    state.advance(1);
466                    if let Some('=') = state.peek() {
467                        state.advance(1);
468                        PowerShellSyntaxKind::LessEqual
469                    }
470                    else {
471                        PowerShellSyntaxKind::LessThan
472                    }
473                }
474                '>' => {
475                    state.advance(1);
476                    if let Some('=') = state.peek() {
477                        state.advance(1);
478                        PowerShellSyntaxKind::GreaterEqual
479                    }
480                    else {
481                        PowerShellSyntaxKind::GreaterThan
482                    }
483                }
484                '&' => {
485                    state.advance(1);
486                    if let Some('&') = state.peek() {
487                        state.advance(1);
488                        PowerShellSyntaxKind::And
489                    }
490                    else {
491                        PowerShellSyntaxKind::Ampersand
492                    }
493                }
494                '|' => {
495                    state.advance(1);
496                    if let Some('|') = state.peek() {
497                        state.advance(1);
498                        PowerShellSyntaxKind::Or
499                    }
500                    else {
501                        PowerShellSyntaxKind::Pipe
502                    }
503                }
504                '^' => {
505                    state.advance(1);
506                    PowerShellSyntaxKind::Xor
507                }
508                '~' => {
509                    state.advance(1);
510                    PowerShellSyntaxKind::Not
511                }
512                '?' => {
513                    state.advance(1);
514                    PowerShellSyntaxKind::Question
515                }
516                ':' => {
517                    state.advance(1);
518                    if let Some(':') = state.peek() {
519                        state.advance(1);
520                        PowerShellSyntaxKind::DoubleColon
521                    }
522                    else {
523                        PowerShellSyntaxKind::Colon
524                    }
525                }
526                ';' => {
527                    state.advance(1);
528                    PowerShellSyntaxKind::Semicolon
529                }
530                ',' => {
531                    state.advance(1);
532                    PowerShellSyntaxKind::Comma
533                }
534                '.' => {
535                    state.advance(1);
536                    if let Some('.') = state.peek() {
537                        state.advance(1);
538                        PowerShellSyntaxKind::DotDot
539                    }
540                    else {
541                        PowerShellSyntaxKind::Dot
542                    }
543                }
544                '(' => {
545                    state.advance(1);
546                    PowerShellSyntaxKind::LeftParen
547                }
548                ')' => {
549                    state.advance(1);
550                    PowerShellSyntaxKind::RightParen
551                }
552                '[' => {
553                    state.advance(1);
554                    PowerShellSyntaxKind::LeftBracket
555                }
556                ']' => {
557                    state.advance(1);
558                    PowerShellSyntaxKind::RightBracket
559                }
560                '{' => {
561                    state.advance(1);
562                    PowerShellSyntaxKind::LeftBrace
563                }
564                '}' => {
565                    state.advance(1);
566                    PowerShellSyntaxKind::RightBrace
567                }
568                '@' => {
569                    state.advance(1);
570                    PowerShellSyntaxKind::At
571                }
572                '`' => {
573                    state.advance(1);
574                    PowerShellSyntaxKind::Backtick
575                }
576                _ => return false,
577            };
578
579            state.add_token(kind, start_pos, state.get_position());
580            true
581        }
582        else {
583            false
584        }
585    }
586}
587
588impl<'config> Lexer<PowerShellLanguage> for PowerShellLexer<'config> {
589    fn lex_incremental(
590        &self,
591        source: impl Source,
592        _changed: usize,
593        _cache: IncrementalCache<PowerShellLanguage>,
594    ) -> LexOutput<PowerShellLanguage> {
595        let mut state = LexerState::new_with_cache(source, _changed, _cache);
596        let result = self.run(&mut state);
597        state.finish(result)
598    }
599}