Skip to main content

oak_powershell/lexer/
mod.rs

1use crate::{kind::PowerShellSyntaxKind, language::PowerShellLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::LexOutput,
5    source::{Source, TextEdit},
6};
7
8type State<'a, S> = LexerState<'a, S, PowerShellLanguage>;
9
10#[derive(Clone)]
11pub struct PowerShellLexer<'config> {
12    _config: &'config PowerShellLanguage,
13}
14
15impl<'config> PowerShellLexer<'config> {
16    pub fn new(config: &'config PowerShellLanguage) -> Self {
17        Self { _config: config }
18    }
19
20    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
21        while state.not_at_end() {
22            if self.skip_whitespace(state) {
23                continue;
24            }
25
26            if self.lex_newline(state) {
27                continue;
28            }
29
30            if self.lex_comment(state) {
31                continue;
32            }
33
34            if self.lex_string(state) {
35                continue;
36            }
37
38            if self.lex_number(state) {
39                continue;
40            }
41
42            if self.lex_variable(state) {
43                continue;
44            }
45
46            if self.lex_identifier_or_keyword(state) {
47                continue;
48            }
49
50            if self.lex_operators_and_punctuation(state) {
51                continue;
52            }
53
54            // 如果没有匹配任何规则,跳过当前字符
55            if let Some(ch) = state.peek() {
56                let start_pos = state.get_position();
57                state.advance(ch.len_utf8());
58                state.add_token(PowerShellSyntaxKind::Error, start_pos, state.get_position());
59            }
60            else {
61                // 如果已到达文件末尾,退出循环
62                break;
63            }
64        }
65
66        // Add EOF token
67        let pos = state.get_position();
68        state.add_token(PowerShellSyntaxKind::Eof, pos, pos);
69
70        Ok(())
71    }
72
73    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
74        let start_pos = state.get_position();
75
76        while let Some(ch) = state.peek() {
77            if ch == ' ' || ch == '\t' {
78                state.advance(ch.len_utf8());
79            }
80            else {
81                break;
82            }
83        }
84
85        if state.get_position() > start_pos {
86            state.add_token(PowerShellSyntaxKind::Whitespace, start_pos, state.get_position());
87            true
88        }
89        else {
90            false
91        }
92    }
93
94    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start_pos = state.get_position();
96
97        if let Some('\n') = state.peek() {
98            state.advance(1);
99            state.add_token(PowerShellSyntaxKind::Newline, start_pos, state.get_position());
100            true
101        }
102        else if let Some('\r') = state.peek() {
103            state.advance(1);
104            if let Some('\n') = state.peek() {
105                state.advance(1);
106            }
107            state.add_token(PowerShellSyntaxKind::Newline, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116        let start_pos = state.get_position();
117
118        if let Some('#') = state.peek() {
119            state.advance(1);
120            // 单行注释
121            while let Some(ch) = state.peek() {
122                if ch == '\n' || ch == '\r' {
123                    break;
124                }
125                state.advance(ch.len_utf8());
126            }
127            state.add_token(PowerShellSyntaxKind::Comment, start_pos, state.get_position());
128            true
129        }
130        else if let Some('<') = state.peek() {
131            state.advance(1);
132            if let Some('#') = state.peek() {
133                state.advance(1);
134                // 多行注释 <# ... #>
135                let mut depth = 1;
136                while let Some(ch) = state.peek() {
137                    if depth == 0 {
138                        break;
139                    }
140                    if ch == '<' {
141                        state.advance(1);
142                        if let Some('#') = state.peek() {
143                            state.advance(1);
144                            depth += 1;
145                        }
146                    }
147                    else if ch == '#' {
148                        state.advance(1);
149                        if let Some('>') = state.peek() {
150                            state.advance(1);
151                            depth -= 1;
152                        }
153                    }
154                    else {
155                        state.advance(ch.len_utf8());
156                    }
157                }
158                state.add_token(PowerShellSyntaxKind::Comment, start_pos, state.get_position());
159                true
160            }
161            else {
162                // 回退,这不是注释
163                state.set_position(start_pos);
164                false
165            }
166        }
167        else {
168            false
169        }
170    }
171
172    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
173        let start_pos = state.get_position();
174
175        if let Some(quote_char) = state.peek() {
176            if quote_char == '"' || quote_char == '\'' {
177                state.advance(1); // 跳过开始引号
178
179                let mut escaped = false;
180                while let Some(ch) = state.peek() {
181                    if escaped {
182                        escaped = false;
183                        state.advance(ch.len_utf8());
184                    }
185                    else if ch == '`' {
186                        // PowerShell 使用反引号作为转义字符
187                        escaped = true;
188                        state.advance(1);
189                    }
190                    else if ch == quote_char {
191                        state.advance(1); // 跳过结束引号
192                        break;
193                    }
194                    else if ch == '\n' || ch == '\r' {
195                        // 字符串可以跨行
196                        state.advance(ch.len_utf8());
197                    }
198                    else {
199                        state.advance(ch.len_utf8());
200                    }
201                }
202
203                state.add_token(PowerShellSyntaxKind::StringLiteral, start_pos, state.get_position());
204                true
205            }
206            else {
207                false
208            }
209        }
210        else {
211            false
212        }
213    }
214
215    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
216        if let Some(ch) = state.peek() {
217            if ch.is_ascii_digit() {
218                let start_pos = state.get_position();
219
220                // 读取整数部分
221                while let Some(ch) = state.peek() {
222                    if ch.is_ascii_digit() {
223                        state.advance(1);
224                    }
225                    else {
226                        break;
227                    }
228                }
229
230                // 检查小数点
231                if let Some('.') = state.peek() {
232                    state.advance(1);
233                    // 读取小数部分
234                    while let Some(ch) = state.peek() {
235                        if ch.is_ascii_digit() {
236                            state.advance(1);
237                        }
238                        else {
239                            break;
240                        }
241                    }
242                }
243
244                // 检查科学记数法
245                if let Some(ch) = state.peek() {
246                    if ch == 'e' || ch == 'E' {
247                        state.advance(1);
248                        if let Some(ch) = state.peek() {
249                            if ch == '+' || ch == '-' {
250                                state.advance(1);
251                            }
252                        }
253                        while let Some(ch) = state.peek() {
254                            if ch.is_ascii_digit() {
255                                state.advance(1);
256                            }
257                            else {
258                                break;
259                            }
260                        }
261                    }
262                }
263
264                state.add_token(PowerShellSyntaxKind::NumberLiteral, start_pos, state.get_position());
265                true
266            }
267            else {
268                false
269            }
270        }
271        else {
272            false
273        }
274    }
275
276    fn lex_variable<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
277        let start_pos = state.get_position();
278
279        if let Some('$') = state.peek() {
280            state.advance(1);
281
282            // 变量名必须以字母或下划线开头
283            if let Some(ch) = state.peek() {
284                if ch.is_alphabetic() || ch == '_' {
285                    state.advance(ch.len_utf8());
286
287                    // 后续字符可以是字母、数字或下划线
288                    while let Some(ch) = state.peek() {
289                        if ch.is_alphanumeric() || ch == '_' {
290                            state.advance(ch.len_utf8());
291                        }
292                        else {
293                            break;
294                        }
295                    }
296
297                    state.add_token(PowerShellSyntaxKind::Variable, start_pos, state.get_position());
298                    true
299                }
300                else {
301                    // 只有 $ 符号,作为操作符处理
302                    state.add_token(PowerShellSyntaxKind::Dollar, start_pos, state.get_position());
303                    true
304                }
305            }
306            else {
307                state.add_token(PowerShellSyntaxKind::Dollar, start_pos, state.get_position());
308                true
309            }
310        }
311        else {
312            false
313        }
314    }
315
316    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
317        if let Some(ch) = state.peek() {
318            if ch.is_alphabetic() || ch == '_' {
319                let start_pos = state.get_position();
320                let mut text = String::new();
321
322                // 读取标识符
323                while let Some(ch) = state.peek() {
324                    if ch.is_alphanumeric() || ch == '_' || ch == '-' {
325                        text.push(ch);
326                        state.advance(ch.len_utf8());
327                    }
328                    else {
329                        break;
330                    }
331                }
332
333                // 检查是否是关键字
334                let kind = match text.as_str() {
335                    "begin" => PowerShellSyntaxKind::Begin,
336                    "break" => PowerShellSyntaxKind::Break,
337                    "catch" => PowerShellSyntaxKind::Catch,
338                    "class" => PowerShellSyntaxKind::Class,
339                    "continue" => PowerShellSyntaxKind::Continue,
340                    "data" => PowerShellSyntaxKind::Data,
341                    "define" => PowerShellSyntaxKind::Define,
342                    "do" => PowerShellSyntaxKind::Do,
343                    "dynamicparam" => PowerShellSyntaxKind::DynamicParam,
344                    "else" => PowerShellSyntaxKind::Else,
345                    "elseif" => PowerShellSyntaxKind::ElseIf,
346                    "end" => PowerShellSyntaxKind::End,
347                    "exit" => PowerShellSyntaxKind::Exit,
348                    "filter" => PowerShellSyntaxKind::Filter,
349                    "finally" => PowerShellSyntaxKind::Finally,
350                    "for" => PowerShellSyntaxKind::For,
351                    "foreach" => PowerShellSyntaxKind::ForEach,
352                    "from" => PowerShellSyntaxKind::From,
353                    "function" => PowerShellSyntaxKind::Function,
354                    "if" => PowerShellSyntaxKind::If,
355                    "in" => PowerShellSyntaxKind::In,
356                    "param" => PowerShellSyntaxKind::Param,
357                    "process" => PowerShellSyntaxKind::Process,
358                    "return" => PowerShellSyntaxKind::Return,
359                    "switch" => PowerShellSyntaxKind::Switch,
360                    "throw" => PowerShellSyntaxKind::Throw,
361                    "trap" => PowerShellSyntaxKind::Trap,
362                    "try" => PowerShellSyntaxKind::Try,
363                    "until" => PowerShellSyntaxKind::Until,
364                    "using" => PowerShellSyntaxKind::Using,
365                    "var" => PowerShellSyntaxKind::Var,
366                    "while" => PowerShellSyntaxKind::While,
367                    "workflow" => PowerShellSyntaxKind::Workflow,
368                    "true" => PowerShellSyntaxKind::BooleanLiteral,
369                    "false" => PowerShellSyntaxKind::BooleanLiteral,
370                    "null" => PowerShellSyntaxKind::NullLiteral,
371                    _ => PowerShellSyntaxKind::Identifier,
372                };
373
374                state.add_token(kind, start_pos, state.get_position());
375                true
376            }
377            else {
378                false
379            }
380        }
381        else {
382            false
383        }
384    }
385
386    fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
387        if let Some(ch) = state.peek() {
388            let start_pos = state.get_position();
389
390            let kind = match ch {
391                '+' => {
392                    state.advance(1);
393                    if let Some('+') = state.peek() {
394                        state.advance(1);
395                        PowerShellSyntaxKind::Plus
396                    }
397                    else if let Some('=') = state.peek() {
398                        state.advance(1);
399                        PowerShellSyntaxKind::Equal
400                    }
401                    else {
402                        PowerShellSyntaxKind::Plus
403                    }
404                }
405                '-' => {
406                    state.advance(1);
407                    if let Some('-') = state.peek() {
408                        state.advance(1);
409                        PowerShellSyntaxKind::Minus
410                    }
411                    else if let Some('=') = state.peek() {
412                        state.advance(1);
413                        PowerShellSyntaxKind::Equal
414                    }
415                    else {
416                        PowerShellSyntaxKind::Minus
417                    }
418                }
419                '*' => {
420                    state.advance(1);
421                    if let Some('=') = state.peek() {
422                        state.advance(1);
423                        PowerShellSyntaxKind::Equal
424                    }
425                    else {
426                        PowerShellSyntaxKind::Multiply
427                    }
428                }
429                '/' => {
430                    state.advance(1);
431                    if let Some('=') = state.peek() {
432                        state.advance(1);
433                        PowerShellSyntaxKind::Equal
434                    }
435                    else {
436                        PowerShellSyntaxKind::Divide
437                    }
438                }
439                '%' => {
440                    state.advance(1);
441                    if let Some('=') = state.peek() {
442                        state.advance(1);
443                        PowerShellSyntaxKind::Equal
444                    }
445                    else {
446                        PowerShellSyntaxKind::Modulo
447                    }
448                }
449                '=' => {
450                    state.advance(1);
451                    if let Some('=') = state.peek() {
452                        state.advance(1);
453                        PowerShellSyntaxKind::Equal
454                    }
455                    else {
456                        PowerShellSyntaxKind::Equal
457                    }
458                }
459                '!' => {
460                    state.advance(1);
461                    if let Some('=') = state.peek() {
462                        state.advance(1);
463                        PowerShellSyntaxKind::NotEqual
464                    }
465                    else {
466                        PowerShellSyntaxKind::Exclamation
467                    }
468                }
469                '<' => {
470                    state.advance(1);
471                    if let Some('=') = state.peek() {
472                        state.advance(1);
473                        PowerShellSyntaxKind::LessEqual
474                    }
475                    else {
476                        PowerShellSyntaxKind::LessThan
477                    }
478                }
479                '>' => {
480                    state.advance(1);
481                    if let Some('=') = state.peek() {
482                        state.advance(1);
483                        PowerShellSyntaxKind::GreaterEqual
484                    }
485                    else {
486                        PowerShellSyntaxKind::GreaterThan
487                    }
488                }
489                '&' => {
490                    state.advance(1);
491                    if let Some('&') = state.peek() {
492                        state.advance(1);
493                        PowerShellSyntaxKind::And
494                    }
495                    else {
496                        PowerShellSyntaxKind::Ampersand
497                    }
498                }
499                '|' => {
500                    state.advance(1);
501                    if let Some('|') = state.peek() {
502                        state.advance(1);
503                        PowerShellSyntaxKind::Or
504                    }
505                    else {
506                        PowerShellSyntaxKind::Pipe
507                    }
508                }
509                '^' => {
510                    state.advance(1);
511                    PowerShellSyntaxKind::Xor
512                }
513                '~' => {
514                    state.advance(1);
515                    PowerShellSyntaxKind::Not
516                }
517                '?' => {
518                    state.advance(1);
519                    PowerShellSyntaxKind::Question
520                }
521                ':' => {
522                    state.advance(1);
523                    if let Some(':') = state.peek() {
524                        state.advance(1);
525                        PowerShellSyntaxKind::DoubleColon
526                    }
527                    else {
528                        PowerShellSyntaxKind::Colon
529                    }
530                }
531                ';' => {
532                    state.advance(1);
533                    PowerShellSyntaxKind::Semicolon
534                }
535                ',' => {
536                    state.advance(1);
537                    PowerShellSyntaxKind::Comma
538                }
539                '.' => {
540                    state.advance(1);
541                    if let Some('.') = state.peek() {
542                        state.advance(1);
543                        PowerShellSyntaxKind::DotDot
544                    }
545                    else {
546                        PowerShellSyntaxKind::Dot
547                    }
548                }
549                '(' => {
550                    state.advance(1);
551                    PowerShellSyntaxKind::LeftParen
552                }
553                ')' => {
554                    state.advance(1);
555                    PowerShellSyntaxKind::RightParen
556                }
557                '[' => {
558                    state.advance(1);
559                    PowerShellSyntaxKind::LeftBracket
560                }
561                ']' => {
562                    state.advance(1);
563                    PowerShellSyntaxKind::RightBracket
564                }
565                '{' => {
566                    state.advance(1);
567                    PowerShellSyntaxKind::LeftBrace
568                }
569                '}' => {
570                    state.advance(1);
571                    PowerShellSyntaxKind::RightBrace
572                }
573                '@' => {
574                    state.advance(1);
575                    PowerShellSyntaxKind::At
576                }
577                '`' => {
578                    state.advance(1);
579                    PowerShellSyntaxKind::Backtick
580                }
581                _ => return false,
582            };
583
584            state.add_token(kind, start_pos, state.get_position());
585            true
586        }
587        else {
588            false
589        }
590    }
591}
592
593impl<'config> Lexer<PowerShellLanguage> for PowerShellLexer<'config> {
594    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PowerShellLanguage>) -> LexOutput<PowerShellLanguage> {
595        let mut state = LexerState::new(source);
596        let result = self.run(&mut state);
597        state.finish_with_cache(result, cache)
598    }
599}