Skip to main content

oak_powershell/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions.
3pub mod token_type;
4
5use crate::{language::PowerShellLanguage, lexer::token_type::PowerShellTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::LexOutput,
9    source::{Source, TextEdit},
10};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, PowerShellLanguage>;
13
14/// Lexer for the PowerShell language.
15#[derive(Clone)]
16pub struct PowerShellLexer<'config> {
17    /// The language configuration.
18    pub config: &'config PowerShellLanguage,
19}
20
21impl<'config> PowerShellLexer<'config> {
22    /// Creates a new `PowerShellLexer`.
23    pub fn new(config: &'config PowerShellLanguage) -> Self {
24        Self { config }
25    }
26
27    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
28        while state.not_at_end() {
29            if self.skip_whitespace(state) {
30                continue;
31            }
32
33            if self.lex_newline(state) {
34                continue;
35            }
36
37            if self.lex_comment(state) {
38                continue;
39            }
40
41            if self.lex_string(state) {
42                continue;
43            }
44
45            if self.lex_number(state) {
46                continue;
47            }
48
49            if self.lex_variable(state) {
50                continue;
51            }
52
53            if self.lex_identifier_or_keyword(state) {
54                continue;
55            }
56
57            if self.lex_operators_and_punctuation(state) {
58                continue;
59            }
60
61            // If no rules match, skip the current character
62            if let Some(ch) = state.peek() {
63                let start_pos = state.get_position();
64                state.advance(ch.len_utf8());
65                state.add_token(PowerShellTokenType::Error, start_pos, state.get_position());
66            }
67            else {
68                // Exit loop if at the end of the file
69                break;
70            }
71        }
72
73        // Add EOF token
74        let pos = state.get_position();
75        state.add_token(PowerShellTokenType::Eof, pos, pos);
76
77        Ok(())
78    }
79
80    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81        let start_pos = state.get_position();
82
83        while let Some(ch) = state.peek() {
84            if ch == ' ' || ch == '\t' {
85                state.advance(ch.len_utf8());
86            }
87            else {
88                break;
89            }
90        }
91
92        if state.get_position() > start_pos {
93            state.add_token(PowerShellTokenType::Whitespace, start_pos, state.get_position());
94            true
95        }
96        else {
97            false
98        }
99    }
100
101    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        let start_pos = state.get_position();
103
104        if let Some('\n') = state.peek() {
105            state.advance(1);
106            state.add_token(PowerShellTokenType::Newline, start_pos, state.get_position());
107            true
108        }
109        else if let Some('\r') = state.peek() {
110            state.advance(1);
111            if let Some('\n') = state.peek() {
112                state.advance(1);
113            }
114            state.add_token(PowerShellTokenType::Newline, start_pos, state.get_position());
115            true
116        }
117        else {
118            false
119        }
120    }
121
122    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
123        let start_pos = state.get_position();
124
125        if let Some('#') = state.peek() {
126            state.advance(1);
127            // Single-line comment
128            while let Some(ch) = state.peek() {
129                if ch == '\n' || ch == '\r' {
130                    break;
131                }
132                state.advance(ch.len_utf8());
133            }
134            state.add_token(PowerShellTokenType::Comment, start_pos, state.get_position());
135            true
136        }
137        else if let Some('<') = state.peek() {
138            state.advance(1);
139            if let Some('#') = state.peek() {
140                state.advance(1);
141                // Multi-line comment <# ... #>
142                let mut depth = 1;
143                while let Some(ch) = state.peek() {
144                    if depth == 0 {
145                        break;
146                    }
147                    if ch == '<' {
148                        state.advance(1);
149                        if let Some('#') = state.peek() {
150                            state.advance(1);
151                            depth += 1;
152                        }
153                    }
154                    else if ch == '#' {
155                        state.advance(1);
156                        if let Some('>') = state.peek() {
157                            state.advance(1);
158                            depth -= 1;
159                        }
160                    }
161                    else {
162                        state.advance(ch.len_utf8());
163                    }
164                }
165                state.add_token(PowerShellTokenType::Comment, start_pos, state.get_position());
166                true
167            }
168            else {
169                // Backtrack, not a comment
170                state.set_position(start_pos);
171                false
172            }
173        }
174        else {
175            false
176        }
177    }
178
179    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
180        let start_pos = state.get_position();
181
182        if let Some(quote_char) = state.peek() {
183            if quote_char == '"' || quote_char == '\'' {
184                state.advance(1); // Skip opening quote
185
186                let mut escaped = false;
187                while let Some(ch) = state.peek() {
188                    if escaped {
189                        escaped = false;
190                        state.advance(ch.len_utf8());
191                    }
192                    else if ch == '`' {
193                        // PowerShell uses backtick as escape character
194                        escaped = true;
195                        state.advance(1);
196                    }
197                    else if ch == quote_char {
198                        state.advance(1); // Skip closing quote
199                        break;
200                    }
201                    else if ch == '\n' || ch == '\r' {
202                        // Strings can span multiple lines
203                        state.advance(ch.len_utf8());
204                    }
205                    else {
206                        state.advance(ch.len_utf8());
207                    }
208                }
209
210                state.add_token(PowerShellTokenType::StringLiteral, start_pos, state.get_position());
211                true
212            }
213            else {
214                false
215            }
216        }
217        else {
218            false
219        }
220    }
221
222    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
223        if let Some(ch) = state.peek() {
224            if ch.is_ascii_digit() {
225                let start_pos = state.get_position();
226
227                // Read integer part
228                while let Some(ch) = state.peek() {
229                    if ch.is_ascii_digit() {
230                        state.advance(1);
231                    }
232                    else {
233                        break;
234                    }
235                }
236
237                // Check for decimal point
238                if let Some('.') = state.peek() {
239                    state.advance(1);
240                    // Read fractional part
241                    while let Some(ch) = state.peek() {
242                        if ch.is_ascii_digit() {
243                            state.advance(1);
244                        }
245                        else {
246                            break;
247                        }
248                    }
249                }
250
251                // Check for scientific notation
252                if let Some(ch) = state.peek() {
253                    if ch == 'e' || ch == 'E' {
254                        state.advance(1);
255                        if let Some(ch) = state.peek() {
256                            if ch == '+' || ch == '-' {
257                                state.advance(1);
258                            }
259                        }
260                        while let Some(ch) = state.peek() {
261                            if ch.is_ascii_digit() {
262                                state.advance(1);
263                            }
264                            else {
265                                break;
266                            }
267                        }
268                    }
269                }
270
271                state.add_token(PowerShellTokenType::NumberLiteral, start_pos, state.get_position());
272                true
273            }
274            else {
275                false
276            }
277        }
278        else {
279            false
280        }
281    }
282
283    fn lex_variable<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
284        let start_pos = state.get_position();
285
286        if let Some('$') = state.peek() {
287            state.advance(1);
288
289            // Variable name must start with letter or underscore
290            if let Some(ch) = state.peek() {
291                if ch.is_alphabetic() || ch == '_' {
292                    state.advance(ch.len_utf8());
293
294                    // Subsequent characters can be alphanumeric or underscore
295                    while let Some(ch) = state.peek() {
296                        if ch.is_alphanumeric() || ch == '_' {
297                            state.advance(ch.len_utf8());
298                        }
299                        else {
300                            break;
301                        }
302                    }
303
304                    state.add_token(PowerShellTokenType::Variable, start_pos, state.get_position());
305                    true
306                }
307                else {
308                    // Only $ sign, treat as operator
309                    state.add_token(PowerShellTokenType::Dollar, start_pos, state.get_position());
310                    true
311                }
312            }
313            else {
314                state.add_token(PowerShellTokenType::Dollar, start_pos, state.get_position());
315                true
316            }
317        }
318        else {
319            false
320        }
321    }
322
323    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
324        if let Some(ch) = state.peek() {
325            if ch.is_alphabetic() || ch == '_' {
326                let start_pos = state.get_position();
327                let mut text = String::new();
328
329                // Read identifier
330                while let Some(ch) = state.peek() {
331                    if ch.is_alphanumeric() || ch == '_' || ch == '-' {
332                        text.push(ch);
333                        state.advance(ch.len_utf8());
334                    }
335                    else {
336                        break;
337                    }
338                }
339
340                // Check for keywords
341                let kind = match text.as_str() {
342                    "begin" => PowerShellTokenType::Begin,
343                    "break" => PowerShellTokenType::Break,
344                    "catch" => PowerShellTokenType::Catch,
345                    "class" => PowerShellTokenType::Class,
346                    "continue" => PowerShellTokenType::Continue,
347                    "data" => PowerShellTokenType::Data,
348                    "define" => PowerShellTokenType::Define,
349                    "do" => PowerShellTokenType::Do,
350                    "dynamicparam" => PowerShellTokenType::DynamicParam,
351                    "else" => PowerShellTokenType::Else,
352                    "elseif" => PowerShellTokenType::ElseIf,
353                    "end" => PowerShellTokenType::End,
354                    "exit" => PowerShellTokenType::Exit,
355                    "filter" => PowerShellTokenType::Filter,
356                    "finally" => PowerShellTokenType::Finally,
357                    "for" => PowerShellTokenType::For,
358                    "foreach" => PowerShellTokenType::ForEach,
359                    "from" => PowerShellTokenType::From,
360                    "function" => PowerShellTokenType::Function,
361                    "if" => PowerShellTokenType::If,
362                    "in" => PowerShellTokenType::In,
363                    "param" => PowerShellTokenType::Param,
364                    "process" => PowerShellTokenType::Process,
365                    "return" => PowerShellTokenType::Return,
366                    "switch" => PowerShellTokenType::Switch,
367                    "throw" => PowerShellTokenType::Throw,
368                    "trap" => PowerShellTokenType::Trap,
369                    "try" => PowerShellTokenType::Try,
370                    "until" => PowerShellTokenType::Until,
371                    "using" => PowerShellTokenType::Using,
372                    "var" => PowerShellTokenType::Var,
373                    "while" => PowerShellTokenType::While,
374                    "workflow" => PowerShellTokenType::Workflow,
375                    "true" => PowerShellTokenType::BooleanLiteral,
376                    "false" => PowerShellTokenType::BooleanLiteral,
377                    "null" => PowerShellTokenType::NullLiteral,
378                    _ => PowerShellTokenType::Identifier,
379                };
380
381                state.add_token(kind, start_pos, state.get_position());
382                true
383            }
384            else {
385                false
386            }
387        }
388        else {
389            false
390        }
391    }
392
393    fn lex_operators_and_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
394        if let Some(ch) = state.peek() {
395            let start_pos = state.get_position();
396
397            let kind = match ch {
398                '+' => {
399                    state.advance(1);
400                    if let Some('+') = state.peek() {
401                        state.advance(1);
402                        PowerShellTokenType::Plus
403                    }
404                    else if let Some('=') = state.peek() {
405                        state.advance(1);
406                        PowerShellTokenType::Equal
407                    }
408                    else {
409                        PowerShellTokenType::Plus
410                    }
411                }
412                '-' => {
413                    state.advance(1);
414                    if let Some('-') = state.peek() {
415                        state.advance(1);
416                        PowerShellTokenType::Minus
417                    }
418                    else if let Some('=') = state.peek() {
419                        state.advance(1);
420                        PowerShellTokenType::Equal
421                    }
422                    else {
423                        PowerShellTokenType::Minus
424                    }
425                }
426                '*' => {
427                    state.advance(1);
428                    if let Some('=') = state.peek() {
429                        state.advance(1);
430                        PowerShellTokenType::Equal
431                    }
432                    else {
433                        PowerShellTokenType::Multiply
434                    }
435                }
436                '/' => {
437                    state.advance(1);
438                    if let Some('=') = state.peek() {
439                        state.advance(1);
440                        PowerShellTokenType::Equal
441                    }
442                    else {
443                        PowerShellTokenType::Divide
444                    }
445                }
446                '%' => {
447                    state.advance(1);
448                    if let Some('=') = state.peek() {
449                        state.advance(1);
450                        PowerShellTokenType::Equal
451                    }
452                    else {
453                        PowerShellTokenType::Modulo
454                    }
455                }
456                '=' => {
457                    state.advance(1);
458                    if let Some('=') = state.peek() {
459                        state.advance(1);
460                        PowerShellTokenType::Equal
461                    }
462                    else {
463                        PowerShellTokenType::Equal
464                    }
465                }
466                '!' => {
467                    state.advance(1);
468                    if let Some('=') = state.peek() {
469                        state.advance(1);
470                        PowerShellTokenType::NotEqual
471                    }
472                    else {
473                        PowerShellTokenType::Exclamation
474                    }
475                }
476                '<' => {
477                    state.advance(1);
478                    if let Some('=') = state.peek() {
479                        state.advance(1);
480                        PowerShellTokenType::LessEqual
481                    }
482                    else {
483                        PowerShellTokenType::LessThan
484                    }
485                }
486                '>' => {
487                    state.advance(1);
488                    if let Some('=') = state.peek() {
489                        state.advance(1);
490                        PowerShellTokenType::GreaterEqual
491                    }
492                    else {
493                        PowerShellTokenType::GreaterThan
494                    }
495                }
496                '&' => {
497                    state.advance(1);
498                    if let Some('&') = state.peek() {
499                        state.advance(1);
500                        PowerShellTokenType::And
501                    }
502                    else {
503                        PowerShellTokenType::Ampersand
504                    }
505                }
506                '|' => {
507                    state.advance(1);
508                    if let Some('|') = state.peek() {
509                        state.advance(1);
510                        PowerShellTokenType::Or
511                    }
512                    else {
513                        PowerShellTokenType::Pipe
514                    }
515                }
516                '^' => {
517                    state.advance(1);
518                    PowerShellTokenType::Xor
519                }
520                '~' => {
521                    state.advance(1);
522                    PowerShellTokenType::Not
523                }
524                '?' => {
525                    state.advance(1);
526                    PowerShellTokenType::Question
527                }
528                ':' => {
529                    state.advance(1);
530                    if let Some(':') = state.peek() {
531                        state.advance(1);
532                        PowerShellTokenType::DoubleColon
533                    }
534                    else {
535                        PowerShellTokenType::Colon
536                    }
537                }
538                ';' => {
539                    state.advance(1);
540                    PowerShellTokenType::Semicolon
541                }
542                ',' => {
543                    state.advance(1);
544                    PowerShellTokenType::Comma
545                }
546                '.' => {
547                    state.advance(1);
548                    if let Some('.') = state.peek() {
549                        state.advance(1);
550                        PowerShellTokenType::DotDot
551                    }
552                    else {
553                        PowerShellTokenType::Dot
554                    }
555                }
556                '(' => {
557                    state.advance(1);
558                    PowerShellTokenType::LeftParen
559                }
560                ')' => {
561                    state.advance(1);
562                    PowerShellTokenType::RightParen
563                }
564                '[' => {
565                    state.advance(1);
566                    PowerShellTokenType::LeftBracket
567                }
568                ']' => {
569                    state.advance(1);
570                    PowerShellTokenType::RightBracket
571                }
572                '{' => {
573                    state.advance(1);
574                    PowerShellTokenType::LeftBrace
575                }
576                '}' => {
577                    state.advance(1);
578                    PowerShellTokenType::RightBrace
579                }
580                '@' => {
581                    state.advance(1);
582                    PowerShellTokenType::At
583                }
584                '`' => {
585                    state.advance(1);
586                    PowerShellTokenType::Backtick
587                }
588                _ => return false,
589            };
590
591            state.add_token(kind, start_pos, state.get_position());
592            true
593        }
594        else {
595            false
596        }
597    }
598}
599
600impl<'config> Lexer<PowerShellLanguage> for PowerShellLexer<'config> {
601    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PowerShellLanguage>) -> LexOutput<PowerShellLanguage> {
602        let mut state = LexerState::new(source);
603        let result = self.run(&mut state);
604        state.finish_with_cache(result, cache)
605    }
606}