Skip to main content

oak_vbnet/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use crate::language::VbNetLanguage;
4
5/// Token types and definitions for the VB.NET lexer.
6pub mod token_type;
7
8use oak_core::{
9    Lexer, LexerCache, LexerState,
10    lexer::LexOutput,
11    source::{Source, TextEdit},
12};
13pub use token_type::VbNetTokenType;
14
15pub(crate) type State<'a, S> = LexerState<'a, S, VbNetLanguage>;
16
17/// VB.NET lexer
18pub struct VbNetLexer<'config> {
19    config: &'config VbNetLanguage,
20}
21
22impl<'config> VbNetLexer<'config> {
23    /// Creates a new VB.NET lexer
24    pub fn new(config: &'config VbNetLanguage) -> Self {
25        Self { config }
26    }
27
28    /// Skips whitespace characters (spaces and tabs).
29    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
30        let start_pos = state.get_position();
31
32        while let Some(ch) = state.peek() {
33            if ch == ' ' || ch == '\t' {
34                state.advance(ch.len_utf8());
35            }
36            else {
37                break;
38            }
39        }
40
41        if state.get_position() > start_pos {
42            state.add_token(VbNetTokenType::Whitespace, start_pos, state.get_position());
43            true
44        }
45        else {
46            false
47        }
48    }
49
50    /// Lexes a newline character (LF or CRLF).
51    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
52        let start_pos = state.get_position();
53
54        if let Some('\n') = state.peek() {
55            state.advance(1);
56            state.add_token(VbNetTokenType::Newline, start_pos, state.get_position());
57            true
58        }
59        else if let Some('\r') = state.peek() {
60            state.advance(1);
61            if let Some('\n') = state.peek() {
62                state.advance(1);
63            }
64            state.add_token(VbNetTokenType::Newline, start_pos, state.get_position());
65            true
66        }
67        else {
68            false
69        }
70    }
71
72    /// Lexes a comment (single-line `'` or multi-line `''' ... '''`).
73    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
74        let start_pos = state.get_position();
75
76        if let Some('\'') = state.peek() {
77            state.advance(1);
78            // Check if it's a block comment (three single quotes)
79            if let Some('\'') = state.peek() {
80                state.advance(1);
81                if let Some('\'') = state.peek() {
82                    state.advance(1);
83                    // Parse until closing three single quotes
84                    while state.not_at_end() {
85                        if let Some('\'') = state.peek() {
86                            state.advance(1);
87                            if let Some('\'') = state.peek() {
88                                state.advance(1);
89                                if let Some('\'') = state.peek() {
90                                    state.advance(1);
91                                    break;
92                                }
93                            }
94                        }
95                        else {
96                            state.advance(1);
97                        }
98                    }
99                    state.add_token(VbNetTokenType::BlockComment, start_pos, state.get_position());
100                    return true;
101                }
102                else {
103                    // Backtrack, not a block comment
104                    state.set_position(start_pos);
105                    return false;
106                }
107            }
108            else {
109                // It's a line comment
110                while let Some(ch) = state.peek() {
111                    if ch == '\n' || ch == '\r' {
112                        break;
113                    }
114                    state.advance(ch.len_utf8());
115                }
116                state.add_token(VbNetTokenType::LineComment, start_pos, state.get_position());
117                return true;
118            }
119        }
120        false
121    }
122
123    /// Lexes a string literal (`"..."`).
124    /// Handles basic escape sequences.
125    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
126        let start_pos = state.get_position();
127
128        if let Some('"') = state.peek() {
129            state.advance(1);
130            while let Some(ch) = state.peek() {
131                if ch == '"' {
132                    // Check for double quote escape
133                    state.advance(1);
134                    if let Some('"') = state.peek() {
135                        state.advance(1);
136                        continue;
137                    }
138                    break;
139                }
140                else {
141                    state.advance(ch.len_utf8())
142                }
143            }
144            state.add_token(VbNetTokenType::StringLiteral, start_pos, state.get_position());
145            true
146        }
147        else {
148            false
149        }
150    }
151
152    /// Lexes a character literal (`'...'`).
153    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154        let start_pos = state.get_position();
155
156        if let Some('\'') = state.peek() {
157            state.advance(1);
158            if let Some(ch) = state.peek() {
159                if ch != '\'' {
160                    state.advance(ch.len_utf8());
161                    // Check for escape sequence
162                    if let Some('\\') = state.peek() {
163                        state.advance(1);
164                        if let Some(_) = state.peek() {
165                            state.advance(1)
166                        }
167                    }
168                    // Check for closing single quote
169                    if let Some('\'') = state.peek() {
170                        state.advance(1);
171                        state.add_token(VbNetTokenType::CharLiteral, start_pos, state.get_position());
172                        return true;
173                    }
174                }
175            }
176            // If we get here, it's not a valid char literal, backtrack
177            state.set_position(start_pos);
178            false
179        }
180        else {
181            false
182        }
183    }
184
185    /// Lexes a date literal (`#...#`).
186    fn lex_date<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
187        let start_pos = state.get_position();
188
189        if let Some('#') = state.peek() {
190            state.advance(1);
191            while let Some(ch) = state.peek() {
192                if ch == '#' {
193                    state.advance(1);
194                    break;
195                }
196                state.advance(ch.len_utf8());
197            }
198            state.add_token(VbNetTokenType::DateLiteral, start_pos, state.get_position());
199            true
200        }
201        else {
202            false
203        }
204    }
205
206    /// Lexes a number literal.
207    ///
208    /// Supports:
209    /// - Decimal integers (`123`)
210    /// - Floating-point numbers (`123.45`, `1.2e3`)
211    /// - Underscore separators (`1_000_000`)
212    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
213        let start_pos = state.get_position();
214
215        if let Some(ch) = state.peek() {
216            if ch.is_numeric() {
217                state.advance(ch.len_utf8());
218
219                let mut has_dot = false;
220                let mut has_e = false;
221
222                while let Some(ch) = state.peek() {
223                    if ch.is_numeric() {
224                        state.advance(ch.len_utf8());
225                    }
226                    else if ch == '.' && !has_dot {
227                        has_dot = true;
228                        state.advance(1);
229                    }
230                    else if (ch == 'e' || ch == 'E') && !has_e {
231                        has_e = true;
232                        state.advance(1);
233                        // Check for optional sign after e
234                        if let Some(ch) = state.peek() {
235                            if ch == '+' || ch == '-' {
236                                state.advance(1);
237                            }
238                        }
239                    }
240                    else if ch == '_' {
241                        // Underscore separator
242                        state.advance(1);
243                    }
244                    else {
245                        break;
246                    }
247                }
248
249                let token_type = if has_dot || has_e { VbNetTokenType::FloatLiteral } else { VbNetTokenType::IntegerLiteral };
250
251                state.add_token(token_type, start_pos, state.get_position());
252                true
253            }
254            else {
255                false
256            }
257        }
258        else {
259            false
260        }
261    }
262
263    /// Lexes a keyword or identifier.
264    ///
265    /// Identifiers can start with a letter or underscore.
266    /// Subsequent characters can be letters, digits, or underscores.
267    ///
268    /// Keywords are matched against the standard VB.NET keyword list. If a match is found,
269    /// the specific keyword token is returned; otherwise, it is treated as an identifier.
270    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
271        let start_pos = state.get_position();
272
273        if let Some(ch) = state.peek() {
274            if ch.is_alphabetic() || ch == '_' {
275                state.advance(ch.len_utf8());
276
277                while let Some(ch) = state.peek() {
278                    if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
279                }
280
281                let text = state.get_text_in((start_pos..state.get_position()).into());
282                let token_type = match text.as_ref() {
283                    "Namespace" => VbNetTokenType::Namespace,
284                    "Imports" => VbNetTokenType::Imports,
285                    "Class" => VbNetTokenType::Class,
286                    "Interface" => VbNetTokenType::Interface,
287                    "Structure" => VbNetTokenType::Structure,
288                    "Enum" => VbNetTokenType::Enum,
289                    "Module" => VbNetTokenType::Module,
290                    "Delegate" => VbNetTokenType::Delegate,
291                    "Event" => VbNetTokenType::Event,
292                    "Function" => VbNetTokenType::Function,
293                    "Sub" => VbNetTokenType::Sub,
294                    "Property" => VbNetTokenType::Property,
295                    "Dim" => VbNetTokenType::Dim,
296                    "Const" => VbNetTokenType::Const,
297                    "As" => VbNetTokenType::As,
298                    "In" => VbNetTokenType::In,
299                    "If" => VbNetTokenType::If,
300                    "Then" => VbNetTokenType::Then,
301                    "Else" => VbNetTokenType::Else,
302                    "ElseIf" => VbNetTokenType::ElseIf,
303                    "End" => VbNetTokenType::End,
304                    "For" => VbNetTokenType::For,
305                    "Each" => VbNetTokenType::Each,
306                    "To" => VbNetTokenType::To,
307                    "Step" => VbNetTokenType::Step,
308                    "While" => VbNetTokenType::While,
309                    "Do" => VbNetTokenType::Do,
310                    "Loop" => VbNetTokenType::Loop,
311                    "Until" => VbNetTokenType::Until,
312                    "Select" => VbNetTokenType::Select,
313                    "Case" => VbNetTokenType::Case,
314                    "Default" => VbNetTokenType::Default,
315                    "With" => VbNetTokenType::With,
316                    "Try" => VbNetTokenType::Try,
317                    "Catch" => VbNetTokenType::Catch,
318                    "Finally" => VbNetTokenType::Finally,
319                    "Throw" => VbNetTokenType::Throw,
320                    "Exit" => VbNetTokenType::Exit,
321                    "Continue" => VbNetTokenType::Continue,
322                    "Return" => VbNetTokenType::Return,
323                    "Me" => VbNetTokenType::Me,
324                    "MyBase" => VbNetTokenType::MyBase,
325                    "MyClass" => VbNetTokenType::MyClass,
326                    "New" => VbNetTokenType::New,
327                    "Of" => VbNetTokenType::Of,
328                    "ByVal" => VbNetTokenType::ByVal,
329                    "ByRef" => VbNetTokenType::ByRef,
330                    "Optional" => VbNetTokenType::Optional,
331                    "ParamArray" => VbNetTokenType::ParamArray,
332                    "Public" => VbNetTokenType::Public,
333                    "Private" => VbNetTokenType::Private,
334                    "Protected" => VbNetTokenType::Protected,
335                    "Friend" => VbNetTokenType::Friend,
336                    "ProtectedFriend" => VbNetTokenType::ProtectedFriend,
337                    "Shared" => VbNetTokenType::Shared,
338                    "MustInherit" => VbNetTokenType::MustInherit,
339                    "NotInheritable" => VbNetTokenType::NotInheritable,
340                    "MustOverride" => VbNetTokenType::MustOverride,
341                    "Overridable" => VbNetTokenType::Overridable,
342                    "Overrides" => VbNetTokenType::Overrides,
343                    "NotOverridable" => VbNetTokenType::NotOverridable,
344                    "MustOverrideReadOnly" => VbNetTokenType::MustOverrideReadOnly,
345                    "ReadOnly" => VbNetTokenType::ReadOnly,
346                    "WriteOnly" => VbNetTokenType::WriteOnly,
347                    "Static" => VbNetTokenType::Static,
348                    "Partial" => VbNetTokenType::Partial,
349                    "Async" => VbNetTokenType::Async,
350                    "Await" => VbNetTokenType::Await,
351                    "From" => VbNetTokenType::From,
352                    "Where" => VbNetTokenType::Where,
353                    "Order" => VbNetTokenType::Order,
354                    "By" => VbNetTokenType::By,
355                    "Group" => VbNetTokenType::Group,
356                    "Join" => VbNetTokenType::Join,
357                    "On" => VbNetTokenType::On,
358                    "Into" => VbNetTokenType::Into,
359                    "Let" => VbNetTokenType::Let,
360                    "And" => VbNetTokenType::And,
361                    "Or" => VbNetTokenType::Or,
362                    "Not" => VbNetTokenType::Not,
363                    "Xor" => VbNetTokenType::Xor,
364                    "AndAlso" => VbNetTokenType::AndAlso,
365                    "OrElse" => VbNetTokenType::OrElse,
366                    "Is" => VbNetTokenType::Is,
367                    "IsNot" => VbNetTokenType::IsNot,
368                    "Like" => VbNetTokenType::Like,
369                    "TypeOf" => VbNetTokenType::TypeOf,
370                    "True" => VbNetTokenType::BooleanLiteral,
371                    "False" => VbNetTokenType::BooleanLiteral,
372                    "Nothing" => VbNetTokenType::NothingLiteral,
373                    _ => VbNetTokenType::Identifier,
374                };
375
376                state.add_token(token_type, start_pos, state.get_position());
377                true
378            }
379            else {
380                false
381            }
382        }
383        else {
384            false
385        }
386    }
387
388    /// Lexes an operator.
389    ///
390    /// Handles single-character and multi-character operators.
391    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
392        let start_pos = state.get_position();
393
394        if let Some(ch) = state.peek() {
395            let token_type = match ch {
396                '+' => {
397                    state.advance(1);
398                    VbNetTokenType::Plus
399                }
400                '-' => {
401                    state.advance(1);
402                    VbNetTokenType::Minus
403                }
404                '*' => {
405                    state.advance(1);
406                    VbNetTokenType::Star
407                }
408                '/' => {
409                    state.advance(1);
410                    VbNetTokenType::Slash
411                }
412                '\\' => {
413                    state.advance(1);
414                    VbNetTokenType::Backslash
415                }
416                '%' => {
417                    state.advance(1);
418                    VbNetTokenType::Percent
419                }
420                '^' => {
421                    state.advance(1);
422                    VbNetTokenType::Caret
423                }
424                '=' => {
425                    state.advance(1);
426                    VbNetTokenType::Equal
427                }
428                '<' => {
429                    state.advance(1);
430                    if let Some('>') = state.peek() {
431                        state.advance(1);
432                        VbNetTokenType::NotEqual
433                    }
434                    else if let Some('=') = state.peek() {
435                        state.advance(1);
436                        VbNetTokenType::LessEqual
437                    }
438                    else {
439                        VbNetTokenType::LessThan
440                    }
441                }
442                '>' => {
443                    state.advance(1);
444                    if let Some('=') = state.peek() {
445                        state.advance(1);
446                        VbNetTokenType::GreaterEqual
447                    }
448                    else {
449                        VbNetTokenType::GreaterThan
450                    }
451                }
452                '&' => {
453                    state.advance(1);
454                    VbNetTokenType::Ampersand
455                }
456                '!' => {
457                    state.advance(1);
458                    VbNetTokenType::Exclamation
459                }
460                _ => return false,
461            };
462
463            state.add_token(token_type, start_pos, state.get_position());
464            true
465        }
466        else {
467            false
468        }
469    }
470
471    /// Lexes a delimiter.
472    ///
473    /// Handles structural characters such as:
474    /// - Parentheses: `(`, `)`
475    /// - Brackets: `[`, `]`
476    /// - Braces: `{`, `}`
477    /// - Punctuation: `;`, `,`, `.`, `:`
478    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
479        let start_pos = state.get_position();
480
481        if let Some(ch) = state.peek() {
482            let token_type = match ch {
483                '(' => VbNetTokenType::LeftParen,
484                ')' => VbNetTokenType::RightParen,
485                '[' => VbNetTokenType::LeftBracket,
486                ']' => VbNetTokenType::RightBracket,
487                '{' => VbNetTokenType::LeftBrace,
488                '}' => VbNetTokenType::RightBrace,
489                ';' => VbNetTokenType::Semicolon,
490                ',' => VbNetTokenType::Comma,
491                '.' => VbNetTokenType::Dot,
492                ':' => VbNetTokenType::Colon,
493                '#' => VbNetTokenType::Hash,
494                _ => return false,
495            };
496
497            state.advance(ch.len_utf8());
498            state.add_token(token_type, start_pos, state.get_position());
499            true
500        }
501        else {
502            false
503        }
504    }
505
506    /// Runs the lexer on the input state.
507    ///
508    /// This method performs the main lexing loop, attempting to match various
509    /// token types (whitespace, comments, literals, keywords, etc.) until the
510    /// end of the input is reached.
511    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
512        while state.not_at_end() {
513            let safe_point = state.get_position();
514
515            if self.skip_whitespace(state) {
516                continue;
517            }
518
519            if self.lex_newline(state) {
520                continue;
521            }
522
523            if self.lex_comment(state) {
524                continue;
525            }
526
527            if self.lex_string(state) {
528                continue;
529            }
530
531            if self.lex_char(state) {
532                continue;
533            }
534
535            if self.lex_date(state) {
536                continue;
537            }
538
539            if self.lex_number(state) {
540                continue;
541            }
542
543            if self.lex_keyword_or_identifier(state) {
544                continue;
545            }
546
547            if self.lex_operator(state) {
548                continue;
549            }
550
551            if self.lex_delimiter(state) {
552                continue;
553            }
554
555            // If no pattern matches, handle the error character and advance
556            let start_pos = state.get_position();
557            if let Some(ch) = state.peek() {
558                state.advance(ch.len_utf8());
559                state.add_token(VbNetTokenType::Error, start_pos, state.get_position());
560            }
561
562            state.advance_if_dead_lock(safe_point)
563        }
564        Ok(())
565    }
566}
567
568impl<'config> Lexer<VbNetLanguage> for VbNetLexer<'config> {
569    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], mut cache: &'a mut impl LexerCache<VbNetLanguage>) -> LexOutput<VbNetLanguage> {
570        let mut state = LexerState::new(text);
571        let result = self.run(&mut state);
572        if result.is_ok() {
573            state.add_eof();
574        }
575        state.finish_with_cache(result, &mut cache)
576    }
577}