Skip to main content

oak_vlang/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the VLang language.
3pub mod token_type;
4
5use crate::{language::VLangLanguage, lexer::token_type::VLangTokenType};
6use oak_core::{
7    Lexer, LexerState,
8    lexer::{LexOutput, LexerCache},
9    source::Source,
10};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, VLangLanguage>;
13
14/// VLang lexer implementation.
15#[derive(Clone, Debug)]
16pub struct VLangLexer<'config> {
17    config: &'config VLangLanguage,
18}
19
20impl<'config> VLangLexer<'config> {
21    /// Creates a new `VLangLexer` instance.
22    pub fn new(config: &'config VLangLanguage) -> Self {
23        Self { config }
24    }
25
26    /// Skips whitespace characters.
27    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
28        let start_pos = state.get_position();
29
30        while let Some(ch) = state.peek() {
31            if ch == ' ' || ch == '\t' {
32                state.advance(ch.len_utf8());
33            }
34            else {
35                break;
36            }
37        }
38
39        if state.get_position() > start_pos {
40            state.add_token(VLangTokenType::Whitespace, start_pos, state.get_position());
41            true
42        }
43        else {
44            false
45        }
46    }
47
48    /// Lexes a newline.
49    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
50        let start_pos = state.get_position();
51
52        if let Some('\n') = state.peek() {
53            state.advance(1);
54            state.add_token(VLangTokenType::Newline, start_pos, state.get_position());
55            true
56        }
57        else if let Some('\r') = state.peek() {
58            state.advance(1);
59            if let Some('\n') = state.peek() {
60                state.advance(1);
61            }
62            state.add_token(VLangTokenType::Newline, start_pos, state.get_position());
63            true
64        }
65        else {
66            false
67        }
68    }
69
70    /// Lexes a comment.
71    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
72        let start_pos = state.get_position();
73
74        // Single-line comment //
75        if let Some('/') = state.peek() {
76            if let Some('/') = state.peek_next_n(1) {
77                state.advance(2);
78
79                // Read until end of line
80                while let Some(ch) = state.peek() {
81                    if ch == '\n' || ch == '\r' {
82                        break;
83                    }
84                    state.advance(ch.len_utf8());
85                }
86
87                state.add_token(VLangTokenType::Comment, start_pos, state.get_position());
88                return true;
89            }
90            // Multi-line comment /* */
91            else if let Some('*') = state.peek_next_n(1) {
92                state.advance(2);
93
94                while let Some(ch) = state.peek() {
95                    if ch == '*' {
96                        if let Some('/') = state.peek_next_n(1) {
97                            state.advance(2);
98                            break;
99                        }
100                    }
101                    state.advance(ch.len_utf8());
102                }
103
104                state.add_token(VLangTokenType::Comment, start_pos, state.get_position());
105                return true;
106            }
107        }
108        false
109    }
110
111    /// Lexes a string literal.
112    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start_pos = state.get_position();
114
115        if let Some(quote) = state.peek() {
116            if quote == '"' || quote == '\'' {
117                state.advance(1);
118                let mut escaped = false;
119
120                while let Some(ch) = state.peek() {
121                    if escaped {
122                        escaped = false;
123                        state.advance(ch.len_utf8());
124                    }
125                    else if ch == '\\' {
126                        escaped = true;
127                        state.advance(1);
128                    }
129                    else if ch == quote {
130                        state.advance(1);
131                        break;
132                    }
133                    else if ch == '\n' || ch == '\r' {
134                        break; // Strings cannot span multiple lines
135                    }
136                    else {
137                        state.advance(ch.len_utf8());
138                    }
139                }
140
141                let token_kind = if quote == '"' { VLangTokenType::StringLiteral } else { VLangTokenType::CharLiteral };
142                state.add_token(token_kind, start_pos, state.get_position());
143                true
144            }
145            else {
146                false
147            }
148        }
149        else {
150            false
151        }
152    }
153
154    /// Lexes a number literal.
155    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
156        let start_pos = state.get_position();
157
158        if let Some(ch) = state.peek() {
159            if ch.is_ascii_digit() {
160                // Integer part
161                while let Some(digit) = state.peek() {
162                    if digit.is_ascii_digit() {
163                        state.advance(1);
164                    }
165                    else {
166                        break;
167                    }
168                }
169
170                // Check for decimal point
171                let mut is_float = false;
172                if let Some('.') = state.peek() {
173                    if let Some(next_ch) = state.peek_next_n(1) {
174                        if next_ch.is_ascii_digit() {
175                            is_float = true;
176                            state.advance(1); // Skip dot
177
178                            // Fractional part
179                            while let Some(digit) = state.peek() {
180                                if digit.is_ascii_digit() {
181                                    state.advance(1);
182                                }
183                                else {
184                                    break;
185                                }
186                            }
187                        }
188                    }
189                }
190
191                // Check for exponent
192                if let Some(e) = state.peek() {
193                    if e == 'e' || e == 'E' {
194                        let exp_start = state.get_position();
195                        state.advance(1);
196
197                        // Optional sign
198                        if let Some(sign) = state.peek() {
199                            if sign == '+' || sign == '-' {
200                                state.advance(1);
201                            }
202                        }
203
204                        // Exponent digits
205                        let mut has_exp_digits = false;
206                        while let Some(digit) = state.peek() {
207                            if digit.is_ascii_digit() {
208                                has_exp_digits = true;
209                                state.advance(1);
210                            }
211                            else {
212                                break;
213                            }
214                        }
215
216                        if has_exp_digits {
217                            is_float = true;
218                        }
219                        else {
220                            // Backtrack to exponent start position
221                            state.set_position(exp_start);
222                        }
223                    }
224                }
225
226                let token_kind = if is_float { VLangTokenType::FloatLiteral } else { VLangTokenType::IntegerLiteral };
227                state.add_token(token_kind, start_pos, state.get_position());
228                true
229            }
230            else {
231                false
232            }
233        }
234        else {
235            false
236        }
237    }
238
239    /// Lexes identifiers and keywords.
240    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241        let start_pos = state.get_position();
242
243        if let Some(ch) = state.peek() {
244            if ch.is_ascii_alphabetic() || ch == '_' {
245                while let Some(ch) = state.peek() {
246                    if ch.is_ascii_alphanumeric() || ch == '_' {
247                        state.advance(ch.len_utf8());
248                    }
249                    else {
250                        break;
251                    }
252                }
253
254                let text = state.get_text_in((start_pos..state.get_position()).into());
255                let token_kind = match text.as_ref() {
256                    "module" => VLangTokenType::ModuleKw,
257                    "import" => VLangTokenType::ImportKw,
258                    "pub" => VLangTokenType::PubKw,
259                    "fn" => VLangTokenType::FnKw,
260                    "struct" => VLangTokenType::StructKw,
261                    "interface" => VLangTokenType::InterfaceKw,
262                    "enum" => VLangTokenType::EnumKw,
263                    "type" => VLangTokenType::TypeKw,
264                    "const" => VLangTokenType::ConstKw,
265                    "mut" => VLangTokenType::MutKw,
266                    "shared" => VLangTokenType::SharedKw,
267                    "volatile" => VLangTokenType::VolatileKw,
268                    "unsafe" => VLangTokenType::UnsafeKw,
269                    "if" => VLangTokenType::IfKw,
270                    "else" => VLangTokenType::ElseKw,
271                    "for" => VLangTokenType::ForKw,
272                    "in" => VLangTokenType::InKw,
273                    "match" => VLangTokenType::MatchKw,
274                    "or" => VLangTokenType::OrKw,
275                    "return" => VLangTokenType::ReturnKw,
276                    "break" => VLangTokenType::BreakKw,
277                    "continue" => VLangTokenType::ContinueKw,
278                    "goto" => VLangTokenType::GotoKw,
279                    "defer" => VLangTokenType::DeferKw,
280                    "go" => VLangTokenType::GoKw,
281                    "select" => VLangTokenType::SelectKw,
282                    "lock" => VLangTokenType::LockKw,
283                    "rlock" => VLangTokenType::RlockKw,
284                    "as" => VLangTokenType::AsKw,
285                    "is" => VLangTokenType::IsKw,
286                    "sizeof" => VLangTokenType::SizeofKw,
287                    "typeof" => VLangTokenType::TypeofKw,
288                    "offsetof" => VLangTokenType::OffsetofKw,
289                    "assert" => VLangTokenType::AssertKw,
290                    "panic" => VLangTokenType::PanicKw,
291                    "eprintln" => VLangTokenType::EprintlnKw,
292                    "println" => VLangTokenType::PrintlnKw,
293                    "print" => VLangTokenType::PrintKw,
294                    "eprint" => VLangTokenType::EprintKw,
295                    "bool" => VLangTokenType::BoolKw,
296                    "i8" => VLangTokenType::I8Kw,
297                    "i16" => VLangTokenType::I16Kw,
298                    "i32" => VLangTokenType::I32Kw,
299                    "i64" => VLangTokenType::I64Kw,
300                    "u8" => VLangTokenType::U8Kw,
301                    "u16" => VLangTokenType::U16Kw,
302                    "u32" => VLangTokenType::U32Kw,
303                    "u64" => VLangTokenType::U64Kw,
304                    "int" => VLangTokenType::IntKw,
305                    "uint" => VLangTokenType::UintKw,
306                    "f32" => VLangTokenType::F32Kw,
307                    "f64" => VLangTokenType::F64Kw,
308                    "string" => VLangTokenType::StringKw,
309                    "rune" => VLangTokenType::RuneKw,
310                    "byte" => VLangTokenType::ByteKw,
311                    "voidptr" => VLangTokenType::VoidptrKw,
312                    "char" => VLangTokenType::CharKw,
313                    "true" | "false" => VLangTokenType::BoolLiteral,
314                    _ => VLangTokenType::Identifier,
315                };
316
317                state.add_token(token_kind, start_pos, state.get_position());
318                true
319            }
320            else {
321                false
322            }
323        }
324        else {
325            false
326        }
327    }
328
329    /// Lexes operators and punctuation.
330    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
331        let start_pos = state.get_position();
332
333        if let Some(ch) = state.peek() {
334            let token_kind = match ch {
335                '+' => {
336                    if let Some('=') = state.peek_next_n(1) {
337                        state.advance(2);
338                        VLangTokenType::PlusEq
339                    }
340                    else if let Some('+') = state.peek_next_n(1) {
341                        state.advance(2);
342                        VLangTokenType::PlusPlus
343                    }
344                    else {
345                        state.advance(1);
346                        VLangTokenType::Plus
347                    }
348                }
349                '-' => {
350                    if let Some('=') = state.peek_next_n(1) {
351                        state.advance(2);
352                        VLangTokenType::MinusEq
353                    }
354                    else if let Some('-') = state.peek_next_n(1) {
355                        state.advance(2);
356                        VLangTokenType::MinusMinus
357                    }
358                    else if let Some('>') = state.peek_next_n(1) {
359                        state.advance(2);
360                        VLangTokenType::Arrow
361                    }
362                    else {
363                        state.advance(1);
364                        VLangTokenType::Minus
365                    }
366                }
367                '*' => {
368                    if let Some('=') = state.peek_next_n(1) {
369                        state.advance(2);
370                        VLangTokenType::StarEq
371                    }
372                    else {
373                        state.advance(1);
374                        VLangTokenType::Star
375                    }
376                }
377                '/' => {
378                    if let Some('=') = state.peek_next_n(1) {
379                        state.advance(2);
380                        VLangTokenType::SlashEq
381                    }
382                    else {
383                        state.advance(1);
384                        VLangTokenType::Slash
385                    }
386                }
387                '%' => {
388                    if let Some('=') = state.peek_next_n(1) {
389                        state.advance(2);
390                        VLangTokenType::PercentEq
391                    }
392                    else {
393                        state.advance(1);
394                        VLangTokenType::Percent
395                    }
396                }
397                '&' => {
398                    if let Some('=') = state.peek_next_n(1) {
399                        state.advance(2);
400                        VLangTokenType::AmpersandEq
401                    }
402                    else if let Some('&') = state.peek_next_n(1) {
403                        state.advance(2);
404                        VLangTokenType::AndAnd
405                    }
406                    else {
407                        state.advance(1);
408                        VLangTokenType::Ampersand
409                    }
410                }
411                '|' => {
412                    if let Some('=') = state.peek_next_n(1) {
413                        state.advance(2);
414                        VLangTokenType::PipeEq
415                    }
416                    else if let Some('|') = state.peek_next_n(1) {
417                        state.advance(2);
418                        VLangTokenType::OrOr
419                    }
420                    else {
421                        state.advance(1);
422                        VLangTokenType::Pipe
423                    }
424                }
425                '^' => {
426                    if let Some('=') = state.peek_next_n(1) {
427                        state.advance(2);
428                        VLangTokenType::CaretEq
429                    }
430                    else {
431                        state.advance(1);
432                        VLangTokenType::Caret
433                    }
434                }
435                '=' => {
436                    if let Some('=') = state.peek_next_n(1) {
437                        state.advance(2);
438                        VLangTokenType::EqEq
439                    }
440                    else if let Some('>') = state.peek_next_n(1) {
441                        state.advance(2);
442                        VLangTokenType::FatArrow
443                    }
444                    else {
445                        state.advance(1);
446                        VLangTokenType::Eq
447                    }
448                }
449                '!' => {
450                    if let Some('=') = state.peek_next_n(1) {
451                        state.advance(2);
452                        VLangTokenType::Ne
453                    }
454                    else {
455                        state.advance(1);
456                        VLangTokenType::Bang
457                    }
458                }
459                '<' => {
460                    if let Some('=') = state.peek_next_n(1) {
461                        state.advance(2);
462                        VLangTokenType::Le
463                    }
464                    else if let Some('<') = state.peek_next_n(1) {
465                        if let Some('=') = state.peek_next_n(2) {
466                            state.advance(3);
467                            VLangTokenType::LeftShiftEq
468                        }
469                        else {
470                            state.advance(2);
471                            VLangTokenType::LeftShift
472                        }
473                    }
474                    else {
475                        state.advance(1);
476                        VLangTokenType::LessThan
477                    }
478                }
479                '>' => {
480                    if let Some('=') = state.peek_next_n(1) {
481                        state.advance(2);
482                        VLangTokenType::Ge
483                    }
484                    else if let Some('>') = state.peek_next_n(1) {
485                        if let Some('=') = state.peek_next_n(2) {
486                            state.advance(3);
487                            VLangTokenType::RightShiftEq
488                        }
489                        else {
490                            state.advance(2);
491                            VLangTokenType::RightShift
492                        }
493                    }
494                    else {
495                        state.advance(1);
496                        VLangTokenType::GreaterThan
497                    }
498                }
499                '.' => {
500                    if let Some('.') = state.peek_next_n(1) {
501                        if let Some('.') = state.peek_next_n(2) {
502                            state.advance(3);
503                            VLangTokenType::DotDotDot
504                        }
505                        else {
506                            state.advance(2);
507                            VLangTokenType::DotDot
508                        }
509                    }
510                    else {
511                        state.advance(1);
512                        VLangTokenType::Dot
513                    }
514                }
515                ',' => {
516                    state.advance(1);
517                    VLangTokenType::Comma
518                }
519                ':' => {
520                    state.advance(1);
521                    VLangTokenType::Colon
522                }
523                ';' => {
524                    state.advance(1);
525                    VLangTokenType::Semicolon
526                }
527                '(' => {
528                    state.advance(1);
529                    VLangTokenType::LeftParen
530                }
531                ')' => {
532                    state.advance(1);
533                    VLangTokenType::RightParen
534                }
535                '[' => {
536                    state.advance(1);
537                    VLangTokenType::LeftBracket
538                }
539                ']' => {
540                    state.advance(1);
541                    VLangTokenType::RightBracket
542                }
543                '{' => {
544                    state.advance(1);
545                    VLangTokenType::LeftBrace
546                }
547                '}' => {
548                    state.advance(1);
549                    VLangTokenType::RightBrace
550                }
551                '?' => {
552                    state.advance(1);
553                    VLangTokenType::Question
554                }
555                '~' => {
556                    state.advance(1);
557                    VLangTokenType::Tilde
558                }
559                _ => {
560                    state.advance(ch.len_utf8());
561                    VLangTokenType::Error
562                }
563            };
564
565            state.add_token(token_kind, start_pos, state.get_position());
566            true
567        }
568        else {
569            false
570        }
571    }
572}
573
574impl<'config> Lexer<VLangLanguage> for VLangLexer<'config> {
575    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<VLangLanguage>) -> LexOutput<VLangLanguage> {
576        let mut state = State::new_with_cache(source, 0, cache);
577
578        while let Some(_ch) = state.peek() {
579            if self.skip_whitespace(&mut state) {
580                continue;
581            }
582
583            if self.lex_newline(&mut state) {
584                continue;
585            }
586
587            if self.lex_comment(&mut state) {
588                continue;
589            }
590
591            if self.lex_string(&mut state) {
592                continue;
593            }
594
595            if self.lex_number(&mut state) {
596                continue;
597            }
598
599            if self.lex_identifier_or_keyword(&mut state) {
600                continue;
601            }
602
603            if self.lex_operator(&mut state) {
604                continue;
605            }
606
607            // If no rules match, skip the current character and mark it as an error
608            let start_pos = state.get_position();
609            if let Some(ch) = state.peek() {
610                state.advance(ch.len_utf8());
611                state.add_token(VLangTokenType::Error, start_pos, state.get_position());
612            }
613        }
614
615        state.add_eof();
616        state.finish_with_cache(Ok(()), cache)
617    }
618}