Skip to main content

oak_d/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::DLanguage, lexer::token_type::DTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7pub(crate) type State<'a, S> = LexerState<'a, S, DLanguage>;
8
9/// Lexer implementation for D programming language
10#[derive(Clone)]
11pub struct DLexer<'config> {
12    config: &'config DLanguage,
13}
14
15impl<'config> Lexer<DLanguage> for DLexer<'config> {
16    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<DLanguage>) -> LexOutput<DLanguage> {
17        let mut state = LexerState::new(source);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof()
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl<'config> DLexer<'config> {
27    /// Creates a new DLexer with the given configuration.
28    pub fn new(config: &'config DLanguage) -> Self {
29        Self { config }
30    }
31
32    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
33        while state.not_at_end() {
34            let start_pos = state.get_position();
35
36            // Try various lexical rules
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.lex_newline(state) {
42                continue;
43            }
44
45            if self.lex_line_comment(state) {
46                continue;
47            }
48
49            if self.lex_block_comment(state) {
50                continue;
51            }
52
53            if self.lex_nested_comment(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_number(state) {
62                continue;
63            }
64
65            if self.lex_string(state) {
66                continue;
67            }
68
69            if self.lex_character(state) {
70                continue;
71            }
72
73            if self.lex_operator(state) {
74                continue;
75            }
76
77            if self.lex_delimiter(state) {
78                continue;
79            }
80
81            // If no rules matched, add error token and force advancement to prevent infinite loops
82            state.advance_if_dead_lock(start_pos);
83            if state.get_position() > start_pos {
84                state.add_token(DTokenType::Error, start_pos, state.get_position())
85            }
86        }
87        Ok(())
88    }
89
90    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        if let Some(ch) = state.peek() {
92            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
93                let start_pos = state.get_position();
94                while let Some(ch) = state.peek() {
95                    if !ch.is_whitespace() || ch == '\n' || ch == '\r' {
96                        break;
97                    }
98                    state.advance(ch.len_utf8())
99                }
100                state.add_token(DTokenType::Whitespace, start_pos, state.get_position());
101                return true;
102            }
103        }
104        false
105    }
106
107    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108        if let Some(ch) = state.peek() {
109            if ch == '\n' || ch == '\r' {
110                let start_pos = state.get_position();
111                if ch == '\r' {
112                    state.advance(1);
113                    if state.peek() == Some('\n') {
114                        state.advance(1)
115                    }
116                }
117                else {
118                    state.advance(1)
119                }
120                state.add_token(DTokenType::Newline, start_pos, state.get_position());
121                return true;
122            }
123        }
124        false
125    }
126
127    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
128        if let Some(ch) = state.peek() {
129            if ch.is_alphabetic() || ch == '_' {
130                let start_pos = state.get_position();
131                while let Some(ch) = state.peek() {
132                    if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
133                }
134                let end_pos = state.get_position();
135                let text = state.get_text_in((start_pos..end_pos).into());
136
137                let kind = match text.as_ref() {
138                    "module" => DTokenType::ModuleKeyword,
139                    "import" => DTokenType::ImportKeyword,
140                    "public" => DTokenType::PublicKeyword,
141                    "private" => DTokenType::PrivateKeyword,
142                    "protected" => DTokenType::ProtectedKeyword,
143                    "package" => DTokenType::PackageKeyword,
144                    "export" => DTokenType::ExportKeyword,
145                    "static" => DTokenType::StaticKeyword,
146                    "final" => DTokenType::FinalKeyword,
147                    "abstract" => DTokenType::AbstractKeyword,
148                    "override" => DTokenType::OverrideKeyword,
149                    "synchronized" => DTokenType::SynchronizedKeyword,
150                    "const" => DTokenType::ConstKeyword,
151                    "immutable" => DTokenType::ImmutableKeyword,
152                    "inout" => DTokenType::InoutKeyword,
153                    "shared" => DTokenType::SharedKeyword,
154                    "class" => DTokenType::ClassKeyword,
155                    "struct" => DTokenType::StructKeyword,
156                    "interface" => DTokenType::InterfaceKeyword,
157                    "union" => DTokenType::UnionKeyword,
158                    "enum" => DTokenType::EnumKeyword,
159                    "function" => DTokenType::FunctionKeyword,
160                    "delegate" => DTokenType::DelegateKeyword,
161                    "if" => DTokenType::IfKeyword,
162                    "else" => DTokenType::ElseKeyword,
163                    "while" => DTokenType::WhileKeyword,
164                    "for" => DTokenType::ForKeyword,
165                    "foreach" => DTokenType::ForeachKeyword,
166                    "do" => DTokenType::DoKeyword,
167                    "switch" => DTokenType::SwitchKeyword,
168                    "case" => DTokenType::CaseKeyword,
169                    "default" => DTokenType::DefaultKeyword,
170                    "break" => DTokenType::BreakKeyword,
171                    "continue" => DTokenType::ContinueKeyword,
172                    "return" => DTokenType::ReturnKeyword,
173                    "goto" => DTokenType::GotoKeyword,
174                    "try" => DTokenType::TryKeyword,
175                    "catch" => DTokenType::CatchKeyword,
176                    "finally" => DTokenType::FinallyKeyword,
177                    "throw" => DTokenType::ThrowKeyword,
178                    "scope" => DTokenType::ScopeKeyword,
179                    "with" => DTokenType::WithKeyword,
180                    "asm" => DTokenType::AsmKeyword,
181                    "mixin" => DTokenType::MixinKeyword,
182                    "template" => DTokenType::TemplateKeyword,
183                    "alias" => DTokenType::AliasKeyword,
184                    "typeof" => DTokenType::TypeofKeyword,
185                    "typeid" => DTokenType::TypeidKeyword,
186                    "is" => DTokenType::IsKeyword,
187                    "in" => DTokenType::InKeyword,
188                    "out" => DTokenType::OutKeyword,
189                    "ref" => DTokenType::RefKeyword,
190                    "lazy" => DTokenType::LazyKeyword,
191                    "auto" => DTokenType::AutoKeyword,
192                    "extern" => DTokenType::ExternKeyword,
193                    "align" => DTokenType::AlignKeyword,
194                    "pragma" => DTokenType::PragmaKeyword,
195                    "debug" => DTokenType::DebugKeyword,
196                    "version" => DTokenType::VersionKeyword,
197                    "unittest" => DTokenType::UnitTestKeyword,
198                    "invariant" => DTokenType::InvariantKeyword,
199                    "body" => DTokenType::BodyKeyword,
200                    "new" => DTokenType::NewKeyword,
201                    "delete" => DTokenType::DeleteKeyword,
202                    "this" => DTokenType::ThisKeyword,
203                    "super" => DTokenType::SuperKeyword,
204                    "null" => DTokenType::NullKeyword,
205                    "true" => DTokenType::TrueKeyword,
206                    "false" => DTokenType::FalseKeyword,
207                    "cast" => DTokenType::CastKeyword,
208                    "void" => DTokenType::VoidType,
209                    "bool" => DTokenType::BoolType,
210                    "byte" => DTokenType::ByteType,
211                    "ubyte" => DTokenType::UbyteType,
212                    "short" => DTokenType::ShortType,
213                    "ushort" => DTokenType::UshortType,
214                    "int" => DTokenType::IntType,
215                    "uint" => DTokenType::UintType,
216                    "long" => DTokenType::LongType,
217                    "ulong" => DTokenType::UlongType,
218                    "cent" => DTokenType::CentType,
219                    "ucent" => DTokenType::UcentType,
220                    "float" => DTokenType::FloatType,
221                    "double" => DTokenType::DoubleType,
222                    "real" => DTokenType::RealType,
223                    "ifloat" => DTokenType::IfloatType,
224                    "idouble" => DTokenType::IdoubleType,
225                    "ireal" => DTokenType::IrealType,
226                    "cfloat" => DTokenType::CfloatType,
227                    "cdouble" => DTokenType::CdoubleType,
228                    "creal" => DTokenType::CrealType,
229                    "char" => DTokenType::CharType,
230                    "wchar" => DTokenType::WcharType,
231                    "dchar" => DTokenType::DcharType,
232                    "string" => DTokenType::StringType,
233                    "wstring" => DTokenType::WstringType,
234                    "dstring" => DTokenType::DstringType,
235                    "typedef" => DTokenType::TypedefKeyword,
236                    "pure" => DTokenType::PureKeyword,
237                    "nothrow" => DTokenType::NothrowKeyword,
238                    "safe" => DTokenType::SafeKeyword,
239                    "trusted" => DTokenType::TrustedKeyword,
240                    "system" => DTokenType::SystemKeyword,
241                    "nogc" => DTokenType::NogcKeyword,
242                    "property" => DTokenType::PropertyKeyword,
243                    "disable" => DTokenType::DisableKeyword,
244                    "deprecated" => DTokenType::DeprecatedKeyword,
245                    _ => DTokenType::Identifier,
246                };
247
248                state.add_token(kind, start_pos, end_pos);
249                return true;
250            }
251        }
252        false
253    }
254
255    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
256        if let Some(ch) = state.peek() {
257            if ch.is_ascii_digit() {
258                let start_pos = state.get_position();
259
260                // Handle numbers
261                while let Some(ch) = state.peek() {
262                    if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
263                }
264
265                // Check for decimal point
266                if let Some('.') = state.peek() {
267                    state.advance(1);
268                    while let Some(ch) = state.peek() {
269                        if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
270                    }
271                }
272
273                // Check for exponent
274                if let Some(ch) = state.peek() {
275                    if ch == 'e' || ch == 'E' {
276                        state.advance(1);
277                        if let Some(ch) = state.peek() {
278                            if ch == '+' || ch == '-' {
279                                state.advance(1)
280                            }
281                        }
282                        while let Some(ch) = state.peek() {
283                            if ch.is_ascii_digit() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
284                        }
285                    }
286                }
287
288                // Check for suffix
289                if let Some(ch) = state.peek() {
290                    if ch == 'f' || ch == 'F' || ch == 'L' || ch == 'u' || ch == 'U' {
291                        state.advance(1)
292                    }
293                }
294
295                state.add_token(DTokenType::IntegerLiteral, start_pos, state.get_position());
296                return true;
297            }
298        }
299        false
300    }
301
302    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
303        if let Some(ch) = state.peek() {
304            if ch == '"' || ch == '\'' {
305                let start_pos = state.get_position();
306                let quote = ch;
307                state.advance(1); // consume opening quote
308
309                while let Some(ch) = state.peek() {
310                    if ch == quote {
311                        state.advance(1); // consume closing quote
312                        break;
313                    }
314                    else if ch == '\\' {
315                        state.advance(1); // consume backslash
316                        if state.peek().is_some() {
317                            state.advance(1); // consume escaped character
318                        }
319                    }
320                    else {
321                        state.advance(ch.len_utf8())
322                    }
323                }
324
325                state.add_token(DTokenType::StringLiteral, start_pos, state.get_position());
326                return true;
327            }
328        }
329        false
330    }
331
332    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
333        if let Some('\'') = state.peek() {
334            let start_pos = state.get_position();
335            state.advance(1); // consume opening quote
336
337            if let Some(ch) = state.peek() {
338                if ch == '\\' {
339                    state.advance(1); // consume backslash
340                    if state.peek().is_some() {
341                        state.advance(1); // consume escaped character
342                    }
343                }
344                else {
345                    state.advance(ch.len_utf8())
346                }
347            }
348
349            if let Some('\'') = state.peek() {
350                state.advance(1); // consume closing quote
351            }
352
353            state.add_token(DTokenType::CharLiteral, start_pos, state.get_position());
354            return true;
355        }
356        false
357    }
358
359    fn lex_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
360        if let Some('/') = state.peek() {
361            if let Some('/') = state.peek_next_n(1) {
362                let start_pos = state.get_position();
363                state.advance(2);
364                while let Some(ch) = state.peek() {
365                    if ch == '\n' || ch == '\r' {
366                        break;
367                    }
368                    state.advance(ch.len_utf8())
369                }
370                state.add_token(DTokenType::LineComment, start_pos, state.get_position());
371                return true;
372            }
373        }
374        false
375    }
376
377    fn lex_block_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
378        if let Some('/') = state.peek() {
379            if let Some('*') = state.peek_next_n(1) {
380                let start_pos = state.get_position();
381                state.advance(2);
382                while let Some(ch) = state.peek() {
383                    if ch == '*' {
384                        state.advance(1);
385                        if state.peek() == Some('/') {
386                            state.advance(1);
387                            break;
388                        }
389                    }
390                    else {
391                        state.advance(ch.len_utf8())
392                    }
393                }
394                state.add_token(DTokenType::BlockComment, start_pos, state.get_position());
395                return true;
396            }
397        }
398        false
399    }
400
401    fn lex_nested_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
402        if let Some('/') = state.peek() {
403            if let Some('+') = state.peek_next_n(1) {
404                let start_pos = state.get_position();
405                state.advance(2);
406                let mut depth = 1;
407                while let Some(ch) = state.peek() {
408                    if ch == '/' {
409                        state.advance(1);
410                        if state.peek() == Some('+') {
411                            state.advance(1);
412                            depth += 1
413                        }
414                    }
415                    else if ch == '+' {
416                        state.advance(1);
417                        if state.peek() == Some('/') {
418                            state.advance(1);
419                            depth -= 1;
420                            if depth == 0 {
421                                break;
422                            }
423                        }
424                    }
425                    else {
426                        state.advance(ch.len_utf8())
427                    }
428                }
429                state.add_token(DTokenType::NestedComment, start_pos, state.get_position());
430                return true;
431            }
432        }
433        false
434    }
435
436    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
437        if let Some(ch) = state.peek() {
438            let start_pos = state.get_position();
439
440            match ch {
441                '+' => {
442                    state.advance(1);
443                    if let Some('=') = state.peek() {
444                        state.advance(1);
445                        state.add_token(DTokenType::PlusAssign, start_pos, state.get_position())
446                    }
447                    else if let Some('+') = state.peek() {
448                        state.advance(1);
449                        state.add_token(DTokenType::Increment, start_pos, state.get_position())
450                    }
451                    else {
452                        state.add_token(DTokenType::Plus, start_pos, state.get_position())
453                    }
454                    return true;
455                }
456                '-' => {
457                    state.advance(1);
458                    if let Some('=') = state.peek() {
459                        state.advance(1);
460                        state.add_token(DTokenType::MinusAssign, start_pos, state.get_position())
461                    }
462                    else if let Some('-') = state.peek() {
463                        state.advance(1);
464                        state.add_token(DTokenType::Decrement, start_pos, state.get_position())
465                    }
466                    else {
467                        state.add_token(DTokenType::Minus, start_pos, state.get_position())
468                    }
469                    return true;
470                }
471                '*' => {
472                    state.advance(1);
473                    if let Some('=') = state.peek() {
474                        state.advance(1);
475                        state.add_token(DTokenType::MultiplyAssign, start_pos, state.get_position())
476                    }
477                    else {
478                        state.add_token(DTokenType::Multiply, start_pos, state.get_position())
479                    }
480                    return true;
481                }
482                '/' => {
483                    // Handled in comment processing
484                    return false;
485                }
486                '%' => {
487                    state.advance(1);
488                    if let Some('=') = state.peek() {
489                        state.advance(1);
490                        state.add_token(DTokenType::ModuloAssign, start_pos, state.get_position())
491                    }
492                    else {
493                        state.add_token(DTokenType::Modulo, start_pos, state.get_position())
494                    }
495                    return true;
496                }
497                '&' => {
498                    state.advance(1);
499                    if let Some('&') = state.peek() {
500                        state.advance(1);
501                        state.add_token(DTokenType::LogicalAnd, start_pos, state.get_position())
502                    }
503                    else if let Some('=') = state.peek() {
504                        state.advance(1);
505                        state.add_token(DTokenType::BitwiseAndAssign, start_pos, state.get_position())
506                    }
507                    else {
508                        state.add_token(DTokenType::BitwiseAnd, start_pos, state.get_position())
509                    }
510                    return true;
511                }
512                '|' => {
513                    state.advance(1);
514                    if let Some('|') = state.peek() {
515                        state.advance(1);
516                        state.add_token(DTokenType::LogicalOr, start_pos, state.get_position())
517                    }
518                    else if let Some('=') = state.peek() {
519                        state.advance(1);
520                        state.add_token(DTokenType::BitwiseOrAssign, start_pos, state.get_position())
521                    }
522                    else {
523                        state.add_token(DTokenType::BitwiseOr, start_pos, state.get_position())
524                    }
525                    return true;
526                }
527                '^' => {
528                    state.advance(1);
529                    if let Some('=') = state.peek() {
530                        state.advance(1);
531                        state.add_token(DTokenType::BitwiseXorAssign, start_pos, state.get_position())
532                    }
533                    else {
534                        state.add_token(DTokenType::BitwiseXor, start_pos, state.get_position())
535                    }
536                    return true;
537                }
538                '~' => {
539                    state.advance(1);
540                    if let Some('=') = state.peek() {
541                        state.advance(1);
542                        state.add_token(DTokenType::ConcatenateAssign, start_pos, state.get_position())
543                    }
544                    else {
545                        state.add_token(DTokenType::BitwiseNot, start_pos, state.get_position())
546                    }
547                    return true;
548                }
549                '!' => {
550                    state.advance(1);
551                    if let Some('=') = state.peek() {
552                        state.advance(1);
553                        state.add_token(DTokenType::NotEqual, start_pos, state.get_position())
554                    }
555                    else {
556                        state.add_token(DTokenType::Not, start_pos, state.get_position())
557                    }
558                    return true;
559                }
560                '<' => {
561                    state.advance(1);
562                    if let Some('<') = state.peek() {
563                        state.advance(1);
564                        if let Some('=') = state.peek() {
565                            state.advance(1);
566                            state.add_token(DTokenType::LeftShiftAssign, start_pos, state.get_position())
567                        }
568                        else {
569                            state.add_token(DTokenType::LeftShift, start_pos, state.get_position())
570                        }
571                    }
572                    else if let Some('=') = state.peek() {
573                        state.advance(1);
574                        state.add_token(DTokenType::LessEqual, start_pos, state.get_position())
575                    }
576                    else {
577                        state.add_token(DTokenType::Less, start_pos, state.get_position())
578                    }
579                    return true;
580                }
581                '>' => {
582                    state.advance(1);
583                    if let Some('>') = state.peek() {
584                        state.advance(1);
585                        if let Some('=') = state.peek() {
586                            state.advance(1);
587                            state.add_token(DTokenType::RightShiftAssign, start_pos, state.get_position())
588                        }
589                        else {
590                            state.add_token(DTokenType::RightShift, start_pos, state.get_position())
591                        }
592                    }
593                    else if let Some('=') = state.peek() {
594                        state.advance(1);
595                        state.add_token(DTokenType::GreaterEqual, start_pos, state.get_position())
596                    }
597                    else {
598                        state.add_token(DTokenType::Greater, start_pos, state.get_position())
599                    }
600                    return true;
601                }
602                '=' => {
603                    state.advance(1);
604                    if let Some('=') = state.peek() {
605                        state.advance(1);
606                        state.add_token(DTokenType::Equal, start_pos, state.get_position())
607                    }
608                    else {
609                        state.add_token(DTokenType::Assign, start_pos, state.get_position())
610                    }
611                    return true;
612                }
613                _ => false,
614            }
615        }
616        else {
617            false
618        }
619    }
620
621    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
622        if let Some(ch) = state.peek() {
623            let start_pos = state.get_position();
624
625            match ch {
626                '(' => {
627                    state.advance(1);
628                    state.add_token(DTokenType::LeftParen, start_pos, state.get_position());
629                    return true;
630                }
631                ')' => {
632                    state.advance(1);
633                    state.add_token(DTokenType::RightParen, start_pos, state.get_position());
634                    return true;
635                }
636                '[' => {
637                    state.advance(1);
638                    state.add_token(DTokenType::LeftBracket, start_pos, state.get_position());
639                    return true;
640                }
641                ']' => {
642                    state.advance(1);
643                    state.add_token(DTokenType::RightBracket, start_pos, state.get_position());
644                    return true;
645                }
646                '{' => {
647                    state.advance(1);
648                    state.add_token(DTokenType::LeftBrace, start_pos, state.get_position());
649                    return true;
650                }
651                '}' => {
652                    state.advance(1);
653                    state.add_token(DTokenType::RightBrace, start_pos, state.get_position());
654                    return true;
655                }
656                ';' => {
657                    state.advance(1);
658                    state.add_token(DTokenType::Semicolon, start_pos, state.get_position());
659                    return true;
660                }
661                ',' => {
662                    state.advance(1);
663                    state.add_token(DTokenType::Comma, start_pos, state.get_position());
664                    return true;
665                }
666                '.' => {
667                    state.advance(1);
668                    state.add_token(DTokenType::Dot, start_pos, state.get_position());
669                    return true;
670                }
671                ':' => {
672                    state.advance(1);
673                    state.add_token(DTokenType::Colon, start_pos, state.get_position());
674                    return true;
675                }
676                '?' => {
677                    state.advance(1);
678                    state.add_token(DTokenType::Question, start_pos, state.get_position());
679                    return true;
680                }
681                _ => false,
682            }
683        }
684        else {
685            false
686        }
687    }
688}