oak_d/lexer/
mod.rs

1use crate::{kind::DSyntaxKind, language::DLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, DLanguage>;
5
6/// Lexer implementation for D programming language
7#[derive(Clone)]
8pub struct DLexer<'config> {
9    _config: &'config DLanguage,
10}
11
12impl<'config> Lexer<DLanguage> for DLexer<'config> {
13    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<DLanguage>) -> LexOutput<DLanguage> {
14        let mut state = LexerState::new(source);
15        let result = self.run(&mut state);
16        if result.is_ok() {
17            state.add_eof();
18        }
19        state.finish_with_cache(result, cache)
20    }
21}
22
23impl<'config> DLexer<'config> {
24    pub fn new(config: &'config DLanguage) -> Self {
25        Self { _config: config }
26    }
27
28    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
29        while state.not_at_end() {
30            let start_pos = state.get_position();
31
32            // 尝试各种词法规则
33            if self.skip_whitespace(state) {
34                continue;
35            }
36
37            if self.lex_newline(state) {
38                continue;
39            }
40
41            if self.lex_line_comment(state) {
42                continue;
43            }
44
45            if self.lex_block_comment(state) {
46                continue;
47            }
48
49            if self.lex_nested_comment(state) {
50                continue;
51            }
52
53            if self.lex_identifier_or_keyword(state) {
54                continue;
55            }
56
57            if self.lex_number(state) {
58                continue;
59            }
60
61            if self.lex_string(state) {
62                continue;
63            }
64
65            if self.lex_character(state) {
66                continue;
67            }
68
69            if self.lex_operator(state) {
70                continue;
71            }
72
73            if self.lex_delimiter(state) {
74                continue;
75            }
76
77            // 如果没有匹配任何规则,添加错误token并强行推进,防止死循环
78            state.advance_if_dead_lock(start_pos);
79            if state.get_position() > start_pos {
80                state.add_token(DSyntaxKind::Error, start_pos, state.get_position());
81            }
82        }
83        Ok(())
84    }
85
86    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        if let Some(ch) = state.peek() {
88            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
89                let start_pos = state.get_position();
90                while let Some(ch) = state.peek() {
91                    if !ch.is_whitespace() || ch == '\n' || ch == '\r' {
92                        break;
93                    }
94                    state.advance(ch.len_utf8());
95                }
96                state.add_token(DSyntaxKind::Whitespace, start_pos, state.get_position());
97                return true;
98            }
99        }
100        false
101    }
102
103    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104        if let Some(ch) = state.peek() {
105            if ch == '\n' || ch == '\r' {
106                let start_pos = state.get_position();
107                if ch == '\r' {
108                    state.advance(1);
109                    if state.peek() == Some('\n') {
110                        state.advance(1);
111                    }
112                }
113                else {
114                    state.advance(1);
115                }
116                state.add_token(DSyntaxKind::Newline, start_pos, state.get_position());
117                return true;
118            }
119        }
120        false
121    }
122
123    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124        if let Some(ch) = state.peek() {
125            if ch.is_alphabetic() || ch == '_' {
126                let start_pos = state.get_position();
127                while let Some(ch) = state.peek() {
128                    if ch.is_alphanumeric() || ch == '_' {
129                        state.advance(ch.len_utf8());
130                    }
131                    else {
132                        break;
133                    }
134                }
135                let end_pos = state.get_position();
136                let text = state.get_text_in((start_pos..end_pos).into());
137
138                let kind = match text.as_ref() {
139                    "module" => DSyntaxKind::ModuleKeyword,
140                    "import" => DSyntaxKind::ImportKeyword,
141                    "public" => DSyntaxKind::PublicKeyword,
142                    "private" => DSyntaxKind::PrivateKeyword,
143                    "protected" => DSyntaxKind::ProtectedKeyword,
144                    "package" => DSyntaxKind::PackageKeyword,
145                    "export" => DSyntaxKind::ExportKeyword,
146                    "static" => DSyntaxKind::StaticKeyword,
147                    "final" => DSyntaxKind::FinalKeyword,
148                    "abstract" => DSyntaxKind::AbstractKeyword,
149                    "override" => DSyntaxKind::OverrideKeyword,
150                    "synchronized" => DSyntaxKind::SynchronizedKeyword,
151                    "const" => DSyntaxKind::ConstKeyword,
152                    "immutable" => DSyntaxKind::ImmutableKeyword,
153                    "inout" => DSyntaxKind::InoutKeyword,
154                    "shared" => DSyntaxKind::SharedKeyword,
155                    "class" => DSyntaxKind::ClassKeyword,
156                    "struct" => DSyntaxKind::StructKeyword,
157                    "interface" => DSyntaxKind::InterfaceKeyword,
158                    "union" => DSyntaxKind::UnionKeyword,
159                    "enum" => DSyntaxKind::EnumKeyword,
160                    "function" => DSyntaxKind::FunctionKeyword,
161                    "delegate" => DSyntaxKind::DelegateKeyword,
162                    "if" => DSyntaxKind::IfKeyword,
163                    "else" => DSyntaxKind::ElseKeyword,
164                    "while" => DSyntaxKind::WhileKeyword,
165                    "for" => DSyntaxKind::ForKeyword,
166                    "foreach" => DSyntaxKind::ForeachKeyword,
167                    "do" => DSyntaxKind::DoKeyword,
168                    "switch" => DSyntaxKind::SwitchKeyword,
169                    "case" => DSyntaxKind::CaseKeyword,
170                    "default" => DSyntaxKind::DefaultKeyword,
171                    "break" => DSyntaxKind::BreakKeyword,
172                    "continue" => DSyntaxKind::ContinueKeyword,
173                    "return" => DSyntaxKind::ReturnKeyword,
174                    "goto" => DSyntaxKind::GotoKeyword,
175                    "try" => DSyntaxKind::TryKeyword,
176                    "catch" => DSyntaxKind::CatchKeyword,
177                    "finally" => DSyntaxKind::FinallyKeyword,
178                    "throw" => DSyntaxKind::ThrowKeyword,
179                    "scope" => DSyntaxKind::ScopeKeyword,
180                    "with" => DSyntaxKind::WithKeyword,
181                    "asm" => DSyntaxKind::AsmKeyword,
182                    "mixin" => DSyntaxKind::MixinKeyword,
183                    "template" => DSyntaxKind::TemplateKeyword,
184                    "alias" => DSyntaxKind::AliasKeyword,
185                    "typeof" => DSyntaxKind::TypeofKeyword,
186                    "typeid" => DSyntaxKind::TypeidKeyword,
187                    "is" => DSyntaxKind::IsKeyword,
188                    "in" => DSyntaxKind::InKeyword,
189                    "out" => DSyntaxKind::OutKeyword,
190                    "ref" => DSyntaxKind::RefKeyword,
191                    "lazy" => DSyntaxKind::LazyKeyword,
192                    "auto" => DSyntaxKind::AutoKeyword,
193                    "extern" => DSyntaxKind::ExternKeyword,
194                    "align" => DSyntaxKind::AlignKeyword,
195                    "pragma" => DSyntaxKind::PragmaKeyword,
196                    "debug" => DSyntaxKind::DebugKeyword,
197                    "version" => DSyntaxKind::VersionKeyword,
198                    "unittest" => DSyntaxKind::UnitTestKeyword,
199                    "invariant" => DSyntaxKind::InvariantKeyword,
200                    "body" => DSyntaxKind::BodyKeyword,
201                    "new" => DSyntaxKind::NewKeyword,
202                    "delete" => DSyntaxKind::DeleteKeyword,
203                    "this" => DSyntaxKind::ThisKeyword,
204                    "super" => DSyntaxKind::SuperKeyword,
205                    "null" => DSyntaxKind::NullKeyword,
206                    "true" => DSyntaxKind::TrueKeyword,
207                    "false" => DSyntaxKind::FalseKeyword,
208                    "cast" => DSyntaxKind::CastKeyword,
209                    "void" => DSyntaxKind::VoidType,
210                    "bool" => DSyntaxKind::BoolType,
211                    "byte" => DSyntaxKind::ByteType,
212                    "ubyte" => DSyntaxKind::UbyteType,
213                    "short" => DSyntaxKind::ShortType,
214                    "ushort" => DSyntaxKind::UshortType,
215                    "int" => DSyntaxKind::IntType,
216                    "uint" => DSyntaxKind::UintType,
217                    "long" => DSyntaxKind::LongType,
218                    "ulong" => DSyntaxKind::UlongType,
219                    "cent" => DSyntaxKind::CentType,
220                    "ucent" => DSyntaxKind::UcentType,
221                    "float" => DSyntaxKind::FloatType,
222                    "double" => DSyntaxKind::DoubleType,
223                    "real" => DSyntaxKind::RealType,
224                    "ifloat" => DSyntaxKind::IfloatType,
225                    "idouble" => DSyntaxKind::IdoubleType,
226                    "ireal" => DSyntaxKind::IrealType,
227                    "cfloat" => DSyntaxKind::CfloatType,
228                    "cdouble" => DSyntaxKind::CdoubleType,
229                    "creal" => DSyntaxKind::CrealType,
230                    "char" => DSyntaxKind::CharType,
231                    "wchar" => DSyntaxKind::WcharType,
232                    "dchar" => DSyntaxKind::DcharType,
233                    "string" => DSyntaxKind::StringType,
234                    "wstring" => DSyntaxKind::WstringType,
235                    "dstring" => DSyntaxKind::DstringType,
236                    "typedef" => DSyntaxKind::TypedefKeyword,
237                    "pure" => DSyntaxKind::PureKeyword,
238                    "nothrow" => DSyntaxKind::NothrowKeyword,
239                    "safe" => DSyntaxKind::SafeKeyword,
240                    "trusted" => DSyntaxKind::TrustedKeyword,
241                    "system" => DSyntaxKind::SystemKeyword,
242                    "nogc" => DSyntaxKind::NogcKeyword,
243                    "property" => DSyntaxKind::PropertyKeyword,
244                    "disable" => DSyntaxKind::DisableKeyword,
245                    "deprecated" => DSyntaxKind::DeprecatedKeyword,
246                    _ => DSyntaxKind::Identifier,
247                };
248
249                state.add_token(kind, start_pos, end_pos);
250                return true;
251            }
252        }
253        false
254    }
255
256    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
257        if let Some(ch) = state.peek() {
258            if ch.is_ascii_digit() {
259                let start_pos = state.get_position();
260
261                // 处理数字
262                while let Some(ch) = state.peek() {
263                    if ch.is_ascii_digit() || ch == '_' {
264                        state.advance(ch.len_utf8());
265                    }
266                    else {
267                        break;
268                    }
269                }
270
271                // 检查小数点
272                if let Some('.') = state.peek() {
273                    state.advance(1);
274                    while let Some(ch) = state.peek() {
275                        if ch.is_ascii_digit() || ch == '_' {
276                            state.advance(ch.len_utf8());
277                        }
278                        else {
279                            break;
280                        }
281                    }
282                }
283
284                // 检查指数
285                if let Some(ch) = state.peek() {
286                    if ch == 'e' || ch == 'E' {
287                        state.advance(1);
288                        if let Some(ch) = state.peek() {
289                            if ch == '+' || ch == '-' {
290                                state.advance(1);
291                            }
292                        }
293                        while let Some(ch) = state.peek() {
294                            if ch.is_ascii_digit() || ch == '_' {
295                                state.advance(ch.len_utf8());
296                            }
297                            else {
298                                break;
299                            }
300                        }
301                    }
302                }
303
304                // 检查后缀
305                if let Some(ch) = state.peek() {
306                    if ch == 'f' || ch == 'F' || ch == 'L' || ch == 'u' || ch == 'U' {
307                        state.advance(1);
308                    }
309                }
310
311                state.add_token(DSyntaxKind::IntegerLiteral, start_pos, state.get_position());
312                return true;
313            }
314        }
315        false
316    }
317
318    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
319        if let Some(ch) = state.peek() {
320            if ch == '"' || ch == '\'' {
321                let start_pos = state.get_position();
322                let quote = ch;
323                state.advance(1); // consume opening quote
324
325                while let Some(ch) = state.peek() {
326                    if ch == quote {
327                        state.advance(1); // consume closing quote
328                        break;
329                    }
330                    else if ch == '\\' {
331                        state.advance(1); // consume backslash
332                        if state.peek().is_some() {
333                            state.advance(1); // consume escaped character
334                        }
335                    }
336                    else {
337                        state.advance(ch.len_utf8());
338                    }
339                }
340
341                state.add_token(DSyntaxKind::StringLiteral, start_pos, state.get_position());
342                return true;
343            }
344        }
345        false
346    }
347
348    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
349        if let Some('\'') = state.peek() {
350            let start_pos = state.get_position();
351            state.advance(1); // consume opening quote
352
353            if let Some(ch) = state.peek() {
354                if ch == '\\' {
355                    state.advance(1); // consume backslash
356                    if state.peek().is_some() {
357                        state.advance(1); // consume escaped character
358                    }
359                }
360                else {
361                    state.advance(ch.len_utf8());
362                }
363            }
364
365            if let Some('\'') = state.peek() {
366                state.advance(1); // consume closing quote
367            }
368
369            state.add_token(DSyntaxKind::CharLiteral, start_pos, state.get_position());
370            return true;
371        }
372        false
373    }
374
375    fn lex_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
376        if let Some('/') = state.peek() {
377            if let Some('/') = state.peek_next_n(1) {
378                let start_pos = state.get_position();
379                state.advance(2);
380                while let Some(ch) = state.peek() {
381                    if ch == '\n' || ch == '\r' {
382                        break;
383                    }
384                    state.advance(ch.len_utf8());
385                }
386                state.add_token(DSyntaxKind::LineComment, start_pos, state.get_position());
387                return true;
388            }
389        }
390        false
391    }
392
393    fn lex_block_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
394        if let Some('/') = state.peek() {
395            if let Some('*') = state.peek_next_n(1) {
396                let start_pos = state.get_position();
397                state.advance(2);
398                while let Some(ch) = state.peek() {
399                    if ch == '*' {
400                        state.advance(1);
401                        if state.peek() == Some('/') {
402                            state.advance(1);
403                            break;
404                        }
405                    }
406                    else {
407                        state.advance(ch.len_utf8());
408                    }
409                }
410                state.add_token(DSyntaxKind::BlockComment, start_pos, state.get_position());
411                return true;
412            }
413        }
414        false
415    }
416
417    fn lex_nested_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
418        if let Some('/') = state.peek() {
419            if let Some('+') = state.peek_next_n(1) {
420                let start_pos = state.get_position();
421                state.advance(2);
422                let mut depth = 1;
423                while let Some(ch) = state.peek() {
424                    if ch == '/' {
425                        state.advance(1);
426                        if state.peek() == Some('+') {
427                            state.advance(1);
428                            depth += 1;
429                        }
430                    }
431                    else if ch == '+' {
432                        state.advance(1);
433                        if state.peek() == Some('/') {
434                            state.advance(1);
435                            depth -= 1;
436                            if depth == 0 {
437                                break;
438                            }
439                        }
440                    }
441                    else {
442                        state.advance(ch.len_utf8());
443                    }
444                }
445                state.add_token(DSyntaxKind::NestedComment, start_pos, state.get_position());
446                return true;
447            }
448        }
449        false
450    }
451
452    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
453        if let Some(ch) = state.peek() {
454            let start_pos = state.get_position();
455
456            match ch {
457                '+' => {
458                    state.advance(1);
459                    if let Some('=') = state.peek() {
460                        state.advance(1);
461                        state.add_token(DSyntaxKind::PlusAssign, start_pos, state.get_position());
462                    }
463                    else if let Some('+') = state.peek() {
464                        state.advance(1);
465                        state.add_token(DSyntaxKind::Increment, start_pos, state.get_position());
466                    }
467                    else {
468                        state.add_token(DSyntaxKind::Plus, start_pos, state.get_position());
469                    }
470                    return true;
471                }
472                '-' => {
473                    state.advance(1);
474                    if let Some('=') = state.peek() {
475                        state.advance(1);
476                        state.add_token(DSyntaxKind::MinusAssign, start_pos, state.get_position());
477                    }
478                    else if let Some('-') = state.peek() {
479                        state.advance(1);
480                        state.add_token(DSyntaxKind::Decrement, start_pos, state.get_position());
481                    }
482                    else {
483                        state.add_token(DSyntaxKind::Minus, start_pos, state.get_position());
484                    }
485                    return true;
486                }
487                '*' => {
488                    state.advance(1);
489                    if let Some('=') = state.peek() {
490                        state.advance(1);
491                        state.add_token(DSyntaxKind::MultiplyAssign, start_pos, state.get_position());
492                    }
493                    else {
494                        state.add_token(DSyntaxKind::Multiply, start_pos, state.get_position());
495                    }
496                    return true;
497                }
498                '/' => {
499                    // 已在注释处理中处理
500                    return false;
501                }
502                '%' => {
503                    state.advance(1);
504                    if let Some('=') = state.peek() {
505                        state.advance(1);
506                        state.add_token(DSyntaxKind::ModuloAssign, start_pos, state.get_position());
507                    }
508                    else {
509                        state.add_token(DSyntaxKind::Modulo, start_pos, state.get_position());
510                    }
511                    return true;
512                }
513                '&' => {
514                    state.advance(1);
515                    if let Some('&') = state.peek() {
516                        state.advance(1);
517                        state.add_token(DSyntaxKind::LogicalAnd, start_pos, state.get_position());
518                    }
519                    else if let Some('=') = state.peek() {
520                        state.advance(1);
521                        state.add_token(DSyntaxKind::BitwiseAndAssign, start_pos, state.get_position());
522                    }
523                    else {
524                        state.add_token(DSyntaxKind::BitwiseAnd, start_pos, state.get_position());
525                    }
526                    return true;
527                }
528                '|' => {
529                    state.advance(1);
530                    if let Some('|') = state.peek() {
531                        state.advance(1);
532                        state.add_token(DSyntaxKind::LogicalOr, start_pos, state.get_position());
533                    }
534                    else if let Some('=') = state.peek() {
535                        state.advance(1);
536                        state.add_token(DSyntaxKind::BitwiseOrAssign, start_pos, state.get_position());
537                    }
538                    else {
539                        state.add_token(DSyntaxKind::BitwiseOr, start_pos, state.get_position());
540                    }
541                    return true;
542                }
543                '^' => {
544                    state.advance(1);
545                    if let Some('=') = state.peek() {
546                        state.advance(1);
547                        state.add_token(DSyntaxKind::BitwiseXorAssign, start_pos, state.get_position());
548                    }
549                    else {
550                        state.add_token(DSyntaxKind::BitwiseXor, start_pos, state.get_position());
551                    }
552                    return true;
553                }
554                '~' => {
555                    state.advance(1);
556                    if let Some('=') = state.peek() {
557                        state.advance(1);
558                        state.add_token(DSyntaxKind::ConcatenateAssign, start_pos, state.get_position());
559                    }
560                    else {
561                        state.add_token(DSyntaxKind::BitwiseNot, start_pos, state.get_position());
562                    }
563                    return true;
564                }
565                '!' => {
566                    state.advance(1);
567                    if let Some('=') = state.peek() {
568                        state.advance(1);
569                        state.add_token(DSyntaxKind::NotEqual, start_pos, state.get_position());
570                    }
571                    else {
572                        state.add_token(DSyntaxKind::Not, start_pos, state.get_position());
573                    }
574                    return true;
575                }
576                '<' => {
577                    state.advance(1);
578                    if let Some('<') = state.peek() {
579                        state.advance(1);
580                        if let Some('=') = state.peek() {
581                            state.advance(1);
582                            state.add_token(DSyntaxKind::LeftShiftAssign, start_pos, state.get_position());
583                        }
584                        else {
585                            state.add_token(DSyntaxKind::LeftShift, start_pos, state.get_position());
586                        }
587                    }
588                    else if let Some('=') = state.peek() {
589                        state.advance(1);
590                        state.add_token(DSyntaxKind::LessEqual, start_pos, state.get_position());
591                    }
592                    else {
593                        state.add_token(DSyntaxKind::Less, start_pos, state.get_position());
594                    }
595                    return true;
596                }
597                '>' => {
598                    state.advance(1);
599                    if let Some('>') = state.peek() {
600                        state.advance(1);
601                        if let Some('=') = state.peek() {
602                            state.advance(1);
603                            state.add_token(DSyntaxKind::RightShiftAssign, start_pos, state.get_position());
604                        }
605                        else {
606                            state.add_token(DSyntaxKind::RightShift, start_pos, state.get_position());
607                        }
608                    }
609                    else if let Some('=') = state.peek() {
610                        state.advance(1);
611                        state.add_token(DSyntaxKind::GreaterEqual, start_pos, state.get_position());
612                    }
613                    else {
614                        state.add_token(DSyntaxKind::Greater, start_pos, state.get_position());
615                    }
616                    return true;
617                }
618                '=' => {
619                    state.advance(1);
620                    if let Some('=') = state.peek() {
621                        state.advance(1);
622                        state.add_token(DSyntaxKind::Equal, start_pos, state.get_position());
623                    }
624                    else {
625                        state.add_token(DSyntaxKind::Assign, start_pos, state.get_position());
626                    }
627                    return true;
628                }
629                _ => false,
630            }
631        }
632        else {
633            false
634        }
635    }
636
637    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
638        if let Some(ch) = state.peek() {
639            let start_pos = state.get_position();
640
641            match ch {
642                '(' => {
643                    state.advance(1);
644                    state.add_token(DSyntaxKind::LeftParen, start_pos, state.get_position());
645                    return true;
646                }
647                ')' => {
648                    state.advance(1);
649                    state.add_token(DSyntaxKind::RightParen, start_pos, state.get_position());
650                    return true;
651                }
652                '[' => {
653                    state.advance(1);
654                    state.add_token(DSyntaxKind::LeftBracket, start_pos, state.get_position());
655                    return true;
656                }
657                ']' => {
658                    state.advance(1);
659                    state.add_token(DSyntaxKind::RightBracket, start_pos, state.get_position());
660                    return true;
661                }
662                '{' => {
663                    state.advance(1);
664                    state.add_token(DSyntaxKind::LeftBrace, start_pos, state.get_position());
665                    return true;
666                }
667                '}' => {
668                    state.advance(1);
669                    state.add_token(DSyntaxKind::RightBrace, start_pos, state.get_position());
670                    return true;
671                }
672                ';' => {
673                    state.advance(1);
674                    state.add_token(DSyntaxKind::Semicolon, start_pos, state.get_position());
675                    return true;
676                }
677                ',' => {
678                    state.advance(1);
679                    state.add_token(DSyntaxKind::Comma, start_pos, state.get_position());
680                    return true;
681                }
682                '.' => {
683                    state.advance(1);
684                    state.add_token(DSyntaxKind::Dot, start_pos, state.get_position());
685                    return true;
686                }
687                ':' => {
688                    state.advance(1);
689                    state.add_token(DSyntaxKind::Colon, start_pos, state.get_position());
690                    return true;
691                }
692                '?' => {
693                    state.advance(1);
694                    state.add_token(DSyntaxKind::Question, start_pos, state.get_position());
695                    return true;
696                }
697                _ => false,
698            }
699        }
700        else {
701            false
702        }
703    }
704}