oak_csharp/lexer/
mod.rs

1use crate::{CSharpSyntaxKind, language::CSharpLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, CSharpLanguage>;
5
6pub struct CSharpLexer;
7
8impl CSharpLexer {
9    pub fn new(_config: &CSharpLanguage) -> Self {
10        Self
11    }
12
13    /// 跳过空白字符
14    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
15        let start_pos = state.get_position();
16
17        while let Some(ch) = state.peek() {
18            if ch == ' ' || ch == '\t' {
19                state.advance(ch.len_utf8());
20            }
21            else {
22                break;
23            }
24        }
25
26        if state.get_position() > start_pos {
27            state.add_token(CSharpSyntaxKind::Whitespace, start_pos, state.get_position());
28            true
29        }
30        else {
31            false
32        }
33    }
34
35    /// 处理换行
36    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
37        let start_pos = state.get_position();
38
39        if let Some('\n') = state.peek() {
40            state.advance(1);
41            state.add_token(CSharpSyntaxKind::Newline, start_pos, state.get_position());
42            true
43        }
44        else if let Some('\r') = state.peek() {
45            state.advance(1);
46            if let Some('\n') = state.peek() {
47                state.advance(1);
48            }
49            state.add_token(CSharpSyntaxKind::Newline, start_pos, state.get_position());
50            true
51        }
52        else {
53            false
54        }
55    }
56
57    /// 处理注释
58    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
59        let start_pos = state.get_position();
60
61        if let Some('/') = state.peek() {
62            state.advance(1);
63            if let Some('/') = state.peek() {
64                // 单行注释
65                state.advance(1);
66                while let Some(ch) = state.peek() {
67                    if ch == '\n' || ch == '\r' {
68                        break;
69                    }
70                    state.advance(ch.len_utf8());
71                }
72                state.add_token(CSharpSyntaxKind::Comment, start_pos, state.get_position());
73                return true;
74            }
75            else if let Some('*') = state.peek() {
76                // 多行注释
77                state.advance(1);
78                while let Some(ch) = state.peek() {
79                    if ch == '*' {
80                        state.advance(1);
81                        if let Some('/') = state.peek() {
82                            state.advance(1);
83                            break;
84                        }
85                    }
86                    else {
87                        state.advance(ch.len_utf8());
88                    }
89                }
90                state.add_token(CSharpSyntaxKind::Comment, start_pos, state.get_position());
91                return true;
92            }
93            else {
94                // 回退,这不是注释
95                state.set_position(start_pos);
96                return false;
97            }
98        }
99        false
100    }
101
102    /// 处理字符串字面量
103    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
104        let start_pos = state.get_position();
105
106        if let Some('"') = state.peek() {
107            state.advance(1);
108            while let Some(ch) = state.peek() {
109                if ch == '"' {
110                    state.advance(1);
111                    break;
112                }
113                else if ch == '\\' {
114                    state.advance(1);
115                    if let Some(_) = state.peek() {
116                        state.advance(1);
117                    }
118                }
119                else {
120                    state.advance(ch.len_utf8());
121                }
122            }
123            state.add_token(CSharpSyntaxKind::StringLiteral, start_pos, state.get_position());
124            true
125        }
126        else if let Some('\'') = state.peek() {
127            // 字符字面量
128            state.advance(1);
129            while let Some(ch) = state.peek() {
130                if ch == '\'' {
131                    state.advance(1);
132                    break;
133                }
134                else if ch == '\\' {
135                    state.advance(1);
136                    if let Some(_) = state.peek() {
137                        state.advance(1);
138                    }
139                }
140                else {
141                    state.advance(ch.len_utf8());
142                }
143            }
144            state.add_token(CSharpSyntaxKind::CharLiteral, start_pos, state.get_position());
145            true
146        }
147        else {
148            false
149        }
150    }
151
152    /// 处理数字字面量
153    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
154        let start_pos = state.get_position();
155
156        if let Some(ch) = state.peek() {
157            if ch.is_ascii_digit() {
158                state.advance(ch.len_utf8());
159
160                while let Some(ch) = state.peek() {
161                    if ch.is_ascii_digit() || ch == '.' || ch == '_' {
162                        state.advance(ch.len_utf8());
163                    }
164                    else {
165                        break;
166                    }
167                }
168
169                // 处理后缀 (f, d, m, l, ul, etc.)
170                if let Some(ch) = state.peek() {
171                    if ch.is_ascii_alphabetic() {
172                        state.advance(ch.len_utf8());
173                        if let Some(ch2) = state.peek() {
174                            if ch2.is_ascii_alphabetic() {
175                                state.advance(ch2.len_utf8());
176                            }
177                        }
178                    }
179                }
180
181                state.add_token(CSharpSyntaxKind::NumberLiteral, start_pos, state.get_position());
182                true
183            }
184            else {
185                false
186            }
187        }
188        else {
189            false
190        }
191    }
192
193    /// 处理关键字或标识符
194    fn lex_keyword_or_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
195        let start_pos = state.get_position();
196
197        if let Some(ch) = state.peek() {
198            if ch.is_ascii_alphabetic() || ch == '_' || ch == '@' {
199                state.advance(ch.len_utf8());
200
201                while let Some(ch) = state.peek() {
202                    if ch.is_ascii_alphanumeric() || ch == '_' {
203                        state.advance(ch.len_utf8());
204                    }
205                    else {
206                        break;
207                    }
208                }
209
210                let text = state.get_text_in((start_pos..state.get_position()).into());
211                let token_kind = match text {
212                    // C# 关键字
213                    "abstract" => CSharpSyntaxKind::Abstract,
214                    "as" => CSharpSyntaxKind::As,
215                    "base" => CSharpSyntaxKind::Base,
216                    "bool" => CSharpSyntaxKind::Bool,
217                    "break" => CSharpSyntaxKind::Break,
218                    "byte" => CSharpSyntaxKind::Byte,
219                    "case" => CSharpSyntaxKind::Case,
220                    "catch" => CSharpSyntaxKind::Catch,
221                    "char" => CSharpSyntaxKind::Char,
222                    "checked" => CSharpSyntaxKind::Checked,
223                    "class" => CSharpSyntaxKind::Class,
224                    "const" => CSharpSyntaxKind::Const,
225                    "continue" => CSharpSyntaxKind::Continue,
226                    "decimal" => CSharpSyntaxKind::Decimal,
227                    "default" => CSharpSyntaxKind::Default,
228                    "delegate" => CSharpSyntaxKind::Delegate,
229                    "do" => CSharpSyntaxKind::Do,
230                    "double" => CSharpSyntaxKind::Double,
231                    "else" => CSharpSyntaxKind::Else,
232                    "enum" => CSharpSyntaxKind::Enum,
233                    "event" => CSharpSyntaxKind::Event,
234                    "explicit" => CSharpSyntaxKind::Explicit,
235                    "extern" => CSharpSyntaxKind::Extern,
236                    "false" => CSharpSyntaxKind::False,
237                    "finally" => CSharpSyntaxKind::Finally,
238                    "fixed" => CSharpSyntaxKind::Fixed,
239                    "float" => CSharpSyntaxKind::Float,
240                    "for" => CSharpSyntaxKind::For,
241                    "foreach" => CSharpSyntaxKind::Foreach,
242                    "goto" => CSharpSyntaxKind::Goto,
243                    "if" => CSharpSyntaxKind::If,
244                    "implicit" => CSharpSyntaxKind::Implicit,
245                    "in" => CSharpSyntaxKind::In,
246                    "int" => CSharpSyntaxKind::Int,
247                    "interface" => CSharpSyntaxKind::Interface,
248                    "internal" => CSharpSyntaxKind::Internal,
249                    "is" => CSharpSyntaxKind::Is,
250                    "lock" => CSharpSyntaxKind::Lock,
251                    "long" => CSharpSyntaxKind::Long,
252                    "namespace" => CSharpSyntaxKind::Namespace,
253                    "new" => CSharpSyntaxKind::New,
254                    "null" => CSharpSyntaxKind::Null,
255                    "object" => CSharpSyntaxKind::Object,
256                    "operator" => CSharpSyntaxKind::Operator,
257                    "out" => CSharpSyntaxKind::Out,
258                    "override" => CSharpSyntaxKind::Override,
259                    "params" => CSharpSyntaxKind::Params,
260                    "private" => CSharpSyntaxKind::Private,
261                    "protected" => CSharpSyntaxKind::Protected,
262                    "public" => CSharpSyntaxKind::Public,
263                    "readonly" => CSharpSyntaxKind::Readonly,
264                    "ref" => CSharpSyntaxKind::Ref,
265                    "return" => CSharpSyntaxKind::Return,
266                    "sbyte" => CSharpSyntaxKind::Sbyte,
267                    "sealed" => CSharpSyntaxKind::Sealed,
268                    "short" => CSharpSyntaxKind::Short,
269                    "sizeof" => CSharpSyntaxKind::Sizeof,
270                    "stackalloc" => CSharpSyntaxKind::Stackalloc,
271                    "static" => CSharpSyntaxKind::Static,
272                    "string" => CSharpSyntaxKind::String,
273                    "struct" => CSharpSyntaxKind::Struct,
274                    "switch" => CSharpSyntaxKind::Switch,
275                    "this" => CSharpSyntaxKind::This,
276                    "throw" => CSharpSyntaxKind::Throw,
277                    "true" => CSharpSyntaxKind::True,
278                    "try" => CSharpSyntaxKind::Try,
279                    "typeof" => CSharpSyntaxKind::Typeof,
280                    "uint" => CSharpSyntaxKind::Uint,
281                    "ulong" => CSharpSyntaxKind::Ulong,
282                    "unchecked" => CSharpSyntaxKind::Unchecked,
283                    "unsafe" => CSharpSyntaxKind::Unsafe,
284                    "ushort" => CSharpSyntaxKind::Ushort,
285                    "using" => CSharpSyntaxKind::Using,
286                    "virtual" => CSharpSyntaxKind::Virtual,
287                    "void" => CSharpSyntaxKind::Void,
288                    "volatile" => CSharpSyntaxKind::Volatile,
289                    "while" => CSharpSyntaxKind::While,
290                    _ => CSharpSyntaxKind::Identifier,
291                };
292
293                state.add_token(token_kind, start_pos, state.get_position());
294                true
295            }
296            else {
297                false
298            }
299        }
300        else {
301            false
302        }
303    }
304
305    /// 处理操作符
306    fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
307        let start_pos = state.get_position();
308
309        if let Some(ch) = state.peek() {
310            let token_kind = match ch {
311                '+' => {
312                    state.advance(1);
313                    if let Some('=') = state.peek() {
314                        state.advance(1);
315                        CSharpSyntaxKind::PlusAssign
316                    }
317                    else if let Some('+') = state.peek() {
318                        state.advance(1);
319                        CSharpSyntaxKind::Increment
320                    }
321                    else {
322                        CSharpSyntaxKind::Plus
323                    }
324                }
325                '-' => {
326                    state.advance(1);
327                    if let Some('=') = state.peek() {
328                        state.advance(1);
329                        CSharpSyntaxKind::MinusAssign
330                    }
331                    else if let Some('-') = state.peek() {
332                        state.advance(1);
333                        CSharpSyntaxKind::Decrement
334                    }
335                    else {
336                        CSharpSyntaxKind::Minus
337                    }
338                }
339                '*' => {
340                    state.advance(1);
341                    if let Some('=') = state.peek() {
342                        state.advance(1);
343                        CSharpSyntaxKind::StarAssign
344                    }
345                    else {
346                        CSharpSyntaxKind::Star
347                    }
348                }
349                '/' => {
350                    // 这里不处理注释,因为已经在 lex_comment 中处理了
351                    state.advance(1);
352                    if let Some('=') = state.peek() {
353                        state.advance(1);
354                        CSharpSyntaxKind::SlashAssign
355                    }
356                    else {
357                        CSharpSyntaxKind::Slash
358                    }
359                }
360                '%' => {
361                    state.advance(1);
362                    if let Some('=') = state.peek() {
363                        state.advance(1);
364                        CSharpSyntaxKind::PercentAssign
365                    }
366                    else {
367                        CSharpSyntaxKind::Percent
368                    }
369                }
370                '=' => {
371                    state.advance(1);
372                    if let Some('=') = state.peek() {
373                        state.advance(1);
374                        CSharpSyntaxKind::Equal
375                    }
376                    else {
377                        CSharpSyntaxKind::Assign
378                    }
379                }
380                '!' => {
381                    state.advance(1);
382                    if let Some('=') = state.peek() {
383                        state.advance(1);
384                        CSharpSyntaxKind::NotEqual
385                    }
386                    else {
387                        CSharpSyntaxKind::LogicalNot
388                    }
389                }
390                '<' => {
391                    state.advance(1);
392                    if let Some('=') = state.peek() {
393                        state.advance(1);
394                        CSharpSyntaxKind::LessEqual
395                    }
396                    else if let Some('<') = state.peek() {
397                        state.advance(1);
398                        CSharpSyntaxKind::LeftShift
399                    }
400                    else {
401                        CSharpSyntaxKind::Less
402                    }
403                }
404                '>' => {
405                    state.advance(1);
406                    if let Some('=') = state.peek() {
407                        state.advance(1);
408                        CSharpSyntaxKind::GreaterEqual
409                    }
410                    else if let Some('>') = state.peek() {
411                        state.advance(1);
412                        CSharpSyntaxKind::RightShift
413                    }
414                    else {
415                        CSharpSyntaxKind::Greater
416                    }
417                }
418                '&' => {
419                    state.advance(1);
420                    if let Some('&') = state.peek() {
421                        state.advance(1);
422                        CSharpSyntaxKind::LogicalAnd
423                    }
424                    else {
425                        CSharpSyntaxKind::Ampersand
426                    }
427                }
428                '|' => {
429                    state.advance(1);
430                    if let Some('|') = state.peek() {
431                        state.advance(1);
432                        CSharpSyntaxKind::LogicalOr
433                    }
434                    else {
435                        CSharpSyntaxKind::Pipe
436                    }
437                }
438                '^' => {
439                    state.advance(1);
440                    CSharpSyntaxKind::Caret
441                }
442                '~' => {
443                    state.advance(1);
444                    CSharpSyntaxKind::Tilde
445                }
446                _ => return false,
447            };
448
449            state.add_token(token_kind, start_pos, state.get_position());
450            true
451        }
452        else {
453            false
454        }
455    }
456
457    /// 处理分隔符
458    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
459        let start_pos = state.get_position();
460
461        if let Some(ch) = state.peek() {
462            let token_kind = match ch {
463                '(' => CSharpSyntaxKind::LeftParen,
464                ')' => CSharpSyntaxKind::RightParen,
465                '[' => CSharpSyntaxKind::LeftBracket,
466                ']' => CSharpSyntaxKind::RightBracket,
467                '{' => CSharpSyntaxKind::LeftBrace,
468                '}' => CSharpSyntaxKind::RightBrace,
469                ';' => CSharpSyntaxKind::Semicolon,
470                ',' => CSharpSyntaxKind::Comma,
471                '.' => CSharpSyntaxKind::Dot,
472                ':' => CSharpSyntaxKind::Colon,
473                '?' => CSharpSyntaxKind::Question,
474                _ => return false,
475            };
476
477            state.advance(ch.len_utf8());
478            state.add_token(token_kind, start_pos, state.get_position());
479            true
480        }
481        else {
482            false
483        }
484    }
485}
486
487impl Lexer<CSharpLanguage> for CSharpLexer {
488    fn lex_incremental(
489        &self,
490        source: impl Source,
491        _changed: usize,
492        _cache: IncrementalCache<CSharpLanguage>,
493    ) -> LexOutput<CSharpLanguage> {
494        let mut state = LexerState::new_with_cache(source, _changed, _cache);
495
496        while state.not_at_end() {
497            if self.skip_whitespace(&mut state) {
498                continue;
499            }
500
501            if self.lex_newline(&mut state) {
502                continue;
503            }
504
505            if self.lex_comment(&mut state) {
506                continue;
507            }
508
509            if self.lex_string(&mut state) {
510                continue;
511            }
512
513            if self.lex_number(&mut state) {
514                continue;
515            }
516
517            if self.lex_keyword_or_identifier(&mut state) {
518                continue;
519            }
520
521            if self.lex_operator(&mut state) {
522                continue;
523            }
524
525            if self.lex_delimiter(&mut state) {
526                continue;
527            }
528
529            // 如果没有匹配到任何模式,处理错误字符并前进
530            let start_pos = state.get_position();
531            if let Some(ch) = state.peek() {
532                state.advance(ch.len_utf8());
533                state.add_token(CSharpSyntaxKind::Error, start_pos, state.get_position());
534            }
535            else {
536                break;
537            }
538        }
539
540        state.finish(Ok(()))
541    }
542}