oak_csharp/lexer/
mod.rs

1use crate::language::CSharpLanguage;
2pub mod token_type;
3use oak_core::{
4    Lexer, LexerCache, LexerState,
5    lexer::LexOutput,
6    source::{Source, TextEdit},
7};
8pub use token_type::CSharpTokenType;
9
10type State<'a, S> = LexerState<'a, S, CSharpLanguage>;
11
12pub struct CSharpLexer<'config> {
13    _config: &'config CSharpLanguage,
14}
15
16impl<'config> CSharpLexer<'config> {
17    pub fn new(config: &'config CSharpLanguage) -> Self {
18        Self { _config: config }
19    }
20
21    /// 跳过空白字符
22    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
23        let start_pos = state.get_position();
24
25        while let Some(ch) = state.peek() {
26            if ch == ' ' || ch == '\t' {
27                state.advance(ch.len_utf8());
28            }
29            else {
30                break;
31            }
32        }
33
34        if state.get_position() > start_pos {
35            state.add_token(CSharpTokenType::Whitespace, start_pos, state.get_position());
36            true
37        }
38        else {
39            false
40        }
41    }
42
43    /// 处理换行
44    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
45        let start_pos = state.get_position();
46
47        if let Some('\n') = state.peek() {
48            state.advance(1);
49            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
50            true
51        }
52        else if let Some('\r') = state.peek() {
53            state.advance(1);
54            if let Some('\n') = state.peek() {
55                state.advance(1);
56            }
57            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
58            true
59        }
60        else {
61            false
62        }
63    }
64
65    /// 处理注释
66    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
67        let start_pos = state.get_position();
68
69        if let Some('/') = state.peek() {
70            state.advance(1);
71            if let Some('/') = state.peek() {
72                // 单行注释
73                state.advance(1);
74                while let Some(ch) = state.peek() {
75                    if ch == '\n' || ch == '\r' {
76                        break;
77                    }
78                    state.advance(ch.len_utf8());
79                }
80                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
81                return true;
82            }
83            else if let Some('*') = state.peek() {
84                // 多行注释
85                state.advance(1);
86                while let Some(ch) = state.peek() {
87                    if ch == '*' {
88                        state.advance(1);
89                        if let Some('/') = state.peek() {
90                            state.advance(1);
91                            break;
92                        }
93                    }
94                    else {
95                        state.advance(ch.len_utf8());
96                    }
97                }
98                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
99                return true;
100            }
101            else {
102                // 回退,这不是注释
103                state.set_position(start_pos);
104                return false;
105            }
106        }
107        false
108    }
109
110    /// 处理字符串字面量
111    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        let start_pos = state.get_position();
113
114        if let Some('"') = state.peek() {
115            state.advance(1);
116            while let Some(ch) = state.peek() {
117                if ch == '"' {
118                    state.advance(1);
119                    break;
120                }
121                else if ch == '\\' {
122                    state.advance(1);
123                    if let Some(_) = state.peek() {
124                        state.advance(1);
125                    }
126                }
127                else {
128                    state.advance(ch.len_utf8());
129                }
130            }
131            state.add_token(CSharpTokenType::StringLiteral, start_pos, state.get_position());
132            true
133        }
134        else if let Some('\'') = state.peek() {
135            // 字符字面量
136            state.advance(1);
137            while let Some(ch) = state.peek() {
138                if ch == '\'' {
139                    state.advance(1);
140                    break;
141                }
142                else if ch == '\\' {
143                    state.advance(1);
144                    if let Some(_) = state.peek() {
145                        state.advance(1);
146                    }
147                }
148                else {
149                    state.advance(ch.len_utf8());
150                }
151            }
152            state.add_token(CSharpTokenType::CharLiteral, start_pos, state.get_position());
153            true
154        }
155        else {
156            false
157        }
158    }
159
160    /// 处理数字字面量
161    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
162        let start_pos = state.get_position();
163
164        if let Some(ch) = state.peek() {
165            if ch.is_ascii_digit() {
166                state.advance(ch.len_utf8());
167
168                while let Some(ch) = state.peek() {
169                    if ch.is_ascii_digit() || ch == '.' || ch == '_' {
170                        state.advance(ch.len_utf8());
171                    }
172                    else {
173                        break;
174                    }
175                }
176
177                // 处理后缀 (f, d, m, l, ul, etc.)
178                if let Some(ch) = state.peek() {
179                    if ch.is_ascii_alphabetic() {
180                        state.advance(ch.len_utf8());
181                        if let Some(ch2) = state.peek() {
182                            if ch2.is_ascii_alphabetic() {
183                                state.advance(ch2.len_utf8());
184                            }
185                        }
186                    }
187                }
188
189                state.add_token(CSharpTokenType::NumberLiteral, start_pos, state.get_position());
190                true
191            }
192            else {
193                false
194            }
195        }
196        else {
197            false
198        }
199    }
200
201    /// 处理关键字或标识符
202    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
203        let start_pos = state.get_position();
204
205        if let Some(ch) = state.peek() {
206            if ch.is_ascii_alphabetic() || ch == '_' || ch == '@' {
207                state.advance(ch.len_utf8());
208
209                while let Some(ch) = state.peek() {
210                    if ch.is_ascii_alphanumeric() || ch == '_' {
211                        state.advance(ch.len_utf8());
212                    }
213                    else {
214                        break;
215                    }
216                }
217
218                let text = state.get_text_in((start_pos..state.get_position()).into());
219                let token_kind = match text.as_ref() {
220                    // C# 关键字
221                    "abstract" => CSharpTokenType::Abstract,
222                    "as" => CSharpTokenType::As,
223                    "base" => CSharpTokenType::Base,
224                    "bool" => CSharpTokenType::Bool,
225                    "break" => CSharpTokenType::Break,
226                    "byte" => CSharpTokenType::Byte,
227                    "case" => CSharpTokenType::Case,
228                    "catch" => CSharpTokenType::Catch,
229                    "char" => CSharpTokenType::Char,
230                    "checked" => CSharpTokenType::Checked,
231                    "class" => CSharpTokenType::Class,
232                    "const" => CSharpTokenType::Const,
233                    "continue" => CSharpTokenType::Continue,
234                    "decimal" => CSharpTokenType::Decimal,
235                    "default" => CSharpTokenType::Default,
236                    "delegate" => CSharpTokenType::Delegate,
237                    "do" => CSharpTokenType::Do,
238                    "double" => CSharpTokenType::Double,
239                    "else" => CSharpTokenType::Else,
240                    "enum" => CSharpTokenType::Enum,
241                    "event" => CSharpTokenType::Event,
242                    "explicit" => CSharpTokenType::Explicit,
243                    "extern" => CSharpTokenType::Extern,
244                    "false" => CSharpTokenType::False,
245                    "finally" => CSharpTokenType::Finally,
246                    "fixed" => CSharpTokenType::Fixed,
247                    "float" => CSharpTokenType::Float,
248                    "for" => CSharpTokenType::For,
249                    "foreach" => CSharpTokenType::Foreach,
250                    "goto" => CSharpTokenType::Goto,
251                    "if" => CSharpTokenType::If,
252                    "implicit" => CSharpTokenType::Implicit,
253                    "in" => CSharpTokenType::In,
254                    "int" => CSharpTokenType::Int,
255                    "interface" => CSharpTokenType::Interface,
256                    "internal" => CSharpTokenType::Internal,
257                    "is" => CSharpTokenType::Is,
258                    "lock" => CSharpTokenType::Lock,
259                    "long" => CSharpTokenType::Long,
260                    "namespace" => CSharpTokenType::Namespace,
261                    "new" => CSharpTokenType::New,
262                    "null" => CSharpTokenType::Null,
263                    "object" => CSharpTokenType::Object,
264                    "operator" => CSharpTokenType::Operator,
265                    "out" => CSharpTokenType::Out,
266                    "override" => CSharpTokenType::Override,
267                    "params" => CSharpTokenType::Params,
268                    "private" => CSharpTokenType::Private,
269                    "protected" => CSharpTokenType::Protected,
270                    "public" => CSharpTokenType::Public,
271                    "readonly" => CSharpTokenType::Readonly,
272                    "ref" => CSharpTokenType::Ref,
273                    "return" => CSharpTokenType::Return,
274                    "sbyte" => CSharpTokenType::Sbyte,
275                    "sealed" => CSharpTokenType::Sealed,
276                    "short" => CSharpTokenType::Short,
277                    "sizeof" => CSharpTokenType::Sizeof,
278                    "stackalloc" => CSharpTokenType::Stackalloc,
279                    "static" => CSharpTokenType::Static,
280                    "string" => CSharpTokenType::String,
281                    "struct" => CSharpTokenType::Struct,
282                    "switch" => CSharpTokenType::Switch,
283                    "this" => CSharpTokenType::This,
284                    "throw" => CSharpTokenType::Throw,
285                    "true" => CSharpTokenType::True,
286                    "try" => CSharpTokenType::Try,
287                    "typeof" => CSharpTokenType::Typeof,
288                    "uint" => CSharpTokenType::Uint,
289                    "ulong" => CSharpTokenType::Ulong,
290                    "unchecked" => CSharpTokenType::Unchecked,
291                    "unsafe" => CSharpTokenType::Unsafe,
292                    "ushort" => CSharpTokenType::Ushort,
293                    "using" => CSharpTokenType::Using,
294                    "virtual" => CSharpTokenType::Virtual,
295                    "void" => CSharpTokenType::Void,
296                    "volatile" => CSharpTokenType::Volatile,
297                    "while" => CSharpTokenType::While,
298                    _ => CSharpTokenType::Identifier,
299                };
300
301                state.add_token(token_kind, start_pos, state.get_position());
302                true
303            }
304            else {
305                false
306            }
307        }
308        else {
309            false
310        }
311    }
312
313    /// 处理操作符
314    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
315        let start_pos = state.get_position();
316
317        if let Some(ch) = state.peek() {
318            let token_kind = match ch {
319                '+' => {
320                    state.advance(1);
321                    if let Some('=') = state.peek() {
322                        state.advance(1);
323                        CSharpTokenType::PlusAssign
324                    }
325                    else if let Some('+') = state.peek() {
326                        state.advance(1);
327                        CSharpTokenType::Increment
328                    }
329                    else {
330                        CSharpTokenType::Plus
331                    }
332                }
333                '-' => {
334                    state.advance(1);
335                    if let Some('=') = state.peek() {
336                        state.advance(1);
337                        CSharpTokenType::MinusAssign
338                    }
339                    else if let Some('-') = state.peek() {
340                        state.advance(1);
341                        CSharpTokenType::Decrement
342                    }
343                    else {
344                        CSharpTokenType::Minus
345                    }
346                }
347                '*' => {
348                    state.advance(1);
349                    if let Some('=') = state.peek() {
350                        state.advance(1);
351                        CSharpTokenType::StarAssign
352                    }
353                    else {
354                        CSharpTokenType::Star
355                    }
356                }
357                '/' => {
358                    // 这里不处理注释,因为已经在 lex_comment 中处理了
359                    state.advance(1);
360                    if let Some('=') = state.peek() {
361                        state.advance(1);
362                        CSharpTokenType::SlashAssign
363                    }
364                    else {
365                        CSharpTokenType::Slash
366                    }
367                }
368                '%' => {
369                    state.advance(1);
370                    if let Some('=') = state.peek() {
371                        state.advance(1);
372                        CSharpTokenType::PercentAssign
373                    }
374                    else {
375                        CSharpTokenType::Percent
376                    }
377                }
378                '=' => {
379                    state.advance(1);
380                    if let Some('=') = state.peek() {
381                        state.advance(1);
382                        CSharpTokenType::Equal
383                    }
384                    else {
385                        CSharpTokenType::Assign
386                    }
387                }
388                '!' => {
389                    state.advance(1);
390                    if let Some('=') = state.peek() {
391                        state.advance(1);
392                        CSharpTokenType::NotEqual
393                    }
394                    else {
395                        CSharpTokenType::LogicalNot
396                    }
397                }
398                '<' => {
399                    state.advance(1);
400                    if let Some('=') = state.peek() {
401                        state.advance(1);
402                        CSharpTokenType::LessEqual
403                    }
404                    else if let Some('<') = state.peek() {
405                        state.advance(1);
406                        CSharpTokenType::LeftShift
407                    }
408                    else {
409                        CSharpTokenType::Less
410                    }
411                }
412                '>' => {
413                    state.advance(1);
414                    if let Some('=') = state.peek() {
415                        state.advance(1);
416                        CSharpTokenType::GreaterEqual
417                    }
418                    else if let Some('>') = state.peek() {
419                        state.advance(1);
420                        CSharpTokenType::RightShift
421                    }
422                    else {
423                        CSharpTokenType::Greater
424                    }
425                }
426                '&' => {
427                    state.advance(1);
428                    if let Some('&') = state.peek() {
429                        state.advance(1);
430                        CSharpTokenType::LogicalAnd
431                    }
432                    else {
433                        CSharpTokenType::Ampersand
434                    }
435                }
436                '|' => {
437                    state.advance(1);
438                    if let Some('|') = state.peek() {
439                        state.advance(1);
440                        CSharpTokenType::LogicalOr
441                    }
442                    else {
443                        CSharpTokenType::Pipe
444                    }
445                }
446                '^' => {
447                    state.advance(1);
448                    CSharpTokenType::Caret
449                }
450                '~' => {
451                    state.advance(1);
452                    CSharpTokenType::Tilde
453                }
454                _ => return false,
455            };
456
457            state.add_token(token_kind, start_pos, state.get_position());
458            true
459        }
460        else {
461            false
462        }
463    }
464
465    /// 处理分隔符
466    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
467        let start_pos = state.get_position();
468
469        if let Some(ch) = state.peek() {
470            let token_kind = match ch {
471                '(' => CSharpTokenType::LeftParen,
472                ')' => CSharpTokenType::RightParen,
473                '[' => CSharpTokenType::LeftBracket,
474                ']' => CSharpTokenType::RightBracket,
475                '{' => CSharpTokenType::LeftBrace,
476                '}' => CSharpTokenType::RightBrace,
477                ';' => CSharpTokenType::Semicolon,
478                ',' => CSharpTokenType::Comma,
479                '.' => CSharpTokenType::Dot,
480                ':' => CSharpTokenType::Colon,
481                '?' => CSharpTokenType::Question,
482                _ => return false,
483            };
484
485            state.advance(ch.len_utf8());
486            state.add_token(token_kind, start_pos, state.get_position());
487            true
488        }
489        else {
490            false
491        }
492    }
493
494    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
495        while state.not_at_end() {
496            let safe_point = state.get_position();
497
498            if self.skip_whitespace(state) {
499                continue;
500            }
501
502            if self.lex_newline(state) {
503                continue;
504            }
505
506            if self.lex_comment(state) {
507                continue;
508            }
509
510            if self.lex_string(state) {
511                continue;
512            }
513
514            if self.lex_number(state) {
515                continue;
516            }
517
518            if self.lex_keyword_or_identifier(state) {
519                continue;
520            }
521
522            if self.lex_operator(state) {
523                continue;
524            }
525
526            if self.lex_delimiter(state) {
527                continue;
528            }
529
530            // 如果没有匹配到任何模式,处理错误字符并前进
531            let start_pos = state.get_position();
532            if let Some(ch) = state.peek() {
533                state.advance(ch.len_utf8());
534                state.add_token(CSharpTokenType::Error, start_pos, state.get_position());
535            }
536
537            state.advance_if_dead_lock(safe_point);
538        }
539        Ok(())
540    }
541}
542
543impl<'config> Lexer<CSharpLanguage> for CSharpLexer<'config> {
544    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], mut cache: &'a mut impl LexerCache<CSharpLanguage>) -> LexOutput<CSharpLanguage> {
545        let mut state = LexerState::new(text);
546        let result = self.run(&mut state);
547        if result.is_ok() {
548            state.add_eof();
549        }
550        state.finish_with_cache(result, &mut cache)
551    }
552}