Skip to main content

oak_csharp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::language::CSharpLanguage;
3pub mod token_type;
4use oak_core::{
5    Lexer, LexerCache, LexerState,
6    lexer::LexOutput,
7    source::{Source, TextEdit},
8};
9pub use token_type::CSharpTokenType;
10
11type State<'a, S> = LexerState<'a, S, CSharpLanguage>;
12
13pub struct CSharpLexer<'config> {
14    _config: &'config CSharpLanguage,
15}
16
17impl<'config> CSharpLexer<'config> {
18    pub fn new(config: &'config CSharpLanguage) -> Self {
19        Self { _config: config }
20    }
21
22    /// 跳过空白字符
23    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
24        let start_pos = state.get_position();
25
26        while let Some(ch) = state.peek() {
27            if ch == ' ' || ch == '\t' {
28                state.advance(ch.len_utf8());
29            }
30            else {
31                break;
32            }
33        }
34
35        if state.get_position() > start_pos {
36            state.add_token(CSharpTokenType::Whitespace, start_pos, state.get_position());
37            true
38        }
39        else {
40            false
41        }
42    }
43
44    /// 处理换行
45    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
46        let start_pos = state.get_position();
47
48        if let Some('\n') = state.peek() {
49            state.advance(1);
50            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
51            true
52        }
53        else if let Some('\r') = state.peek() {
54            state.advance(1);
55            if let Some('\n') = state.peek() {
56                state.advance(1);
57            }
58            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
59            true
60        }
61        else {
62            false
63        }
64    }
65
66    /// 处理注释
67    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68        let start_pos = state.get_position();
69
70        if let Some('/') = state.peek() {
71            state.advance(1);
72            if let Some('/') = state.peek() {
73                // 单行注释
74                state.advance(1);
75                while let Some(ch) = state.peek() {
76                    if ch == '\n' || ch == '\r' {
77                        break;
78                    }
79                    state.advance(ch.len_utf8());
80                }
81                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
82                return true;
83            }
84            else if let Some('*') = state.peek() {
85                // 多行注释
86                state.advance(1);
87                while let Some(ch) = state.peek() {
88                    if ch == '*' {
89                        state.advance(1);
90                        if let Some('/') = state.peek() {
91                            state.advance(1);
92                            break;
93                        }
94                    }
95                    else {
96                        state.advance(ch.len_utf8());
97                    }
98                }
99                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
100                return true;
101            }
102            else {
103                // 回退,这不是注释
104                state.set_position(start_pos);
105                return false;
106            }
107        }
108        false
109    }
110
111    /// 处理字符串字面量
112    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start_pos = state.get_position();
114
115        if let Some('"') = state.peek() {
116            state.advance(1);
117            while let Some(ch) = state.peek() {
118                if ch == '"' {
119                    state.advance(1);
120                    break;
121                }
122                else if ch == '\\' {
123                    state.advance(1);
124                    if let Some(_) = state.peek() {
125                        state.advance(1)
126                    }
127                }
128                else {
129                    state.advance(ch.len_utf8())
130                }
131            }
132            state.add_token(CSharpTokenType::StringLiteral, start_pos, state.get_position());
133            true
134        }
135        else if let Some('\'') = state.peek() {
136            // 字符字面量
137            state.advance(1);
138            while let Some(ch) = state.peek() {
139                if ch == '\'' {
140                    state.advance(1);
141                    break;
142                }
143                else if ch == '\\' {
144                    state.advance(1);
145                    if let Some(_) = state.peek() {
146                        state.advance(1)
147                    }
148                }
149                else {
150                    state.advance(ch.len_utf8())
151                }
152            }
153            state.add_token(CSharpTokenType::CharLiteral, start_pos, state.get_position());
154            true
155        }
156        else {
157            false
158        }
159    }
160
161    /// 处理数字字面量
162    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
163        let start_pos = state.get_position();
164
165        if let Some(ch) = state.peek() {
166            if ch.is_ascii_digit() {
167                state.advance(ch.len_utf8());
168
169                while let Some(ch) = state.peek() {
170                    if ch.is_ascii_digit() || ch == '.' || ch == '_' { state.advance(ch.len_utf8()) } else { break }
171                }
172
173                // 处理后缀 (f, d, m, l, ul, etc.)
174                if let Some(ch) = state.peek() {
175                    if ch.is_ascii_alphabetic() {
176                        state.advance(ch.len_utf8());
177                        if let Some(ch2) = state.peek() {
178                            if ch2.is_ascii_alphabetic() {
179                                state.advance(ch2.len_utf8())
180                            }
181                        }
182                    }
183                }
184
185                state.add_token(CSharpTokenType::NumberLiteral, start_pos, state.get_position());
186                true
187            }
188            else {
189                false
190            }
191        }
192        else {
193            false
194        }
195    }
196
197    /// 处理关键字或标识符
198    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
199        let start_pos = state.get_position();
200
201        if let Some(ch) = state.peek() {
202            if ch.is_ascii_alphabetic() || ch == '_' || ch == '@' {
203                state.advance(ch.len_utf8());
204
205                while let Some(ch) = state.peek() {
206                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
207                }
208
209                let text = state.get_text_in((start_pos..state.get_position()).into());
210                let token_kind = match text.as_ref() {
211                    // C# 关键字
212                    "abstract" => CSharpTokenType::Abstract,
213                    "as" => CSharpTokenType::As,
214                    "async" => CSharpTokenType::AsyncKeyword,
215                    "await" => CSharpTokenType::AwaitKeyword,
216                    "base" => CSharpTokenType::Base,
217                    "bool" => CSharpTokenType::Bool,
218                    "break" => CSharpTokenType::Break,
219                    "byte" => CSharpTokenType::Byte,
220                    "case" => CSharpTokenType::Case,
221                    "catch" => CSharpTokenType::Catch,
222                    "char" => CSharpTokenType::Char,
223                    "checked" => CSharpTokenType::Checked,
224                    "class" => CSharpTokenType::Class,
225                    "const" => CSharpTokenType::Const,
226                    "continue" => CSharpTokenType::Continue,
227                    "decimal" => CSharpTokenType::Decimal,
228                    "default" => CSharpTokenType::Default,
229                    "delegate" => CSharpTokenType::Delegate,
230                    "do" => CSharpTokenType::Do,
231                    "double" => CSharpTokenType::Double,
232                    "else" => CSharpTokenType::Else,
233                    "enum" => CSharpTokenType::Enum,
234                    "event" => CSharpTokenType::Event,
235                    "explicit" => CSharpTokenType::Explicit,
236                    "extern" => CSharpTokenType::Extern,
237                    "false" => CSharpTokenType::False,
238                    "finally" => CSharpTokenType::Finally,
239                    "fixed" => CSharpTokenType::Fixed,
240                    "float" => CSharpTokenType::Float,
241                    "for" => CSharpTokenType::For,
242                    "foreach" => CSharpTokenType::Foreach,
243                    "goto" => CSharpTokenType::Goto,
244                    "if" => CSharpTokenType::If,
245                    "implicit" => CSharpTokenType::Implicit,
246                    "in" => CSharpTokenType::In,
247                    "int" => CSharpTokenType::Int,
248                    "interface" => CSharpTokenType::Interface,
249                    "internal" => CSharpTokenType::Internal,
250                    "is" => CSharpTokenType::Is,
251                    "lock" => CSharpTokenType::Lock,
252                    "long" => CSharpTokenType::Long,
253                    "namespace" => CSharpTokenType::Namespace,
254                    "new" => CSharpTokenType::New,
255                    "null" => CSharpTokenType::Null,
256                    "object" => CSharpTokenType::Object,
257                    "operator" => CSharpTokenType::Operator,
258                    "out" => CSharpTokenType::Out,
259                    "override" => CSharpTokenType::Override,
260                    "params" => CSharpTokenType::Params,
261                    "private" => CSharpTokenType::Private,
262                    "protected" => CSharpTokenType::Protected,
263                    "public" => CSharpTokenType::Public,
264                    "readonly" => CSharpTokenType::Readonly,
265                    "record" => CSharpTokenType::Record,
266                    "ref" => CSharpTokenType::Ref,
267                    "return" => CSharpTokenType::Return,
268                    "sbyte" => CSharpTokenType::Sbyte,
269                    "sealed" => CSharpTokenType::Sealed,
270                    "short" => CSharpTokenType::Short,
271                    "sizeof" => CSharpTokenType::Sizeof,
272                    "stackalloc" => CSharpTokenType::Stackalloc,
273                    "static" => CSharpTokenType::Static,
274                    "string" => CSharpTokenType::String,
275                    "struct" => CSharpTokenType::Struct,
276                    "switch" => CSharpTokenType::Switch,
277                    "this" => CSharpTokenType::This,
278                    "throw" => CSharpTokenType::Throw,
279                    "true" => CSharpTokenType::True,
280                    "try" => CSharpTokenType::Try,
281                    "typeof" => CSharpTokenType::Typeof,
282                    "uint" => CSharpTokenType::Uint,
283                    "ulong" => CSharpTokenType::Ulong,
284                    "unchecked" => CSharpTokenType::Unchecked,
285                    "unsafe" => CSharpTokenType::Unsafe,
286                    "ushort" => CSharpTokenType::Ushort,
287                    "using" => CSharpTokenType::Using,
288                    "virtual" => CSharpTokenType::Virtual,
289                    "void" => CSharpTokenType::Void,
290                    "volatile" => CSharpTokenType::Volatile,
291                    "while" => CSharpTokenType::While,
292                    _ => CSharpTokenType::Identifier,
293                };
294
295                state.add_token(token_kind, start_pos, state.get_position());
296                true
297            }
298            else {
299                false
300            }
301        }
302        else {
303            false
304        }
305    }
306
307    /// 处理操作符
308    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
309        let start_pos = state.get_position();
310
311        if let Some(ch) = state.peek() {
312            let token_kind = match ch {
313                '+' => {
314                    state.advance(1);
315                    if let Some('=') = state.peek() {
316                        state.advance(1);
317                        CSharpTokenType::PlusAssign
318                    }
319                    else if let Some('+') = state.peek() {
320                        state.advance(1);
321                        CSharpTokenType::Increment
322                    }
323                    else {
324                        CSharpTokenType::Plus
325                    }
326                }
327                '-' => {
328                    state.advance(1);
329                    if let Some('=') = state.peek() {
330                        state.advance(1);
331                        CSharpTokenType::MinusAssign
332                    }
333                    else if let Some('-') = state.peek() {
334                        state.advance(1);
335                        CSharpTokenType::Decrement
336                    }
337                    else {
338                        CSharpTokenType::Minus
339                    }
340                }
341                '*' => {
342                    state.advance(1);
343                    if let Some('=') = state.peek() {
344                        state.advance(1);
345                        CSharpTokenType::StarAssign
346                    }
347                    else {
348                        CSharpTokenType::Star
349                    }
350                }
351                '/' => {
352                    // 这里不处理注释,因为已经在 lex_comment 中处理了
353                    state.advance(1);
354                    if let Some('=') = state.peek() {
355                        state.advance(1);
356                        CSharpTokenType::SlashAssign
357                    }
358                    else {
359                        CSharpTokenType::Slash
360                    }
361                }
362                '%' => {
363                    state.advance(1);
364                    if let Some('=') = state.peek() {
365                        state.advance(1);
366                        CSharpTokenType::PercentAssign
367                    }
368                    else {
369                        CSharpTokenType::Percent
370                    }
371                }
372                '=' => {
373                    state.advance(1);
374                    if let Some('=') = state.peek() {
375                        state.advance(1);
376                        CSharpTokenType::Equal
377                    }
378                    else {
379                        CSharpTokenType::Assign
380                    }
381                }
382                '!' => {
383                    state.advance(1);
384                    if let Some('=') = state.peek() {
385                        state.advance(1);
386                        CSharpTokenType::NotEqual
387                    }
388                    else {
389                        CSharpTokenType::LogicalNot
390                    }
391                }
392                '<' => {
393                    state.advance(1);
394                    if let Some('=') = state.peek() {
395                        state.advance(1);
396                        CSharpTokenType::LessEqual
397                    }
398                    else if let Some('<') = state.peek() {
399                        state.advance(1);
400                        CSharpTokenType::LeftShift
401                    }
402                    else {
403                        CSharpTokenType::Less
404                    }
405                }
406                '>' => {
407                    state.advance(1);
408                    if let Some('=') = state.peek() {
409                        state.advance(1);
410                        CSharpTokenType::GreaterEqual
411                    }
412                    else if let Some('>') = state.peek() {
413                        state.advance(1);
414                        CSharpTokenType::RightShift
415                    }
416                    else {
417                        CSharpTokenType::Greater
418                    }
419                }
420                '&' => {
421                    state.advance(1);
422                    if let Some('&') = state.peek() {
423                        state.advance(1);
424                        CSharpTokenType::LogicalAnd
425                    }
426                    else {
427                        CSharpTokenType::Ampersand
428                    }
429                }
430                '|' => {
431                    state.advance(1);
432                    if let Some('|') = state.peek() {
433                        state.advance(1);
434                        CSharpTokenType::LogicalOr
435                    }
436                    else {
437                        CSharpTokenType::Pipe
438                    }
439                }
440                '^' => {
441                    state.advance(1);
442                    CSharpTokenType::Caret
443                }
444                '~' => {
445                    state.advance(1);
446                    CSharpTokenType::Tilde
447                }
448                _ => return false,
449            };
450
451            state.add_token(token_kind, start_pos, state.get_position());
452            true
453        }
454        else {
455            false
456        }
457    }
458
459    /// 处理分隔符
460    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
461        let start_pos = state.get_position();
462
463        if let Some(ch) = state.peek() {
464            let token_kind = match ch {
465                '(' => CSharpTokenType::LeftParen,
466                ')' => CSharpTokenType::RightParen,
467                '[' => CSharpTokenType::LeftBracket,
468                ']' => CSharpTokenType::RightBracket,
469                '{' => CSharpTokenType::LeftBrace,
470                '}' => CSharpTokenType::RightBrace,
471                ';' => CSharpTokenType::Semicolon,
472                ',' => CSharpTokenType::Comma,
473                '.' => CSharpTokenType::Dot,
474                ':' => CSharpTokenType::Colon,
475                '?' => CSharpTokenType::Question,
476                _ => return false,
477            };
478
479            state.advance(ch.len_utf8());
480            state.add_token(token_kind, start_pos, state.get_position());
481            true
482        }
483        else {
484            false
485        }
486    }
487
488    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
489        while state.not_at_end() {
490            let safe_point = state.get_position();
491
492            if self.skip_whitespace(state) {
493                continue;
494            }
495
496            if self.lex_newline(state) {
497                continue;
498            }
499
500            if self.lex_comment(state) {
501                continue;
502            }
503
504            if self.lex_string(state) {
505                continue;
506            }
507
508            if self.lex_number(state) {
509                continue;
510            }
511
512            if self.lex_keyword_or_identifier(state) {
513                continue;
514            }
515
516            if self.lex_operator(state) {
517                continue;
518            }
519
520            if self.lex_delimiter(state) {
521                continue;
522            }
523
524            // 如果没有匹配到任何模式,处理错误字符并前进
525            let start_pos = state.get_position();
526            if let Some(ch) = state.peek() {
527                state.advance(ch.len_utf8());
528                state.add_token(CSharpTokenType::Error, start_pos, state.get_position());
529            }
530
531            state.advance_if_dead_lock(safe_point)
532        }
533        Ok(())
534    }
535}
536
537impl<'config> Lexer<CSharpLanguage> for CSharpLexer<'config> {
538    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], mut cache: &'a mut impl LexerCache<CSharpLanguage>) -> LexOutput<CSharpLanguage> {
539        let mut state = LexerState::new(text);
540        let result = self.run(&mut state);
541        if result.is_ok() {
542            state.add_eof();
543        }
544        state.finish_with_cache(result, &mut cache)
545    }
546}