Skip to main content

oak_csharp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use crate::language::CSharpLanguage;
4
5/// Token types and definitions for the C# lexer.
6pub mod token_type;
7
8use oak_core::{
9    Lexer, LexerCache, LexerState,
10    lexer::LexOutput,
11    source::{Source, TextEdit},
12};
13pub use token_type::CSharpTokenType;
14
15pub(crate) type State<'a, S> = LexerState<'a, S, CSharpLanguage>;
16
17/// A lexer for the C# language.
18pub struct CSharpLexer<'config> {
19    config: &'config CSharpLanguage,
20}
21
22impl<'config> CSharpLexer<'config> {
23    /// Creates a new C# lexer.
24    pub fn new(config: &'config CSharpLanguage) -> Self {
25        Self { config }
26    }
27
28    /// Skips whitespace characters (spaces and tabs).
29    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
30        let start_pos = state.get_position();
31
32        while let Some(ch) = state.peek() {
33            if ch == ' ' || ch == '\t' {
34                state.advance(ch.len_utf8());
35            }
36            else {
37                break;
38            }
39        }
40
41        if state.get_position() > start_pos {
42            state.add_token(CSharpTokenType::Whitespace, start_pos, state.get_position());
43            true
44        }
45        else {
46            false
47        }
48    }
49
50    /// Lexes a newline character (LF or CRLF).
51    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
52        let start_pos = state.get_position();
53
54        if let Some('\n') = state.peek() {
55            state.advance(1);
56            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
57            true
58        }
59        else if let Some('\r') = state.peek() {
60            state.advance(1);
61            if let Some('\n') = state.peek() {
62                state.advance(1);
63            }
64            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
65            true
66        }
67        else {
68            false
69        }
70    }
71
72    /// Lexes a comment (single-line `//` or multi-line `/* ... */`).
73    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
74        let start_pos = state.get_position();
75
76        if let Some('/') = state.peek() {
77            state.advance(1);
78            if let Some('/') = state.peek() {
79                // Single-line comment
80                state.advance(1);
81                while let Some(ch) = state.peek() {
82                    if ch == '\n' || ch == '\r' {
83                        break;
84                    }
85                    state.advance(ch.len_utf8());
86                }
87                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
88                return true;
89            }
90            else if let Some('*') = state.peek() {
91                // Multi-line comment
92                state.advance(1);
93                while let Some(ch) = state.peek() {
94                    if ch == '*' {
95                        state.advance(1);
96                        if let Some('/') = state.peek() {
97                            state.advance(1);
98                            break;
99                        }
100                    }
101                    else {
102                        state.advance(ch.len_utf8());
103                    }
104                }
105                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
106                return true;
107            }
108            else {
109                // Backtrack, not a comment
110                state.set_position(start_pos);
111                return false;
112            }
113        }
114        false
115    }
116
117    /// Lexes a string literal (`"..."`) or character literal (`'...'`).
118    /// Handles basic escape sequences.
119    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120        let start_pos = state.get_position();
121
122        if let Some('"') = state.peek() {
123            state.advance(1);
124            while let Some(ch) = state.peek() {
125                if ch == '"' {
126                    state.advance(1);
127                    break;
128                }
129                else if ch == '\\' {
130                    state.advance(1);
131                    if let Some(_) = state.peek() {
132                        state.advance(1)
133                    }
134                }
135                else {
136                    state.advance(ch.len_utf8())
137                }
138            }
139            state.add_token(CSharpTokenType::StringLiteral, start_pos, state.get_position());
140            true
141        }
142        else if let Some('\'') = state.peek() {
143            // Char literal
144            state.advance(1);
145            while let Some(ch) = state.peek() {
146                if ch == '\'' {
147                    state.advance(1);
148                    break;
149                }
150                else if ch == '\\' {
151                    state.advance(1);
152                    if let Some(_) = state.peek() {
153                        state.advance(1)
154                    }
155                }
156                else {
157                    state.advance(ch.len_utf8())
158                }
159            }
160            state.add_token(CSharpTokenType::CharLiteral, start_pos, state.get_position());
161            true
162        }
163        else {
164            false
165        }
166    }
167
168    /// Lexes a number literal.
169    ///
170    /// Supports:
171    /// - Decimal integers (`123`)
172    /// - Floating-point numbers (`123.45`, `1.2e3`)
173    /// - Underscore separators (`1_000_000`)
174    /// - Type suffixes (`f`, `d`, `m`, `l`, `ul`, etc.)
175    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
176        let start_pos = state.get_position();
177
178        if let Some(ch) = state.peek() {
179            if ch.is_ascii_digit() {
180                state.advance(ch.len_utf8());
181
182                while let Some(ch) = state.peek() {
183                    if ch.is_ascii_digit() || ch == '.' || ch == '_' { state.advance(ch.len_utf8()) } else { break }
184                }
185
186                // Handle suffixes (f, d, m, l, ul, etc.)
187                if let Some(ch) = state.peek() {
188                    if ch.is_ascii_alphabetic() {
189                        state.advance(ch.len_utf8());
190                        if let Some(ch2) = state.peek() {
191                            if ch2.is_ascii_alphabetic() {
192                                state.advance(ch2.len_utf8())
193                            }
194                        }
195                    }
196                }
197
198                state.add_token(CSharpTokenType::NumberLiteral, start_pos, state.get_position());
199                true
200            }
201            else {
202                false
203            }
204        }
205        else {
206            false
207        }
208    }
209
210    /// Lexes a keyword or identifier.
211    ///
212    /// Identifiers can start with a letter, underscore, or `@` (for verbatim identifiers).
213    /// Subsequent characters can be letters, digits, or underscores.
214    ///
215    /// Keywords are matched against the standard C# keyword list. If a match is found,
216    /// the specific keyword token is returned; otherwise, it is treated as an identifier.
217    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
218        let start_pos = state.get_position();
219
220        if let Some(ch) = state.peek() {
221            if ch.is_ascii_alphabetic() || ch == '_' || ch == '@' {
222                state.advance(ch.len_utf8());
223
224                while let Some(ch) = state.peek() {
225                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
226                }
227
228                let text = state.get_text_in((start_pos..state.get_position()).into());
229                let token_kind = match text.as_ref() {
230                    // C# Keywords
231                    "abstract" => CSharpTokenType::Abstract,
232                    "as" => CSharpTokenType::As,
233                    "async" => CSharpTokenType::AsyncKeyword,
234                    "await" => CSharpTokenType::AwaitKeyword,
235                    "base" => CSharpTokenType::Base,
236                    "bool" => CSharpTokenType::Bool,
237                    "break" => CSharpTokenType::Break,
238                    "byte" => CSharpTokenType::Byte,
239                    "case" => CSharpTokenType::Case,
240                    "catch" => CSharpTokenType::Catch,
241                    "char" => CSharpTokenType::Char,
242                    "checked" => CSharpTokenType::Checked,
243                    "class" => CSharpTokenType::Class,
244                    "const" => CSharpTokenType::Const,
245                    "continue" => CSharpTokenType::Continue,
246                    "decimal" => CSharpTokenType::Decimal,
247                    "default" => CSharpTokenType::Default,
248                    "delegate" => CSharpTokenType::Delegate,
249                    "do" => CSharpTokenType::Do,
250                    "double" => CSharpTokenType::Double,
251                    "else" => CSharpTokenType::Else,
252                    "enum" => CSharpTokenType::Enum,
253                    "event" => CSharpTokenType::Event,
254                    "explicit" => CSharpTokenType::Explicit,
255                    "extern" => CSharpTokenType::Extern,
256                    "false" => CSharpTokenType::False,
257                    "finally" => CSharpTokenType::Finally,
258                    "fixed" => CSharpTokenType::Fixed,
259                    "float" => CSharpTokenType::Float,
260                    "for" => CSharpTokenType::For,
261                    "foreach" => CSharpTokenType::Foreach,
262                    "goto" => CSharpTokenType::Goto,
263                    "if" => CSharpTokenType::If,
264                    "implicit" => CSharpTokenType::Implicit,
265                    "in" => CSharpTokenType::In,
266                    "int" => CSharpTokenType::Int,
267                    "interface" => CSharpTokenType::Interface,
268                    "internal" => CSharpTokenType::Internal,
269                    "is" => CSharpTokenType::Is,
270                    "lock" => CSharpTokenType::Lock,
271                    "long" => CSharpTokenType::Long,
272                    "namespace" => CSharpTokenType::Namespace,
273                    "new" => CSharpTokenType::New,
274                    "null" => CSharpTokenType::Null,
275                    "object" => CSharpTokenType::Object,
276                    "operator" => CSharpTokenType::Operator,
277                    "out" => CSharpTokenType::Out,
278                    "override" => CSharpTokenType::Override,
279                    "params" => CSharpTokenType::Params,
280                    "private" => CSharpTokenType::Private,
281                    "protected" => CSharpTokenType::Protected,
282                    "public" => CSharpTokenType::Public,
283                    "readonly" => CSharpTokenType::Readonly,
284                    "record" => CSharpTokenType::Record,
285                    "ref" => CSharpTokenType::Ref,
286                    "return" => CSharpTokenType::Return,
287                    "sbyte" => CSharpTokenType::Sbyte,
288                    "sealed" => CSharpTokenType::Sealed,
289                    "short" => CSharpTokenType::Short,
290                    "sizeof" => CSharpTokenType::Sizeof,
291                    "stackalloc" => CSharpTokenType::Stackalloc,
292                    "static" => CSharpTokenType::Static,
293                    "string" => CSharpTokenType::String,
294                    "struct" => CSharpTokenType::Struct,
295                    "switch" => CSharpTokenType::Switch,
296                    "this" => CSharpTokenType::This,
297                    "throw" => CSharpTokenType::Throw,
298                    "true" => CSharpTokenType::True,
299                    "try" => CSharpTokenType::Try,
300                    "typeof" => CSharpTokenType::Typeof,
301                    "uint" => CSharpTokenType::Uint,
302                    "ulong" => CSharpTokenType::Ulong,
303                    "unchecked" => CSharpTokenType::Unchecked,
304                    "unsafe" => CSharpTokenType::Unsafe,
305                    "ushort" => CSharpTokenType::Ushort,
306                    "using" => CSharpTokenType::Using,
307                    "virtual" => CSharpTokenType::Virtual,
308                    "void" => CSharpTokenType::Void,
309                    "volatile" => CSharpTokenType::Volatile,
310                    "while" => CSharpTokenType::While,
311                    _ => CSharpTokenType::Identifier,
312                };
313
314                state.add_token(token_kind, start_pos, state.get_position());
315                true
316            }
317            else {
318                false
319            }
320        }
321        else {
322            false
323        }
324    }
325
326    /// Lexes an operator.
327    ///
328    /// Handles single-character and multi-character operators, including:
329    /// - Arithmetic: `+`, `-`, `*`, `/`, `%`
330    /// - Assignment: `=`, `+=`, `-=`, `*=`, `/=`, `%=`
331    /// - Increment/Decrement: `++`, `--`
332    /// - Comparison: `==`, `!=`, `<`, `<=`, `>`, `>=`
333    /// - Logical: `&&`, `||`, `!`
334    /// - Bitwise: `&`, `|`, `^`, `~`, `<<`, `>>`
335    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
336        let start_pos = state.get_position();
337
338        if let Some(ch) = state.peek() {
339            let token_kind = match ch {
340                '+' => {
341                    state.advance(1);
342                    if let Some('=') = state.peek() {
343                        state.advance(1);
344                        CSharpTokenType::PlusAssign
345                    }
346                    else if let Some('+') = state.peek() {
347                        state.advance(1);
348                        CSharpTokenType::Increment
349                    }
350                    else {
351                        CSharpTokenType::Plus
352                    }
353                }
354                '-' => {
355                    state.advance(1);
356                    if let Some('=') = state.peek() {
357                        state.advance(1);
358                        CSharpTokenType::MinusAssign
359                    }
360                    else if let Some('-') = state.peek() {
361                        state.advance(1);
362                        CSharpTokenType::Decrement
363                    }
364                    else {
365                        CSharpTokenType::Minus
366                    }
367                }
368                '*' => {
369                    state.advance(1);
370                    if let Some('=') = state.peek() {
371                        state.advance(1);
372                        CSharpTokenType::StarAssign
373                    }
374                    else {
375                        CSharpTokenType::Star
376                    }
377                }
378                '/' => {
379                    // Comments are handled in lex_comment
380                    state.advance(1);
381                    if let Some('=') = state.peek() {
382                        state.advance(1);
383                        CSharpTokenType::SlashAssign
384                    }
385                    else {
386                        CSharpTokenType::Slash
387                    }
388                }
389                '%' => {
390                    state.advance(1);
391                    if let Some('=') = state.peek() {
392                        state.advance(1);
393                        CSharpTokenType::PercentAssign
394                    }
395                    else {
396                        CSharpTokenType::Percent
397                    }
398                }
399                '=' => {
400                    state.advance(1);
401                    if let Some('=') = state.peek() {
402                        state.advance(1);
403                        CSharpTokenType::Equal
404                    }
405                    else {
406                        CSharpTokenType::Assign
407                    }
408                }
409                '!' => {
410                    state.advance(1);
411                    if let Some('=') = state.peek() {
412                        state.advance(1);
413                        CSharpTokenType::NotEqual
414                    }
415                    else {
416                        CSharpTokenType::LogicalNot
417                    }
418                }
419                '<' => {
420                    state.advance(1);
421                    if let Some('=') = state.peek() {
422                        state.advance(1);
423                        CSharpTokenType::LessEqual
424                    }
425                    else if let Some('<') = state.peek() {
426                        state.advance(1);
427                        CSharpTokenType::LeftShift
428                    }
429                    else {
430                        CSharpTokenType::Less
431                    }
432                }
433                '>' => {
434                    state.advance(1);
435                    if let Some('=') = state.peek() {
436                        state.advance(1);
437                        CSharpTokenType::GreaterEqual
438                    }
439                    else if let Some('>') = state.peek() {
440                        state.advance(1);
441                        CSharpTokenType::RightShift
442                    }
443                    else {
444                        CSharpTokenType::Greater
445                    }
446                }
447                '&' => {
448                    state.advance(1);
449                    if let Some('&') = state.peek() {
450                        state.advance(1);
451                        CSharpTokenType::LogicalAnd
452                    }
453                    else {
454                        CSharpTokenType::Ampersand
455                    }
456                }
457                '|' => {
458                    state.advance(1);
459                    if let Some('|') = state.peek() {
460                        state.advance(1);
461                        CSharpTokenType::LogicalOr
462                    }
463                    else {
464                        CSharpTokenType::Pipe
465                    }
466                }
467                '^' => {
468                    state.advance(1);
469                    CSharpTokenType::Caret
470                }
471                '~' => {
472                    state.advance(1);
473                    CSharpTokenType::Tilde
474                }
475                _ => return false,
476            };
477
478            state.add_token(token_kind, start_pos, state.get_position());
479            true
480        }
481        else {
482            false
483        }
484    }
485
486    /// Lexes a delimiter.
487    ///
488    /// Handles structural characters such as:
489    /// - Parentheses: `(`, `)`
490    /// - Brackets: `[`, `]`
491    /// - Braces: `{`, `}`
492    /// - Punctuation: `;`, `,`, `.`, `:`, `?`
493    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
494        let start_pos = state.get_position();
495
496        if let Some(ch) = state.peek() {
497            let token_kind = match ch {
498                '(' => CSharpTokenType::LeftParen,
499                ')' => CSharpTokenType::RightParen,
500                '[' => CSharpTokenType::LeftBracket,
501                ']' => CSharpTokenType::RightBracket,
502                '{' => CSharpTokenType::LeftBrace,
503                '}' => CSharpTokenType::RightBrace,
504                ';' => CSharpTokenType::Semicolon,
505                ',' => CSharpTokenType::Comma,
506                '.' => CSharpTokenType::Dot,
507                ':' => CSharpTokenType::Colon,
508                '?' => CSharpTokenType::Question,
509                _ => return false,
510            };
511
512            state.advance(ch.len_utf8());
513            state.add_token(token_kind, start_pos, state.get_position());
514            true
515        }
516        else {
517            false
518        }
519    }
520
521    /// Runs the lexer on the input state.
522    ///
523    /// This method performs the main lexing loop, attempting to match various
524    /// token types (whitespace, comments, literals, keywords, etc.) until the
525    /// end of the input is reached.
526    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
527        while state.not_at_end() {
528            let safe_point = state.get_position();
529
530            if self.skip_whitespace(state) {
531                continue;
532            }
533
534            if self.lex_newline(state) {
535                continue;
536            }
537
538            if self.lex_comment(state) {
539                continue;
540            }
541
542            if self.lex_string(state) {
543                continue;
544            }
545
546            if self.lex_number(state) {
547                continue;
548            }
549
550            if self.lex_keyword_or_identifier(state) {
551                continue;
552            }
553
554            if self.lex_operator(state) {
555                continue;
556            }
557
558            if self.lex_delimiter(state) {
559                continue;
560            }
561
562            // If no pattern matches, handle the error character and advance
563            let start_pos = state.get_position();
564            if let Some(ch) = state.peek() {
565                state.advance(ch.len_utf8());
566                state.add_token(CSharpTokenType::Error, start_pos, state.get_position());
567            }
568
569            state.advance_if_dead_lock(safe_point)
570        }
571        Ok(())
572    }
573}
574
575impl<'config> Lexer<CSharpLanguage> for CSharpLexer<'config> {
576    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], mut cache: &'a mut impl LexerCache<CSharpLanguage>) -> LexOutput<CSharpLanguage> {
577        let mut state = LexerState::new(text);
578        let result = self.run(&mut state);
579        if result.is_ok() {
580            state.add_eof();
581        }
582        state.finish_with_cache(result, &mut cache)
583    }
584}