Skip to main content

oak_csharp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2//! Lexer implementation for the C# language.
3
4use crate::language::CSharpLanguage;
5
6/// Token types and definitions for the C# lexer.
7pub mod token_type;
8
9use oak_core::{
10    Lexer, LexerCache, LexerState,
11    lexer::LexOutput,
12    source::{Source, TextEdit},
13};
14pub use token_type::CSharpTokenType;
15
16pub(crate) type State<'a, S> = LexerState<'a, S, CSharpLanguage>;
17
18/// A lexer for the C# language.
19pub struct CSharpLexer<'config> {
20    config: &'config CSharpLanguage,
21}
22
23impl<'config> CSharpLexer<'config> {
24    /// Creates a new C# lexer.
25    pub fn new(config: &'config CSharpLanguage) -> Self {
26        Self { config }
27    }
28
29    /// Skips whitespace characters (spaces and tabs).
30    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
31        let start_pos = state.get_position();
32
33        while let Some(ch) = state.peek() {
34            if ch == ' ' || ch == '\t' {
35                state.advance(ch.len_utf8());
36            }
37            else {
38                break;
39            }
40        }
41
42        if state.get_position() > start_pos {
43            state.add_token(CSharpTokenType::Whitespace, start_pos, state.get_position());
44            true
45        }
46        else {
47            false
48        }
49    }
50
51    /// Lexes a newline character (LF or CRLF).
52    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
53        let start_pos = state.get_position();
54
55        if let Some('\n') = state.peek() {
56            state.advance(1);
57            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
58            true
59        }
60        else if let Some('\r') = state.peek() {
61            state.advance(1);
62            if let Some('\n') = state.peek() {
63                state.advance(1);
64            }
65            state.add_token(CSharpTokenType::Newline, start_pos, state.get_position());
66            true
67        }
68        else {
69            false
70        }
71    }
72
73    /// Lexes a comment (single-line `//` or multi-line `/* ... */`).
74    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
75        let start_pos = state.get_position();
76
77        if let Some('/') = state.peek() {
78            state.advance(1);
79            if let Some('/') = state.peek() {
80                // Single-line comment
81                state.advance(1);
82                while let Some(ch) = state.peek() {
83                    if ch == '\n' || ch == '\r' {
84                        break;
85                    }
86                    state.advance(ch.len_utf8());
87                }
88                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
89                return true;
90            }
91            else if let Some('*') = state.peek() {
92                // Multi-line comment
93                state.advance(1);
94                while let Some(ch) = state.peek() {
95                    if ch == '*' {
96                        state.advance(1);
97                        if let Some('/') = state.peek() {
98                            state.advance(1);
99                            break;
100                        }
101                    }
102                    else {
103                        state.advance(ch.len_utf8());
104                    }
105                }
106                state.add_token(CSharpTokenType::Comment, start_pos, state.get_position());
107                return true;
108            }
109            else {
110                // Backtrack, not a comment
111                state.set_position(start_pos);
112                return false;
113            }
114        }
115        false
116    }
117
118    /// Lexes a string literal (`"..."`) or character literal (`'...'`).
119    /// Handles basic escape sequences.
120    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
121        let start_pos = state.get_position();
122
123        if let Some('"') = state.peek() {
124            state.advance(1);
125            while let Some(ch) = state.peek() {
126                if ch == '"' {
127                    state.advance(1);
128                    break;
129                }
130                else if ch == '\\' {
131                    state.advance(1);
132                    if let Some(_) = state.peek() {
133                        state.advance(1)
134                    }
135                }
136                else {
137                    state.advance(ch.len_utf8())
138                }
139            }
140            state.add_token(CSharpTokenType::StringLiteral, start_pos, state.get_position());
141            true
142        }
143        else if let Some('\'') = state.peek() {
144            // Char literal
145            state.advance(1);
146            while let Some(ch) = state.peek() {
147                if ch == '\'' {
148                    state.advance(1);
149                    break;
150                }
151                else if ch == '\\' {
152                    state.advance(1);
153                    if let Some(_) = state.peek() {
154                        state.advance(1)
155                    }
156                }
157                else {
158                    state.advance(ch.len_utf8())
159                }
160            }
161            state.add_token(CSharpTokenType::CharLiteral, start_pos, state.get_position());
162            true
163        }
164        else {
165            false
166        }
167    }
168
169    /// Lexes a number literal.
170    ///
171    /// Supports:
172    /// - Decimal integers (`123`)
173    /// - Floating-point numbers (`123.45`, `1.2e3`)
174    /// - Underscore separators (`1_000_000`)
175    /// - Type suffixes (`f`, `d`, `m`, `l`, `ul`, etc.)
176    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
177        let start_pos = state.get_position();
178
179        if let Some(ch) = state.peek() {
180            if ch.is_ascii_digit() {
181                state.advance(ch.len_utf8());
182
183                while let Some(ch) = state.peek() {
184                    if ch.is_ascii_digit() || ch == '.' || ch == '_' { state.advance(ch.len_utf8()) } else { break }
185                }
186
187                // Handle suffixes (f, d, m, l, ul, etc.)
188                if let Some(ch) = state.peek() {
189                    if ch.is_ascii_alphabetic() {
190                        state.advance(ch.len_utf8());
191                        if let Some(ch2) = state.peek() {
192                            if ch2.is_ascii_alphabetic() {
193                                state.advance(ch2.len_utf8())
194                            }
195                        }
196                    }
197                }
198
199                state.add_token(CSharpTokenType::NumberLiteral, start_pos, state.get_position());
200                true
201            }
202            else {
203                false
204            }
205        }
206        else {
207            false
208        }
209    }
210
211    /// Lexes a keyword or identifier.
212    ///
213    /// Identifiers can start with a letter, underscore, or `@` (for verbatim identifiers).
214    /// Subsequent characters can be letters, digits, or underscores.
215    ///
216    /// Keywords are matched against the standard C# keyword list. If a match is found,
217    /// the specific keyword token is returned; otherwise, it is treated as an identifier.
218    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219        let start_pos = state.get_position();
220
221        if let Some(ch) = state.peek() {
222            if ch.is_ascii_alphabetic() || ch == '_' || ch == '@' {
223                state.advance(ch.len_utf8());
224
225                while let Some(ch) = state.peek() {
226                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
227                }
228
229                let text = state.get_text_in((start_pos..state.get_position()).into());
230                let token_kind = match text.as_ref() {
231                    // C# Keywords
232                    "abstract" => CSharpTokenType::Abstract,
233                    "as" => CSharpTokenType::As,
234                    "async" => CSharpTokenType::AsyncKeyword,
235                    "await" => CSharpTokenType::AwaitKeyword,
236                    "base" => CSharpTokenType::Base,
237                    "bool" => CSharpTokenType::Bool,
238                    "break" => CSharpTokenType::Break,
239                    "byte" => CSharpTokenType::Byte,
240                    "case" => CSharpTokenType::Case,
241                    "catch" => CSharpTokenType::Catch,
242                    "char" => CSharpTokenType::Char,
243                    "checked" => CSharpTokenType::Checked,
244                    "class" => CSharpTokenType::Class,
245                    "const" => CSharpTokenType::Const,
246                    "continue" => CSharpTokenType::Continue,
247                    "decimal" => CSharpTokenType::Decimal,
248                    "default" => CSharpTokenType::Default,
249                    "delegate" => CSharpTokenType::Delegate,
250                    "do" => CSharpTokenType::Do,
251                    "double" => CSharpTokenType::Double,
252                    "else" => CSharpTokenType::Else,
253                    "enum" => CSharpTokenType::Enum,
254                    "event" => CSharpTokenType::Event,
255                    "explicit" => CSharpTokenType::Explicit,
256                    "extern" => CSharpTokenType::Extern,
257                    "false" => CSharpTokenType::False,
258                    "finally" => CSharpTokenType::Finally,
259                    "fixed" => CSharpTokenType::Fixed,
260                    "float" => CSharpTokenType::Float,
261                    "for" => CSharpTokenType::For,
262                    "foreach" => CSharpTokenType::Foreach,
263                    "goto" => CSharpTokenType::Goto,
264                    "if" => CSharpTokenType::If,
265                    "implicit" => CSharpTokenType::Implicit,
266                    "in" => CSharpTokenType::In,
267                    "int" => CSharpTokenType::Int,
268                    "interface" => CSharpTokenType::Interface,
269                    "internal" => CSharpTokenType::Internal,
270                    "is" => CSharpTokenType::Is,
271                    "lock" => CSharpTokenType::Lock,
272                    "long" => CSharpTokenType::Long,
273                    "namespace" => CSharpTokenType::Namespace,
274                    "new" => CSharpTokenType::New,
275                    "null" => CSharpTokenType::Null,
276                    "object" => CSharpTokenType::Object,
277                    "operator" => CSharpTokenType::Operator,
278                    "out" => CSharpTokenType::Out,
279                    "override" => CSharpTokenType::Override,
280                    "params" => CSharpTokenType::Params,
281                    "private" => CSharpTokenType::Private,
282                    "protected" => CSharpTokenType::Protected,
283                    "public" => CSharpTokenType::Public,
284                    "readonly" => CSharpTokenType::Readonly,
285                    "record" => CSharpTokenType::Record,
286                    "ref" => CSharpTokenType::Ref,
287                    "return" => CSharpTokenType::Return,
288                    "sbyte" => CSharpTokenType::Sbyte,
289                    "sealed" => CSharpTokenType::Sealed,
290                    "short" => CSharpTokenType::Short,
291                    "sizeof" => CSharpTokenType::Sizeof,
292                    "stackalloc" => CSharpTokenType::Stackalloc,
293                    "static" => CSharpTokenType::Static,
294                    "string" => CSharpTokenType::String,
295                    "struct" => CSharpTokenType::Struct,
296                    "switch" => CSharpTokenType::Switch,
297                    "this" => CSharpTokenType::This,
298                    "throw" => CSharpTokenType::Throw,
299                    "true" => CSharpTokenType::True,
300                    "try" => CSharpTokenType::Try,
301                    "typeof" => CSharpTokenType::Typeof,
302                    "uint" => CSharpTokenType::Uint,
303                    "ulong" => CSharpTokenType::Ulong,
304                    "unchecked" => CSharpTokenType::Unchecked,
305                    "unsafe" => CSharpTokenType::Unsafe,
306                    "ushort" => CSharpTokenType::Ushort,
307                    "using" => CSharpTokenType::Using,
308                    "virtual" => CSharpTokenType::Virtual,
309                    "void" => CSharpTokenType::Void,
310                    "volatile" => CSharpTokenType::Volatile,
311                    "while" => CSharpTokenType::While,
312                    _ => CSharpTokenType::Identifier,
313                };
314
315                state.add_token(token_kind, start_pos, state.get_position());
316                true
317            }
318            else {
319                false
320            }
321        }
322        else {
323            false
324        }
325    }
326
327    /// Lexes an operator.
328    ///
329    /// Handles single-character and multi-character operators, including:
330    /// - Arithmetic: `+`, `-`, `*`, `/`, `%`
331    /// - Assignment: `=`, `+=`, `-=`, `*=`, `/=`, `%=`
332    /// - Increment/Decrement: `++`, `--`
333    /// - Comparison: `==`, `!=`, `<`, `<=`, `>`, `>=`
334    /// - Logical: `&&`, `||`, `!`
335    /// - Bitwise: `&`, `|`, `^`, `~`, `<<`, `>>`
336    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
337        let start_pos = state.get_position();
338
339        if let Some(ch) = state.peek() {
340            let token_kind = match ch {
341                '+' => {
342                    state.advance(1);
343                    if let Some('=') = state.peek() {
344                        state.advance(1);
345                        CSharpTokenType::PlusAssign
346                    }
347                    else if let Some('+') = state.peek() {
348                        state.advance(1);
349                        CSharpTokenType::Increment
350                    }
351                    else {
352                        CSharpTokenType::Plus
353                    }
354                }
355                '-' => {
356                    state.advance(1);
357                    if let Some('=') = state.peek() {
358                        state.advance(1);
359                        CSharpTokenType::MinusAssign
360                    }
361                    else if let Some('-') = state.peek() {
362                        state.advance(1);
363                        CSharpTokenType::Decrement
364                    }
365                    else {
366                        CSharpTokenType::Minus
367                    }
368                }
369                '*' => {
370                    state.advance(1);
371                    if let Some('=') = state.peek() {
372                        state.advance(1);
373                        CSharpTokenType::StarAssign
374                    }
375                    else {
376                        CSharpTokenType::Star
377                    }
378                }
379                '/' => {
380                    // Comments are handled in lex_comment
381                    state.advance(1);
382                    if let Some('=') = state.peek() {
383                        state.advance(1);
384                        CSharpTokenType::SlashAssign
385                    }
386                    else {
387                        CSharpTokenType::Slash
388                    }
389                }
390                '%' => {
391                    state.advance(1);
392                    if let Some('=') = state.peek() {
393                        state.advance(1);
394                        CSharpTokenType::PercentAssign
395                    }
396                    else {
397                        CSharpTokenType::Percent
398                    }
399                }
400                '=' => {
401                    state.advance(1);
402                    if let Some('=') = state.peek() {
403                        state.advance(1);
404                        CSharpTokenType::Equal
405                    }
406                    else {
407                        CSharpTokenType::Assign
408                    }
409                }
410                '!' => {
411                    state.advance(1);
412                    if let Some('=') = state.peek() {
413                        state.advance(1);
414                        CSharpTokenType::NotEqual
415                    }
416                    else {
417                        CSharpTokenType::LogicalNot
418                    }
419                }
420                '<' => {
421                    state.advance(1);
422                    if let Some('=') = state.peek() {
423                        state.advance(1);
424                        CSharpTokenType::LessEqual
425                    }
426                    else if let Some('<') = state.peek() {
427                        state.advance(1);
428                        CSharpTokenType::LeftShift
429                    }
430                    else {
431                        CSharpTokenType::Less
432                    }
433                }
434                '>' => {
435                    state.advance(1);
436                    if let Some('=') = state.peek() {
437                        state.advance(1);
438                        CSharpTokenType::GreaterEqual
439                    }
440                    else if let Some('>') = state.peek() {
441                        state.advance(1);
442                        CSharpTokenType::RightShift
443                    }
444                    else {
445                        CSharpTokenType::Greater
446                    }
447                }
448                '&' => {
449                    state.advance(1);
450                    if let Some('&') = state.peek() {
451                        state.advance(1);
452                        CSharpTokenType::LogicalAnd
453                    }
454                    else {
455                        CSharpTokenType::Ampersand
456                    }
457                }
458                '|' => {
459                    state.advance(1);
460                    if let Some('|') = state.peek() {
461                        state.advance(1);
462                        CSharpTokenType::LogicalOr
463                    }
464                    else {
465                        CSharpTokenType::Pipe
466                    }
467                }
468                '^' => {
469                    state.advance(1);
470                    CSharpTokenType::Caret
471                }
472                '~' => {
473                    state.advance(1);
474                    CSharpTokenType::Tilde
475                }
476                _ => return false,
477            };
478
479            state.add_token(token_kind, start_pos, state.get_position());
480            true
481        }
482        else {
483            false
484        }
485    }
486
487    /// Lexes a delimiter.
488    ///
489    /// Handles structural characters such as:
490    /// - Parentheses: `(`, `)`
491    /// - Brackets: `[`, `]`
492    /// - Braces: `{`, `}`
493    /// - Punctuation: `;`, `,`, `.`, `:`, `?`
494    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
495        let start_pos = state.get_position();
496
497        if let Some(ch) = state.peek() {
498            let token_kind = match ch {
499                '(' => CSharpTokenType::LeftParen,
500                ')' => CSharpTokenType::RightParen,
501                '[' => CSharpTokenType::LeftBracket,
502                ']' => CSharpTokenType::RightBracket,
503                '{' => CSharpTokenType::LeftBrace,
504                '}' => CSharpTokenType::RightBrace,
505                ';' => CSharpTokenType::Semicolon,
506                ',' => CSharpTokenType::Comma,
507                '.' => CSharpTokenType::Dot,
508                ':' => CSharpTokenType::Colon,
509                '?' => CSharpTokenType::Question,
510                _ => return false,
511            };
512
513            state.advance(ch.len_utf8());
514            state.add_token(token_kind, start_pos, state.get_position());
515            true
516        }
517        else {
518            false
519        }
520    }
521
522    /// Runs the lexer on the input state.
523    ///
524    /// This method performs the main lexing loop, attempting to match various
525    /// token types (whitespace, comments, literals, keywords, etc.) until the
526    /// end of the input is reached.
527    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
528        while state.not_at_end() {
529            let safe_point = state.get_position();
530
531            if self.skip_whitespace(state) {
532                continue;
533            }
534
535            if self.lex_newline(state) {
536                continue;
537            }
538
539            if self.lex_comment(state) {
540                continue;
541            }
542
543            if self.lex_string(state) {
544                continue;
545            }
546
547            if self.lex_number(state) {
548                continue;
549            }
550
551            if self.lex_keyword_or_identifier(state) {
552                continue;
553            }
554
555            if self.lex_operator(state) {
556                continue;
557            }
558
559            if self.lex_delimiter(state) {
560                continue;
561            }
562
563            // If no pattern matches, handle the error character and advance
564            let start_pos = state.get_position();
565            if let Some(ch) = state.peek() {
566                state.advance(ch.len_utf8());
567                state.add_token(CSharpTokenType::Error, start_pos, state.get_position());
568            }
569
570            state.advance_if_dead_lock(safe_point)
571        }
572        Ok(())
573    }
574}
575
576impl<'config> Lexer<CSharpLanguage> for CSharpLexer<'config> {
577    fn lex<'a, S: Source + ?Sized>(&self, text: &S, _edits: &[TextEdit], mut cache: &'a mut impl LexerCache<CSharpLanguage>) -> LexOutput<CSharpLanguage> {
578        let mut state = LexerState::new(text);
579        let result = self.run(&mut state);
580        if result.is_ok() {
581            state.add_eof();
582        }
583        state.finish_with_cache(result, &mut cache)
584    }
585}