Skip to main content

oak_cpp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definition.
3pub mod token_type;
4pub use token_type::CppTokenType;
5
6use crate::language::CppLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, CppLanguage>;
10
11/// Lexer for the C++ language.
12pub struct CppLexer<'config> {
13    config: &'config CppLanguage,
14}
15
16/// Type alias for a C lexer.
17pub type CLexer<'config> = CppLexer<'config>;
18
19impl<'config> CppLexer<'config> {
20    /// Creates a new `CppLexer` with the given configuration.
21    pub fn new(config: &'config CppLanguage) -> Self {
22        Self { config }
23    }
24
25    /// Skips whitespace characters.
26    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
27        let start_pos = state.get_position();
28
29        while let Some(ch) = state.peek() {
30            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
31        }
32
33        if state.get_position() > start_pos {
34            state.add_token(CppTokenType::Whitespace, start_pos, state.get_position());
35            true
36        }
37        else {
38            false
39        }
40    }
41
42    /// Lexes a newline sequence.
43    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
44        let start_pos = state.get_position();
45
46        if let Some('\n') = state.peek() {
47            state.advance(1);
48            state.add_token(CppTokenType::Newline, start_pos, state.get_position());
49            true
50        }
51        else if let Some('\r') = state.peek() {
52            state.advance(1);
53            if let Some('\n') = state.peek() {
54                state.advance(1)
55            }
56            state.add_token(CppTokenType::Newline, start_pos, state.get_position());
57            true
58        }
59        else {
60            false
61        }
62    }
63
64    /// Lexes a comment (single-line or multi-line).
65    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
66        let start_pos = state.get_position();
67
68        if let Some('/') = state.peek() {
69            if let Some('/') = state.peek_next_n(1) {
70                // Single-line comment
71                state.advance(2);
72                while let Some(ch) = state.peek() {
73                    if ch == '\n' || ch == '\r' {
74                        break;
75                    }
76                    state.advance(ch.len_utf8())
77                }
78                state.add_token(CppTokenType::Comment, start_pos, state.get_position());
79                true
80            }
81            else if let Some('*') = state.peek_next_n(1) {
82                // Multi-line comment
83                state.advance(2);
84                while let Some(ch) = state.peek() {
85                    if ch == '*' && state.peek_next_n(1) == Some('/') {
86                        state.advance(2);
87                        break;
88                    }
89                    state.advance(ch.len_utf8())
90                }
91                state.add_token(CppTokenType::Comment, start_pos, state.get_position());
92                true
93            }
94            else {
95                false
96            }
97        }
98        else {
99            false
100        }
101    }
102
103    /// Lexes a string literal.
104    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
105        let start_pos = state.get_position();
106
107        if let Some('"') = state.peek() {
108            state.advance(1);
109
110            let mut escaped = false;
111            while let Some(ch) = state.peek() {
112                if escaped {
113                    escaped = false;
114                    state.advance(ch.len_utf8());
115                    continue;
116                }
117
118                if ch == '\\' {
119                    escaped = true;
120                    state.advance(1);
121                    continue;
122                }
123
124                if ch == '"' {
125                    state.advance(1);
126                    break;
127                }
128
129                if ch == '\n' || ch == '\r' {
130                    break; // Unclosed string
131                }
132
133                state.advance(ch.len_utf8())
134            }
135
136            state.add_token(CppTokenType::StringLiteral, start_pos, state.get_position());
137            true
138        }
139        else {
140            false
141        }
142    }
143
144    /// Lexes a character literal.
145    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
146        let start_pos = state.get_position();
147
148        if let Some('\'') = state.peek() {
149            state.advance(1);
150
151            let mut escaped = false;
152            while let Some(ch) = state.peek() {
153                if escaped {
154                    escaped = false;
155                    state.advance(ch.len_utf8());
156                    continue;
157                }
158
159                if ch == '\\' {
160                    escaped = true;
161                    state.advance(1);
162                    continue;
163                }
164
165                if ch == '\'' {
166                    state.advance(1);
167                    break;
168                }
169
170                if ch == '\n' || ch == '\r' {
171                    break; // Unclosed character
172                }
173
174                state.advance(ch.len_utf8())
175            }
176
177            state.add_token(CppTokenType::CharacterLiteral, start_pos, state.get_position());
178            true
179        }
180        else {
181            false
182        }
183    }
184
185    /// Lexes a numeric literal.
186    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
187        let start_pos = state.get_position();
188
189        if let Some(ch) = state.peek() {
190            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
191                let mut is_float = false;
192
193                // Handle hex, octal, binary
194                if ch == '0' {
195                    if let Some(next_ch) = state.peek_next_n(1) {
196                        if next_ch == 'x' || next_ch == 'X' {
197                            // Hexadecimal
198                            state.advance(2);
199                            while let Some(ch) = state.peek() {
200                                if ch.is_ascii_hexdigit() { state.advance(1) } else { break }
201                            }
202                        }
203                        else if next_ch == 'b' || next_ch == 'B' {
204                            // Binary
205                            state.advance(2);
206                            while let Some(ch) = state.peek() {
207                                if ch == '0' || ch == '1' { state.advance(1) } else { break }
208                            }
209                        }
210                        else if next_ch.is_ascii_digit() {
211                            // Octal
212                            while let Some(ch) = state.peek() {
213                                if ch.is_ascii_digit() { state.advance(1) } else { break }
214                            }
215                        }
216                        else {
217                            state.advance(1); // just '0'
218                        }
219                    }
220                    else {
221                        state.advance(1); // just '0'
222                    }
223                }
224                else {
225                    // Decimal integer part
226                    while let Some(ch) = state.peek() {
227                        if ch.is_ascii_digit() { state.advance(1) } else { break }
228                    }
229                }
230
231                // Check for decimal point
232                if let Some('.') = state.peek() {
233                    if let Some(next_ch) = state.peek_next_n(1) {
234                        if next_ch.is_ascii_digit() {
235                            is_float = true;
236                            state.advance(1); // consume '.'
237                            while let Some(ch) = state.peek() {
238                                if ch.is_ascii_digit() { state.advance(1) } else { break }
239                            }
240                        }
241                    }
242                }
243
244                // Check for scientific notation
245                if let Some(ch) = state.peek() {
246                    if ch == 'e' || ch == 'E' {
247                        is_float = true;
248                        state.advance(1);
249                        if let Some(sign) = state.peek() {
250                            if sign == '+' || sign == '-' {
251                                state.advance(1)
252                            }
253                        }
254                        while let Some(ch) = state.peek() {
255                            if ch.is_ascii_digit() { state.advance(1) } else { break }
256                        }
257                    }
258                }
259
260                // Check for suffix
261                while let Some(ch) = state.peek() {
262                    if ch.is_ascii_alphabetic() { state.advance(1) } else { break }
263                }
264
265                let token_kind = if is_float { CppTokenType::FloatLiteral } else { CppTokenType::IntegerLiteral };
266                state.add_token(token_kind, start_pos, state.get_position());
267                true
268            }
269            else {
270                false
271            }
272        }
273        else {
274            false
275        }
276    }
277
278    /// Lexes a keyword or identifier.
279    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
280        let start_pos = state.get_position();
281
282        if let Some(ch) = state.peek() {
283            if ch.is_ascii_alphabetic() || ch == '_' {
284                while let Some(ch) = state.peek() {
285                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
286                }
287
288                let text = state.get_text_in((start_pos..state.get_position()).into());
289                let token_kind = match text.as_ref() {
290                    // C++ Keywords
291                    "alignas" | "alignof" | "and" | "and_eq" | "asm" | "atomic_cancel" | "atomic_commit" | "atomic_noexcept" | "auto" | "bitand" | "bitor" | "bool" | "break" | "case" | "catch" | "char" | "char8_t" | "char16_t" | "char32_t" | "class"
292                    | "compl" | "concept" | "const" | "consteval" | "constexpr" | "constinit" | "const_cast" | "continue" | "co_await" | "co_return" | "co_yield" | "decltype" | "default" | "delete" | "do" | "double" | "dynamic_cast" | "else" | "enum"
293                    | "explicit" | "export" | "extern" | "float" | "for" | "friend" | "goto" | "if" | "inline" | "int" | "long" | "mutable" | "namespace" | "new" | "noexcept" | "not" | "not_eq" | "nullptr" | "operator" | "or" | "or_eq" | "private"
294                    | "protected" | "public" | "reflexpr" | "register" | "reinterpret_cast" | "requires" | "return" | "short" | "signed" | "sizeof" | "static" | "static_assert" | "static_cast" | "struct" | "switch" | "synchronized" | "template"
295                    | "this" | "thread_local" | "throw" | "try" | "typedef" | "typeid" | "typename" | "union" | "unsigned" | "using" | "virtual" | "void" | "volatile" | "wchar_t" | "while" | "xor" | "xor_eq" => CppTokenType::Keyword,
296                    "true" | "false" => CppTokenType::BooleanLiteral,
297                    _ => CppTokenType::Identifier,
298                };
299
300                state.add_token(token_kind, start_pos, state.get_position());
301                true
302            }
303            else {
304                false
305            }
306        }
307        else {
308            false
309        }
310    }
311
312    /// Lexes an operator.
313    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
314        let start_pos = state.get_position();
315
316        if let Some(ch) = state.peek() {
317            let (token_kind, advance_count) = match ch {
318                '+' => {
319                    if let Some('+') = state.peek_next_n(1) {
320                        (CppTokenType::Increment, 2)
321                    }
322                    else if let Some('=') = state.peek_next_n(1) {
323                        (CppTokenType::PlusAssign, 2)
324                    }
325                    else {
326                        (CppTokenType::Plus, 1)
327                    }
328                }
329                '-' => {
330                    if let Some('-') = state.peek_next_n(1) {
331                        (CppTokenType::Decrement, 2)
332                    }
333                    else if let Some('=') = state.peek_next_n(1) {
334                        (CppTokenType::MinusAssign, 2)
335                    }
336                    else if let Some('>') = state.peek_next_n(1) {
337                        (CppTokenType::Arrow, 2)
338                    }
339                    else {
340                        (CppTokenType::Minus, 1)
341                    }
342                }
343                '*' => {
344                    if let Some('=') = state.peek_next_n(1) {
345                        (CppTokenType::StarAssign, 2)
346                    }
347                    else {
348                        (CppTokenType::Star, 1)
349                    }
350                }
351                '/' => {
352                    if let Some('=') = state.peek_next_n(1) {
353                        (CppTokenType::SlashAssign, 2)
354                    }
355                    else {
356                        (CppTokenType::Slash, 1)
357                    }
358                }
359                '%' => {
360                    if let Some('=') = state.peek_next_n(1) {
361                        (CppTokenType::PercentAssign, 2)
362                    }
363                    else {
364                        (CppTokenType::Percent, 1)
365                    }
366                }
367                '=' => {
368                    if let Some('=') = state.peek_next_n(1) {
369                        (CppTokenType::Equal, 2)
370                    }
371                    else {
372                        (CppTokenType::Assign, 1)
373                    }
374                }
375                '!' => {
376                    if let Some('=') = state.peek_next_n(1) {
377                        (CppTokenType::NotEqual, 2)
378                    }
379                    else {
380                        (CppTokenType::LogicalNot, 1)
381                    }
382                }
383                '<' => {
384                    if let Some('<') = state.peek_next_n(1) {
385                        if let Some('=') = state.peek_next_n(2) { (CppTokenType::LeftShiftAssign, 3) } else { (CppTokenType::LeftShift, 2) }
386                    }
387                    else if let Some('=') = state.peek_next_n(1) {
388                        (CppTokenType::LessEqual, 2)
389                    }
390                    else {
391                        (CppTokenType::Less, 1)
392                    }
393                }
394                '>' => {
395                    if let Some('>') = state.peek_next_n(1) {
396                        if let Some('=') = state.peek_next_n(2) { (CppTokenType::RightShiftAssign, 3) } else { (CppTokenType::RightShift, 2) }
397                    }
398                    else if let Some('=') = state.peek_next_n(1) {
399                        (CppTokenType::GreaterEqual, 2)
400                    }
401                    else {
402                        (CppTokenType::Greater, 1)
403                    }
404                }
405                '&' => {
406                    if let Some('&') = state.peek_next_n(1) {
407                        (CppTokenType::LogicalAnd, 2)
408                    }
409                    else if let Some('=') = state.peek_next_n(1) {
410                        (CppTokenType::AndAssign, 2)
411                    }
412                    else {
413                        (CppTokenType::BitAnd, 1)
414                    }
415                }
416                '|' => {
417                    if let Some('|') = state.peek_next_n(1) {
418                        (CppTokenType::LogicalOr, 2)
419                    }
420                    else if let Some('=') = state.peek_next_n(1) {
421                        (CppTokenType::OrAssign, 2)
422                    }
423                    else {
424                        (CppTokenType::BitOr, 1)
425                    }
426                }
427                '^' => {
428                    if let Some('=') = state.peek_next_n(1) {
429                        (CppTokenType::XorAssign, 2)
430                    }
431                    else {
432                        (CppTokenType::BitXor, 1)
433                    }
434                }
435                '~' => (CppTokenType::BitNot, 1),
436                '?' => (CppTokenType::Question, 1),
437                ':' => {
438                    if let Some(':') = state.peek_next_n(1) {
439                        (CppTokenType::Scope, 2)
440                    }
441                    else {
442                        (CppTokenType::Colon, 1)
443                    }
444                }
445                '.' => (CppTokenType::Dot, 1),
446                _ => return false,
447            };
448
449            state.advance(advance_count);
450            state.add_token(token_kind, start_pos, state.get_position());
451            true
452        }
453        else {
454            false
455        }
456    }
457
458    /// Lexes a delimiter.
459    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
460        let start_pos = state.get_position();
461
462        if let Some(ch) = state.peek() {
463            let token_kind = match ch {
464                '(' => CppTokenType::LeftParen,
465                ')' => CppTokenType::RightParen,
466                '[' => CppTokenType::LeftBracket,
467                ']' => CppTokenType::RightBracket,
468                '{' => CppTokenType::LeftBrace,
469                '}' => CppTokenType::RightBrace,
470                ',' => CppTokenType::Comma,
471                ';' => CppTokenType::Semicolon,
472                _ => return false,
473            };
474
475            state.advance(1);
476            state.add_token(token_kind, start_pos, state.get_position());
477            true
478        }
479        else {
480            false
481        }
482    }
483
484    /// Lexes a preprocessor directive.
485    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
486        let start_pos = state.get_position();
487
488        if let Some('#') = state.peek() {
489            // Read until end of line
490            while let Some(ch) = state.peek() {
491                if ch == '\n' || ch == '\r' {
492                    break;
493                }
494                state.advance(ch.len_utf8())
495            }
496
497            state.add_token(CppTokenType::Preprocessor, start_pos, state.get_position());
498            true
499        }
500        else {
501            false
502        }
503    }
504}
505
506impl<'config> Lexer<CppLanguage> for CppLexer<'config> {
507    /// Tokenizes the input source text.
508    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<CppLanguage>) -> LexOutput<CppLanguage> {
509        let mut state = LexerState::new(source);
510        let result = self.run(&mut state);
511        state.finish_with_cache(result, cache)
512    }
513}
514
515impl<'config> CppLexer<'config> {
516    /// Main lexer loop that tokenizes the source text.
517    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
518        while state.not_at_end() {
519            // Try various lexing rules
520            if self.skip_whitespace(state) {
521                continue;
522            }
523
524            if self.lex_newline(state) {
525                continue;
526            }
527
528            if self.lex_comment(state) {
529                continue;
530            }
531
532            if self.lex_string(state) {
533                continue;
534            }
535
536            if self.lex_character(state) {
537                continue;
538            }
539
540            if self.lex_number(state) {
541                continue;
542            }
543
544            if self.lex_keyword_or_identifier(state) {
545                continue;
546            }
547
548            if self.lex_preprocessor(state) {
549                continue;
550            }
551
552            if self.lex_operator(state) {
553                continue;
554            }
555
556            if self.lex_delimiter(state) {
557                continue;
558            }
559
560            // If no rules match, skip the current character and mark as error
561            let start = state.get_position();
562            if let Some(ch) = state.peek() {
563                state.advance(ch.len_utf8());
564                state.add_token(CppTokenType::Error, start, state.get_position())
565            }
566        }
567        Ok(())
568    }
569}