Skip to main content

oak_cpp/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::CppTokenType;
4
5use crate::language::CppLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, CppLanguage>;
9
10/// Lexer for the C++ language.
11pub struct CppLexer<'config> {
12    _config: &'config CppLanguage,
13}
14
15/// Type alias for a C lexer.
16pub type CLexer<'config> = CppLexer<'config>;
17
18impl<'config> CppLexer<'config> {
19    /// Creates a new `CppLexer` with the given configuration.
20    pub fn new(config: &'config CppLanguage) -> Self {
21        Self { _config: config }
22    }
23
24    /// Skips whitespace characters.
25    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
26        let start_pos = state.get_position();
27
28        while let Some(ch) = state.peek() {
29            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
30        }
31
32        if state.get_position() > start_pos {
33            state.add_token(CppTokenType::Whitespace, start_pos, state.get_position());
34            true
35        }
36        else {
37            false
38        }
39    }
40
41    /// Lexes a newline sequence.
42    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43        let start_pos = state.get_position();
44
45        if let Some('\n') = state.peek() {
46            state.advance(1);
47            state.add_token(CppTokenType::Newline, start_pos, state.get_position());
48            true
49        }
50        else if let Some('\r') = state.peek() {
51            state.advance(1);
52            if let Some('\n') = state.peek() {
53                state.advance(1)
54            }
55            state.add_token(CppTokenType::Newline, start_pos, state.get_position());
56            true
57        }
58        else {
59            false
60        }
61    }
62
63    /// Lexes a comment (single-line or multi-line).
64    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
65        let start_pos = state.get_position();
66
67        if let Some('/') = state.peek() {
68            if let Some('/') = state.peek_next_n(1) {
69                // Single-line comment
70                state.advance(2);
71                while let Some(ch) = state.peek() {
72                    if ch == '\n' || ch == '\r' {
73                        break;
74                    }
75                    state.advance(ch.len_utf8())
76                }
77                state.add_token(CppTokenType::Comment, start_pos, state.get_position());
78                true
79            }
80            else if let Some('*') = state.peek_next_n(1) {
81                // Multi-line comment
82                state.advance(2);
83                while let Some(ch) = state.peek() {
84                    if ch == '*' && state.peek_next_n(1) == Some('/') {
85                        state.advance(2);
86                        break;
87                    }
88                    state.advance(ch.len_utf8())
89                }
90                state.add_token(CppTokenType::Comment, start_pos, state.get_position());
91                true
92            }
93            else {
94                false
95            }
96        }
97        else {
98            false
99        }
100    }
101
102    /// Lexes a string literal.
103    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104        let start_pos = state.get_position();
105
106        if let Some('"') = state.peek() {
107            state.advance(1);
108
109            let mut escaped = false;
110            while let Some(ch) = state.peek() {
111                if escaped {
112                    escaped = false;
113                    state.advance(ch.len_utf8());
114                    continue;
115                }
116
117                if ch == '\\' {
118                    escaped = true;
119                    state.advance(1);
120                    continue;
121                }
122
123                if ch == '"' {
124                    state.advance(1);
125                    break;
126                }
127
128                if ch == '\n' || ch == '\r' {
129                    break; // Unclosed string
130                }
131
132                state.advance(ch.len_utf8())
133            }
134
135            state.add_token(CppTokenType::StringLiteral, start_pos, state.get_position());
136            true
137        }
138        else {
139            false
140        }
141    }
142
143    /// Lexes a character literal.
144    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
145        let start_pos = state.get_position();
146
147        if let Some('\'') = state.peek() {
148            state.advance(1);
149
150            let mut escaped = false;
151            while let Some(ch) = state.peek() {
152                if escaped {
153                    escaped = false;
154                    state.advance(ch.len_utf8());
155                    continue;
156                }
157
158                if ch == '\\' {
159                    escaped = true;
160                    state.advance(1);
161                    continue;
162                }
163
164                if ch == '\'' {
165                    state.advance(1);
166                    break;
167                }
168
169                if ch == '\n' || ch == '\r' {
170                    break; // Unclosed character
171                }
172
173                state.advance(ch.len_utf8())
174            }
175
176            state.add_token(CppTokenType::CharacterLiteral, start_pos, state.get_position());
177            true
178        }
179        else {
180            false
181        }
182    }
183
184    /// Lexes a numeric literal.
185    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
186        let start_pos = state.get_position();
187
188        if let Some(ch) = state.peek() {
189            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
190                let mut is_float = false;
191
192                // Handle hex, octal, binary
193                if ch == '0' {
194                    if let Some(next_ch) = state.peek_next_n(1) {
195                        if next_ch == 'x' || next_ch == 'X' {
196                            // Hexadecimal
197                            state.advance(2);
198                            while let Some(ch) = state.peek() {
199                                if ch.is_ascii_hexdigit() { state.advance(1) } else { break }
200                            }
201                        }
202                        else if next_ch == 'b' || next_ch == 'B' {
203                            // Binary
204                            state.advance(2);
205                            while let Some(ch) = state.peek() {
206                                if ch == '0' || ch == '1' { state.advance(1) } else { break }
207                            }
208                        }
209                        else if next_ch.is_ascii_digit() {
210                            // Octal
211                            while let Some(ch) = state.peek() {
212                                if ch.is_ascii_digit() { state.advance(1) } else { break }
213                            }
214                        }
215                        else {
216                            state.advance(1); // just '0'
217                        }
218                    }
219                    else {
220                        state.advance(1); // just '0'
221                    }
222                }
223                else {
224                    // Decimal integer part
225                    while let Some(ch) = state.peek() {
226                        if ch.is_ascii_digit() { state.advance(1) } else { break }
227                    }
228                }
229
230                // Check for decimal point
231                if let Some('.') = state.peek() {
232                    if let Some(next_ch) = state.peek_next_n(1) {
233                        if next_ch.is_ascii_digit() {
234                            is_float = true;
235                            state.advance(1); // consume '.'
236                            while let Some(ch) = state.peek() {
237                                if ch.is_ascii_digit() { state.advance(1) } else { break }
238                            }
239                        }
240                    }
241                }
242
243                // Check for scientific notation
244                if let Some(ch) = state.peek() {
245                    if ch == 'e' || ch == 'E' {
246                        is_float = true;
247                        state.advance(1);
248                        if let Some(sign) = state.peek() {
249                            if sign == '+' || sign == '-' {
250                                state.advance(1)
251                            }
252                        }
253                        while let Some(ch) = state.peek() {
254                            if ch.is_ascii_digit() { state.advance(1) } else { break }
255                        }
256                    }
257                }
258
259                // Check for suffix
260                while let Some(ch) = state.peek() {
261                    if ch.is_ascii_alphabetic() { state.advance(1) } else { break }
262                }
263
264                let token_kind = if is_float { CppTokenType::FloatLiteral } else { CppTokenType::IntegerLiteral };
265                state.add_token(token_kind, start_pos, state.get_position());
266                true
267            }
268            else {
269                false
270            }
271        }
272        else {
273            false
274        }
275    }
276
277    /// Lexes a keyword or identifier.
278    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
279        let start_pos = state.get_position();
280
281        if let Some(ch) = state.peek() {
282            if ch.is_ascii_alphabetic() || ch == '_' {
283                while let Some(ch) = state.peek() {
284                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
285                }
286
287                let text = state.get_text_in((start_pos..state.get_position()).into());
288                let token_kind = match text.as_ref() {
289                    // C++ Keywords
290                    "alignas" | "alignof" | "and" | "and_eq" | "asm" | "atomic_cancel" | "atomic_commit" | "atomic_noexcept" | "auto" | "bitand" | "bitor" | "bool" | "break" | "case" | "catch" | "char" | "char8_t" | "char16_t" | "char32_t" | "class"
291                    | "compl" | "concept" | "const" | "consteval" | "constexpr" | "constinit" | "const_cast" | "continue" | "co_await" | "co_return" | "co_yield" | "decltype" | "default" | "delete" | "do" | "double" | "dynamic_cast" | "else" | "enum"
292                    | "explicit" | "export" | "extern" | "float" | "for" | "friend" | "goto" | "if" | "inline" | "int" | "long" | "mutable" | "namespace" | "new" | "noexcept" | "not" | "not_eq" | "nullptr" | "operator" | "or" | "or_eq" | "private"
293                    | "protected" | "public" | "reflexpr" | "register" | "reinterpret_cast" | "requires" | "return" | "short" | "signed" | "sizeof" | "static" | "static_assert" | "static_cast" | "struct" | "switch" | "synchronized" | "template"
294                    | "this" | "thread_local" | "throw" | "try" | "typedef" | "typeid" | "typename" | "union" | "unsigned" | "using" | "virtual" | "void" | "volatile" | "wchar_t" | "while" | "xor" | "xor_eq" => CppTokenType::Keyword,
295                    "true" | "false" => CppTokenType::BooleanLiteral,
296                    _ => CppTokenType::Identifier,
297                };
298
299                state.add_token(token_kind, start_pos, state.get_position());
300                true
301            }
302            else {
303                false
304            }
305        }
306        else {
307            false
308        }
309    }
310
311    /// Lexes an operator.
312    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
313        let start_pos = state.get_position();
314
315        if let Some(ch) = state.peek() {
316            let (token_kind, advance_count) = match ch {
317                '+' => {
318                    if let Some('+') = state.peek_next_n(1) {
319                        (CppTokenType::Increment, 2)
320                    }
321                    else if let Some('=') = state.peek_next_n(1) {
322                        (CppTokenType::PlusAssign, 2)
323                    }
324                    else {
325                        (CppTokenType::Plus, 1)
326                    }
327                }
328                '-' => {
329                    if let Some('-') = state.peek_next_n(1) {
330                        (CppTokenType::Decrement, 2)
331                    }
332                    else if let Some('=') = state.peek_next_n(1) {
333                        (CppTokenType::MinusAssign, 2)
334                    }
335                    else if let Some('>') = state.peek_next_n(1) {
336                        (CppTokenType::Arrow, 2)
337                    }
338                    else {
339                        (CppTokenType::Minus, 1)
340                    }
341                }
342                '*' => {
343                    if let Some('=') = state.peek_next_n(1) {
344                        (CppTokenType::StarAssign, 2)
345                    }
346                    else {
347                        (CppTokenType::Star, 1)
348                    }
349                }
350                '/' => {
351                    if let Some('=') = state.peek_next_n(1) {
352                        (CppTokenType::SlashAssign, 2)
353                    }
354                    else {
355                        (CppTokenType::Slash, 1)
356                    }
357                }
358                '%' => {
359                    if let Some('=') = state.peek_next_n(1) {
360                        (CppTokenType::PercentAssign, 2)
361                    }
362                    else {
363                        (CppTokenType::Percent, 1)
364                    }
365                }
366                '=' => {
367                    if let Some('=') = state.peek_next_n(1) {
368                        (CppTokenType::Equal, 2)
369                    }
370                    else {
371                        (CppTokenType::Assign, 1)
372                    }
373                }
374                '!' => {
375                    if let Some('=') = state.peek_next_n(1) {
376                        (CppTokenType::NotEqual, 2)
377                    }
378                    else {
379                        (CppTokenType::LogicalNot, 1)
380                    }
381                }
382                '<' => {
383                    if let Some('<') = state.peek_next_n(1) {
384                        if let Some('=') = state.peek_next_n(2) { (CppTokenType::LeftShiftAssign, 3) } else { (CppTokenType::LeftShift, 2) }
385                    }
386                    else if let Some('=') = state.peek_next_n(1) {
387                        (CppTokenType::LessEqual, 2)
388                    }
389                    else {
390                        (CppTokenType::Less, 1)
391                    }
392                }
393                '>' => {
394                    if let Some('>') = state.peek_next_n(1) {
395                        if let Some('=') = state.peek_next_n(2) { (CppTokenType::RightShiftAssign, 3) } else { (CppTokenType::RightShift, 2) }
396                    }
397                    else if let Some('=') = state.peek_next_n(1) {
398                        (CppTokenType::GreaterEqual, 2)
399                    }
400                    else {
401                        (CppTokenType::Greater, 1)
402                    }
403                }
404                '&' => {
405                    if let Some('&') = state.peek_next_n(1) {
406                        (CppTokenType::LogicalAnd, 2)
407                    }
408                    else if let Some('=') = state.peek_next_n(1) {
409                        (CppTokenType::AndAssign, 2)
410                    }
411                    else {
412                        (CppTokenType::BitAnd, 1)
413                    }
414                }
415                '|' => {
416                    if let Some('|') = state.peek_next_n(1) {
417                        (CppTokenType::LogicalOr, 2)
418                    }
419                    else if let Some('=') = state.peek_next_n(1) {
420                        (CppTokenType::OrAssign, 2)
421                    }
422                    else {
423                        (CppTokenType::BitOr, 1)
424                    }
425                }
426                '^' => {
427                    if let Some('=') = state.peek_next_n(1) {
428                        (CppTokenType::XorAssign, 2)
429                    }
430                    else {
431                        (CppTokenType::BitXor, 1)
432                    }
433                }
434                '~' => (CppTokenType::BitNot, 1),
435                '?' => (CppTokenType::Question, 1),
436                ':' => {
437                    if let Some(':') = state.peek_next_n(1) {
438                        (CppTokenType::Scope, 2)
439                    }
440                    else {
441                        (CppTokenType::Colon, 1)
442                    }
443                }
444                '.' => (CppTokenType::Dot, 1),
445                _ => return false,
446            };
447
448            state.advance(advance_count);
449            state.add_token(token_kind, start_pos, state.get_position());
450            true
451        }
452        else {
453            false
454        }
455    }
456
457    /// Lexes a delimiter.
458    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
459        let start_pos = state.get_position();
460
461        if let Some(ch) = state.peek() {
462            let token_kind = match ch {
463                '(' => CppTokenType::LeftParen,
464                ')' => CppTokenType::RightParen,
465                '[' => CppTokenType::LeftBracket,
466                ']' => CppTokenType::RightBracket,
467                '{' => CppTokenType::LeftBrace,
468                '}' => CppTokenType::RightBrace,
469                ',' => CppTokenType::Comma,
470                ';' => CppTokenType::Semicolon,
471                _ => return false,
472            };
473
474            state.advance(1);
475            state.add_token(token_kind, start_pos, state.get_position());
476            true
477        }
478        else {
479            false
480        }
481    }
482
483    /// Lexes a preprocessor directive.
484    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
485        let start_pos = state.get_position();
486
487        if let Some('#') = state.peek() {
488            // Read until end of line
489            while let Some(ch) = state.peek() {
490                if ch == '\n' || ch == '\r' {
491                    break;
492                }
493                state.advance(ch.len_utf8())
494            }
495
496            state.add_token(CppTokenType::Preprocessor, start_pos, state.get_position());
497            true
498        }
499        else {
500            false
501        }
502    }
503}
504
505impl<'config> Lexer<CppLanguage> for CppLexer<'config> {
506    /// Tokenizes the input source text.
507    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<CppLanguage>) -> LexOutput<CppLanguage> {
508        let mut state = LexerState::new(source);
509        let result = self.run(&mut state);
510        state.finish_with_cache(result, cache)
511    }
512}
513
514impl<'config> CppLexer<'config> {
515    /// Main lexer loop that tokenizes the source text.
516    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
517        while state.not_at_end() {
518            // Try various lexing rules
519            if self.skip_whitespace(state) {
520                continue;
521            }
522
523            if self.lex_newline(state) {
524                continue;
525            }
526
527            if self.lex_comment(state) {
528                continue;
529            }
530
531            if self.lex_string(state) {
532                continue;
533            }
534
535            if self.lex_character(state) {
536                continue;
537            }
538
539            if self.lex_number(state) {
540                continue;
541            }
542
543            if self.lex_keyword_or_identifier(state) {
544                continue;
545            }
546
547            if self.lex_preprocessor(state) {
548                continue;
549            }
550
551            if self.lex_operator(state) {
552                continue;
553            }
554
555            if self.lex_delimiter(state) {
556                continue;
557            }
558
559            // 如果都不匹配,跳过当前字符并记录错误
560            let start = state.get_position();
561            if let Some(ch) = state.peek() {
562                state.advance(ch.len_utf8());
563                state.add_token(CppTokenType::Error, start, state.get_position())
564            }
565        }
566        Ok(())
567    }
568}