oak_cpp/lexer/
mod.rs

1mod token_type;
2pub use token_type::CppTokenType;
3
4use crate::language::CppLanguage;
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, CppLanguage>;
8
9pub struct CppLexer<'config> {
10    _config: &'config CppLanguage,
11}
12
13/// C 词法分析器类型别名
14pub type CLexer<'config> = CppLexer<'config>;
15
16impl<'config> CppLexer<'config> {
17    pub fn new(config: &'config CppLanguage) -> Self {
18        Self { _config: config }
19    }
20
21    /// 跳过空白字符
22    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
23        let start_pos = state.get_position();
24
25        while let Some(ch) = state.peek() {
26            if ch == ' ' || ch == '\t' {
27                state.advance(ch.len_utf8());
28            }
29            else {
30                break;
31            }
32        }
33
34        if state.get_position() > start_pos {
35            state.add_token(CppTokenType::Whitespace, start_pos, state.get_position());
36            true
37        }
38        else {
39            false
40        }
41    }
42
43    /// 处理换行
44    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
45        let start_pos = state.get_position();
46
47        if let Some('\n') = state.peek() {
48            state.advance(1);
49            state.add_token(CppTokenType::Newline, start_pos, state.get_position());
50            true
51        }
52        else if let Some('\r') = state.peek() {
53            state.advance(1);
54            if let Some('\n') = state.peek() {
55                state.advance(1);
56            }
57            state.add_token(CppTokenType::Newline, start_pos, state.get_position());
58            true
59        }
60        else {
61            false
62        }
63    }
64
65    /// 处理注释
66    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
67        let start_pos = state.get_position();
68
69        if let Some('/') = state.peek() {
70            if let Some('/') = state.peek_next_n(1) {
71                // 单行注释
72                state.advance(2);
73                while let Some(ch) = state.peek() {
74                    if ch == '\n' || ch == '\r' {
75                        break;
76                    }
77                    state.advance(ch.len_utf8());
78                }
79                state.add_token(CppTokenType::Comment, start_pos, state.get_position());
80                true
81            }
82            else if let Some('*') = state.peek_next_n(1) {
83                // 多行注释
84                state.advance(2);
85                while let Some(ch) = state.peek() {
86                    if ch == '*' && state.peek_next_n(1) == Some('/') {
87                        state.advance(2);
88                        break;
89                    }
90                    state.advance(ch.len_utf8());
91                }
92                state.add_token(CppTokenType::Comment, start_pos, state.get_position());
93                true
94            }
95            else {
96                false
97            }
98        }
99        else {
100            false
101        }
102    }
103
104    /// 处理字符串字面量
105    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106        let start_pos = state.get_position();
107
108        if let Some('"') = state.peek() {
109            state.advance(1);
110
111            let mut escaped = false;
112            while let Some(ch) = state.peek() {
113                if escaped {
114                    escaped = false;
115                    state.advance(ch.len_utf8());
116                    continue;
117                }
118
119                if ch == '\\' {
120                    escaped = true;
121                    state.advance(1);
122                    continue;
123                }
124
125                if ch == '"' {
126                    state.advance(1);
127                    break;
128                }
129
130                if ch == '\n' || ch == '\r' {
131                    break; // 未闭合的字符
132                }
133
134                state.advance(ch.len_utf8());
135            }
136
137            state.add_token(CppTokenType::StringLiteral, start_pos, state.get_position());
138            true
139        }
140        else {
141            false
142        }
143    }
144
145    /// 处理字符字面量
146    fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
147        let start_pos = state.get_position();
148
149        if let Some('\'') = state.peek() {
150            state.advance(1);
151
152            let mut escaped = false;
153            while let Some(ch) = state.peek() {
154                if escaped {
155                    escaped = false;
156                    state.advance(ch.len_utf8());
157                    continue;
158                }
159
160                if ch == '\\' {
161                    escaped = true;
162                    state.advance(1);
163                    continue;
164                }
165
166                if ch == '\'' {
167                    state.advance(1);
168                    break;
169                }
170
171                if ch == '\n' || ch == '\r' {
172                    break; // 未闭合的字符
173                }
174
175                state.advance(ch.len_utf8());
176            }
177
178            state.add_token(CppTokenType::CharacterLiteral, start_pos, state.get_position());
179            true
180        }
181        else {
182            false
183        }
184    }
185
186    /// 处理数字字面量
187    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
188        let start_pos = state.get_position();
189
190        if let Some(ch) = state.peek() {
191            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
192                let mut is_float = false;
193
194                // 处理十六进制、八进制、二进制
195                if ch == '0' {
196                    if let Some(next_ch) = state.peek_next_n(1) {
197                        if next_ch == 'x' || next_ch == 'X' {
198                            // 十六进制
199                            state.advance(2);
200                            while let Some(ch) = state.peek() {
201                                if ch.is_ascii_hexdigit() {
202                                    state.advance(1);
203                                }
204                                else {
205                                    break;
206                                }
207                            }
208                        }
209                        else if next_ch == 'b' || next_ch == 'B' {
210                            // 二进
211                            state.advance(2);
212                            while let Some(ch) = state.peek() {
213                                if ch == '0' || ch == '1' {
214                                    state.advance(1);
215                                }
216                                else {
217                                    break;
218                                }
219                            }
220                        }
221                        else if next_ch.is_ascii_digit() {
222                            // 八进
223                            while let Some(ch) = state.peek() {
224                                if ch.is_ascii_digit() {
225                                    state.advance(1);
226                                }
227                                else {
228                                    break;
229                                }
230                            }
231                        }
232                        else {
233                            state.advance(1); // 只是 '0'
234                        }
235                    }
236                    else {
237                        state.advance(1); // 只是 '0'
238                    }
239                }
240                else {
241                    // 十进制整数部
242                    while let Some(ch) = state.peek() {
243                        if ch.is_ascii_digit() {
244                            state.advance(1);
245                        }
246                        else {
247                            break;
248                        }
249                    }
250                }
251
252                // 检查小数点
253                if let Some('.') = state.peek() {
254                    if let Some(next_ch) = state.peek_next_n(1) {
255                        if next_ch.is_ascii_digit() {
256                            is_float = true;
257                            state.advance(1); // 消费小数
258                            while let Some(ch) = state.peek() {
259                                if ch.is_ascii_digit() {
260                                    state.advance(1);
261                                }
262                                else {
263                                    break;
264                                }
265                            }
266                        }
267                    }
268                }
269
270                // 检查科学记数法
271                if let Some(ch) = state.peek() {
272                    if ch == 'e' || ch == 'E' {
273                        is_float = true;
274                        state.advance(1);
275                        if let Some(sign) = state.peek() {
276                            if sign == '+' || sign == '-' {
277                                state.advance(1);
278                            }
279                        }
280                        while let Some(ch) = state.peek() {
281                            if ch.is_ascii_digit() {
282                                state.advance(1);
283                            }
284                            else {
285                                break;
286                            }
287                        }
288                    }
289                }
290
291                // 检查后缀
292                while let Some(ch) = state.peek() {
293                    if ch.is_ascii_alphabetic() {
294                        state.advance(1);
295                    }
296                    else {
297                        break;
298                    }
299                }
300
301                let token_kind = if is_float { CppTokenType::FloatLiteral } else { CppTokenType::IntegerLiteral };
302                state.add_token(token_kind, start_pos, state.get_position());
303                true
304            }
305            else {
306                false
307            }
308        }
309        else {
310            false
311        }
312    }
313
314    /// 处理关键字或标识符
315    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
316        let start_pos = state.get_position();
317
318        if let Some(ch) = state.peek() {
319            if ch.is_ascii_alphabetic() || ch == '_' {
320                while let Some(ch) = state.peek() {
321                    if ch.is_ascii_alphanumeric() || ch == '_' {
322                        state.advance(ch.len_utf8());
323                    }
324                    else {
325                        break;
326                    }
327                }
328
329                let text = state.get_text_in((start_pos..state.get_position()).into());
330                let token_kind = match text.as_ref() {
331                    // C++ 关键
332                    "alignas" | "alignof" | "and" | "and_eq" | "asm" | "atomic_cancel" | "atomic_commit" | "atomic_noexcept" | "auto" | "bitand" | "bitor" | "bool" | "break" | "case" | "catch" | "char" | "char8_t" | "char16_t" | "char32_t" | "class"
333                    | "compl" | "concept" | "const" | "consteval" | "constexpr" | "constinit" | "const_cast" | "continue" | "co_await" | "co_return" | "co_yield" | "decltype" | "default" | "delete" | "do" | "double" | "dynamic_cast" | "else" | "enum"
334                    | "explicit" | "export" | "extern" | "float" | "for" | "friend" | "goto" | "if" | "inline" | "int" | "long" | "mutable" | "namespace" | "new" | "noexcept" | "not" | "not_eq" | "nullptr" | "operator" | "or" | "or_eq" | "private"
335                    | "protected" | "public" | "reflexpr" | "register" | "reinterpret_cast" | "requires" | "return" | "short" | "signed" | "sizeof" | "static" | "static_assert" | "static_cast" | "struct" | "switch" | "synchronized" | "template"
336                    | "this" | "thread_local" | "throw" | "try" | "typedef" | "typeid" | "typename" | "union" | "unsigned" | "using" | "virtual" | "void" | "volatile" | "wchar_t" | "while" | "xor" | "xor_eq" => CppTokenType::Keyword,
337                    "true" | "false" => CppTokenType::BooleanLiteral,
338                    _ => CppTokenType::Identifier,
339                };
340
341                state.add_token(token_kind, start_pos, state.get_position());
342                true
343            }
344            else {
345                false
346            }
347        }
348        else {
349            false
350        }
351    }
352
353    /// 处理操作符
354    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
355        let start_pos = state.get_position();
356
357        if let Some(ch) = state.peek() {
358            let (token_kind, advance_count) = match ch {
359                '+' => {
360                    if let Some('+') = state.peek_next_n(1) {
361                        (CppTokenType::Increment, 2)
362                    }
363                    else if let Some('=') = state.peek_next_n(1) {
364                        (CppTokenType::PlusAssign, 2)
365                    }
366                    else {
367                        (CppTokenType::Plus, 1)
368                    }
369                }
370                '-' => {
371                    if let Some('-') = state.peek_next_n(1) {
372                        (CppTokenType::Decrement, 2)
373                    }
374                    else if let Some('=') = state.peek_next_n(1) {
375                        (CppTokenType::MinusAssign, 2)
376                    }
377                    else if let Some('>') = state.peek_next_n(1) {
378                        (CppTokenType::Arrow, 2)
379                    }
380                    else {
381                        (CppTokenType::Minus, 1)
382                    }
383                }
384                '*' => {
385                    if let Some('=') = state.peek_next_n(1) {
386                        (CppTokenType::StarAssign, 2)
387                    }
388                    else {
389                        (CppTokenType::Star, 1)
390                    }
391                }
392                '/' => {
393                    if let Some('=') = state.peek_next_n(1) {
394                        (CppTokenType::SlashAssign, 2)
395                    }
396                    else {
397                        (CppTokenType::Slash, 1)
398                    }
399                }
400                '%' => {
401                    if let Some('=') = state.peek_next_n(1) {
402                        (CppTokenType::PercentAssign, 2)
403                    }
404                    else {
405                        (CppTokenType::Percent, 1)
406                    }
407                }
408                '=' => {
409                    if let Some('=') = state.peek_next_n(1) {
410                        (CppTokenType::Equal, 2)
411                    }
412                    else {
413                        (CppTokenType::Assign, 1)
414                    }
415                }
416                '!' => {
417                    if let Some('=') = state.peek_next_n(1) {
418                        (CppTokenType::NotEqual, 2)
419                    }
420                    else {
421                        (CppTokenType::LogicalNot, 1)
422                    }
423                }
424                '<' => {
425                    if let Some('<') = state.peek_next_n(1) {
426                        if let Some('=') = state.peek_next_n(2) { (CppTokenType::LeftShiftAssign, 3) } else { (CppTokenType::LeftShift, 2) }
427                    }
428                    else if let Some('=') = state.peek_next_n(1) {
429                        (CppTokenType::LessEqual, 2)
430                    }
431                    else {
432                        (CppTokenType::Less, 1)
433                    }
434                }
435                '>' => {
436                    if let Some('>') = state.peek_next_n(1) {
437                        if let Some('=') = state.peek_next_n(2) { (CppTokenType::RightShiftAssign, 3) } else { (CppTokenType::RightShift, 2) }
438                    }
439                    else if let Some('=') = state.peek_next_n(1) {
440                        (CppTokenType::GreaterEqual, 2)
441                    }
442                    else {
443                        (CppTokenType::Greater, 1)
444                    }
445                }
446                '&' => {
447                    if let Some('&') = state.peek_next_n(1) {
448                        (CppTokenType::LogicalAnd, 2)
449                    }
450                    else if let Some('=') = state.peek_next_n(1) {
451                        (CppTokenType::AndAssign, 2)
452                    }
453                    else {
454                        (CppTokenType::BitAnd, 1)
455                    }
456                }
457                '|' => {
458                    if let Some('|') = state.peek_next_n(1) {
459                        (CppTokenType::LogicalOr, 2)
460                    }
461                    else if let Some('=') = state.peek_next_n(1) {
462                        (CppTokenType::OrAssign, 2)
463                    }
464                    else {
465                        (CppTokenType::BitOr, 1)
466                    }
467                }
468                '^' => {
469                    if let Some('=') = state.peek_next_n(1) {
470                        (CppTokenType::XorAssign, 2)
471                    }
472                    else {
473                        (CppTokenType::BitXor, 1)
474                    }
475                }
476                '~' => (CppTokenType::BitNot, 1),
477                '?' => (CppTokenType::Question, 1),
478                ':' => {
479                    if let Some(':') = state.peek_next_n(1) {
480                        (CppTokenType::Scope, 2)
481                    }
482                    else {
483                        (CppTokenType::Colon, 1)
484                    }
485                }
486                '.' => (CppTokenType::Dot, 1),
487                _ => return false,
488            };
489
490            state.advance(advance_count);
491            state.add_token(token_kind, start_pos, state.get_position());
492            true
493        }
494        else {
495            false
496        }
497    }
498
499    /// 处理分隔符
500    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
501        let start_pos = state.get_position();
502
503        if let Some(ch) = state.peek() {
504            let token_kind = match ch {
505                '(' => CppTokenType::LeftParen,
506                ')' => CppTokenType::RightParen,
507                '[' => CppTokenType::LeftBracket,
508                ']' => CppTokenType::RightBracket,
509                '{' => CppTokenType::LeftBrace,
510                '}' => CppTokenType::RightBrace,
511                ',' => CppTokenType::Comma,
512                ';' => CppTokenType::Semicolon,
513                _ => return false,
514            };
515
516            state.advance(1);
517            state.add_token(token_kind, start_pos, state.get_position());
518            true
519        }
520        else {
521            false
522        }
523    }
524
525    /// 处理预处理指令
526    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
527        let start_pos = state.get_position();
528
529        if let Some('#') = state.peek() {
530            // 读取到行
531            while let Some(ch) = state.peek() {
532                if ch == '\n' || ch == '\r' {
533                    break;
534                }
535                state.advance(ch.len_utf8());
536            }
537
538            state.add_token(CppTokenType::Preprocessor, start_pos, state.get_position());
539            true
540        }
541        else {
542            false
543        }
544    }
545}
546
547impl<'config> Lexer<CppLanguage> for CppLexer<'config> {
548    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<CppLanguage>) -> LexOutput<CppLanguage> {
549        let mut state = LexerState::new(source);
550        let result = self.run(&mut state);
551        state.finish_with_cache(result, cache)
552    }
553}
554
555impl<'config> CppLexer<'config> {
556    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
557        while state.not_at_end() {
558            // 尝试各种词法规则
559            if self.skip_whitespace(state) {
560                continue;
561            }
562
563            if self.lex_newline(state) {
564                continue;
565            }
566
567            if self.lex_comment(state) {
568                continue;
569            }
570
571            if self.lex_string(state) {
572                continue;
573            }
574
575            if self.lex_character(state) {
576                continue;
577            }
578
579            if self.lex_number(state) {
580                continue;
581            }
582
583            if self.lex_keyword_or_identifier(state) {
584                continue;
585            }
586
587            if self.lex_preprocessor(state) {
588                continue;
589            }
590
591            if self.lex_operator(state) {
592                continue;
593            }
594
595            if self.lex_delimiter(state) {
596                continue;
597            }
598
599            // 如果都不匹配,跳过当前字符并记录错误
600            let start = state.get_position();
601            if let Some(ch) = state.peek() {
602                state.advance(ch.len_utf8());
603                state.add_token(CppTokenType::Error, start, state.get_position());
604            }
605        }
606        Ok(())
607    }
608}