Skip to main content

oak_c/lexer/
mod.rs

1pub mod token_type;
2
3pub use token_type::CTokenType;
4
5use crate::language::CLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
7use serde::Serialize;
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, CLanguage>;
11
12#[derive(Clone, Copy, Debug, Serialize)]
13pub struct CLexer<'config> {
14    config: &'config CLanguage,
15}
16
17impl<'config> Lexer<CLanguage> for CLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
19        let mut state = State::new_with_cache(source, 0, cache);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof();
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> CLexer<'config> {
29    pub fn new(config: &'config CLanguage) -> Self {
30        Self { config }
31    }
32
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36            if self.skip_whitespace(state) {
37                continue;
38            }
39            if self.skip_comment(state) {
40                continue;
41            }
42            if self.lex_newline(state) {
43                continue;
44            }
45            if self.lex_string(state) {
46                continue;
47            }
48            if self.lex_char(state) {
49                continue;
50            }
51            if self.lex_number(state) {
52                continue;
53            }
54            if self.lex_keyword_or_identifier(state) {
55                continue;
56            }
57            if self.lex_operator_or_delimiter(state) {
58                continue;
59            }
60            if self.lex_preprocessor(state) {
61                continue;
62            }
63            if self.lex_text(state) {
64                continue;
65            }
66            else {
67                let start = state.get_position();
68                if let Some(ch) = state.peek() {
69                    state.advance(ch.len_utf8());
70                    state.add_token(CTokenType::Error, start, state.get_position());
71                }
72            }
73            state.advance_if_dead_lock(safe_point);
74        }
75        Ok(())
76    }
77
78    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79        let start = state.get_position();
80        let mut count = 0;
81
82        while let Some(ch) = state.peek() {
83            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
84                state.advance(ch.len_utf8());
85                count += 1;
86            }
87            else {
88                break;
89            }
90        }
91
92        if count > 0 {
93            state.add_token(CTokenType::Whitespace, start, state.get_position());
94            true
95        }
96        else {
97            false
98        }
99    }
100
101    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        let start = state.get_position();
103
104        if state.consume_if_starts_with("//") {
105            while let Some(ch) = state.peek() {
106                if ch == '\n' || ch == '\r' {
107                    break;
108                }
109                state.advance(ch.len_utf8());
110            }
111            state.add_token(CTokenType::Comment, start, state.get_position());
112            return true;
113        }
114        else if state.consume_if_starts_with("/*") {
115            while state.not_at_end() {
116                if state.consume_if_starts_with("*/") {
117                    break;
118                }
119                if let Some(ch) = state.peek() {
120                    state.advance(ch.len_utf8());
121                }
122                else {
123                    break;
124                }
125            }
126            state.add_token(CTokenType::Comment, start, state.get_position());
127            return true;
128        }
129        false
130    }
131
132    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
133        let start = state.get_position();
134
135        if let Some(ch) = state.peek() {
136            if ch == '\n' {
137                state.advance(1);
138                state.add_token(CTokenType::Whitespace, start, state.get_position());
139                return true;
140            }
141            else if ch == '\r' {
142                state.advance(1);
143                if state.peek() == Some('\n') {
144                    state.advance(1);
145                }
146                state.add_token(CTokenType::Whitespace, start, state.get_position());
147                return true;
148            }
149        }
150        false
151    }
152
153    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154        let start = state.get_position();
155
156        if let Some('"') = state.peek() {
157            state.advance(1);
158            while let Some(ch) = state.peek() {
159                if ch == '"' {
160                    state.advance(1);
161                    break;
162                }
163                else if ch == '\\' {
164                    state.advance(1);
165                    if let Some(escaped) = state.peek() {
166                        state.advance(escaped.len_utf8());
167                    }
168                }
169                else {
170                    state.advance(ch.len_utf8());
171                }
172            }
173            state.add_token(CTokenType::StringLiteral, start, state.get_position());
174            return true;
175        }
176        false
177    }
178
179    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
180        let start = state.get_position();
181
182        if let Some('\'') = state.peek() {
183            state.advance(1);
184            while let Some(ch) = state.peek() {
185                if ch == '\'' {
186                    state.advance(1);
187                    break;
188                }
189                else if ch == '\\' {
190                    state.advance(1);
191                    if let Some(escaped) = state.peek() {
192                        state.advance(escaped.len_utf8());
193                    }
194                }
195                else {
196                    state.advance(ch.len_utf8());
197                }
198            }
199            state.add_token(CTokenType::CharLiteral, start, state.get_position());
200            return true;
201        }
202        false
203    }
204
205    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
206        let start = state.get_position();
207
208        if let Some(ch) = state.peek() {
209            if ch.is_ascii_digit() {
210                state.advance(1);
211                while let Some(ch) = state.peek() {
212                    if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
213                        state.advance(ch.len_utf8());
214                    }
215                    else {
216                        break;
217                    }
218                }
219
220                let text = state.get_text_in((start..state.get_position()).into());
221                let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatLiteral } else { CTokenType::IntegerLiteral };
222                state.add_token(kind, start, state.get_position());
223                return true;
224            }
225        }
226        false
227    }
228
229    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230        let start = state.get_position();
231
232        if let Some(ch) = state.peek() {
233            if ch.is_ascii_alphabetic() || ch == '_' {
234                state.advance(ch.len_utf8());
235                while let Some(ch) = state.peek() {
236                    if ch.is_ascii_alphanumeric() || ch == '_' {
237                        state.advance(ch.len_utf8());
238                    }
239                    else {
240                        break;
241                    }
242                }
243
244                let text = state.get_text_in((start..state.get_position()).into());
245                let kind = if C_KEYWORDS.contains(&&*text) {
246                    match &*text {
247                        "auto" => CTokenType::Auto,
248                        "register" => CTokenType::Register,
249                        "static" => CTokenType::Static,
250                        "extern" => CTokenType::Extern,
251                        "typedef" => CTokenType::Typedef,
252                        "void" => CTokenType::Void,
253                        "char" => CTokenType::Char,
254                        "short" => CTokenType::Short,
255                        "int" => CTokenType::Int,
256                        "long" => CTokenType::Long,
257                        "float" => CTokenType::Float,
258                        "double" => CTokenType::Double,
259                        "signed" => CTokenType::Signed,
260                        "unsigned" => CTokenType::Unsigned,
261                        "struct" => CTokenType::Struct,
262                        "union" => CTokenType::Union,
263                        "enum" => CTokenType::Enum,
264                        "const" => CTokenType::Const,
265                        "volatile" => CTokenType::Volatile,
266                        "restrict" => CTokenType::Restrict,
267                        "if" => CTokenType::If,
268                        "else" => CTokenType::Else,
269                        "switch" => CTokenType::Switch,
270                        "case" => CTokenType::Case,
271                        "default" => CTokenType::Default,
272                        "for" => CTokenType::For,
273                        "while" => CTokenType::While,
274                        "do" => CTokenType::Do,
275                        "break" => CTokenType::Break,
276                        "continue" => CTokenType::Continue,
277                        "goto" => CTokenType::Goto,
278                        "return" => CTokenType::Return,
279                        "sizeof" => CTokenType::Sizeof,
280                        "inline" => CTokenType::Inline,
281                        "_Bool" => CTokenType::Bool,
282                        "_Complex" => CTokenType::Complex,
283                        "_Imaginary" => CTokenType::Imaginary,
284                        "_Alignas" => CTokenType::Alignas,
285                        "_Alignof" => CTokenType::Alignof,
286                        "_Atomic" => CTokenType::Atomic,
287                        "_Static_assert" => CTokenType::StaticAssert,
288                        "_Thread_local" => CTokenType::ThreadLocal,
289                        "_Generic" => CTokenType::Generic,
290                        "_Noreturn" => CTokenType::Noreturn,
291                        _ => CTokenType::Identifier,
292                    }
293                }
294                else {
295                    CTokenType::Identifier
296                };
297                state.add_token(kind, start, state.get_position());
298                return true;
299            }
300        }
301        false
302    }
303
304    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
305        let start = state.get_position();
306
307        if let Some(ch) = state.peek() {
308            let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
309
310            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
311
312            // 检查三字符操作符
313            if let Some(ref three) = three_char {
314                if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
315                    state.advance(3);
316                    state.add_token(kind, start, state.get_position());
317                    return true;
318                }
319            }
320
321            // 检查双字符操作符
322            if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
323                state.advance(2);
324                state.add_token(kind, start, state.get_position());
325                return true;
326            }
327
328            // 检查单字符操作符 and 分隔符
329            let kind = match ch {
330                '(' => CTokenType::LeftParen,
331                ')' => CTokenType::RightParen,
332                '[' => CTokenType::LeftBracket,
333                ']' => CTokenType::RightBracket,
334                '{' => CTokenType::LeftBrace,
335                '}' => CTokenType::RightBrace,
336                ',' => CTokenType::Comma,
337                ';' => CTokenType::Semicolon,
338                ':' => CTokenType::Colon,
339                '.' => CTokenType::Dot,
340                '?' => CTokenType::Question,
341                '+' => CTokenType::Plus,
342                '-' => CTokenType::Minus,
343                '*' => CTokenType::Star,
344                '/' => CTokenType::Slash,
345                '%' => CTokenType::Percent,
346                '=' => CTokenType::Assign,
347                '<' => CTokenType::Less,
348                '>' => CTokenType::Greater,
349                '!' => CTokenType::LogicalNot,
350                '&' => CTokenType::BitAnd,
351                '|' => CTokenType::BitOr,
352                '^' => CTokenType::BitXor,
353                '~' => CTokenType::BitNot,
354                _ => return false,
355            };
356            state.advance(1);
357            state.add_token(kind, start, state.get_position());
358            return true;
359        }
360        false
361    }
362
363    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364        let start = state.get_position();
365
366        if state.consume_if_starts_with("#") {
367            while let Some(ch) = state.peek() {
368                if ch == '\n' || ch == '\r' {
369                    break;
370                }
371                state.advance(ch.len_utf8());
372            }
373            state.add_token(CTokenType::PreprocessorDirective, start, state.get_position());
374            return true;
375        }
376        false
377    }
378
379    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
380        let start = state.get_position();
381
382        if let Some(ch) = state.peek() {
383            if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
384                state.advance(ch.len_utf8());
385                state.add_token(CTokenType::Text, start, state.get_position());
386                return true;
387            }
388        }
389        false
390    }
391}
392
393static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
394    &[
395        "auto",
396        "register",
397        "static",
398        "extern",
399        "typedef",
400        "void",
401        "char",
402        "short",
403        "int",
404        "long",
405        "float",
406        "double",
407        "signed",
408        "unsigned",
409        "struct",
410        "union",
411        "enum",
412        "const",
413        "volatile",
414        "restrict",
415        "if",
416        "else",
417        "switch",
418        "case",
419        "default",
420        "for",
421        "while",
422        "do",
423        "break",
424        "continue",
425        "goto",
426        "return",
427        "sizeof",
428        "inline",
429        "_Bool",
430        "_Complex",
431        "_Imaginary",
432        "_Alignas",
433        "_Alignof",
434        "_Atomic",
435        "_Static_assert",
436        "_Thread_local",
437        "_Generic",
438        "_Noreturn",
439    ]
440});
441
442static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
443    let mut map = std::collections::HashMap::new();
444    map.insert("+=", CTokenType::PlusAssign);
445    map.insert("-=", CTokenType::MinusAssign);
446    map.insert("*=", CTokenType::StarAssign);
447    map.insert("/=", CTokenType::SlashAssign);
448    map.insert("%=", CTokenType::PercentAssign);
449    map.insert("==", CTokenType::Equal);
450    map.insert("!=", CTokenType::NotEqual);
451    map.insert("<=", CTokenType::LessEqual);
452    map.insert(">=", CTokenType::GreaterEqual);
453    map.insert("&&", CTokenType::LogicalAnd);
454    map.insert("||", CTokenType::LogicalOr);
455    map.insert("<<", CTokenType::LeftShift);
456    map.insert(">>", CTokenType::RightShift);
457    map.insert("&=", CTokenType::AndAssign);
458    map.insert("|=", CTokenType::OrAssign);
459    map.insert("^=", CTokenType::XorAssign);
460    map.insert("++", CTokenType::Increment);
461    map.insert("--", CTokenType::Decrement);
462    map.insert("->", CTokenType::Arrow);
463    map
464});
465
466static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
467    let mut map = std::collections::HashMap::new();
468    map.insert("<<=", CTokenType::LeftShiftAssign);
469    map.insert(">>=", CTokenType::RightShiftAssign);
470    map
471});