Skip to main content

oak_c/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::CTokenType;
5
6use crate::language::CLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8#[cfg(feature = "serde")]
9use serde::Serialize;
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, CLanguage>;
13
14/// Lexer for the C language.
15#[cfg_attr(feature = "serde", derive(Serialize))]
16#[derive(Clone, Copy, Debug)]
17pub struct CLexer<'config> {
18    /// Language configuration.
19    config: &'config CLanguage,
20}
21
22impl<'config> Lexer<CLanguage> for CLexer<'config> {
23    /// Tokenizes the source code into a stream of C tokens.
24    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
25        let mut state = State::new_with_cache(source, 0, cache);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof()
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> CLexer<'config> {
35    /// Creates a new `CLexer` with the given language configuration.
36    pub fn new(config: &'config CLanguage) -> Self {
37        Self { config }
38    }
39
40    /// Runs the lexer on the current state until the end of the source.
41    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
42        while state.not_at_end() {
43            let safe_point = state.get_position();
44            if self.skip_whitespace(state) {
45                continue;
46            }
47            if self.skip_comment(state) {
48                continue;
49            }
50            if self.lex_newline(state) {
51                continue;
52            }
53            if self.lex_string(state) {
54                continue;
55            }
56            if self.lex_char(state) {
57                continue;
58            }
59            if self.lex_number(state) {
60                continue;
61            }
62            if self.lex_keyword_or_identifier(state) {
63                continue;
64            }
65            if self.lex_operator_or_delimiter(state) {
66                continue;
67            }
68            if self.lex_preprocessor(state) {
69                continue;
70            }
71            if self.lex_text(state) {
72                continue;
73            }
74            else {
75                let start = state.get_position();
76                if let Some(ch) = state.peek() {
77                    state.advance(ch.len_utf8());
78                    state.add_token(CTokenType::Error, start, state.get_position())
79                }
80            }
81            state.advance_if_dead_lock(safe_point)
82        }
83        Ok(())
84    }
85
86    /// Skips whitespace characters (except newlines).
87    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start = state.get_position();
89        let mut count = 0;
90
91        while let Some(ch) = state.peek() {
92            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
93                state.advance(ch.len_utf8());
94                count += 1
95            }
96            else {
97                break;
98            }
99        }
100
101        if count > 0 {
102            state.add_token(CTokenType::Whitespace, start, state.get_position());
103            true
104        }
105        else {
106            false
107        }
108    }
109
110    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111        let start = state.get_position();
112
113        if state.consume_if_starts_with("//") {
114            while let Some(ch) = state.peek() {
115                if ch == '\n' || ch == '\r' {
116                    break;
117                }
118                state.advance(ch.len_utf8())
119            }
120            state.add_token(CTokenType::Comment, start, state.get_position());
121            return true;
122        }
123        else if state.consume_if_starts_with("/*") {
124            while state.not_at_end() {
125                if state.consume_if_starts_with("*/") {
126                    break;
127                }
128                if let Some(ch) = state.peek() { state.advance(ch.len_utf8()) } else { break }
129            }
130            state.add_token(CTokenType::Comment, start, state.get_position());
131            return true;
132        }
133        false
134    }
135
136    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
137        let start = state.get_position();
138
139        if let Some(ch) = state.peek() {
140            if ch == '\n' {
141                state.advance(1);
142                state.add_token(CTokenType::Whitespace, start, state.get_position());
143                return true;
144            }
145            else if ch == '\r' {
146                state.advance(1);
147                if state.peek() == Some('\n') {
148                    state.advance(1)
149                }
150                state.add_token(CTokenType::Whitespace, start, state.get_position());
151                return true;
152            }
153        }
154        false
155    }
156
157    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
158        let start = state.get_position();
159
160        if let Some('"') = state.peek() {
161            state.advance(1);
162            while let Some(ch) = state.peek() {
163                if ch == '"' {
164                    state.advance(1);
165                    break;
166                }
167                else if ch == '\\' {
168                    state.advance(1);
169                    if let Some(escaped) = state.peek() {
170                        state.advance(escaped.len_utf8())
171                    }
172                }
173                else {
174                    state.advance(ch.len_utf8())
175                }
176            }
177            state.add_token(CTokenType::StringLiteral, start, state.get_position());
178            return true;
179        }
180        false
181    }
182
183    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
184        let start = state.get_position();
185
186        if let Some('\'') = state.peek() {
187            state.advance(1);
188            while let Some(ch) = state.peek() {
189                if ch == '\'' {
190                    state.advance(1);
191                    break;
192                }
193                else if ch == '\\' {
194                    state.advance(1);
195                    if let Some(escaped) = state.peek() {
196                        state.advance(escaped.len_utf8())
197                    }
198                }
199                else {
200                    state.advance(ch.len_utf8())
201                }
202            }
203            state.add_token(CTokenType::CharLiteral, start, state.get_position());
204            return true;
205        }
206        false
207    }
208
209    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
210        let start = state.get_position();
211
212        if let Some(ch) = state.peek() {
213            if ch.is_ascii_digit() {
214                state.advance(1);
215                while let Some(ch) = state.peek() {
216                    if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
217                }
218
219                let text = state.get_text_in((start..state.get_position()).into());
220                let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatLiteral } else { CTokenType::IntegerLiteral };
221                state.add_token(kind, start, state.get_position());
222                return true;
223            }
224        }
225        false
226    }
227
228    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
229        let start = state.get_position();
230
231        if let Some(ch) = state.peek() {
232            if ch.is_ascii_alphabetic() || ch == '_' {
233                state.advance(ch.len_utf8());
234                while let Some(ch) = state.peek() {
235                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
236                }
237
238                let text = state.get_text_in((start..state.get_position()).into());
239                let kind = if C_KEYWORDS.contains(&&*text) {
240                    match &*text {
241                        "auto" => CTokenType::Auto,
242                        "register" => CTokenType::Register,
243                        "static" => CTokenType::Static,
244                        "extern" => CTokenType::Extern,
245                        "typedef" => CTokenType::Typedef,
246                        "void" => CTokenType::Void,
247                        "char" => CTokenType::Char,
248                        "short" => CTokenType::Short,
249                        "int" => CTokenType::Int,
250                        "long" => CTokenType::Long,
251                        "float" => CTokenType::Float,
252                        "double" => CTokenType::Double,
253                        "signed" => CTokenType::Signed,
254                        "unsigned" => CTokenType::Unsigned,
255                        "struct" => CTokenType::Struct,
256                        "union" => CTokenType::Union,
257                        "enum" => CTokenType::Enum,
258                        "const" => CTokenType::Const,
259                        "volatile" => CTokenType::Volatile,
260                        "restrict" => CTokenType::Restrict,
261                        "if" => CTokenType::If,
262                        "else" => CTokenType::Else,
263                        "switch" => CTokenType::Switch,
264                        "case" => CTokenType::Case,
265                        "default" => CTokenType::Default,
266                        "for" => CTokenType::For,
267                        "while" => CTokenType::While,
268                        "do" => CTokenType::Do,
269                        "break" => CTokenType::Break,
270                        "continue" => CTokenType::Continue,
271                        "goto" => CTokenType::Goto,
272                        "return" => CTokenType::Return,
273                        "sizeof" => CTokenType::Sizeof,
274                        "inline" => CTokenType::Inline,
275                        "_Bool" => CTokenType::Bool,
276                        "_Complex" => CTokenType::Complex,
277                        "_Imaginary" => CTokenType::Imaginary,
278                        "_Alignas" => CTokenType::Alignas,
279                        "_Alignof" => CTokenType::Alignof,
280                        "_Atomic" => CTokenType::Atomic,
281                        "_Static_assert" => CTokenType::StaticAssert,
282                        "_Thread_local" => CTokenType::ThreadLocal,
283                        "_Generic" => CTokenType::Generic,
284                        "_Noreturn" => CTokenType::Noreturn,
285                        _ => CTokenType::Identifier,
286                    }
287                }
288                else {
289                    CTokenType::Identifier
290                };
291                state.add_token(kind, start, state.get_position());
292                return true;
293            }
294        }
295        false
296    }
297
298    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
299        let start = state.get_position();
300
301        if let Some(ch) = state.peek() {
302            let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
303
304            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
305
306            // 检查三字符操作符
307            if let Some(ref three) = three_char {
308                if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
309                    state.advance(3);
310                    state.add_token(kind, start, state.get_position());
311                    return true;
312                }
313            }
314
315            // 检查双字符操作符
316            if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
317                state.advance(2);
318                state.add_token(kind, start, state.get_position());
319                return true;
320            }
321
322            // 检查单字符操作符 and 分隔符
323            let kind = match ch {
324                '(' => CTokenType::LeftParen,
325                ')' => CTokenType::RightParen,
326                '[' => CTokenType::LeftBracket,
327                ']' => CTokenType::RightBracket,
328                '{' => CTokenType::LeftBrace,
329                '}' => CTokenType::RightBrace,
330                ',' => CTokenType::Comma,
331                ';' => CTokenType::Semicolon,
332                ':' => CTokenType::Colon,
333                '.' => CTokenType::Dot,
334                '?' => CTokenType::Question,
335                '+' => CTokenType::Plus,
336                '-' => CTokenType::Minus,
337                '*' => CTokenType::Star,
338                '/' => CTokenType::Slash,
339                '%' => CTokenType::Percent,
340                '=' => CTokenType::Assign,
341                '<' => CTokenType::Less,
342                '>' => CTokenType::Greater,
343                '!' => CTokenType::LogicalNot,
344                '&' => CTokenType::BitAnd,
345                '|' => CTokenType::BitOr,
346                '^' => CTokenType::BitXor,
347                '~' => CTokenType::BitNot,
348                _ => return false,
349            };
350            state.advance(1);
351            state.add_token(kind, start, state.get_position());
352            return true;
353        }
354        false
355    }
356
357    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
358        let start = state.get_position();
359
360        if state.consume_if_starts_with("#") {
361            while let Some(ch) = state.peek() {
362                if ch == '\n' || ch == '\r' {
363                    break;
364                }
365                state.advance(ch.len_utf8())
366            }
367            state.add_token(CTokenType::PreprocessorDirective, start, state.get_position());
368            return true;
369        }
370        false
371    }
372
373    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
374        let start = state.get_position();
375
376        if let Some(ch) = state.peek() {
377            if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
378                state.advance(ch.len_utf8());
379                state.add_token(CTokenType::Text, start, state.get_position());
380                return true;
381            }
382        }
383        false
384    }
385}
386
387static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
388    &[
389        "auto",
390        "register",
391        "static",
392        "extern",
393        "typedef",
394        "void",
395        "char",
396        "short",
397        "int",
398        "long",
399        "float",
400        "double",
401        "signed",
402        "unsigned",
403        "struct",
404        "union",
405        "enum",
406        "const",
407        "volatile",
408        "restrict",
409        "if",
410        "else",
411        "switch",
412        "case",
413        "default",
414        "for",
415        "while",
416        "do",
417        "break",
418        "continue",
419        "goto",
420        "return",
421        "sizeof",
422        "inline",
423        "_Bool",
424        "_Complex",
425        "_Imaginary",
426        "_Alignas",
427        "_Alignof",
428        "_Atomic",
429        "_Static_assert",
430        "_Thread_local",
431        "_Generic",
432        "_Noreturn",
433    ]
434});
435
436static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
437    let mut map = std::collections::HashMap::new();
438    map.insert("+=", CTokenType::PlusAssign);
439    map.insert("-=", CTokenType::MinusAssign);
440    map.insert("*=", CTokenType::StarAssign);
441    map.insert("/=", CTokenType::SlashAssign);
442    map.insert("%=", CTokenType::PercentAssign);
443    map.insert("==", CTokenType::Equal);
444    map.insert("!=", CTokenType::NotEqual);
445    map.insert("<=", CTokenType::LessEqual);
446    map.insert(">=", CTokenType::GreaterEqual);
447    map.insert("&&", CTokenType::LogicalAnd);
448    map.insert("||", CTokenType::LogicalOr);
449    map.insert("<<", CTokenType::LeftShift);
450    map.insert(">>", CTokenType::RightShift);
451    map.insert("&=", CTokenType::AndAssign);
452    map.insert("|=", CTokenType::OrAssign);
453    map.insert("^=", CTokenType::XorAssign);
454    map.insert("++", CTokenType::Increment);
455    map.insert("--", CTokenType::Decrement);
456    map.insert("->", CTokenType::Arrow);
457    map
458});
459
460static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
461    let mut map = std::collections::HashMap::new();
462    map.insert("<<=", CTokenType::LeftShiftAssign);
463    map.insert(">>=", CTokenType::RightShiftAssign);
464    map
465});