Skip to main content

oak_c/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the C language.
3pub mod token_type;
4
5pub use token_type::CTokenType;
6
7use crate::language::CLanguage;
8use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
9#[cfg(feature = "serde")]
10use serde::Serialize;
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, CLanguage>;
14
15/// Lexer for the C language.
16#[cfg_attr(feature = "serde", derive(Serialize))]
17#[derive(Clone, Copy, Debug)]
18pub struct CLexer<'config> {
19    /// Language configuration.
20    config: &'config CLanguage,
21}
22
23impl<'config> Lexer<CLanguage> for CLexer<'config> {
24    /// Tokenizes the source code into a stream of C tokens.
25    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
26        let mut state = State::new_with_cache(source, 0, cache);
27        let result = self.run(&mut state);
28        if result.is_ok() {
29            state.add_eof()
30        }
31        state.finish_with_cache(result, cache)
32    }
33}
34
35impl<'config> CLexer<'config> {
36    /// Creates a new `CLexer` with the given language configuration.
37    pub fn new(config: &'config CLanguage) -> Self {
38        Self { config }
39    }
40
41    /// Runs the lexer on the current state until the end of the source.
42    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
43        while state.not_at_end() {
44            let safe_point = state.get_position();
45            if self.skip_whitespace(state) {
46                continue;
47            }
48            if self.skip_comment(state) {
49                continue;
50            }
51            if self.lex_newline(state) {
52                continue;
53            }
54            if self.lex_string(state) {
55                continue;
56            }
57            if self.lex_char(state) {
58                continue;
59            }
60            if self.lex_number(state) {
61                continue;
62            }
63            if self.lex_keyword_or_identifier(state) {
64                continue;
65            }
66            if self.lex_operator_or_delimiter(state) {
67                continue;
68            }
69            if self.lex_preprocessor(state) {
70                continue;
71            }
72            if self.lex_text(state) {
73                continue;
74            }
75            else {
76                let start = state.get_position();
77                if let Some(ch) = state.peek() {
78                    state.advance(ch.len_utf8());
79                    state.add_token(CTokenType::Error, start, state.get_position())
80                }
81            }
82            state.advance_if_dead_lock(safe_point)
83        }
84        Ok(())
85    }
86
87    /// Skips whitespace characters (except newlines).
88    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89        let start = state.get_position();
90        let mut count = 0;
91
92        while let Some(ch) = state.peek() {
93            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
94                state.advance(ch.len_utf8());
95                count += 1
96            }
97            else {
98                break;
99            }
100        }
101
102        if count > 0 {
103            state.add_token(CTokenType::Whitespace, start, state.get_position());
104            true
105        }
106        else {
107            false
108        }
109    }
110
111    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        let start = state.get_position();
113
114        if state.consume_if_starts_with("//") {
115            while let Some(ch) = state.peek() {
116                if ch == '\n' || ch == '\r' {
117                    break;
118                }
119                state.advance(ch.len_utf8())
120            }
121            state.add_token(CTokenType::LineComment, start, state.get_position());
122            return true;
123        }
124        else if state.consume_if_starts_with("/*") {
125            while state.not_at_end() {
126                if state.consume_if_starts_with("*/") {
127                    break;
128                }
129                if let Some(ch) = state.peek() { state.advance(ch.len_utf8()) } else { break }
130            }
131            state.add_token(CTokenType::BlockComment, start, state.get_position());
132            return true;
133        }
134        false
135    }
136
137    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
138        let start = state.get_position();
139
140        if let Some(ch) = state.peek() {
141            if ch == '\n' {
142                state.advance(1);
143                state.add_token(CTokenType::Whitespace, start, state.get_position());
144                return true;
145            }
146            else if ch == '\r' {
147                state.advance(1);
148                if state.peek() == Some('\n') {
149                    state.advance(1)
150                }
151                state.add_token(CTokenType::Whitespace, start, state.get_position());
152                return true;
153            }
154        }
155        false
156    }
157
158    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
159        let start = state.get_position();
160
161        if let Some('"') = state.peek() {
162            state.advance(1);
163            while let Some(ch) = state.peek() {
164                if ch == '"' {
165                    state.advance(1);
166                    break;
167                }
168                else if ch == '\\' {
169                    state.advance(1);
170                    if let Some(escaped) = state.peek() {
171                        state.advance(escaped.len_utf8())
172                    }
173                }
174                else {
175                    state.advance(ch.len_utf8())
176                }
177            }
178            state.add_token(CTokenType::StringLiteral, start, state.get_position());
179            return true;
180        }
181        false
182    }
183
184    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
185        let start = state.get_position();
186
187        if let Some('\'') = state.peek() {
188            state.advance(1);
189            while let Some(ch) = state.peek() {
190                if ch == '\'' {
191                    state.advance(1);
192                    break;
193                }
194                else if ch == '\\' {
195                    state.advance(1);
196                    if let Some(escaped) = state.peek() {
197                        state.advance(escaped.len_utf8())
198                    }
199                }
200                else {
201                    state.advance(ch.len_utf8())
202                }
203            }
204            state.add_token(CTokenType::CharConstant, start, state.get_position());
205            return true;
206        }
207        false
208    }
209
210    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211        let start = state.get_position();
212
213        if let Some(ch) = state.peek() {
214            if ch.is_ascii_digit() {
215                state.advance(1);
216                while let Some(ch) = state.peek() {
217                    if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
218                }
219
220                let text = state.get_text_in((start..state.get_position()).into());
221                let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatConstant } else { CTokenType::IntConstant };
222                state.add_token(kind, start, state.get_position());
223                return true;
224            }
225        }
226        false
227    }
228
229    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230        let start = state.get_position();
231
232        if let Some(ch) = state.peek() {
233            if ch.is_ascii_alphabetic() || ch == '_' {
234                state.advance(ch.len_utf8());
235                while let Some(ch) = state.peek() {
236                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
237                }
238
239                let text = state.get_text_in((start..state.get_position()).into());
240                let kind = if C_KEYWORDS.contains(&&*text) {
241                    match &*text {
242                        "auto" => CTokenType::Auto,
243                        "register" => CTokenType::Register,
244                        "static" => CTokenType::Static,
245                        "extern" => CTokenType::Extern,
246                        "typedef" => CTokenType::Typedef,
247                        "void" => CTokenType::Void,
248                        "char" => CTokenType::Char,
249                        "short" => CTokenType::Short,
250                        "int" => CTokenType::Int,
251                        "long" => CTokenType::Long,
252                        "float" => CTokenType::Float,
253                        "double" => CTokenType::Double,
254                        "signed" => CTokenType::Signed,
255                        "unsigned" => CTokenType::Unsigned,
256                        "struct" => CTokenType::Struct,
257                        "union" => CTokenType::Union,
258                        "enum" => CTokenType::Enum,
259                        "const" => CTokenType::Const,
260                        "volatile" => CTokenType::Volatile,
261                        "restrict" => CTokenType::Restrict,
262                        "if" => CTokenType::If,
263                        "else" => CTokenType::Else,
264                        "switch" => CTokenType::Switch,
265                        "case" => CTokenType::Case,
266                        "default" => CTokenType::Default,
267                        "for" => CTokenType::For,
268                        "while" => CTokenType::While,
269                        "do" => CTokenType::Do,
270                        "break" => CTokenType::Break,
271                        "continue" => CTokenType::Continue,
272                        "goto" => CTokenType::Goto,
273                        "return" => CTokenType::Return,
274                        "sizeof" => CTokenType::Sizeof,
275                        "inline" => CTokenType::Inline,
276                        "_Bool" => CTokenType::Bool,
277                        "_Complex" => CTokenType::Complex,
278                        "_Imaginary" => CTokenType::Imaginary,
279                        "_Alignas" => CTokenType::Alignas,
280                        "_Alignof" => CTokenType::Alignof,
281                        "_Atomic" => CTokenType::Atomic,
282                        "_Static_assert" => CTokenType::StaticAssert,
283                        "_Thread_local" => CTokenType::ThreadLocal,
284                        "_Generic" => CTokenType::Generic,
285                        "_Noreturn" => CTokenType::Noreturn,
286                        _ => CTokenType::Identifier,
287                    }
288                }
289                else {
290                    CTokenType::Identifier
291                };
292                state.add_token(kind, start, state.get_position());
293                return true;
294            }
295        }
296        false
297    }
298
299    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
300        let start = state.get_position();
301
302        if let Some(ch) = state.peek() {
303            let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
304
305            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
306
307            // Check three-character operators
308            if let Some(ref three) = three_char {
309                if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
310                    state.advance(3);
311                    state.add_token(kind, start, state.get_position());
312                    return true;
313                }
314            }
315
316            // Check two-character operators
317            if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
318                state.advance(2);
319                state.add_token(kind, start, state.get_position());
320                return true;
321            }
322
323            // Check single-character operators and delimiters
324            let kind = match ch {
325                '(' => CTokenType::LeftParen,
326                ')' => CTokenType::RightParen,
327                '[' => CTokenType::LeftBracket,
328                ']' => CTokenType::RightBracket,
329                '{' => CTokenType::LeftBrace,
330                '}' => CTokenType::RightBrace,
331                ',' => CTokenType::Comma,
332                ';' => CTokenType::Semicolon,
333                ':' => CTokenType::Colon,
334                '.' => CTokenType::Dot,
335                '?' => CTokenType::Question,
336                '+' => CTokenType::Plus,
337                '-' => CTokenType::Minus,
338                '*' => CTokenType::Star,
339                '/' => CTokenType::Slash,
340                '%' => CTokenType::Percent,
341                '=' => CTokenType::Assign,
342                '<' => CTokenType::Less,
343                '>' => CTokenType::Greater,
344                '!' => CTokenType::LogicalNot,
345                '&' => CTokenType::BitAnd,
346                '|' => CTokenType::BitOr,
347                '^' => CTokenType::BitXor,
348                '~' => CTokenType::BitNot,
349                _ => return false,
350            };
351            state.advance(1);
352            state.add_token(kind, start, state.get_position());
353            return true;
354        }
355        false
356    }
357
358    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
359        let start = state.get_position();
360
361        if state.consume_if_starts_with("#") {
362            while let Some(ch) = state.peek() {
363                if ch == '\n' || ch == '\r' {
364                    break;
365                }
366                state.advance(ch.len_utf8())
367            }
368            state.add_token(CTokenType::Preprocessor, start, state.get_position());
369            return true;
370        }
371        false
372    }
373
374    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
375        let start = state.get_position();
376
377        if let Some(ch) = state.peek() {
378            if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
379                state.advance(ch.len_utf8());
380                state.add_token(CTokenType::Text, start, state.get_position());
381                return true;
382            }
383        }
384        false
385    }
386}
387
388static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
389    &[
390        "auto",
391        "register",
392        "static",
393        "extern",
394        "typedef",
395        "void",
396        "char",
397        "short",
398        "int",
399        "long",
400        "float",
401        "double",
402        "signed",
403        "unsigned",
404        "struct",
405        "union",
406        "enum",
407        "const",
408        "volatile",
409        "restrict",
410        "if",
411        "else",
412        "switch",
413        "case",
414        "default",
415        "for",
416        "while",
417        "do",
418        "break",
419        "continue",
420        "goto",
421        "return",
422        "sizeof",
423        "inline",
424        "_Bool",
425        "_Complex",
426        "_Imaginary",
427        "_Alignas",
428        "_Alignof",
429        "_Atomic",
430        "_Static_assert",
431        "_Thread_local",
432        "_Generic",
433        "_Noreturn",
434    ]
435});
436
437static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
438    let mut map = std::collections::HashMap::new();
439    map.insert("+=", CTokenType::PlusAssign);
440    map.insert("-=", CTokenType::MinusAssign);
441    map.insert("*=", CTokenType::StarAssign);
442    map.insert("/=", CTokenType::SlashAssign);
443    map.insert("%=", CTokenType::PercentAssign);
444    map.insert("==", CTokenType::Equal);
445    map.insert("!=", CTokenType::NotEqual);
446    map.insert("<=", CTokenType::LessEqual);
447    map.insert(">=", CTokenType::GreaterEqual);
448    map.insert("&&", CTokenType::LogicalAnd);
449    map.insert("||", CTokenType::LogicalOr);
450    map.insert("<<", CTokenType::LeftShift);
451    map.insert(">>", CTokenType::RightShift);
452    map.insert("&=", CTokenType::AndAssign);
453    map.insert("|=", CTokenType::OrAssign);
454    map.insert("^=", CTokenType::XorAssign);
455    map.insert("++", CTokenType::Increment);
456    map.insert("--", CTokenType::Decrement);
457    map.insert("->", CTokenType::Arrow);
458    map
459});
460
461static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
462    let mut map = std::collections::HashMap::new();
463    map.insert("<<=", CTokenType::LeftShiftAssign);
464    map.insert(">>=", CTokenType::RightShiftAssign);
465    map
466});