Skip to main content

oak_c/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the C language.
3pub mod token_type;
4
5pub use token_type::CTokenType;
6
7use crate::language::CLanguage;
8use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, CLanguage>;
12
13/// Lexer for the C language.
14#[cfg_attr(feature = "serde", derive(serde::Serialize))]
15#[derive(Clone, Copy, Debug)]
16pub struct CLexer<'config> {
17    /// Language configuration.
18    config: &'config CLanguage,
19}
20
21impl<'config> Lexer<CLanguage> for CLexer<'config> {
22    /// Tokenizes the source code into a stream of C tokens.
23    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
24        let mut state = State::new_with_cache(source, 0, cache);
25        let result = self.run(&mut state);
26        if result.is_ok() {
27            state.add_eof()
28        }
29        state.finish_with_cache(result, cache)
30    }
31}
32
33impl<'config> CLexer<'config> {
34    /// Creates a new `CLexer` with the given language configuration.
35    pub fn new(config: &'config CLanguage) -> Self {
36        Self { config }
37    }
38
39    /// Runs the lexer on the current state until the end of the source.
40    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43            if self.skip_whitespace(state) {
44                continue;
45            }
46            if self.skip_comment(state) {
47                continue;
48            }
49            if self.lex_newline(state) {
50                continue;
51            }
52            if self.lex_string(state) {
53                continue;
54            }
55            if self.lex_char(state) {
56                continue;
57            }
58            if self.lex_number(state) {
59                continue;
60            }
61            if self.lex_keyword_or_identifier(state) {
62                continue;
63            }
64            if self.lex_operator_or_delimiter(state) {
65                continue;
66            }
67            if self.lex_preprocessor(state) {
68                continue;
69            }
70            if self.lex_text(state) {
71                continue;
72            }
73            else {
74                let start = state.get_position();
75                if let Some(ch) = state.peek() {
76                    state.advance(ch.len_utf8());
77                    state.add_token(CTokenType::Error, start, state.get_position())
78                }
79            }
80            state.advance_if_dead_lock(safe_point)
81        }
82        Ok(())
83    }
84
85    /// Skips whitespace characters (except newlines).
86    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        let start = state.get_position();
88        let mut count = 0;
89
90        while let Some(ch) = state.peek() {
91            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
92                state.advance(ch.len_utf8());
93                count += 1
94            }
95            else {
96                break;
97            }
98        }
99
100        if count > 0 {
101            state.add_token(CTokenType::Whitespace, start, state.get_position());
102            true
103        }
104        else {
105            false
106        }
107    }
108
109    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        let start = state.get_position();
111
112        if state.consume_if_starts_with("//") {
113            while let Some(ch) = state.peek() {
114                if ch == '\n' || ch == '\r' {
115                    break;
116                }
117                state.advance(ch.len_utf8())
118            }
119            state.add_token(CTokenType::LineComment, start, state.get_position());
120            return true;
121        }
122        else if state.consume_if_starts_with("/*") {
123            while state.not_at_end() {
124                if state.consume_if_starts_with("*/") {
125                    break;
126                }
127                if let Some(ch) = state.peek() { state.advance(ch.len_utf8()) } else { break }
128            }
129            state.add_token(CTokenType::BlockComment, start, state.get_position());
130            return true;
131        }
132        false
133    }
134
135    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
136        let start = state.get_position();
137
138        if let Some(ch) = state.peek() {
139            if ch == '\n' {
140                state.advance(1);
141                state.add_token(CTokenType::Whitespace, start, state.get_position());
142                return true;
143            }
144            else if ch == '\r' {
145                state.advance(1);
146                if state.peek() == Some('\n') {
147                    state.advance(1)
148                }
149                state.add_token(CTokenType::Whitespace, start, state.get_position());
150                return true;
151            }
152        }
153        false
154    }
155
156    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
157        let start = state.get_position();
158
159        if let Some('"') = state.peek() {
160            state.advance(1);
161            while let Some(ch) = state.peek() {
162                if ch == '"' {
163                    state.advance(1);
164                    break;
165                }
166                else if ch == '\\' {
167                    state.advance(1);
168                    if let Some(escaped) = state.peek() {
169                        state.advance(escaped.len_utf8())
170                    }
171                }
172                else {
173                    state.advance(ch.len_utf8())
174                }
175            }
176            state.add_token(CTokenType::StringLiteral, start, state.get_position());
177            return true;
178        }
179        false
180    }
181
182    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183        let start = state.get_position();
184
185        if let Some('\'') = state.peek() {
186            state.advance(1);
187            while let Some(ch) = state.peek() {
188                if ch == '\'' {
189                    state.advance(1);
190                    break;
191                }
192                else if ch == '\\' {
193                    state.advance(1);
194                    if let Some(escaped) = state.peek() {
195                        state.advance(escaped.len_utf8())
196                    }
197                }
198                else {
199                    state.advance(ch.len_utf8())
200                }
201            }
202            state.add_token(CTokenType::CharConstant, start, state.get_position());
203            return true;
204        }
205        false
206    }
207
208    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
209        let start = state.get_position();
210
211        if let Some(ch) = state.peek() {
212            if ch.is_ascii_digit() {
213                state.advance(1);
214                while let Some(ch) = state.peek() {
215                    if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
216                }
217
218                let text = state.get_text_in((start..state.get_position()).into());
219                let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatConstant } else { CTokenType::IntConstant };
220                state.add_token(kind, start, state.get_position());
221                return true;
222            }
223        }
224        false
225    }
226
227    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
228        let start = state.get_position();
229
230        if let Some(ch) = state.peek() {
231            if ch.is_ascii_alphabetic() || ch == '_' {
232                state.advance(ch.len_utf8());
233                while let Some(ch) = state.peek() {
234                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
235                }
236
237                let text = state.get_text_in((start..state.get_position()).into());
238                let kind = if C_KEYWORDS.contains(&&*text) {
239                    match &*text {
240                        "auto" => CTokenType::Auto,
241                        "register" => CTokenType::Register,
242                        "static" => CTokenType::Static,
243                        "extern" => CTokenType::Extern,
244                        "typedef" => CTokenType::Typedef,
245                        "void" => CTokenType::Void,
246                        "char" => CTokenType::Char,
247                        "short" => CTokenType::Short,
248                        "int" => CTokenType::Int,
249                        "long" => CTokenType::Long,
250                        "float" => CTokenType::Float,
251                        "double" => CTokenType::Double,
252                        "signed" => CTokenType::Signed,
253                        "unsigned" => CTokenType::Unsigned,
254                        "struct" => CTokenType::Struct,
255                        "union" => CTokenType::Union,
256                        "enum" => CTokenType::Enum,
257                        "const" => CTokenType::Const,
258                        "volatile" => CTokenType::Volatile,
259                        "restrict" => CTokenType::Restrict,
260                        "if" => CTokenType::If,
261                        "else" => CTokenType::Else,
262                        "switch" => CTokenType::Switch,
263                        "case" => CTokenType::Case,
264                        "default" => CTokenType::Default,
265                        "for" => CTokenType::For,
266                        "while" => CTokenType::While,
267                        "do" => CTokenType::Do,
268                        "break" => CTokenType::Break,
269                        "continue" => CTokenType::Continue,
270                        "goto" => CTokenType::Goto,
271                        "return" => CTokenType::Return,
272                        "sizeof" => CTokenType::Sizeof,
273                        "inline" => CTokenType::Inline,
274                        "_Bool" => CTokenType::Bool,
275                        "_Complex" => CTokenType::Complex,
276                        "_Imaginary" => CTokenType::Imaginary,
277                        "_Alignas" => CTokenType::Alignas,
278                        "_Alignof" => CTokenType::Alignof,
279                        "_Atomic" => CTokenType::Atomic,
280                        "_Static_assert" => CTokenType::StaticAssert,
281                        "_Thread_local" => CTokenType::ThreadLocal,
282                        "_Generic" => CTokenType::Generic,
283                        "_Noreturn" => CTokenType::Noreturn,
284                        _ => CTokenType::Identifier,
285                    }
286                }
287                else {
288                    CTokenType::Identifier
289                };
290                state.add_token(kind, start, state.get_position());
291                return true;
292            }
293        }
294        false
295    }
296
297    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
298        let start = state.get_position();
299
300        if let Some(ch) = state.peek() {
301            let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
302
303            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
304
305            // Check three-character operators
306            if let Some(ref three) = three_char {
307                if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
308                    state.advance(3);
309                    state.add_token(kind, start, state.get_position());
310                    return true;
311                }
312            }
313
314            // Check two-character operators
315            if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
316                state.advance(2);
317                state.add_token(kind, start, state.get_position());
318                return true;
319            }
320
321            // Check single-character operators and delimiters
322            let kind = match ch {
323                '(' => CTokenType::LeftParen,
324                ')' => CTokenType::RightParen,
325                '[' => CTokenType::LeftBracket,
326                ']' => CTokenType::RightBracket,
327                '{' => CTokenType::LeftBrace,
328                '}' => CTokenType::RightBrace,
329                ',' => CTokenType::Comma,
330                ';' => CTokenType::Semicolon,
331                ':' => CTokenType::Colon,
332                '.' => CTokenType::Dot,
333                '?' => CTokenType::Question,
334                '+' => CTokenType::Plus,
335                '-' => CTokenType::Minus,
336                '*' => CTokenType::Star,
337                '/' => CTokenType::Slash,
338                '%' => CTokenType::Percent,
339                '=' => CTokenType::Assign,
340                '<' => CTokenType::Less,
341                '>' => CTokenType::Greater,
342                '!' => CTokenType::LogicalNot,
343                '&' => CTokenType::BitAnd,
344                '|' => CTokenType::BitOr,
345                '^' => CTokenType::BitXor,
346                '~' => CTokenType::BitNot,
347                _ => return false,
348            };
349            state.advance(1);
350            state.add_token(kind, start, state.get_position());
351            return true;
352        }
353        false
354    }
355
356    fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
357        let start = state.get_position();
358
359        if state.consume_if_starts_with("#") {
360            while let Some(ch) = state.peek() {
361                if ch == '\n' || ch == '\r' {
362                    break;
363                }
364                state.advance(ch.len_utf8())
365            }
366            state.add_token(CTokenType::Preprocessor, start, state.get_position());
367            return true;
368        }
369        false
370    }
371
372    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
373        let start = state.get_position();
374
375        if let Some(ch) = state.peek() {
376            if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
377                state.advance(ch.len_utf8());
378                state.add_token(CTokenType::Text, start, state.get_position());
379                return true;
380            }
381        }
382        false
383    }
384}
385
386static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
387    &[
388        "auto",
389        "register",
390        "static",
391        "extern",
392        "typedef",
393        "void",
394        "char",
395        "short",
396        "int",
397        "long",
398        "float",
399        "double",
400        "signed",
401        "unsigned",
402        "struct",
403        "union",
404        "enum",
405        "const",
406        "volatile",
407        "restrict",
408        "if",
409        "else",
410        "switch",
411        "case",
412        "default",
413        "for",
414        "while",
415        "do",
416        "break",
417        "continue",
418        "goto",
419        "return",
420        "sizeof",
421        "inline",
422        "_Bool",
423        "_Complex",
424        "_Imaginary",
425        "_Alignas",
426        "_Alignof",
427        "_Atomic",
428        "_Static_assert",
429        "_Thread_local",
430        "_Generic",
431        "_Noreturn",
432    ]
433});
434
435static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
436    let mut map = std::collections::HashMap::new();
437    map.insert("+=", CTokenType::PlusAssign);
438    map.insert("-=", CTokenType::MinusAssign);
439    map.insert("*=", CTokenType::StarAssign);
440    map.insert("/=", CTokenType::SlashAssign);
441    map.insert("%=", CTokenType::PercentAssign);
442    map.insert("==", CTokenType::Equal);
443    map.insert("!=", CTokenType::NotEqual);
444    map.insert("<=", CTokenType::LessEqual);
445    map.insert(">=", CTokenType::GreaterEqual);
446    map.insert("&&", CTokenType::LogicalAnd);
447    map.insert("||", CTokenType::LogicalOr);
448    map.insert("<<", CTokenType::LeftShift);
449    map.insert(">>", CTokenType::RightShift);
450    map.insert("&=", CTokenType::AndAssign);
451    map.insert("|=", CTokenType::OrAssign);
452    map.insert("^=", CTokenType::XorAssign);
453    map.insert("++", CTokenType::Increment);
454    map.insert("--", CTokenType::Decrement);
455    map.insert("->", CTokenType::Arrow);
456    map
457});
458
459static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
460    let mut map = std::collections::HashMap::new();
461    map.insert("<<=", CTokenType::LeftShiftAssign);
462    map.insert(">>=", CTokenType::RightShiftAssign);
463    map
464});