1pub mod token_type;
2
3pub use token_type::CTokenType;
4
5use crate::language::CLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
7use serde::Serialize;
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, CLanguage>;
11
12#[derive(Clone, Copy, Debug, Serialize)]
13pub struct CLexer<'config> {
14 config: &'config CLanguage,
15}
16
17impl<'config> Lexer<CLanguage> for CLexer<'config> {
18 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
19 let mut state = State::new_with_cache(source, 0, cache);
20 let result = self.run(&mut state);
21 if result.is_ok() {
22 state.add_eof();
23 }
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> CLexer<'config> {
29 pub fn new(config: &'config CLanguage) -> Self {
30 Self { config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 let safe_point = state.get_position();
36 if self.skip_whitespace(state) {
37 continue;
38 }
39 if self.skip_comment(state) {
40 continue;
41 }
42 if self.lex_newline(state) {
43 continue;
44 }
45 if self.lex_string(state) {
46 continue;
47 }
48 if self.lex_char(state) {
49 continue;
50 }
51 if self.lex_number(state) {
52 continue;
53 }
54 if self.lex_keyword_or_identifier(state) {
55 continue;
56 }
57 if self.lex_operator_or_delimiter(state) {
58 continue;
59 }
60 if self.lex_preprocessor(state) {
61 continue;
62 }
63 if self.lex_text(state) {
64 continue;
65 }
66 else {
67 let start = state.get_position();
68 if let Some(ch) = state.peek() {
69 state.advance(ch.len_utf8());
70 state.add_token(CTokenType::Error, start, state.get_position());
71 }
72 }
73 state.advance_if_dead_lock(safe_point);
74 }
75 Ok(())
76 }
77
78 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79 let start = state.get_position();
80 let mut count = 0;
81
82 while let Some(ch) = state.peek() {
83 if ch.is_whitespace() && ch != '\n' && ch != '\r' {
84 state.advance(ch.len_utf8());
85 count += 1;
86 }
87 else {
88 break;
89 }
90 }
91
92 if count > 0 {
93 state.add_token(CTokenType::Whitespace, start, state.get_position());
94 true
95 }
96 else {
97 false
98 }
99 }
100
101 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102 let start = state.get_position();
103
104 if state.consume_if_starts_with("//") {
105 while let Some(ch) = state.peek() {
106 if ch == '\n' || ch == '\r' {
107 break;
108 }
109 state.advance(ch.len_utf8());
110 }
111 state.add_token(CTokenType::Comment, start, state.get_position());
112 return true;
113 }
114 else if state.consume_if_starts_with("/*") {
115 while state.not_at_end() {
116 if state.consume_if_starts_with("*/") {
117 break;
118 }
119 if let Some(ch) = state.peek() {
120 state.advance(ch.len_utf8());
121 }
122 else {
123 break;
124 }
125 }
126 state.add_token(CTokenType::Comment, start, state.get_position());
127 return true;
128 }
129 false
130 }
131
132 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
133 let start = state.get_position();
134
135 if let Some(ch) = state.peek() {
136 if ch == '\n' {
137 state.advance(1);
138 state.add_token(CTokenType::Whitespace, start, state.get_position());
139 return true;
140 }
141 else if ch == '\r' {
142 state.advance(1);
143 if state.peek() == Some('\n') {
144 state.advance(1);
145 }
146 state.add_token(CTokenType::Whitespace, start, state.get_position());
147 return true;
148 }
149 }
150 false
151 }
152
153 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154 let start = state.get_position();
155
156 if let Some('"') = state.peek() {
157 state.advance(1);
158 while let Some(ch) = state.peek() {
159 if ch == '"' {
160 state.advance(1);
161 break;
162 }
163 else if ch == '\\' {
164 state.advance(1);
165 if let Some(escaped) = state.peek() {
166 state.advance(escaped.len_utf8());
167 }
168 }
169 else {
170 state.advance(ch.len_utf8());
171 }
172 }
173 state.add_token(CTokenType::StringLiteral, start, state.get_position());
174 return true;
175 }
176 false
177 }
178
179 fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
180 let start = state.get_position();
181
182 if let Some('\'') = state.peek() {
183 state.advance(1);
184 while let Some(ch) = state.peek() {
185 if ch == '\'' {
186 state.advance(1);
187 break;
188 }
189 else if ch == '\\' {
190 state.advance(1);
191 if let Some(escaped) = state.peek() {
192 state.advance(escaped.len_utf8());
193 }
194 }
195 else {
196 state.advance(ch.len_utf8());
197 }
198 }
199 state.add_token(CTokenType::CharLiteral, start, state.get_position());
200 return true;
201 }
202 false
203 }
204
205 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
206 let start = state.get_position();
207
208 if let Some(ch) = state.peek() {
209 if ch.is_ascii_digit() {
210 state.advance(1);
211 while let Some(ch) = state.peek() {
212 if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
213 state.advance(ch.len_utf8());
214 }
215 else {
216 break;
217 }
218 }
219
220 let text = state.get_text_in((start..state.get_position()).into());
221 let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatLiteral } else { CTokenType::IntegerLiteral };
222 state.add_token(kind, start, state.get_position());
223 return true;
224 }
225 }
226 false
227 }
228
229 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230 let start = state.get_position();
231
232 if let Some(ch) = state.peek() {
233 if ch.is_ascii_alphabetic() || ch == '_' {
234 state.advance(ch.len_utf8());
235 while let Some(ch) = state.peek() {
236 if ch.is_ascii_alphanumeric() || ch == '_' {
237 state.advance(ch.len_utf8());
238 }
239 else {
240 break;
241 }
242 }
243
244 let text = state.get_text_in((start..state.get_position()).into());
245 let kind = if C_KEYWORDS.contains(&&*text) {
246 match &*text {
247 "auto" => CTokenType::Auto,
248 "register" => CTokenType::Register,
249 "static" => CTokenType::Static,
250 "extern" => CTokenType::Extern,
251 "typedef" => CTokenType::Typedef,
252 "void" => CTokenType::Void,
253 "char" => CTokenType::Char,
254 "short" => CTokenType::Short,
255 "int" => CTokenType::Int,
256 "long" => CTokenType::Long,
257 "float" => CTokenType::Float,
258 "double" => CTokenType::Double,
259 "signed" => CTokenType::Signed,
260 "unsigned" => CTokenType::Unsigned,
261 "struct" => CTokenType::Struct,
262 "union" => CTokenType::Union,
263 "enum" => CTokenType::Enum,
264 "const" => CTokenType::Const,
265 "volatile" => CTokenType::Volatile,
266 "restrict" => CTokenType::Restrict,
267 "if" => CTokenType::If,
268 "else" => CTokenType::Else,
269 "switch" => CTokenType::Switch,
270 "case" => CTokenType::Case,
271 "default" => CTokenType::Default,
272 "for" => CTokenType::For,
273 "while" => CTokenType::While,
274 "do" => CTokenType::Do,
275 "break" => CTokenType::Break,
276 "continue" => CTokenType::Continue,
277 "goto" => CTokenType::Goto,
278 "return" => CTokenType::Return,
279 "sizeof" => CTokenType::Sizeof,
280 "inline" => CTokenType::Inline,
281 "_Bool" => CTokenType::Bool,
282 "_Complex" => CTokenType::Complex,
283 "_Imaginary" => CTokenType::Imaginary,
284 "_Alignas" => CTokenType::Alignas,
285 "_Alignof" => CTokenType::Alignof,
286 "_Atomic" => CTokenType::Atomic,
287 "_Static_assert" => CTokenType::StaticAssert,
288 "_Thread_local" => CTokenType::ThreadLocal,
289 "_Generic" => CTokenType::Generic,
290 "_Noreturn" => CTokenType::Noreturn,
291 _ => CTokenType::Identifier,
292 }
293 }
294 else {
295 CTokenType::Identifier
296 };
297 state.add_token(kind, start, state.get_position());
298 return true;
299 }
300 }
301 false
302 }
303
304 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
305 let start = state.get_position();
306
307 if let Some(ch) = state.peek() {
308 let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
309
310 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
311
312 if let Some(ref three) = three_char {
314 if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
315 state.advance(3);
316 state.add_token(kind, start, state.get_position());
317 return true;
318 }
319 }
320
321 if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
323 state.advance(2);
324 state.add_token(kind, start, state.get_position());
325 return true;
326 }
327
328 let kind = match ch {
330 '(' => CTokenType::LeftParen,
331 ')' => CTokenType::RightParen,
332 '[' => CTokenType::LeftBracket,
333 ']' => CTokenType::RightBracket,
334 '{' => CTokenType::LeftBrace,
335 '}' => CTokenType::RightBrace,
336 ',' => CTokenType::Comma,
337 ';' => CTokenType::Semicolon,
338 ':' => CTokenType::Colon,
339 '.' => CTokenType::Dot,
340 '?' => CTokenType::Question,
341 '+' => CTokenType::Plus,
342 '-' => CTokenType::Minus,
343 '*' => CTokenType::Star,
344 '/' => CTokenType::Slash,
345 '%' => CTokenType::Percent,
346 '=' => CTokenType::Assign,
347 '<' => CTokenType::Less,
348 '>' => CTokenType::Greater,
349 '!' => CTokenType::LogicalNot,
350 '&' => CTokenType::BitAnd,
351 '|' => CTokenType::BitOr,
352 '^' => CTokenType::BitXor,
353 '~' => CTokenType::BitNot,
354 _ => return false,
355 };
356 state.advance(1);
357 state.add_token(kind, start, state.get_position());
358 return true;
359 }
360 false
361 }
362
363 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364 let start = state.get_position();
365
366 if state.consume_if_starts_with("#") {
367 while let Some(ch) = state.peek() {
368 if ch == '\n' || ch == '\r' {
369 break;
370 }
371 state.advance(ch.len_utf8());
372 }
373 state.add_token(CTokenType::PreprocessorDirective, start, state.get_position());
374 return true;
375 }
376 false
377 }
378
379 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
380 let start = state.get_position();
381
382 if let Some(ch) = state.peek() {
383 if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
384 state.advance(ch.len_utf8());
385 state.add_token(CTokenType::Text, start, state.get_position());
386 return true;
387 }
388 }
389 false
390 }
391}
392
393static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
394 &[
395 "auto",
396 "register",
397 "static",
398 "extern",
399 "typedef",
400 "void",
401 "char",
402 "short",
403 "int",
404 "long",
405 "float",
406 "double",
407 "signed",
408 "unsigned",
409 "struct",
410 "union",
411 "enum",
412 "const",
413 "volatile",
414 "restrict",
415 "if",
416 "else",
417 "switch",
418 "case",
419 "default",
420 "for",
421 "while",
422 "do",
423 "break",
424 "continue",
425 "goto",
426 "return",
427 "sizeof",
428 "inline",
429 "_Bool",
430 "_Complex",
431 "_Imaginary",
432 "_Alignas",
433 "_Alignof",
434 "_Atomic",
435 "_Static_assert",
436 "_Thread_local",
437 "_Generic",
438 "_Noreturn",
439 ]
440});
441
442static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
443 let mut map = std::collections::HashMap::new();
444 map.insert("+=", CTokenType::PlusAssign);
445 map.insert("-=", CTokenType::MinusAssign);
446 map.insert("*=", CTokenType::StarAssign);
447 map.insert("/=", CTokenType::SlashAssign);
448 map.insert("%=", CTokenType::PercentAssign);
449 map.insert("==", CTokenType::Equal);
450 map.insert("!=", CTokenType::NotEqual);
451 map.insert("<=", CTokenType::LessEqual);
452 map.insert(">=", CTokenType::GreaterEqual);
453 map.insert("&&", CTokenType::LogicalAnd);
454 map.insert("||", CTokenType::LogicalOr);
455 map.insert("<<", CTokenType::LeftShift);
456 map.insert(">>", CTokenType::RightShift);
457 map.insert("&=", CTokenType::AndAssign);
458 map.insert("|=", CTokenType::OrAssign);
459 map.insert("^=", CTokenType::XorAssign);
460 map.insert("++", CTokenType::Increment);
461 map.insert("--", CTokenType::Decrement);
462 map.insert("->", CTokenType::Arrow);
463 map
464});
465
466static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
467 let mut map = std::collections::HashMap::new();
468 map.insert("<<=", CTokenType::LeftShiftAssign);
469 map.insert(">>=", CTokenType::RightShiftAssign);
470 map
471});