1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::CTokenType;
5
6use crate::language::CLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8#[cfg(feature = "serde")]
9use serde::Serialize;
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, CLanguage>;
13
14#[cfg_attr(feature = "serde", derive(Serialize))]
16#[derive(Clone, Copy, Debug)]
17pub struct CLexer<'config> {
18 config: &'config CLanguage,
20}
21
22impl<'config> Lexer<CLanguage> for CLexer<'config> {
23 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
25 let mut state = State::new_with_cache(source, 0, cache);
26 let result = self.run(&mut state);
27 if result.is_ok() {
28 state.add_eof()
29 }
30 state.finish_with_cache(result, cache)
31 }
32}
33
34impl<'config> CLexer<'config> {
35 pub fn new(config: &'config CLanguage) -> Self {
37 Self { config }
38 }
39
40 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
42 while state.not_at_end() {
43 let safe_point = state.get_position();
44 if self.skip_whitespace(state) {
45 continue;
46 }
47 if self.skip_comment(state) {
48 continue;
49 }
50 if self.lex_newline(state) {
51 continue;
52 }
53 if self.lex_string(state) {
54 continue;
55 }
56 if self.lex_char(state) {
57 continue;
58 }
59 if self.lex_number(state) {
60 continue;
61 }
62 if self.lex_keyword_or_identifier(state) {
63 continue;
64 }
65 if self.lex_operator_or_delimiter(state) {
66 continue;
67 }
68 if self.lex_preprocessor(state) {
69 continue;
70 }
71 if self.lex_text(state) {
72 continue;
73 }
74 else {
75 let start = state.get_position();
76 if let Some(ch) = state.peek() {
77 state.advance(ch.len_utf8());
78 state.add_token(CTokenType::Error, start, state.get_position())
79 }
80 }
81 state.advance_if_dead_lock(safe_point)
82 }
83 Ok(())
84 }
85
86 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88 let start = state.get_position();
89 let mut count = 0;
90
91 while let Some(ch) = state.peek() {
92 if ch.is_whitespace() && ch != '\n' && ch != '\r' {
93 state.advance(ch.len_utf8());
94 count += 1
95 }
96 else {
97 break;
98 }
99 }
100
101 if count > 0 {
102 state.add_token(CTokenType::Whitespace, start, state.get_position());
103 true
104 }
105 else {
106 false
107 }
108 }
109
110 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111 let start = state.get_position();
112
113 if state.consume_if_starts_with("//") {
114 while let Some(ch) = state.peek() {
115 if ch == '\n' || ch == '\r' {
116 break;
117 }
118 state.advance(ch.len_utf8())
119 }
120 state.add_token(CTokenType::Comment, start, state.get_position());
121 return true;
122 }
123 else if state.consume_if_starts_with("/*") {
124 while state.not_at_end() {
125 if state.consume_if_starts_with("*/") {
126 break;
127 }
128 if let Some(ch) = state.peek() { state.advance(ch.len_utf8()) } else { break }
129 }
130 state.add_token(CTokenType::Comment, start, state.get_position());
131 return true;
132 }
133 false
134 }
135
136 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
137 let start = state.get_position();
138
139 if let Some(ch) = state.peek() {
140 if ch == '\n' {
141 state.advance(1);
142 state.add_token(CTokenType::Whitespace, start, state.get_position());
143 return true;
144 }
145 else if ch == '\r' {
146 state.advance(1);
147 if state.peek() == Some('\n') {
148 state.advance(1)
149 }
150 state.add_token(CTokenType::Whitespace, start, state.get_position());
151 return true;
152 }
153 }
154 false
155 }
156
157 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
158 let start = state.get_position();
159
160 if let Some('"') = state.peek() {
161 state.advance(1);
162 while let Some(ch) = state.peek() {
163 if ch == '"' {
164 state.advance(1);
165 break;
166 }
167 else if ch == '\\' {
168 state.advance(1);
169 if let Some(escaped) = state.peek() {
170 state.advance(escaped.len_utf8())
171 }
172 }
173 else {
174 state.advance(ch.len_utf8())
175 }
176 }
177 state.add_token(CTokenType::StringLiteral, start, state.get_position());
178 return true;
179 }
180 false
181 }
182
183 fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
184 let start = state.get_position();
185
186 if let Some('\'') = state.peek() {
187 state.advance(1);
188 while let Some(ch) = state.peek() {
189 if ch == '\'' {
190 state.advance(1);
191 break;
192 }
193 else if ch == '\\' {
194 state.advance(1);
195 if let Some(escaped) = state.peek() {
196 state.advance(escaped.len_utf8())
197 }
198 }
199 else {
200 state.advance(ch.len_utf8())
201 }
202 }
203 state.add_token(CTokenType::CharLiteral, start, state.get_position());
204 return true;
205 }
206 false
207 }
208
209 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
210 let start = state.get_position();
211
212 if let Some(ch) = state.peek() {
213 if ch.is_ascii_digit() {
214 state.advance(1);
215 while let Some(ch) = state.peek() {
216 if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
217 }
218
219 let text = state.get_text_in((start..state.get_position()).into());
220 let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatLiteral } else { CTokenType::IntegerLiteral };
221 state.add_token(kind, start, state.get_position());
222 return true;
223 }
224 }
225 false
226 }
227
228 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
229 let start = state.get_position();
230
231 if let Some(ch) = state.peek() {
232 if ch.is_ascii_alphabetic() || ch == '_' {
233 state.advance(ch.len_utf8());
234 while let Some(ch) = state.peek() {
235 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
236 }
237
238 let text = state.get_text_in((start..state.get_position()).into());
239 let kind = if C_KEYWORDS.contains(&&*text) {
240 match &*text {
241 "auto" => CTokenType::Auto,
242 "register" => CTokenType::Register,
243 "static" => CTokenType::Static,
244 "extern" => CTokenType::Extern,
245 "typedef" => CTokenType::Typedef,
246 "void" => CTokenType::Void,
247 "char" => CTokenType::Char,
248 "short" => CTokenType::Short,
249 "int" => CTokenType::Int,
250 "long" => CTokenType::Long,
251 "float" => CTokenType::Float,
252 "double" => CTokenType::Double,
253 "signed" => CTokenType::Signed,
254 "unsigned" => CTokenType::Unsigned,
255 "struct" => CTokenType::Struct,
256 "union" => CTokenType::Union,
257 "enum" => CTokenType::Enum,
258 "const" => CTokenType::Const,
259 "volatile" => CTokenType::Volatile,
260 "restrict" => CTokenType::Restrict,
261 "if" => CTokenType::If,
262 "else" => CTokenType::Else,
263 "switch" => CTokenType::Switch,
264 "case" => CTokenType::Case,
265 "default" => CTokenType::Default,
266 "for" => CTokenType::For,
267 "while" => CTokenType::While,
268 "do" => CTokenType::Do,
269 "break" => CTokenType::Break,
270 "continue" => CTokenType::Continue,
271 "goto" => CTokenType::Goto,
272 "return" => CTokenType::Return,
273 "sizeof" => CTokenType::Sizeof,
274 "inline" => CTokenType::Inline,
275 "_Bool" => CTokenType::Bool,
276 "_Complex" => CTokenType::Complex,
277 "_Imaginary" => CTokenType::Imaginary,
278 "_Alignas" => CTokenType::Alignas,
279 "_Alignof" => CTokenType::Alignof,
280 "_Atomic" => CTokenType::Atomic,
281 "_Static_assert" => CTokenType::StaticAssert,
282 "_Thread_local" => CTokenType::ThreadLocal,
283 "_Generic" => CTokenType::Generic,
284 "_Noreturn" => CTokenType::Noreturn,
285 _ => CTokenType::Identifier,
286 }
287 }
288 else {
289 CTokenType::Identifier
290 };
291 state.add_token(kind, start, state.get_position());
292 return true;
293 }
294 }
295 false
296 }
297
298 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
299 let start = state.get_position();
300
301 if let Some(ch) = state.peek() {
302 let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
303
304 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
305
306 if let Some(ref three) = three_char {
308 if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
309 state.advance(3);
310 state.add_token(kind, start, state.get_position());
311 return true;
312 }
313 }
314
315 if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
317 state.advance(2);
318 state.add_token(kind, start, state.get_position());
319 return true;
320 }
321
322 let kind = match ch {
324 '(' => CTokenType::LeftParen,
325 ')' => CTokenType::RightParen,
326 '[' => CTokenType::LeftBracket,
327 ']' => CTokenType::RightBracket,
328 '{' => CTokenType::LeftBrace,
329 '}' => CTokenType::RightBrace,
330 ',' => CTokenType::Comma,
331 ';' => CTokenType::Semicolon,
332 ':' => CTokenType::Colon,
333 '.' => CTokenType::Dot,
334 '?' => CTokenType::Question,
335 '+' => CTokenType::Plus,
336 '-' => CTokenType::Minus,
337 '*' => CTokenType::Star,
338 '/' => CTokenType::Slash,
339 '%' => CTokenType::Percent,
340 '=' => CTokenType::Assign,
341 '<' => CTokenType::Less,
342 '>' => CTokenType::Greater,
343 '!' => CTokenType::LogicalNot,
344 '&' => CTokenType::BitAnd,
345 '|' => CTokenType::BitOr,
346 '^' => CTokenType::BitXor,
347 '~' => CTokenType::BitNot,
348 _ => return false,
349 };
350 state.advance(1);
351 state.add_token(kind, start, state.get_position());
352 return true;
353 }
354 false
355 }
356
357 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
358 let start = state.get_position();
359
360 if state.consume_if_starts_with("#") {
361 while let Some(ch) = state.peek() {
362 if ch == '\n' || ch == '\r' {
363 break;
364 }
365 state.advance(ch.len_utf8())
366 }
367 state.add_token(CTokenType::PreprocessorDirective, start, state.get_position());
368 return true;
369 }
370 false
371 }
372
373 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
374 let start = state.get_position();
375
376 if let Some(ch) = state.peek() {
377 if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
378 state.advance(ch.len_utf8());
379 state.add_token(CTokenType::Text, start, state.get_position());
380 return true;
381 }
382 }
383 false
384 }
385}
386
387static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
388 &[
389 "auto",
390 "register",
391 "static",
392 "extern",
393 "typedef",
394 "void",
395 "char",
396 "short",
397 "int",
398 "long",
399 "float",
400 "double",
401 "signed",
402 "unsigned",
403 "struct",
404 "union",
405 "enum",
406 "const",
407 "volatile",
408 "restrict",
409 "if",
410 "else",
411 "switch",
412 "case",
413 "default",
414 "for",
415 "while",
416 "do",
417 "break",
418 "continue",
419 "goto",
420 "return",
421 "sizeof",
422 "inline",
423 "_Bool",
424 "_Complex",
425 "_Imaginary",
426 "_Alignas",
427 "_Alignof",
428 "_Atomic",
429 "_Static_assert",
430 "_Thread_local",
431 "_Generic",
432 "_Noreturn",
433 ]
434});
435
436static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
437 let mut map = std::collections::HashMap::new();
438 map.insert("+=", CTokenType::PlusAssign);
439 map.insert("-=", CTokenType::MinusAssign);
440 map.insert("*=", CTokenType::StarAssign);
441 map.insert("/=", CTokenType::SlashAssign);
442 map.insert("%=", CTokenType::PercentAssign);
443 map.insert("==", CTokenType::Equal);
444 map.insert("!=", CTokenType::NotEqual);
445 map.insert("<=", CTokenType::LessEqual);
446 map.insert(">=", CTokenType::GreaterEqual);
447 map.insert("&&", CTokenType::LogicalAnd);
448 map.insert("||", CTokenType::LogicalOr);
449 map.insert("<<", CTokenType::LeftShift);
450 map.insert(">>", CTokenType::RightShift);
451 map.insert("&=", CTokenType::AndAssign);
452 map.insert("|=", CTokenType::OrAssign);
453 map.insert("^=", CTokenType::XorAssign);
454 map.insert("++", CTokenType::Increment);
455 map.insert("--", CTokenType::Decrement);
456 map.insert("->", CTokenType::Arrow);
457 map
458});
459
460static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
461 let mut map = std::collections::HashMap::new();
462 map.insert("<<=", CTokenType::LeftShiftAssign);
463 map.insert(">>=", CTokenType::RightShiftAssign);
464 map
465});