1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5pub use token_type::CTokenType;
6
7use crate::language::CLanguage;
8use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
9#[cfg(feature = "serde")]
10use serde::Serialize;
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, CLanguage>;
14
15#[cfg_attr(feature = "serde", derive(Serialize))]
17#[derive(Clone, Copy, Debug)]
18pub struct CLexer<'config> {
19 config: &'config CLanguage,
21}
22
23impl<'config> Lexer<CLanguage> for CLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
26 let mut state = State::new_with_cache(source, 0, cache);
27 let result = self.run(&mut state);
28 if result.is_ok() {
29 state.add_eof()
30 }
31 state.finish_with_cache(result, cache)
32 }
33}
34
35impl<'config> CLexer<'config> {
36 pub fn new(config: &'config CLanguage) -> Self {
38 Self { config }
39 }
40
41 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
43 while state.not_at_end() {
44 let safe_point = state.get_position();
45 if self.skip_whitespace(state) {
46 continue;
47 }
48 if self.skip_comment(state) {
49 continue;
50 }
51 if self.lex_newline(state) {
52 continue;
53 }
54 if self.lex_string(state) {
55 continue;
56 }
57 if self.lex_char(state) {
58 continue;
59 }
60 if self.lex_number(state) {
61 continue;
62 }
63 if self.lex_keyword_or_identifier(state) {
64 continue;
65 }
66 if self.lex_operator_or_delimiter(state) {
67 continue;
68 }
69 if self.lex_preprocessor(state) {
70 continue;
71 }
72 if self.lex_text(state) {
73 continue;
74 }
75 else {
76 let start = state.get_position();
77 if let Some(ch) = state.peek() {
78 state.advance(ch.len_utf8());
79 state.add_token(CTokenType::Error, start, state.get_position())
80 }
81 }
82 state.advance_if_dead_lock(safe_point)
83 }
84 Ok(())
85 }
86
87 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89 let start = state.get_position();
90 let mut count = 0;
91
92 while let Some(ch) = state.peek() {
93 if ch.is_whitespace() && ch != '\n' && ch != '\r' {
94 state.advance(ch.len_utf8());
95 count += 1
96 }
97 else {
98 break;
99 }
100 }
101
102 if count > 0 {
103 state.add_token(CTokenType::Whitespace, start, state.get_position());
104 true
105 }
106 else {
107 false
108 }
109 }
110
111 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112 let start = state.get_position();
113
114 if state.consume_if_starts_with("//") {
115 while let Some(ch) = state.peek() {
116 if ch == '\n' || ch == '\r' {
117 break;
118 }
119 state.advance(ch.len_utf8())
120 }
121 state.add_token(CTokenType::LineComment, start, state.get_position());
122 return true;
123 }
124 else if state.consume_if_starts_with("/*") {
125 while state.not_at_end() {
126 if state.consume_if_starts_with("*/") {
127 break;
128 }
129 if let Some(ch) = state.peek() { state.advance(ch.len_utf8()) } else { break }
130 }
131 state.add_token(CTokenType::BlockComment, start, state.get_position());
132 return true;
133 }
134 false
135 }
136
137 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
138 let start = state.get_position();
139
140 if let Some(ch) = state.peek() {
141 if ch == '\n' {
142 state.advance(1);
143 state.add_token(CTokenType::Whitespace, start, state.get_position());
144 return true;
145 }
146 else if ch == '\r' {
147 state.advance(1);
148 if state.peek() == Some('\n') {
149 state.advance(1)
150 }
151 state.add_token(CTokenType::Whitespace, start, state.get_position());
152 return true;
153 }
154 }
155 false
156 }
157
158 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
159 let start = state.get_position();
160
161 if let Some('"') = state.peek() {
162 state.advance(1);
163 while let Some(ch) = state.peek() {
164 if ch == '"' {
165 state.advance(1);
166 break;
167 }
168 else if ch == '\\' {
169 state.advance(1);
170 if let Some(escaped) = state.peek() {
171 state.advance(escaped.len_utf8())
172 }
173 }
174 else {
175 state.advance(ch.len_utf8())
176 }
177 }
178 state.add_token(CTokenType::StringLiteral, start, state.get_position());
179 return true;
180 }
181 false
182 }
183
184 fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
185 let start = state.get_position();
186
187 if let Some('\'') = state.peek() {
188 state.advance(1);
189 while let Some(ch) = state.peek() {
190 if ch == '\'' {
191 state.advance(1);
192 break;
193 }
194 else if ch == '\\' {
195 state.advance(1);
196 if let Some(escaped) = state.peek() {
197 state.advance(escaped.len_utf8())
198 }
199 }
200 else {
201 state.advance(ch.len_utf8())
202 }
203 }
204 state.add_token(CTokenType::CharConstant, start, state.get_position());
205 return true;
206 }
207 false
208 }
209
210 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211 let start = state.get_position();
212
213 if let Some(ch) = state.peek() {
214 if ch.is_ascii_digit() {
215 state.advance(1);
216 while let Some(ch) = state.peek() {
217 if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
218 }
219
220 let text = state.get_text_in((start..state.get_position()).into());
221 let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatConstant } else { CTokenType::IntConstant };
222 state.add_token(kind, start, state.get_position());
223 return true;
224 }
225 }
226 false
227 }
228
229 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230 let start = state.get_position();
231
232 if let Some(ch) = state.peek() {
233 if ch.is_ascii_alphabetic() || ch == '_' {
234 state.advance(ch.len_utf8());
235 while let Some(ch) = state.peek() {
236 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
237 }
238
239 let text = state.get_text_in((start..state.get_position()).into());
240 let kind = if C_KEYWORDS.contains(&&*text) {
241 match &*text {
242 "auto" => CTokenType::Auto,
243 "register" => CTokenType::Register,
244 "static" => CTokenType::Static,
245 "extern" => CTokenType::Extern,
246 "typedef" => CTokenType::Typedef,
247 "void" => CTokenType::Void,
248 "char" => CTokenType::Char,
249 "short" => CTokenType::Short,
250 "int" => CTokenType::Int,
251 "long" => CTokenType::Long,
252 "float" => CTokenType::Float,
253 "double" => CTokenType::Double,
254 "signed" => CTokenType::Signed,
255 "unsigned" => CTokenType::Unsigned,
256 "struct" => CTokenType::Struct,
257 "union" => CTokenType::Union,
258 "enum" => CTokenType::Enum,
259 "const" => CTokenType::Const,
260 "volatile" => CTokenType::Volatile,
261 "restrict" => CTokenType::Restrict,
262 "if" => CTokenType::If,
263 "else" => CTokenType::Else,
264 "switch" => CTokenType::Switch,
265 "case" => CTokenType::Case,
266 "default" => CTokenType::Default,
267 "for" => CTokenType::For,
268 "while" => CTokenType::While,
269 "do" => CTokenType::Do,
270 "break" => CTokenType::Break,
271 "continue" => CTokenType::Continue,
272 "goto" => CTokenType::Goto,
273 "return" => CTokenType::Return,
274 "sizeof" => CTokenType::Sizeof,
275 "inline" => CTokenType::Inline,
276 "_Bool" => CTokenType::Bool,
277 "_Complex" => CTokenType::Complex,
278 "_Imaginary" => CTokenType::Imaginary,
279 "_Alignas" => CTokenType::Alignas,
280 "_Alignof" => CTokenType::Alignof,
281 "_Atomic" => CTokenType::Atomic,
282 "_Static_assert" => CTokenType::StaticAssert,
283 "_Thread_local" => CTokenType::ThreadLocal,
284 "_Generic" => CTokenType::Generic,
285 "_Noreturn" => CTokenType::Noreturn,
286 _ => CTokenType::Identifier,
287 }
288 }
289 else {
290 CTokenType::Identifier
291 };
292 state.add_token(kind, start, state.get_position());
293 return true;
294 }
295 }
296 false
297 }
298
299 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
300 let start = state.get_position();
301
302 if let Some(ch) = state.peek() {
303 let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
304
305 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
306
307 if let Some(ref three) = three_char {
309 if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
310 state.advance(3);
311 state.add_token(kind, start, state.get_position());
312 return true;
313 }
314 }
315
316 if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
318 state.advance(2);
319 state.add_token(kind, start, state.get_position());
320 return true;
321 }
322
323 let kind = match ch {
325 '(' => CTokenType::LeftParen,
326 ')' => CTokenType::RightParen,
327 '[' => CTokenType::LeftBracket,
328 ']' => CTokenType::RightBracket,
329 '{' => CTokenType::LeftBrace,
330 '}' => CTokenType::RightBrace,
331 ',' => CTokenType::Comma,
332 ';' => CTokenType::Semicolon,
333 ':' => CTokenType::Colon,
334 '.' => CTokenType::Dot,
335 '?' => CTokenType::Question,
336 '+' => CTokenType::Plus,
337 '-' => CTokenType::Minus,
338 '*' => CTokenType::Star,
339 '/' => CTokenType::Slash,
340 '%' => CTokenType::Percent,
341 '=' => CTokenType::Assign,
342 '<' => CTokenType::Less,
343 '>' => CTokenType::Greater,
344 '!' => CTokenType::LogicalNot,
345 '&' => CTokenType::BitAnd,
346 '|' => CTokenType::BitOr,
347 '^' => CTokenType::BitXor,
348 '~' => CTokenType::BitNot,
349 _ => return false,
350 };
351 state.advance(1);
352 state.add_token(kind, start, state.get_position());
353 return true;
354 }
355 false
356 }
357
358 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
359 let start = state.get_position();
360
361 if state.consume_if_starts_with("#") {
362 while let Some(ch) = state.peek() {
363 if ch == '\n' || ch == '\r' {
364 break;
365 }
366 state.advance(ch.len_utf8())
367 }
368 state.add_token(CTokenType::Preprocessor, start, state.get_position());
369 return true;
370 }
371 false
372 }
373
374 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
375 let start = state.get_position();
376
377 if let Some(ch) = state.peek() {
378 if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
379 state.advance(ch.len_utf8());
380 state.add_token(CTokenType::Text, start, state.get_position());
381 return true;
382 }
383 }
384 false
385 }
386}
387
388static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
389 &[
390 "auto",
391 "register",
392 "static",
393 "extern",
394 "typedef",
395 "void",
396 "char",
397 "short",
398 "int",
399 "long",
400 "float",
401 "double",
402 "signed",
403 "unsigned",
404 "struct",
405 "union",
406 "enum",
407 "const",
408 "volatile",
409 "restrict",
410 "if",
411 "else",
412 "switch",
413 "case",
414 "default",
415 "for",
416 "while",
417 "do",
418 "break",
419 "continue",
420 "goto",
421 "return",
422 "sizeof",
423 "inline",
424 "_Bool",
425 "_Complex",
426 "_Imaginary",
427 "_Alignas",
428 "_Alignof",
429 "_Atomic",
430 "_Static_assert",
431 "_Thread_local",
432 "_Generic",
433 "_Noreturn",
434 ]
435});
436
437static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
438 let mut map = std::collections::HashMap::new();
439 map.insert("+=", CTokenType::PlusAssign);
440 map.insert("-=", CTokenType::MinusAssign);
441 map.insert("*=", CTokenType::StarAssign);
442 map.insert("/=", CTokenType::SlashAssign);
443 map.insert("%=", CTokenType::PercentAssign);
444 map.insert("==", CTokenType::Equal);
445 map.insert("!=", CTokenType::NotEqual);
446 map.insert("<=", CTokenType::LessEqual);
447 map.insert(">=", CTokenType::GreaterEqual);
448 map.insert("&&", CTokenType::LogicalAnd);
449 map.insert("||", CTokenType::LogicalOr);
450 map.insert("<<", CTokenType::LeftShift);
451 map.insert(">>", CTokenType::RightShift);
452 map.insert("&=", CTokenType::AndAssign);
453 map.insert("|=", CTokenType::OrAssign);
454 map.insert("^=", CTokenType::XorAssign);
455 map.insert("++", CTokenType::Increment);
456 map.insert("--", CTokenType::Decrement);
457 map.insert("->", CTokenType::Arrow);
458 map
459});
460
461static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
462 let mut map = std::collections::HashMap::new();
463 map.insert("<<=", CTokenType::LeftShiftAssign);
464 map.insert(">>=", CTokenType::RightShiftAssign);
465 map
466});