1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5pub use token_type::CTokenType;
6
7use crate::language::CLanguage;
8use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, CLanguage>;
12
13#[cfg_attr(feature = "serde", derive(serde::Serialize))]
15#[derive(Clone, Copy, Debug)]
16pub struct CLexer<'config> {
17 config: &'config CLanguage,
19}
20
21impl<'config> Lexer<CLanguage> for CLexer<'config> {
22 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<CLanguage>) -> LexOutput<CLanguage> {
24 let mut state = State::new_with_cache(source, 0, cache);
25 let result = self.run(&mut state);
26 if result.is_ok() {
27 state.add_eof()
28 }
29 state.finish_with_cache(result, cache)
30 }
31}
32
33impl<'config> CLexer<'config> {
34 pub fn new(config: &'config CLanguage) -> Self {
36 Self { config }
37 }
38
39 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41 while state.not_at_end() {
42 let safe_point = state.get_position();
43 if self.skip_whitespace(state) {
44 continue;
45 }
46 if self.skip_comment(state) {
47 continue;
48 }
49 if self.lex_newline(state) {
50 continue;
51 }
52 if self.lex_string(state) {
53 continue;
54 }
55 if self.lex_char(state) {
56 continue;
57 }
58 if self.lex_number(state) {
59 continue;
60 }
61 if self.lex_keyword_or_identifier(state) {
62 continue;
63 }
64 if self.lex_operator_or_delimiter(state) {
65 continue;
66 }
67 if self.lex_preprocessor(state) {
68 continue;
69 }
70 if self.lex_text(state) {
71 continue;
72 }
73 else {
74 let start = state.get_position();
75 if let Some(ch) = state.peek() {
76 state.advance(ch.len_utf8());
77 state.add_token(CTokenType::Error, start, state.get_position())
78 }
79 }
80 state.advance_if_dead_lock(safe_point)
81 }
82 Ok(())
83 }
84
85 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start = state.get_position();
88 let mut count = 0;
89
90 while let Some(ch) = state.peek() {
91 if ch.is_whitespace() && ch != '\n' && ch != '\r' {
92 state.advance(ch.len_utf8());
93 count += 1
94 }
95 else {
96 break;
97 }
98 }
99
100 if count > 0 {
101 state.add_token(CTokenType::Whitespace, start, state.get_position());
102 true
103 }
104 else {
105 false
106 }
107 }
108
109 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110 let start = state.get_position();
111
112 if state.consume_if_starts_with("//") {
113 while let Some(ch) = state.peek() {
114 if ch == '\n' || ch == '\r' {
115 break;
116 }
117 state.advance(ch.len_utf8())
118 }
119 state.add_token(CTokenType::LineComment, start, state.get_position());
120 return true;
121 }
122 else if state.consume_if_starts_with("/*") {
123 while state.not_at_end() {
124 if state.consume_if_starts_with("*/") {
125 break;
126 }
127 if let Some(ch) = state.peek() { state.advance(ch.len_utf8()) } else { break }
128 }
129 state.add_token(CTokenType::BlockComment, start, state.get_position());
130 return true;
131 }
132 false
133 }
134
135 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
136 let start = state.get_position();
137
138 if let Some(ch) = state.peek() {
139 if ch == '\n' {
140 state.advance(1);
141 state.add_token(CTokenType::Whitespace, start, state.get_position());
142 return true;
143 }
144 else if ch == '\r' {
145 state.advance(1);
146 if state.peek() == Some('\n') {
147 state.advance(1)
148 }
149 state.add_token(CTokenType::Whitespace, start, state.get_position());
150 return true;
151 }
152 }
153 false
154 }
155
156 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
157 let start = state.get_position();
158
159 if let Some('"') = state.peek() {
160 state.advance(1);
161 while let Some(ch) = state.peek() {
162 if ch == '"' {
163 state.advance(1);
164 break;
165 }
166 else if ch == '\\' {
167 state.advance(1);
168 if let Some(escaped) = state.peek() {
169 state.advance(escaped.len_utf8())
170 }
171 }
172 else {
173 state.advance(ch.len_utf8())
174 }
175 }
176 state.add_token(CTokenType::StringLiteral, start, state.get_position());
177 return true;
178 }
179 false
180 }
181
182 fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183 let start = state.get_position();
184
185 if let Some('\'') = state.peek() {
186 state.advance(1);
187 while let Some(ch) = state.peek() {
188 if ch == '\'' {
189 state.advance(1);
190 break;
191 }
192 else if ch == '\\' {
193 state.advance(1);
194 if let Some(escaped) = state.peek() {
195 state.advance(escaped.len_utf8())
196 }
197 }
198 else {
199 state.advance(ch.len_utf8())
200 }
201 }
202 state.add_token(CTokenType::CharConstant, start, state.get_position());
203 return true;
204 }
205 false
206 }
207
208 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
209 let start = state.get_position();
210
211 if let Some(ch) = state.peek() {
212 if ch.is_ascii_digit() {
213 state.advance(1);
214 while let Some(ch) = state.peek() {
215 if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' { state.advance(ch.len_utf8()) } else { break }
216 }
217
218 let text = state.get_text_in((start..state.get_position()).into());
219 let kind = if text.contains('.') || text.contains('e') || text.contains('E') { CTokenType::FloatConstant } else { CTokenType::IntConstant };
220 state.add_token(kind, start, state.get_position());
221 return true;
222 }
223 }
224 false
225 }
226
227 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
228 let start = state.get_position();
229
230 if let Some(ch) = state.peek() {
231 if ch.is_ascii_alphabetic() || ch == '_' {
232 state.advance(ch.len_utf8());
233 while let Some(ch) = state.peek() {
234 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
235 }
236
237 let text = state.get_text_in((start..state.get_position()).into());
238 let kind = if C_KEYWORDS.contains(&&*text) {
239 match &*text {
240 "auto" => CTokenType::Auto,
241 "register" => CTokenType::Register,
242 "static" => CTokenType::Static,
243 "extern" => CTokenType::Extern,
244 "typedef" => CTokenType::Typedef,
245 "void" => CTokenType::Void,
246 "char" => CTokenType::Char,
247 "short" => CTokenType::Short,
248 "int" => CTokenType::Int,
249 "long" => CTokenType::Long,
250 "float" => CTokenType::Float,
251 "double" => CTokenType::Double,
252 "signed" => CTokenType::Signed,
253 "unsigned" => CTokenType::Unsigned,
254 "struct" => CTokenType::Struct,
255 "union" => CTokenType::Union,
256 "enum" => CTokenType::Enum,
257 "const" => CTokenType::Const,
258 "volatile" => CTokenType::Volatile,
259 "restrict" => CTokenType::Restrict,
260 "if" => CTokenType::If,
261 "else" => CTokenType::Else,
262 "switch" => CTokenType::Switch,
263 "case" => CTokenType::Case,
264 "default" => CTokenType::Default,
265 "for" => CTokenType::For,
266 "while" => CTokenType::While,
267 "do" => CTokenType::Do,
268 "break" => CTokenType::Break,
269 "continue" => CTokenType::Continue,
270 "goto" => CTokenType::Goto,
271 "return" => CTokenType::Return,
272 "sizeof" => CTokenType::Sizeof,
273 "inline" => CTokenType::Inline,
274 "_Bool" => CTokenType::Bool,
275 "_Complex" => CTokenType::Complex,
276 "_Imaginary" => CTokenType::Imaginary,
277 "_Alignas" => CTokenType::Alignas,
278 "_Alignof" => CTokenType::Alignof,
279 "_Atomic" => CTokenType::Atomic,
280 "_Static_assert" => CTokenType::StaticAssert,
281 "_Thread_local" => CTokenType::ThreadLocal,
282 "_Generic" => CTokenType::Generic,
283 "_Noreturn" => CTokenType::Noreturn,
284 _ => CTokenType::Identifier,
285 }
286 }
287 else {
288 CTokenType::Identifier
289 };
290 state.add_token(kind, start, state.get_position());
291 return true;
292 }
293 }
294 false
295 }
296
297 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
298 let start = state.get_position();
299
300 if let Some(ch) = state.peek() {
301 let three_char = if let Some(next_ch) = state.peek_next_n(1) { if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None } } else { None };
302
303 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
304
305 if let Some(ref three) = three_char {
307 if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
308 state.advance(3);
309 state.add_token(kind, start, state.get_position());
310 return true;
311 }
312 }
313
314 if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
316 state.advance(2);
317 state.add_token(kind, start, state.get_position());
318 return true;
319 }
320
321 let kind = match ch {
323 '(' => CTokenType::LeftParen,
324 ')' => CTokenType::RightParen,
325 '[' => CTokenType::LeftBracket,
326 ']' => CTokenType::RightBracket,
327 '{' => CTokenType::LeftBrace,
328 '}' => CTokenType::RightBrace,
329 ',' => CTokenType::Comma,
330 ';' => CTokenType::Semicolon,
331 ':' => CTokenType::Colon,
332 '.' => CTokenType::Dot,
333 '?' => CTokenType::Question,
334 '+' => CTokenType::Plus,
335 '-' => CTokenType::Minus,
336 '*' => CTokenType::Star,
337 '/' => CTokenType::Slash,
338 '%' => CTokenType::Percent,
339 '=' => CTokenType::Assign,
340 '<' => CTokenType::Less,
341 '>' => CTokenType::Greater,
342 '!' => CTokenType::LogicalNot,
343 '&' => CTokenType::BitAnd,
344 '|' => CTokenType::BitOr,
345 '^' => CTokenType::BitXor,
346 '~' => CTokenType::BitNot,
347 _ => return false,
348 };
349 state.advance(1);
350 state.add_token(kind, start, state.get_position());
351 return true;
352 }
353 false
354 }
355
356 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
357 let start = state.get_position();
358
359 if state.consume_if_starts_with("#") {
360 while let Some(ch) = state.peek() {
361 if ch == '\n' || ch == '\r' {
362 break;
363 }
364 state.advance(ch.len_utf8())
365 }
366 state.add_token(CTokenType::Preprocessor, start, state.get_position());
367 return true;
368 }
369 false
370 }
371
372 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
373 let start = state.get_position();
374
375 if let Some(ch) = state.peek() {
376 if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
377 state.advance(ch.len_utf8());
378 state.add_token(CTokenType::Text, start, state.get_position());
379 return true;
380 }
381 }
382 false
383 }
384}
385
386static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
387 &[
388 "auto",
389 "register",
390 "static",
391 "extern",
392 "typedef",
393 "void",
394 "char",
395 "short",
396 "int",
397 "long",
398 "float",
399 "double",
400 "signed",
401 "unsigned",
402 "struct",
403 "union",
404 "enum",
405 "const",
406 "volatile",
407 "restrict",
408 "if",
409 "else",
410 "switch",
411 "case",
412 "default",
413 "for",
414 "while",
415 "do",
416 "break",
417 "continue",
418 "goto",
419 "return",
420 "sizeof",
421 "inline",
422 "_Bool",
423 "_Complex",
424 "_Imaginary",
425 "_Alignas",
426 "_Alignof",
427 "_Atomic",
428 "_Static_assert",
429 "_Thread_local",
430 "_Generic",
431 "_Noreturn",
432 ]
433});
434
435static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
436 let mut map = std::collections::HashMap::new();
437 map.insert("+=", CTokenType::PlusAssign);
438 map.insert("-=", CTokenType::MinusAssign);
439 map.insert("*=", CTokenType::StarAssign);
440 map.insert("/=", CTokenType::SlashAssign);
441 map.insert("%=", CTokenType::PercentAssign);
442 map.insert("==", CTokenType::Equal);
443 map.insert("!=", CTokenType::NotEqual);
444 map.insert("<=", CTokenType::LessEqual);
445 map.insert(">=", CTokenType::GreaterEqual);
446 map.insert("&&", CTokenType::LogicalAnd);
447 map.insert("||", CTokenType::LogicalOr);
448 map.insert("<<", CTokenType::LeftShift);
449 map.insert(">>", CTokenType::RightShift);
450 map.insert("&=", CTokenType::AndAssign);
451 map.insert("|=", CTokenType::OrAssign);
452 map.insert("^=", CTokenType::XorAssign);
453 map.insert("++", CTokenType::Increment);
454 map.insert("--", CTokenType::Decrement);
455 map.insert("->", CTokenType::Arrow);
456 map
457});
458
459static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CTokenType>> = LazyLock::new(|| {
460 let mut map = std::collections::HashMap::new();
461 map.insert("<<=", CTokenType::LeftShiftAssign);
462 map.insert(">>=", CTokenType::RightShiftAssign);
463 map
464});