1use crate::{kind::CSyntaxKind, language::CLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3use serde::Serialize;
4use std::sync::LazyLock;
5
6type State<S> = LexerState<S, CLanguage>;
7
8#[derive(Clone, Copy, Debug, Serialize)]
9pub struct CLexer<'config> {
10 config: &'config CLanguage,
11}
12
13impl<'config> CLexer<'config> {
14 pub fn new(config: &'config CLanguage) -> Self {
15 Self { config }
16 }
17
18 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
19 while state.not_at_end() {
20 if self.skip_whitespace(state) {
21 continue;
22 }
23 if self.skip_comment(state) {
24 continue;
25 }
26 if self.lex_newline(state) {
27 continue;
28 }
29 if self.lex_string(state) {
30 continue;
31 }
32 if self.lex_char(state) {
33 continue;
34 }
35 if self.lex_number(state) {
36 continue;
37 }
38 if self.lex_keyword_or_identifier(state) {
39 continue;
40 }
41 if self.lex_operator_or_delimiter(state) {
42 continue;
43 }
44 if self.lex_preprocessor(state) {
45 continue;
46 }
47 if self.lex_text(state) {
48 continue;
49 }
50 else {
51 state.advance(1);
53 }
54 }
55 Ok(())
56 }
57
58 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
59 let start = state.get_position();
60 let mut count = 0;
61
62 while let Some(ch) = state.current() {
63 if ch.is_whitespace() && ch != '\n' && ch != '\r' {
64 state.advance(1);
65 count += 1;
66 }
67 else {
68 break;
69 }
70 }
71
72 if count > 0 {
73 state.add_token(CSyntaxKind::Whitespace, start, state.get_position());
74 true
75 }
76 else {
77 false
78 }
79 }
80
81 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
82 let start = state.get_position();
83
84 if let Some('/') = state.current() {
85 if let Some('/') = state.peek() {
86 state.advance(2);
88 while let Some(ch) = state.current() {
89 if ch == '\n' || ch == '\r' {
90 break;
91 }
92 state.advance(1);
93 }
94 state.add_token(CSyntaxKind::Comment, start, state.get_position());
95 return true;
96 }
97 else if let Some('*') = state.peek() {
98 state.advance(2);
100 while let Some(ch) = state.current() {
101 if ch == '*' && state.peek() == Some('/') {
102 state.advance(2);
103 break;
104 }
105 state.advance(1);
106 }
107 state.add_token(CSyntaxKind::Comment, start, state.get_position());
108 return true;
109 }
110 }
111 false
112 }
113
114 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
115 let start = state.get_position();
116
117 if let Some(ch) = state.current() {
118 if ch == '\n' {
119 state.advance(1);
120 state.add_token(CSyntaxKind::Whitespace, start, state.get_position());
121 return true;
122 }
123 else if ch == '\r' {
124 state.advance(1);
125 if state.current() == Some('\n') {
126 state.advance(1);
127 }
128 state.add_token(CSyntaxKind::Whitespace, start, state.get_position());
129 return true;
130 }
131 }
132 false
133 }
134
135 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
136 let start = state.get_position();
137
138 if let Some('"') = state.current() {
139 state.advance(1);
140 while let Some(ch) = state.current() {
141 if ch == '"' {
142 state.advance(1);
143 break;
144 }
145 else if ch == '\\' {
146 state.advance(1);
147 if state.current().is_some() {
148 state.advance(1);
149 }
150 }
151 else {
152 state.advance(1);
153 }
154 }
155 state.add_token(CSyntaxKind::StringLiteral, start, state.get_position());
156 return true;
157 }
158 false
159 }
160
161 fn lex_char<S: Source>(&self, state: &mut State<S>) -> bool {
162 let start = state.get_position();
163
164 if let Some('\'') = state.current() {
165 state.advance(1);
166 while let Some(ch) = state.current() {
167 if ch == '\'' {
168 state.advance(1);
169 break;
170 }
171 else if ch == '\\' {
172 state.advance(1);
173 if state.current().is_some() {
174 state.advance(1);
175 }
176 }
177 else {
178 state.advance(1);
179 }
180 }
181 state.add_token(CSyntaxKind::CharLiteral, start, state.get_position());
182 return true;
183 }
184 false
185 }
186
187 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
188 let start = state.get_position();
189
190 if let Some(ch) = state.current() {
191 if ch.is_ascii_digit() {
192 state.advance(1);
193 while let Some(ch) = state.current() {
194 if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
195 state.advance(1);
196 }
197 else {
198 break;
199 }
200 }
201
202 let text = state.get_text_in((start..state.get_position()).into());
203 let kind = if text.contains('.') || text.contains('e') || text.contains('E') {
204 CSyntaxKind::FloatLiteral
205 }
206 else {
207 CSyntaxKind::IntegerLiteral
208 };
209 state.add_token(kind, start, state.get_position());
210 return true;
211 }
212 }
213 false
214 }
215
216 fn lex_keyword_or_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
217 let start = state.get_position();
218
219 if let Some(ch) = state.current() {
220 if ch.is_ascii_alphabetic() || ch == '_' {
221 state.advance(1);
222 while let Some(ch) = state.current() {
223 if ch.is_ascii_alphanumeric() || ch == '_' {
224 state.advance(1);
225 }
226 else {
227 break;
228 }
229 }
230
231 let text = state.get_text_in((start..state.get_position()).into());
232 let kind = if C_KEYWORDS.contains(&text) {
233 match text {
234 "auto" => CSyntaxKind::Auto,
235 "register" => CSyntaxKind::Register,
236 "static" => CSyntaxKind::Static,
237 "extern" => CSyntaxKind::Extern,
238 "typedef" => CSyntaxKind::Typedef,
239 "void" => CSyntaxKind::Void,
240 "char" => CSyntaxKind::Char,
241 "short" => CSyntaxKind::Short,
242 "int" => CSyntaxKind::Int,
243 "long" => CSyntaxKind::Long,
244 "float" => CSyntaxKind::Float,
245 "double" => CSyntaxKind::Double,
246 "signed" => CSyntaxKind::Signed,
247 "unsigned" => CSyntaxKind::Unsigned,
248 "struct" => CSyntaxKind::Struct,
249 "union" => CSyntaxKind::Union,
250 "enum" => CSyntaxKind::Enum,
251 "const" => CSyntaxKind::Const,
252 "volatile" => CSyntaxKind::Volatile,
253 "restrict" => CSyntaxKind::Restrict,
254 "if" => CSyntaxKind::If,
255 "else" => CSyntaxKind::Else,
256 "switch" => CSyntaxKind::Switch,
257 "case" => CSyntaxKind::Case,
258 "default" => CSyntaxKind::Default,
259 "for" => CSyntaxKind::For,
260 "while" => CSyntaxKind::While,
261 "do" => CSyntaxKind::Do,
262 "break" => CSyntaxKind::Break,
263 "continue" => CSyntaxKind::Continue,
264 "goto" => CSyntaxKind::Goto,
265 "return" => CSyntaxKind::Return,
266 "sizeof" => CSyntaxKind::Sizeof,
267 "inline" => CSyntaxKind::Inline,
268 "_Bool" => CSyntaxKind::Bool,
269 "_Complex" => CSyntaxKind::Complex,
270 "_Imaginary" => CSyntaxKind::Imaginary,
271 "_Alignas" => CSyntaxKind::Alignas,
272 "_Alignof" => CSyntaxKind::Alignof,
273 "_Atomic" => CSyntaxKind::Atomic,
274 "_Static_assert" => CSyntaxKind::StaticAssert,
275 "_Thread_local" => CSyntaxKind::ThreadLocal,
276 "_Generic" => CSyntaxKind::Generic,
277 "_Noreturn" => CSyntaxKind::Noreturn,
278 _ => CSyntaxKind::Identifier,
279 }
280 }
281 else {
282 CSyntaxKind::Identifier
283 };
284 state.add_token(kind, start, state.get_position());
285 return true;
286 }
287 }
288 false
289 }
290
291 fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
292 let start = state.get_position();
293
294 if let Some(ch) = state.current() {
295 let three_char = if let Some(next_ch) = state.peek_next_n(1) {
296 if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None }
297 }
298 else {
299 None
300 };
301
302 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
303
304 if let Some(ref three) = three_char {
306 if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
307 state.advance(3);
308 state.add_token(kind, start, state.get_position());
309 return true;
310 }
311 }
312
313 if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
315 state.advance(2);
316 state.add_token(kind, start, state.get_position());
317 return true;
318 }
319
320 let kind = match ch {
322 '(' => CSyntaxKind::LeftParen,
323 ')' => CSyntaxKind::RightParen,
324 '[' => CSyntaxKind::LeftBracket,
325 ']' => CSyntaxKind::RightBracket,
326 '{' => CSyntaxKind::LeftBrace,
327 '}' => CSyntaxKind::RightBrace,
328 ',' => CSyntaxKind::Comma,
329 ';' => CSyntaxKind::Semicolon,
330 ':' => CSyntaxKind::Colon,
331 '.' => CSyntaxKind::Dot,
332 '?' => CSyntaxKind::Question,
333 '+' => CSyntaxKind::Plus,
334 '-' => CSyntaxKind::Minus,
335 '*' => CSyntaxKind::Star,
336 '/' => CSyntaxKind::Slash,
337 '%' => CSyntaxKind::Percent,
338 '=' => CSyntaxKind::Assign,
339 '<' => CSyntaxKind::Less,
340 '>' => CSyntaxKind::Greater,
341 '!' => CSyntaxKind::LogicalNot,
342 '&' => CSyntaxKind::BitAnd,
343 '|' => CSyntaxKind::BitOr,
344 '^' => CSyntaxKind::BitXor,
345 '~' => CSyntaxKind::BitNot,
346 _ => return false,
347 };
348 state.advance(1);
349 state.add_token(kind, start, state.get_position());
350 return true;
351 }
352 false
353 }
354
355 fn lex_preprocessor<S: Source>(&self, state: &mut State<S>) -> bool {
356 let start = state.get_position();
357
358 if let Some('#') = state.current() {
359 state.advance(1);
360 while let Some(ch) = state.current() {
361 if ch == '\n' || ch == '\r' {
362 break;
363 }
364 state.advance(1);
365 }
366 state.add_token(CSyntaxKind::PreprocessorDirective, start, state.get_position());
367 return true;
368 }
369 false
370 }
371
372 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
373 let start = state.get_position();
374
375 if let Some(ch) = state.current() {
376 if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
377 state.advance(1);
378 state.add_token(CSyntaxKind::Text, start, state.get_position());
379 return true;
380 }
381 }
382 false
383 }
384}
385
386impl<'config> Lexer<CLanguage> for CLexer<'config> {
387 fn lex_incremental(
388 &self,
389 source: impl Source,
390 _changed: usize,
391 _cache: IncrementalCache<CLanguage>,
392 ) -> LexOutput<CLanguage> {
393 let mut state = LexerState::new_with_cache(source, _changed, _cache);
394 let result = self.run(&mut state);
395 state.finish(result)
396 }
397}
398
399static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
400 &[
401 "auto",
402 "register",
403 "static",
404 "extern",
405 "typedef",
406 "void",
407 "char",
408 "short",
409 "int",
410 "long",
411 "float",
412 "double",
413 "signed",
414 "unsigned",
415 "struct",
416 "union",
417 "enum",
418 "const",
419 "volatile",
420 "restrict",
421 "if",
422 "else",
423 "switch",
424 "case",
425 "default",
426 "for",
427 "while",
428 "do",
429 "break",
430 "continue",
431 "goto",
432 "return",
433 "sizeof",
434 "inline",
435 "_Bool",
436 "_Complex",
437 "_Imaginary",
438 "_Alignas",
439 "_Alignof",
440 "_Atomic",
441 "_Static_assert",
442 "_Thread_local",
443 "_Generic",
444 "_Noreturn",
445 ]
446});
447
448static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CSyntaxKind>> = LazyLock::new(|| {
449 let mut map = std::collections::HashMap::new();
450 map.insert("+=", CSyntaxKind::PlusAssign);
451 map.insert("-=", CSyntaxKind::MinusAssign);
452 map.insert("*=", CSyntaxKind::StarAssign);
453 map.insert("/=", CSyntaxKind::SlashAssign);
454 map.insert("%=", CSyntaxKind::PercentAssign);
455 map.insert("==", CSyntaxKind::Equal);
456 map.insert("!=", CSyntaxKind::NotEqual);
457 map.insert("<=", CSyntaxKind::LessEqual);
458 map.insert(">=", CSyntaxKind::GreaterEqual);
459 map.insert("&&", CSyntaxKind::LogicalAnd);
460 map.insert("||", CSyntaxKind::LogicalOr);
461 map.insert("<<", CSyntaxKind::LeftShift);
462 map.insert(">>", CSyntaxKind::RightShift);
463 map.insert("&=", CSyntaxKind::AndAssign);
464 map.insert("|=", CSyntaxKind::OrAssign);
465 map.insert("^=", CSyntaxKind::XorAssign);
466 map.insert("++", CSyntaxKind::Increment);
467 map.insert("--", CSyntaxKind::Decrement);
468 map.insert("->", CSyntaxKind::Arrow);
469 map
470});
471
472static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CSyntaxKind>> = LazyLock::new(|| {
473 let mut map = std::collections::HashMap::new();
474 map.insert("<<=", CSyntaxKind::LeftShiftAssign);
475 map.insert(">>=", CSyntaxKind::RightShiftAssign);
476 map
477});