1mod token_type;
2pub use token_type::CppTokenType;
3
4use crate::language::CppLanguage;
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, CppLanguage>;
8
9pub struct CppLexer<'config> {
10 _config: &'config CppLanguage,
11}
12
13pub type CLexer<'config> = CppLexer<'config>;
15
16impl<'config> CppLexer<'config> {
17 pub fn new(config: &'config CppLanguage) -> Self {
18 Self { _config: config }
19 }
20
21 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
23 let start_pos = state.get_position();
24
25 while let Some(ch) = state.peek() {
26 if ch == ' ' || ch == '\t' {
27 state.advance(ch.len_utf8());
28 }
29 else {
30 break;
31 }
32 }
33
34 if state.get_position() > start_pos {
35 state.add_token(CppTokenType::Whitespace, start_pos, state.get_position());
36 true
37 }
38 else {
39 false
40 }
41 }
42
43 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
45 let start_pos = state.get_position();
46
47 if let Some('\n') = state.peek() {
48 state.advance(1);
49 state.add_token(CppTokenType::Newline, start_pos, state.get_position());
50 true
51 }
52 else if let Some('\r') = state.peek() {
53 state.advance(1);
54 if let Some('\n') = state.peek() {
55 state.advance(1);
56 }
57 state.add_token(CppTokenType::Newline, start_pos, state.get_position());
58 true
59 }
60 else {
61 false
62 }
63 }
64
65 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
67 let start_pos = state.get_position();
68
69 if let Some('/') = state.peek() {
70 if let Some('/') = state.peek_next_n(1) {
71 state.advance(2);
73 while let Some(ch) = state.peek() {
74 if ch == '\n' || ch == '\r' {
75 break;
76 }
77 state.advance(ch.len_utf8());
78 }
79 state.add_token(CppTokenType::Comment, start_pos, state.get_position());
80 true
81 }
82 else if let Some('*') = state.peek_next_n(1) {
83 state.advance(2);
85 while let Some(ch) = state.peek() {
86 if ch == '*' && state.peek_next_n(1) == Some('/') {
87 state.advance(2);
88 break;
89 }
90 state.advance(ch.len_utf8());
91 }
92 state.add_token(CppTokenType::Comment, start_pos, state.get_position());
93 true
94 }
95 else {
96 false
97 }
98 }
99 else {
100 false
101 }
102 }
103
104 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106 let start_pos = state.get_position();
107
108 if let Some('"') = state.peek() {
109 state.advance(1);
110
111 let mut escaped = false;
112 while let Some(ch) = state.peek() {
113 if escaped {
114 escaped = false;
115 state.advance(ch.len_utf8());
116 continue;
117 }
118
119 if ch == '\\' {
120 escaped = true;
121 state.advance(1);
122 continue;
123 }
124
125 if ch == '"' {
126 state.advance(1);
127 break;
128 }
129
130 if ch == '\n' || ch == '\r' {
131 break; }
133
134 state.advance(ch.len_utf8());
135 }
136
137 state.add_token(CppTokenType::StringLiteral, start_pos, state.get_position());
138 true
139 }
140 else {
141 false
142 }
143 }
144
145 fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
147 let start_pos = state.get_position();
148
149 if let Some('\'') = state.peek() {
150 state.advance(1);
151
152 let mut escaped = false;
153 while let Some(ch) = state.peek() {
154 if escaped {
155 escaped = false;
156 state.advance(ch.len_utf8());
157 continue;
158 }
159
160 if ch == '\\' {
161 escaped = true;
162 state.advance(1);
163 continue;
164 }
165
166 if ch == '\'' {
167 state.advance(1);
168 break;
169 }
170
171 if ch == '\n' || ch == '\r' {
172 break; }
174
175 state.advance(ch.len_utf8());
176 }
177
178 state.add_token(CppTokenType::CharacterLiteral, start_pos, state.get_position());
179 true
180 }
181 else {
182 false
183 }
184 }
185
186 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
188 let start_pos = state.get_position();
189
190 if let Some(ch) = state.peek() {
191 if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
192 let mut is_float = false;
193
194 if ch == '0' {
196 if let Some(next_ch) = state.peek_next_n(1) {
197 if next_ch == 'x' || next_ch == 'X' {
198 state.advance(2);
200 while let Some(ch) = state.peek() {
201 if ch.is_ascii_hexdigit() {
202 state.advance(1);
203 }
204 else {
205 break;
206 }
207 }
208 }
209 else if next_ch == 'b' || next_ch == 'B' {
210 state.advance(2);
212 while let Some(ch) = state.peek() {
213 if ch == '0' || ch == '1' {
214 state.advance(1);
215 }
216 else {
217 break;
218 }
219 }
220 }
221 else if next_ch.is_ascii_digit() {
222 while let Some(ch) = state.peek() {
224 if ch.is_ascii_digit() {
225 state.advance(1);
226 }
227 else {
228 break;
229 }
230 }
231 }
232 else {
233 state.advance(1); }
235 }
236 else {
237 state.advance(1); }
239 }
240 else {
241 while let Some(ch) = state.peek() {
243 if ch.is_ascii_digit() {
244 state.advance(1);
245 }
246 else {
247 break;
248 }
249 }
250 }
251
252 if let Some('.') = state.peek() {
254 if let Some(next_ch) = state.peek_next_n(1) {
255 if next_ch.is_ascii_digit() {
256 is_float = true;
257 state.advance(1); while let Some(ch) = state.peek() {
259 if ch.is_ascii_digit() {
260 state.advance(1);
261 }
262 else {
263 break;
264 }
265 }
266 }
267 }
268 }
269
270 if let Some(ch) = state.peek() {
272 if ch == 'e' || ch == 'E' {
273 is_float = true;
274 state.advance(1);
275 if let Some(sign) = state.peek() {
276 if sign == '+' || sign == '-' {
277 state.advance(1);
278 }
279 }
280 while let Some(ch) = state.peek() {
281 if ch.is_ascii_digit() {
282 state.advance(1);
283 }
284 else {
285 break;
286 }
287 }
288 }
289 }
290
291 while let Some(ch) = state.peek() {
293 if ch.is_ascii_alphabetic() {
294 state.advance(1);
295 }
296 else {
297 break;
298 }
299 }
300
301 let token_kind = if is_float { CppTokenType::FloatLiteral } else { CppTokenType::IntegerLiteral };
302 state.add_token(token_kind, start_pos, state.get_position());
303 true
304 }
305 else {
306 false
307 }
308 }
309 else {
310 false
311 }
312 }
313
314 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
316 let start_pos = state.get_position();
317
318 if let Some(ch) = state.peek() {
319 if ch.is_ascii_alphabetic() || ch == '_' {
320 while let Some(ch) = state.peek() {
321 if ch.is_ascii_alphanumeric() || ch == '_' {
322 state.advance(ch.len_utf8());
323 }
324 else {
325 break;
326 }
327 }
328
329 let text = state.get_text_in((start_pos..state.get_position()).into());
330 let token_kind = match text.as_ref() {
331 "alignas" | "alignof" | "and" | "and_eq" | "asm" | "atomic_cancel" | "atomic_commit" | "atomic_noexcept" | "auto" | "bitand" | "bitor" | "bool" | "break" | "case" | "catch" | "char" | "char8_t" | "char16_t" | "char32_t" | "class"
333 | "compl" | "concept" | "const" | "consteval" | "constexpr" | "constinit" | "const_cast" | "continue" | "co_await" | "co_return" | "co_yield" | "decltype" | "default" | "delete" | "do" | "double" | "dynamic_cast" | "else" | "enum"
334 | "explicit" | "export" | "extern" | "float" | "for" | "friend" | "goto" | "if" | "inline" | "int" | "long" | "mutable" | "namespace" | "new" | "noexcept" | "not" | "not_eq" | "nullptr" | "operator" | "or" | "or_eq" | "private"
335 | "protected" | "public" | "reflexpr" | "register" | "reinterpret_cast" | "requires" | "return" | "short" | "signed" | "sizeof" | "static" | "static_assert" | "static_cast" | "struct" | "switch" | "synchronized" | "template"
336 | "this" | "thread_local" | "throw" | "try" | "typedef" | "typeid" | "typename" | "union" | "unsigned" | "using" | "virtual" | "void" | "volatile" | "wchar_t" | "while" | "xor" | "xor_eq" => CppTokenType::Keyword,
337 "true" | "false" => CppTokenType::BooleanLiteral,
338 _ => CppTokenType::Identifier,
339 };
340
341 state.add_token(token_kind, start_pos, state.get_position());
342 true
343 }
344 else {
345 false
346 }
347 }
348 else {
349 false
350 }
351 }
352
353 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
355 let start_pos = state.get_position();
356
357 if let Some(ch) = state.peek() {
358 let (token_kind, advance_count) = match ch {
359 '+' => {
360 if let Some('+') = state.peek_next_n(1) {
361 (CppTokenType::Increment, 2)
362 }
363 else if let Some('=') = state.peek_next_n(1) {
364 (CppTokenType::PlusAssign, 2)
365 }
366 else {
367 (CppTokenType::Plus, 1)
368 }
369 }
370 '-' => {
371 if let Some('-') = state.peek_next_n(1) {
372 (CppTokenType::Decrement, 2)
373 }
374 else if let Some('=') = state.peek_next_n(1) {
375 (CppTokenType::MinusAssign, 2)
376 }
377 else if let Some('>') = state.peek_next_n(1) {
378 (CppTokenType::Arrow, 2)
379 }
380 else {
381 (CppTokenType::Minus, 1)
382 }
383 }
384 '*' => {
385 if let Some('=') = state.peek_next_n(1) {
386 (CppTokenType::StarAssign, 2)
387 }
388 else {
389 (CppTokenType::Star, 1)
390 }
391 }
392 '/' => {
393 if let Some('=') = state.peek_next_n(1) {
394 (CppTokenType::SlashAssign, 2)
395 }
396 else {
397 (CppTokenType::Slash, 1)
398 }
399 }
400 '%' => {
401 if let Some('=') = state.peek_next_n(1) {
402 (CppTokenType::PercentAssign, 2)
403 }
404 else {
405 (CppTokenType::Percent, 1)
406 }
407 }
408 '=' => {
409 if let Some('=') = state.peek_next_n(1) {
410 (CppTokenType::Equal, 2)
411 }
412 else {
413 (CppTokenType::Assign, 1)
414 }
415 }
416 '!' => {
417 if let Some('=') = state.peek_next_n(1) {
418 (CppTokenType::NotEqual, 2)
419 }
420 else {
421 (CppTokenType::LogicalNot, 1)
422 }
423 }
424 '<' => {
425 if let Some('<') = state.peek_next_n(1) {
426 if let Some('=') = state.peek_next_n(2) { (CppTokenType::LeftShiftAssign, 3) } else { (CppTokenType::LeftShift, 2) }
427 }
428 else if let Some('=') = state.peek_next_n(1) {
429 (CppTokenType::LessEqual, 2)
430 }
431 else {
432 (CppTokenType::Less, 1)
433 }
434 }
435 '>' => {
436 if let Some('>') = state.peek_next_n(1) {
437 if let Some('=') = state.peek_next_n(2) { (CppTokenType::RightShiftAssign, 3) } else { (CppTokenType::RightShift, 2) }
438 }
439 else if let Some('=') = state.peek_next_n(1) {
440 (CppTokenType::GreaterEqual, 2)
441 }
442 else {
443 (CppTokenType::Greater, 1)
444 }
445 }
446 '&' => {
447 if let Some('&') = state.peek_next_n(1) {
448 (CppTokenType::LogicalAnd, 2)
449 }
450 else if let Some('=') = state.peek_next_n(1) {
451 (CppTokenType::AndAssign, 2)
452 }
453 else {
454 (CppTokenType::BitAnd, 1)
455 }
456 }
457 '|' => {
458 if let Some('|') = state.peek_next_n(1) {
459 (CppTokenType::LogicalOr, 2)
460 }
461 else if let Some('=') = state.peek_next_n(1) {
462 (CppTokenType::OrAssign, 2)
463 }
464 else {
465 (CppTokenType::BitOr, 1)
466 }
467 }
468 '^' => {
469 if let Some('=') = state.peek_next_n(1) {
470 (CppTokenType::XorAssign, 2)
471 }
472 else {
473 (CppTokenType::BitXor, 1)
474 }
475 }
476 '~' => (CppTokenType::BitNot, 1),
477 '?' => (CppTokenType::Question, 1),
478 ':' => {
479 if let Some(':') = state.peek_next_n(1) {
480 (CppTokenType::Scope, 2)
481 }
482 else {
483 (CppTokenType::Colon, 1)
484 }
485 }
486 '.' => (CppTokenType::Dot, 1),
487 _ => return false,
488 };
489
490 state.advance(advance_count);
491 state.add_token(token_kind, start_pos, state.get_position());
492 true
493 }
494 else {
495 false
496 }
497 }
498
499 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
501 let start_pos = state.get_position();
502
503 if let Some(ch) = state.peek() {
504 let token_kind = match ch {
505 '(' => CppTokenType::LeftParen,
506 ')' => CppTokenType::RightParen,
507 '[' => CppTokenType::LeftBracket,
508 ']' => CppTokenType::RightBracket,
509 '{' => CppTokenType::LeftBrace,
510 '}' => CppTokenType::RightBrace,
511 ',' => CppTokenType::Comma,
512 ';' => CppTokenType::Semicolon,
513 _ => return false,
514 };
515
516 state.advance(1);
517 state.add_token(token_kind, start_pos, state.get_position());
518 true
519 }
520 else {
521 false
522 }
523 }
524
525 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
527 let start_pos = state.get_position();
528
529 if let Some('#') = state.peek() {
530 while let Some(ch) = state.peek() {
532 if ch == '\n' || ch == '\r' {
533 break;
534 }
535 state.advance(ch.len_utf8());
536 }
537
538 state.add_token(CppTokenType::Preprocessor, start_pos, state.get_position());
539 true
540 }
541 else {
542 false
543 }
544 }
545}
546
547impl<'config> Lexer<CppLanguage> for CppLexer<'config> {
548 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<CppLanguage>) -> LexOutput<CppLanguage> {
549 let mut state = LexerState::new(source);
550 let result = self.run(&mut state);
551 state.finish_with_cache(result, cache)
552 }
553}
554
555impl<'config> CppLexer<'config> {
556 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
557 while state.not_at_end() {
558 if self.skip_whitespace(state) {
560 continue;
561 }
562
563 if self.lex_newline(state) {
564 continue;
565 }
566
567 if self.lex_comment(state) {
568 continue;
569 }
570
571 if self.lex_string(state) {
572 continue;
573 }
574
575 if self.lex_character(state) {
576 continue;
577 }
578
579 if self.lex_number(state) {
580 continue;
581 }
582
583 if self.lex_keyword_or_identifier(state) {
584 continue;
585 }
586
587 if self.lex_preprocessor(state) {
588 continue;
589 }
590
591 if self.lex_operator(state) {
592 continue;
593 }
594
595 if self.lex_delimiter(state) {
596 continue;
597 }
598
599 let start = state.get_position();
601 if let Some(ch) = state.peek() {
602 state.advance(ch.len_utf8());
603 state.add_token(CppTokenType::Error, start, state.get_position());
604 }
605 }
606 Ok(())
607 }
608}