1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::CppTokenType;
4
5use crate::language::CppLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, CppLanguage>;
9
10pub struct CppLexer<'config> {
12 _config: &'config CppLanguage,
13}
14
15pub type CLexer<'config> = CppLexer<'config>;
17
18impl<'config> CppLexer<'config> {
19 pub fn new(config: &'config CppLanguage) -> Self {
21 Self { _config: config }
22 }
23
24 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
26 let start_pos = state.get_position();
27
28 while let Some(ch) = state.peek() {
29 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
30 }
31
32 if state.get_position() > start_pos {
33 state.add_token(CppTokenType::Whitespace, start_pos, state.get_position());
34 true
35 }
36 else {
37 false
38 }
39 }
40
41 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43 let start_pos = state.get_position();
44
45 if let Some('\n') = state.peek() {
46 state.advance(1);
47 state.add_token(CppTokenType::Newline, start_pos, state.get_position());
48 true
49 }
50 else if let Some('\r') = state.peek() {
51 state.advance(1);
52 if let Some('\n') = state.peek() {
53 state.advance(1)
54 }
55 state.add_token(CppTokenType::Newline, start_pos, state.get_position());
56 true
57 }
58 else {
59 false
60 }
61 }
62
63 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
65 let start_pos = state.get_position();
66
67 if let Some('/') = state.peek() {
68 if let Some('/') = state.peek_next_n(1) {
69 state.advance(2);
71 while let Some(ch) = state.peek() {
72 if ch == '\n' || ch == '\r' {
73 break;
74 }
75 state.advance(ch.len_utf8())
76 }
77 state.add_token(CppTokenType::Comment, start_pos, state.get_position());
78 true
79 }
80 else if let Some('*') = state.peek_next_n(1) {
81 state.advance(2);
83 while let Some(ch) = state.peek() {
84 if ch == '*' && state.peek_next_n(1) == Some('/') {
85 state.advance(2);
86 break;
87 }
88 state.advance(ch.len_utf8())
89 }
90 state.add_token(CppTokenType::Comment, start_pos, state.get_position());
91 true
92 }
93 else {
94 false
95 }
96 }
97 else {
98 false
99 }
100 }
101
102 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104 let start_pos = state.get_position();
105
106 if let Some('"') = state.peek() {
107 state.advance(1);
108
109 let mut escaped = false;
110 while let Some(ch) = state.peek() {
111 if escaped {
112 escaped = false;
113 state.advance(ch.len_utf8());
114 continue;
115 }
116
117 if ch == '\\' {
118 escaped = true;
119 state.advance(1);
120 continue;
121 }
122
123 if ch == '"' {
124 state.advance(1);
125 break;
126 }
127
128 if ch == '\n' || ch == '\r' {
129 break; }
131
132 state.advance(ch.len_utf8())
133 }
134
135 state.add_token(CppTokenType::StringLiteral, start_pos, state.get_position());
136 true
137 }
138 else {
139 false
140 }
141 }
142
143 fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
145 let start_pos = state.get_position();
146
147 if let Some('\'') = state.peek() {
148 state.advance(1);
149
150 let mut escaped = false;
151 while let Some(ch) = state.peek() {
152 if escaped {
153 escaped = false;
154 state.advance(ch.len_utf8());
155 continue;
156 }
157
158 if ch == '\\' {
159 escaped = true;
160 state.advance(1);
161 continue;
162 }
163
164 if ch == '\'' {
165 state.advance(1);
166 break;
167 }
168
169 if ch == '\n' || ch == '\r' {
170 break; }
172
173 state.advance(ch.len_utf8())
174 }
175
176 state.add_token(CppTokenType::CharacterLiteral, start_pos, state.get_position());
177 true
178 }
179 else {
180 false
181 }
182 }
183
184 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
186 let start_pos = state.get_position();
187
188 if let Some(ch) = state.peek() {
189 if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
190 let mut is_float = false;
191
192 if ch == '0' {
194 if let Some(next_ch) = state.peek_next_n(1) {
195 if next_ch == 'x' || next_ch == 'X' {
196 state.advance(2);
198 while let Some(ch) = state.peek() {
199 if ch.is_ascii_hexdigit() { state.advance(1) } else { break }
200 }
201 }
202 else if next_ch == 'b' || next_ch == 'B' {
203 state.advance(2);
205 while let Some(ch) = state.peek() {
206 if ch == '0' || ch == '1' { state.advance(1) } else { break }
207 }
208 }
209 else if next_ch.is_ascii_digit() {
210 while let Some(ch) = state.peek() {
212 if ch.is_ascii_digit() { state.advance(1) } else { break }
213 }
214 }
215 else {
216 state.advance(1); }
218 }
219 else {
220 state.advance(1); }
222 }
223 else {
224 while let Some(ch) = state.peek() {
226 if ch.is_ascii_digit() { state.advance(1) } else { break }
227 }
228 }
229
230 if let Some('.') = state.peek() {
232 if let Some(next_ch) = state.peek_next_n(1) {
233 if next_ch.is_ascii_digit() {
234 is_float = true;
235 state.advance(1); while let Some(ch) = state.peek() {
237 if ch.is_ascii_digit() { state.advance(1) } else { break }
238 }
239 }
240 }
241 }
242
243 if let Some(ch) = state.peek() {
245 if ch == 'e' || ch == 'E' {
246 is_float = true;
247 state.advance(1);
248 if let Some(sign) = state.peek() {
249 if sign == '+' || sign == '-' {
250 state.advance(1)
251 }
252 }
253 while let Some(ch) = state.peek() {
254 if ch.is_ascii_digit() { state.advance(1) } else { break }
255 }
256 }
257 }
258
259 while let Some(ch) = state.peek() {
261 if ch.is_ascii_alphabetic() { state.advance(1) } else { break }
262 }
263
264 let token_kind = if is_float { CppTokenType::FloatLiteral } else { CppTokenType::IntegerLiteral };
265 state.add_token(token_kind, start_pos, state.get_position());
266 true
267 }
268 else {
269 false
270 }
271 }
272 else {
273 false
274 }
275 }
276
277 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
279 let start_pos = state.get_position();
280
281 if let Some(ch) = state.peek() {
282 if ch.is_ascii_alphabetic() || ch == '_' {
283 while let Some(ch) = state.peek() {
284 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
285 }
286
287 let text = state.get_text_in((start_pos..state.get_position()).into());
288 let token_kind = match text.as_ref() {
289 "alignas" | "alignof" | "and" | "and_eq" | "asm" | "atomic_cancel" | "atomic_commit" | "atomic_noexcept" | "auto" | "bitand" | "bitor" | "bool" | "break" | "case" | "catch" | "char" | "char8_t" | "char16_t" | "char32_t" | "class"
291 | "compl" | "concept" | "const" | "consteval" | "constexpr" | "constinit" | "const_cast" | "continue" | "co_await" | "co_return" | "co_yield" | "decltype" | "default" | "delete" | "do" | "double" | "dynamic_cast" | "else" | "enum"
292 | "explicit" | "export" | "extern" | "float" | "for" | "friend" | "goto" | "if" | "inline" | "int" | "long" | "mutable" | "namespace" | "new" | "noexcept" | "not" | "not_eq" | "nullptr" | "operator" | "or" | "or_eq" | "private"
293 | "protected" | "public" | "reflexpr" | "register" | "reinterpret_cast" | "requires" | "return" | "short" | "signed" | "sizeof" | "static" | "static_assert" | "static_cast" | "struct" | "switch" | "synchronized" | "template"
294 | "this" | "thread_local" | "throw" | "try" | "typedef" | "typeid" | "typename" | "union" | "unsigned" | "using" | "virtual" | "void" | "volatile" | "wchar_t" | "while" | "xor" | "xor_eq" => CppTokenType::Keyword,
295 "true" | "false" => CppTokenType::BooleanLiteral,
296 _ => CppTokenType::Identifier,
297 };
298
299 state.add_token(token_kind, start_pos, state.get_position());
300 true
301 }
302 else {
303 false
304 }
305 }
306 else {
307 false
308 }
309 }
310
311 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
313 let start_pos = state.get_position();
314
315 if let Some(ch) = state.peek() {
316 let (token_kind, advance_count) = match ch {
317 '+' => {
318 if let Some('+') = state.peek_next_n(1) {
319 (CppTokenType::Increment, 2)
320 }
321 else if let Some('=') = state.peek_next_n(1) {
322 (CppTokenType::PlusAssign, 2)
323 }
324 else {
325 (CppTokenType::Plus, 1)
326 }
327 }
328 '-' => {
329 if let Some('-') = state.peek_next_n(1) {
330 (CppTokenType::Decrement, 2)
331 }
332 else if let Some('=') = state.peek_next_n(1) {
333 (CppTokenType::MinusAssign, 2)
334 }
335 else if let Some('>') = state.peek_next_n(1) {
336 (CppTokenType::Arrow, 2)
337 }
338 else {
339 (CppTokenType::Minus, 1)
340 }
341 }
342 '*' => {
343 if let Some('=') = state.peek_next_n(1) {
344 (CppTokenType::StarAssign, 2)
345 }
346 else {
347 (CppTokenType::Star, 1)
348 }
349 }
350 '/' => {
351 if let Some('=') = state.peek_next_n(1) {
352 (CppTokenType::SlashAssign, 2)
353 }
354 else {
355 (CppTokenType::Slash, 1)
356 }
357 }
358 '%' => {
359 if let Some('=') = state.peek_next_n(1) {
360 (CppTokenType::PercentAssign, 2)
361 }
362 else {
363 (CppTokenType::Percent, 1)
364 }
365 }
366 '=' => {
367 if let Some('=') = state.peek_next_n(1) {
368 (CppTokenType::Equal, 2)
369 }
370 else {
371 (CppTokenType::Assign, 1)
372 }
373 }
374 '!' => {
375 if let Some('=') = state.peek_next_n(1) {
376 (CppTokenType::NotEqual, 2)
377 }
378 else {
379 (CppTokenType::LogicalNot, 1)
380 }
381 }
382 '<' => {
383 if let Some('<') = state.peek_next_n(1) {
384 if let Some('=') = state.peek_next_n(2) { (CppTokenType::LeftShiftAssign, 3) } else { (CppTokenType::LeftShift, 2) }
385 }
386 else if let Some('=') = state.peek_next_n(1) {
387 (CppTokenType::LessEqual, 2)
388 }
389 else {
390 (CppTokenType::Less, 1)
391 }
392 }
393 '>' => {
394 if let Some('>') = state.peek_next_n(1) {
395 if let Some('=') = state.peek_next_n(2) { (CppTokenType::RightShiftAssign, 3) } else { (CppTokenType::RightShift, 2) }
396 }
397 else if let Some('=') = state.peek_next_n(1) {
398 (CppTokenType::GreaterEqual, 2)
399 }
400 else {
401 (CppTokenType::Greater, 1)
402 }
403 }
404 '&' => {
405 if let Some('&') = state.peek_next_n(1) {
406 (CppTokenType::LogicalAnd, 2)
407 }
408 else if let Some('=') = state.peek_next_n(1) {
409 (CppTokenType::AndAssign, 2)
410 }
411 else {
412 (CppTokenType::BitAnd, 1)
413 }
414 }
415 '|' => {
416 if let Some('|') = state.peek_next_n(1) {
417 (CppTokenType::LogicalOr, 2)
418 }
419 else if let Some('=') = state.peek_next_n(1) {
420 (CppTokenType::OrAssign, 2)
421 }
422 else {
423 (CppTokenType::BitOr, 1)
424 }
425 }
426 '^' => {
427 if let Some('=') = state.peek_next_n(1) {
428 (CppTokenType::XorAssign, 2)
429 }
430 else {
431 (CppTokenType::BitXor, 1)
432 }
433 }
434 '~' => (CppTokenType::BitNot, 1),
435 '?' => (CppTokenType::Question, 1),
436 ':' => {
437 if let Some(':') = state.peek_next_n(1) {
438 (CppTokenType::Scope, 2)
439 }
440 else {
441 (CppTokenType::Colon, 1)
442 }
443 }
444 '.' => (CppTokenType::Dot, 1),
445 _ => return false,
446 };
447
448 state.advance(advance_count);
449 state.add_token(token_kind, start_pos, state.get_position());
450 true
451 }
452 else {
453 false
454 }
455 }
456
457 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
459 let start_pos = state.get_position();
460
461 if let Some(ch) = state.peek() {
462 let token_kind = match ch {
463 '(' => CppTokenType::LeftParen,
464 ')' => CppTokenType::RightParen,
465 '[' => CppTokenType::LeftBracket,
466 ']' => CppTokenType::RightBracket,
467 '{' => CppTokenType::LeftBrace,
468 '}' => CppTokenType::RightBrace,
469 ',' => CppTokenType::Comma,
470 ';' => CppTokenType::Semicolon,
471 _ => return false,
472 };
473
474 state.advance(1);
475 state.add_token(token_kind, start_pos, state.get_position());
476 true
477 }
478 else {
479 false
480 }
481 }
482
483 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
485 let start_pos = state.get_position();
486
487 if let Some('#') = state.peek() {
488 while let Some(ch) = state.peek() {
490 if ch == '\n' || ch == '\r' {
491 break;
492 }
493 state.advance(ch.len_utf8())
494 }
495
496 state.add_token(CppTokenType::Preprocessor, start_pos, state.get_position());
497 true
498 }
499 else {
500 false
501 }
502 }
503}
504
505impl<'config> Lexer<CppLanguage> for CppLexer<'config> {
506 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<CppLanguage>) -> LexOutput<CppLanguage> {
508 let mut state = LexerState::new(source);
509 let result = self.run(&mut state);
510 state.finish_with_cache(result, cache)
511 }
512}
513
514impl<'config> CppLexer<'config> {
515 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
517 while state.not_at_end() {
518 if self.skip_whitespace(state) {
520 continue;
521 }
522
523 if self.lex_newline(state) {
524 continue;
525 }
526
527 if self.lex_comment(state) {
528 continue;
529 }
530
531 if self.lex_string(state) {
532 continue;
533 }
534
535 if self.lex_character(state) {
536 continue;
537 }
538
539 if self.lex_number(state) {
540 continue;
541 }
542
543 if self.lex_keyword_or_identifier(state) {
544 continue;
545 }
546
547 if self.lex_preprocessor(state) {
548 continue;
549 }
550
551 if self.lex_operator(state) {
552 continue;
553 }
554
555 if self.lex_delimiter(state) {
556 continue;
557 }
558
559 let start = state.get_position();
561 if let Some(ch) = state.peek() {
562 state.advance(ch.len_utf8());
563 state.add_token(CppTokenType::Error, start, state.get_position())
564 }
565 }
566 Ok(())
567 }
568}