1#![doc = include_str!("readme.md")]
2pub mod token_type;
4pub use token_type::CppTokenType;
5
6use crate::language::CppLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, CppLanguage>;
10
11pub struct CppLexer<'config> {
13 config: &'config CppLanguage,
14}
15
16pub type CLexer<'config> = CppLexer<'config>;
18
19impl<'config> CppLexer<'config> {
20 pub fn new(config: &'config CppLanguage) -> Self {
22 Self { config }
23 }
24
25 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
27 let start_pos = state.get_position();
28
29 while let Some(ch) = state.peek() {
30 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
31 }
32
33 if state.get_position() > start_pos {
34 state.add_token(CppTokenType::Whitespace, start_pos, state.get_position());
35 true
36 }
37 else {
38 false
39 }
40 }
41
42 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
44 let start_pos = state.get_position();
45
46 if let Some('\n') = state.peek() {
47 state.advance(1);
48 state.add_token(CppTokenType::Newline, start_pos, state.get_position());
49 true
50 }
51 else if let Some('\r') = state.peek() {
52 state.advance(1);
53 if let Some('\n') = state.peek() {
54 state.advance(1)
55 }
56 state.add_token(CppTokenType::Newline, start_pos, state.get_position());
57 true
58 }
59 else {
60 false
61 }
62 }
63
64 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
66 let start_pos = state.get_position();
67
68 if let Some('/') = state.peek() {
69 if let Some('/') = state.peek_next_n(1) {
70 state.advance(2);
72 while let Some(ch) = state.peek() {
73 if ch == '\n' || ch == '\r' {
74 break;
75 }
76 state.advance(ch.len_utf8())
77 }
78 state.add_token(CppTokenType::Comment, start_pos, state.get_position());
79 true
80 }
81 else if let Some('*') = state.peek_next_n(1) {
82 state.advance(2);
84 while let Some(ch) = state.peek() {
85 if ch == '*' && state.peek_next_n(1) == Some('/') {
86 state.advance(2);
87 break;
88 }
89 state.advance(ch.len_utf8())
90 }
91 state.add_token(CppTokenType::Comment, start_pos, state.get_position());
92 true
93 }
94 else {
95 false
96 }
97 }
98 else {
99 false
100 }
101 }
102
103 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
105 let start_pos = state.get_position();
106
107 if let Some('"') = state.peek() {
108 state.advance(1);
109
110 let mut escaped = false;
111 while let Some(ch) = state.peek() {
112 if escaped {
113 escaped = false;
114 state.advance(ch.len_utf8());
115 continue;
116 }
117
118 if ch == '\\' {
119 escaped = true;
120 state.advance(1);
121 continue;
122 }
123
124 if ch == '"' {
125 state.advance(1);
126 break;
127 }
128
129 if ch == '\n' || ch == '\r' {
130 break; }
132
133 state.advance(ch.len_utf8())
134 }
135
136 state.add_token(CppTokenType::StringLiteral, start_pos, state.get_position());
137 true
138 }
139 else {
140 false
141 }
142 }
143
144 fn lex_character<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
146 let start_pos = state.get_position();
147
148 if let Some('\'') = state.peek() {
149 state.advance(1);
150
151 let mut escaped = false;
152 while let Some(ch) = state.peek() {
153 if escaped {
154 escaped = false;
155 state.advance(ch.len_utf8());
156 continue;
157 }
158
159 if ch == '\\' {
160 escaped = true;
161 state.advance(1);
162 continue;
163 }
164
165 if ch == '\'' {
166 state.advance(1);
167 break;
168 }
169
170 if ch == '\n' || ch == '\r' {
171 break; }
173
174 state.advance(ch.len_utf8())
175 }
176
177 state.add_token(CppTokenType::CharacterLiteral, start_pos, state.get_position());
178 true
179 }
180 else {
181 false
182 }
183 }
184
185 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
187 let start_pos = state.get_position();
188
189 if let Some(ch) = state.peek() {
190 if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
191 let mut is_float = false;
192
193 if ch == '0' {
195 if let Some(next_ch) = state.peek_next_n(1) {
196 if next_ch == 'x' || next_ch == 'X' {
197 state.advance(2);
199 while let Some(ch) = state.peek() {
200 if ch.is_ascii_hexdigit() { state.advance(1) } else { break }
201 }
202 }
203 else if next_ch == 'b' || next_ch == 'B' {
204 state.advance(2);
206 while let Some(ch) = state.peek() {
207 if ch == '0' || ch == '1' { state.advance(1) } else { break }
208 }
209 }
210 else if next_ch.is_ascii_digit() {
211 while let Some(ch) = state.peek() {
213 if ch.is_ascii_digit() { state.advance(1) } else { break }
214 }
215 }
216 else {
217 state.advance(1); }
219 }
220 else {
221 state.advance(1); }
223 }
224 else {
225 while let Some(ch) = state.peek() {
227 if ch.is_ascii_digit() { state.advance(1) } else { break }
228 }
229 }
230
231 if let Some('.') = state.peek() {
233 if let Some(next_ch) = state.peek_next_n(1) {
234 if next_ch.is_ascii_digit() {
235 is_float = true;
236 state.advance(1); while let Some(ch) = state.peek() {
238 if ch.is_ascii_digit() { state.advance(1) } else { break }
239 }
240 }
241 }
242 }
243
244 if let Some(ch) = state.peek() {
246 if ch == 'e' || ch == 'E' {
247 is_float = true;
248 state.advance(1);
249 if let Some(sign) = state.peek() {
250 if sign == '+' || sign == '-' {
251 state.advance(1)
252 }
253 }
254 while let Some(ch) = state.peek() {
255 if ch.is_ascii_digit() { state.advance(1) } else { break }
256 }
257 }
258 }
259
260 while let Some(ch) = state.peek() {
262 if ch.is_ascii_alphabetic() { state.advance(1) } else { break }
263 }
264
265 let token_kind = if is_float { CppTokenType::FloatLiteral } else { CppTokenType::IntegerLiteral };
266 state.add_token(token_kind, start_pos, state.get_position());
267 true
268 }
269 else {
270 false
271 }
272 }
273 else {
274 false
275 }
276 }
277
278 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
280 let start_pos = state.get_position();
281
282 if let Some(ch) = state.peek() {
283 if ch.is_ascii_alphabetic() || ch == '_' {
284 while let Some(ch) = state.peek() {
285 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
286 }
287
288 let text = state.get_text_in((start_pos..state.get_position()).into());
289 let token_kind = match text.as_ref() {
290 "alignas" | "alignof" | "and" | "and_eq" | "asm" | "atomic_cancel" | "atomic_commit" | "atomic_noexcept" | "auto" | "bitand" | "bitor" | "bool" | "break" | "case" | "catch" | "char" | "char8_t" | "char16_t" | "char32_t" | "class"
292 | "compl" | "concept" | "const" | "consteval" | "constexpr" | "constinit" | "const_cast" | "continue" | "co_await" | "co_return" | "co_yield" | "decltype" | "default" | "delete" | "do" | "double" | "dynamic_cast" | "else" | "enum"
293 | "explicit" | "export" | "extern" | "float" | "for" | "friend" | "goto" | "if" | "inline" | "int" | "long" | "mutable" | "namespace" | "new" | "noexcept" | "not" | "not_eq" | "nullptr" | "operator" | "or" | "or_eq" | "private"
294 | "protected" | "public" | "reflexpr" | "register" | "reinterpret_cast" | "requires" | "return" | "short" | "signed" | "sizeof" | "static" | "static_assert" | "static_cast" | "struct" | "switch" | "synchronized" | "template"
295 | "this" | "thread_local" | "throw" | "try" | "typedef" | "typeid" | "typename" | "union" | "unsigned" | "using" | "virtual" | "void" | "volatile" | "wchar_t" | "while" | "xor" | "xor_eq" => CppTokenType::Keyword,
296 "true" | "false" => CppTokenType::BooleanLiteral,
297 _ => CppTokenType::Identifier,
298 };
299
300 state.add_token(token_kind, start_pos, state.get_position());
301 true
302 }
303 else {
304 false
305 }
306 }
307 else {
308 false
309 }
310 }
311
312 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
314 let start_pos = state.get_position();
315
316 if let Some(ch) = state.peek() {
317 let (token_kind, advance_count) = match ch {
318 '+' => {
319 if let Some('+') = state.peek_next_n(1) {
320 (CppTokenType::Increment, 2)
321 }
322 else if let Some('=') = state.peek_next_n(1) {
323 (CppTokenType::PlusAssign, 2)
324 }
325 else {
326 (CppTokenType::Plus, 1)
327 }
328 }
329 '-' => {
330 if let Some('-') = state.peek_next_n(1) {
331 (CppTokenType::Decrement, 2)
332 }
333 else if let Some('=') = state.peek_next_n(1) {
334 (CppTokenType::MinusAssign, 2)
335 }
336 else if let Some('>') = state.peek_next_n(1) {
337 (CppTokenType::Arrow, 2)
338 }
339 else {
340 (CppTokenType::Minus, 1)
341 }
342 }
343 '*' => {
344 if let Some('=') = state.peek_next_n(1) {
345 (CppTokenType::StarAssign, 2)
346 }
347 else {
348 (CppTokenType::Star, 1)
349 }
350 }
351 '/' => {
352 if let Some('=') = state.peek_next_n(1) {
353 (CppTokenType::SlashAssign, 2)
354 }
355 else {
356 (CppTokenType::Slash, 1)
357 }
358 }
359 '%' => {
360 if let Some('=') = state.peek_next_n(1) {
361 (CppTokenType::PercentAssign, 2)
362 }
363 else {
364 (CppTokenType::Percent, 1)
365 }
366 }
367 '=' => {
368 if let Some('=') = state.peek_next_n(1) {
369 (CppTokenType::Equal, 2)
370 }
371 else {
372 (CppTokenType::Assign, 1)
373 }
374 }
375 '!' => {
376 if let Some('=') = state.peek_next_n(1) {
377 (CppTokenType::NotEqual, 2)
378 }
379 else {
380 (CppTokenType::LogicalNot, 1)
381 }
382 }
383 '<' => {
384 if let Some('<') = state.peek_next_n(1) {
385 if let Some('=') = state.peek_next_n(2) { (CppTokenType::LeftShiftAssign, 3) } else { (CppTokenType::LeftShift, 2) }
386 }
387 else if let Some('=') = state.peek_next_n(1) {
388 (CppTokenType::LessEqual, 2)
389 }
390 else {
391 (CppTokenType::Less, 1)
392 }
393 }
394 '>' => {
395 if let Some('>') = state.peek_next_n(1) {
396 if let Some('=') = state.peek_next_n(2) { (CppTokenType::RightShiftAssign, 3) } else { (CppTokenType::RightShift, 2) }
397 }
398 else if let Some('=') = state.peek_next_n(1) {
399 (CppTokenType::GreaterEqual, 2)
400 }
401 else {
402 (CppTokenType::Greater, 1)
403 }
404 }
405 '&' => {
406 if let Some('&') = state.peek_next_n(1) {
407 (CppTokenType::LogicalAnd, 2)
408 }
409 else if let Some('=') = state.peek_next_n(1) {
410 (CppTokenType::AndAssign, 2)
411 }
412 else {
413 (CppTokenType::BitAnd, 1)
414 }
415 }
416 '|' => {
417 if let Some('|') = state.peek_next_n(1) {
418 (CppTokenType::LogicalOr, 2)
419 }
420 else if let Some('=') = state.peek_next_n(1) {
421 (CppTokenType::OrAssign, 2)
422 }
423 else {
424 (CppTokenType::BitOr, 1)
425 }
426 }
427 '^' => {
428 if let Some('=') = state.peek_next_n(1) {
429 (CppTokenType::XorAssign, 2)
430 }
431 else {
432 (CppTokenType::BitXor, 1)
433 }
434 }
435 '~' => (CppTokenType::BitNot, 1),
436 '?' => (CppTokenType::Question, 1),
437 ':' => {
438 if let Some(':') = state.peek_next_n(1) {
439 (CppTokenType::Scope, 2)
440 }
441 else {
442 (CppTokenType::Colon, 1)
443 }
444 }
445 '.' => (CppTokenType::Dot, 1),
446 _ => return false,
447 };
448
449 state.advance(advance_count);
450 state.add_token(token_kind, start_pos, state.get_position());
451 true
452 }
453 else {
454 false
455 }
456 }
457
458 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
460 let start_pos = state.get_position();
461
462 if let Some(ch) = state.peek() {
463 let token_kind = match ch {
464 '(' => CppTokenType::LeftParen,
465 ')' => CppTokenType::RightParen,
466 '[' => CppTokenType::LeftBracket,
467 ']' => CppTokenType::RightBracket,
468 '{' => CppTokenType::LeftBrace,
469 '}' => CppTokenType::RightBrace,
470 ',' => CppTokenType::Comma,
471 ';' => CppTokenType::Semicolon,
472 _ => return false,
473 };
474
475 state.advance(1);
476 state.add_token(token_kind, start_pos, state.get_position());
477 true
478 }
479 else {
480 false
481 }
482 }
483
484 fn lex_preprocessor<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
486 let start_pos = state.get_position();
487
488 if let Some('#') = state.peek() {
489 while let Some(ch) = state.peek() {
491 if ch == '\n' || ch == '\r' {
492 break;
493 }
494 state.advance(ch.len_utf8())
495 }
496
497 state.add_token(CppTokenType::Preprocessor, start_pos, state.get_position());
498 true
499 }
500 else {
501 false
502 }
503 }
504}
505
506impl<'config> Lexer<CppLanguage> for CppLexer<'config> {
507 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<CppLanguage>) -> LexOutput<CppLanguage> {
509 let mut state = LexerState::new(source);
510 let result = self.run(&mut state);
511 state.finish_with_cache(result, cache)
512 }
513}
514
515impl<'config> CppLexer<'config> {
516 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
518 while state.not_at_end() {
519 if self.skip_whitespace(state) {
521 continue;
522 }
523
524 if self.lex_newline(state) {
525 continue;
526 }
527
528 if self.lex_comment(state) {
529 continue;
530 }
531
532 if self.lex_string(state) {
533 continue;
534 }
535
536 if self.lex_character(state) {
537 continue;
538 }
539
540 if self.lex_number(state) {
541 continue;
542 }
543
544 if self.lex_keyword_or_identifier(state) {
545 continue;
546 }
547
548 if self.lex_preprocessor(state) {
549 continue;
550 }
551
552 if self.lex_operator(state) {
553 continue;
554 }
555
556 if self.lex_delimiter(state) {
557 continue;
558 }
559
560 let start = state.get_position();
562 if let Some(ch) = state.peek() {
563 state.advance(ch.len_utf8());
564 state.add_token(CppTokenType::Error, start, state.get_position())
565 }
566 }
567 Ok(())
568 }
569}