1use text_scanner::{ext::CScannerExt, Scanner};
2
3use crate::{impl_lexer_from_scanner, ScanToken, ScannerExt, TokenSpan};
4
5#[rustfmt::skip]
7const KEYWORDS: [&str; 97] = [
8 "alignas", "alignof", "and", "and_eq", "asm", "atomic_cancel", "atomic_commit",
9 "atomic_noexcept", "auto", "bitand", "bitor", "bool", "break", "case", "catch",
10 "char", "char8_t", "char16_t", "char32_t", "class", "compl", "concept", "const",
11 "consteval", "constexpr", "constinit", "const_cast", "continue", "co_await",
12 "co_return", "co_yield", "decltype", "default", "delete", "do", "double", "dynamic_cast",
13 "else", "enum", "explicit", "export", "extern", "false", "float", "for", "friend",
14 "goto", "if", "inline", "int", "long", "mutable", "namespace", "new", "noexcept",
15 "not", "not_eq", "nullptr", "operator", "or", "or_eq", "private", "protected",
16 "public", "reflexpr", "register", "reinterpret_cast", "requires", "return",
17 "short", "signed", "sizeof", "static", "static_assert", "static_cast", "struct",
18 "switch", "synchronized", "template", "this", "thread_local", "throw", "true",
19 "try", "typedef", "typeid", "typename", "union", "unsigned", "using", "virtual",
20 "void", "volatile", "wchar_t", "while", "xor", "xor_eq",
21];
22
23#[derive(PartialEq, Eq, Clone, Copy, Debug)]
24pub enum CppToken {
25 Space,
26 LineComment,
27 BlockComment,
28 Ident,
29 Keyword,
30 Char,
31 String,
32 Int,
33 Float,
34 Delim,
35 Punct,
36 Unknown,
42}
43
44impl ScanToken for CppToken {
45 fn scan_token<'text>(scanner: &mut Scanner<'text>) -> Option<(Self, TokenSpan<'text>)> {
46 let (r, _s) = scanner.skip_whitespace();
47 if !r.is_empty() {
48 return Some((Self::Space, scanner.span(r)));
49 }
50
51 if let Ok((r, _s)) = scanner.scan_c_line_comment() {
52 return Some((Self::LineComment, scanner.span(r)));
53 } else if let Ok((r, _s)) = scanner.scan_c_block_comment() {
54 return Some((Self::BlockComment, scanner.span(r)));
55 }
56
57 if let Ok((r, ident)) = scanner.scan_c_identifier() {
58 let tok = if KEYWORDS.contains(&ident) {
59 Self::Keyword
60 } else {
61 Self::Ident
62 };
63 return Some((tok, scanner.span(r)));
64 }
65
66 if let Ok((r, _s)) = scanner.scan_c_char() {
67 return Some((Self::Char, scanner.span(r)));
68 } else if let Ok((r, _s)) = scanner.scan_c_string() {
69 return Some((Self::String, scanner.span(r)));
70 }
71
72 if let Ok((r, _s)) = scanner.scan_c_float() {
73 return Some((Self::Float, scanner.span(r)));
74 } else if let Ok((r, _s)) = scanner
75 .scan_c_int_hex()
76 .or_else(|_| scanner.scan_c_int_oct())
77 .or_else(|_| scanner.scan_c_int_dec())
78 {
79 return Some((Self::Int, scanner.span(r)));
80 }
81
82 if let Ok((r, _c)) = scanner.accept_char_any(&['{', '}', '[', ']', '(', ')']) {
83 return Some((Self::Delim, scanner.span(r)));
84 }
85
86 let res = scanner.scan_with(|scanner| {
88 let (r, c) = scanner.next()?;
89 match c {
90 '=' => {
91 _ = scanner.accept_char_any(&['=', '>']);
92 }
93 '+' => {
94 _ = scanner.accept_char_any(&['+', '=']);
95 }
96 '-' => {
97 let res = scanner.accept_char_any(&['-', '=']);
98 if res.is_err() && scanner.accept_char('>').is_ok() {
99 let _ = scanner.accept_char('*');
100 }
101 }
102 '*' | '/' | '%' | '^' | '!' => {
103 _ = scanner.accept_char('=');
104 }
105 '&' => {
106 _ = scanner.accept_char_any(&['&', '=']);
107 }
108 '|' => {
109 _ = scanner.accept_char_any(&['|', '=']);
110 }
111 '<' => {
112 let res1 = scanner.accept_char('<');
113 let res2 = scanner.accept_char('=');
114 if res1.is_ok() && res2.is_ok() {
115 _ = scanner.accept_char('>');
116 }
117 }
118 '>' => {
119 _ = scanner.accept_char('>');
120 _ = scanner.accept_char('=');
121 }
122 '.' => {
123 let res = scanner.accept_char('*');
124 if res.is_err() {
125 _ = scanner.scan_with(|scanner| {
126 scanner.accept_char('.')?;
127 scanner.accept_char('.')?;
128 Ok(())
129 });
130 }
131 }
132 '#' => {
133 _ = scanner.accept_char('#');
134 }
135 ',' | ';' | ':' | '?' | '~' => {}
136 _ => return Err(scanner.ranged_text(r)),
137 }
138 Ok(())
139 });
140 if let Ok((r, _s)) = res {
141 return Some((Self::Punct, scanner.span(r)));
142 }
143
144 let (r, _c) = scanner.next().ok()?;
145 Some((Self::Unknown, scanner.span(r)))
146 }
147}
148
149#[derive(Clone, Debug)]
155pub struct CppLexer<'text> {
156 scanner: Scanner<'text>,
157}
158
159impl<'text> CppLexer<'text> {
160 #[inline]
161 pub fn new(text: &'text str) -> Self {
162 Self {
163 scanner: Scanner::new(text),
164 }
165 }
166}
167
168impl_lexer_from_scanner!('text, CppLexer<'text>, CppToken, scanner);
169
170#[cfg(test)]
171mod tests {
172 use super::*;
173
174 #[test]
175 fn test_cpp_lexer_spans() {
176 let input = include_str!("../../../text-scanner/src/ext/rust.rs");
179 let mut output = String::new();
180
181 let lexer = CppLexer::new(input);
182 for (_tok, span) in lexer {
183 output.push_str(span.as_str());
184 }
185
186 assert_eq!(input, output);
187 }
188}