any_lexer/lexers/
cpp.rs

1use text_scanner::{ext::CScannerExt, Scanner};
2
3use crate::{impl_lexer_from_scanner, ScanToken, ScannerExt, TokenSpan};
4
5// Reference: https://en.cppreference.com/w/cpp/keyword
6#[rustfmt::skip]
7const KEYWORDS: [&str; 97] = [
8    "alignas", "alignof", "and", "and_eq", "asm", "atomic_cancel", "atomic_commit",
9    "atomic_noexcept", "auto", "bitand", "bitor", "bool", "break", "case", "catch",
10    "char", "char8_t", "char16_t", "char32_t", "class", "compl", "concept", "const",
11    "consteval", "constexpr", "constinit", "const_cast", "continue", "co_await",
12    "co_return", "co_yield", "decltype", "default", "delete", "do", "double", "dynamic_cast",
13    "else", "enum", "explicit", "export", "extern", "false", "float", "for", "friend",
14    "goto", "if", "inline", "int", "long", "mutable", "namespace", "new", "noexcept",
15    "not", "not_eq", "nullptr", "operator", "or", "or_eq", "private", "protected",
16    "public", "reflexpr", "register", "reinterpret_cast", "requires", "return",
17    "short", "signed", "sizeof", "static", "static_assert", "static_cast", "struct",
18    "switch", "synchronized", "template", "this", "thread_local", "throw", "true",
19    "try", "typedef", "typeid", "typename", "union", "unsigned", "using", "virtual",
20    "void", "volatile", "wchar_t", "while", "xor", "xor_eq",
21];
22
23#[derive(PartialEq, Eq, Clone, Copy, Debug)]
24pub enum CppToken {
25    Space,
26    LineComment,
27    BlockComment,
28    Ident,
29    Keyword,
30    Char,
31    String,
32    Int,
33    Float,
34    Delim,
35    Punct,
36    /// Given valid C++ code, then this variant should never be encountered. If
37    /// is is encountered, then check if an issue has already been submitted,
38    /// otherwise please [submit an issue].
39    ///
40    /// [submit an issue]: https://github.com/vallentin/colorblast/issues
41    Unknown,
42}
43
44impl ScanToken for CppToken {
45    fn scan_token<'text>(scanner: &mut Scanner<'text>) -> Option<(Self, TokenSpan<'text>)> {
46        let (r, _s) = scanner.skip_whitespace();
47        if !r.is_empty() {
48            return Some((Self::Space, scanner.span(r)));
49        }
50
51        if let Ok((r, _s)) = scanner.scan_c_line_comment() {
52            return Some((Self::LineComment, scanner.span(r)));
53        } else if let Ok((r, _s)) = scanner.scan_c_block_comment() {
54            return Some((Self::BlockComment, scanner.span(r)));
55        }
56
57        if let Ok((r, ident)) = scanner.scan_c_identifier() {
58            let tok = if KEYWORDS.contains(&ident) {
59                Self::Keyword
60            } else {
61                Self::Ident
62            };
63            return Some((tok, scanner.span(r)));
64        }
65
66        if let Ok((r, _s)) = scanner.scan_c_char() {
67            return Some((Self::Char, scanner.span(r)));
68        } else if let Ok((r, _s)) = scanner.scan_c_string() {
69            return Some((Self::String, scanner.span(r)));
70        }
71
72        if let Ok((r, _s)) = scanner.scan_c_float() {
73            return Some((Self::Float, scanner.span(r)));
74        } else if let Ok((r, _s)) = scanner
75            .scan_c_int_hex()
76            .or_else(|_| scanner.scan_c_int_oct())
77            .or_else(|_| scanner.scan_c_int_dec())
78        {
79            return Some((Self::Int, scanner.span(r)));
80        }
81
82        if let Ok((r, _c)) = scanner.accept_char_any(&['{', '}', '[', ']', '(', ')']) {
83            return Some((Self::Delim, scanner.span(r)));
84        }
85
86        // Reference: https://en.cppreference.com/w/cpp/language/punctuators
87        let res = scanner.scan_with(|scanner| {
88            let (r, c) = scanner.next()?;
89            match c {
90                '=' => {
91                    _ = scanner.accept_char_any(&['=', '>']);
92                }
93                '+' => {
94                    _ = scanner.accept_char_any(&['+', '=']);
95                }
96                '-' => {
97                    let res = scanner.accept_char_any(&['-', '=']);
98                    if res.is_err() && scanner.accept_char('>').is_ok() {
99                        let _ = scanner.accept_char('*');
100                    }
101                }
102                '*' | '/' | '%' | '^' | '!' => {
103                    _ = scanner.accept_char('=');
104                }
105                '&' => {
106                    _ = scanner.accept_char_any(&['&', '=']);
107                }
108                '|' => {
109                    _ = scanner.accept_char_any(&['|', '=']);
110                }
111                '<' => {
112                    let res1 = scanner.accept_char('<');
113                    let res2 = scanner.accept_char('=');
114                    if res1.is_ok() && res2.is_ok() {
115                        _ = scanner.accept_char('>');
116                    }
117                }
118                '>' => {
119                    _ = scanner.accept_char('>');
120                    _ = scanner.accept_char('=');
121                }
122                '.' => {
123                    let res = scanner.accept_char('*');
124                    if res.is_err() {
125                        _ = scanner.scan_with(|scanner| {
126                            scanner.accept_char('.')?;
127                            scanner.accept_char('.')?;
128                            Ok(())
129                        });
130                    }
131                }
132                '#' => {
133                    _ = scanner.accept_char('#');
134                }
135                ',' | ';' | ':' | '?' | '~' => {}
136                _ => return Err(scanner.ranged_text(r)),
137            }
138            Ok(())
139        });
140        if let Ok((r, _s)) = res {
141            return Some((Self::Punct, scanner.span(r)));
142        }
143
144        let (r, _c) = scanner.next().ok()?;
145        Some((Self::Unknown, scanner.span(r)))
146    }
147}
148
149/// C++ lexer producing [`CppToken`]s.
150///
151/// **Note:** Cloning `CppLexer` is essentially a copy, as it just contains
152/// a `&str` and a `usize` for its `cursor`. However, `Copy` is not
153/// implemented, to avoid accidentally copying immutable `CppLexer`s.
154#[derive(Clone, Debug)]
155pub struct CppLexer<'text> {
156    scanner: Scanner<'text>,
157}
158
159impl<'text> CppLexer<'text> {
160    #[inline]
161    pub fn new(text: &'text str) -> Self {
162        Self {
163            scanner: Scanner::new(text),
164        }
165    }
166}
167
168impl_lexer_from_scanner!('text, CppLexer<'text>, CppToken, scanner);
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173
174    #[test]
175    fn test_cpp_lexer_spans() {
176        // This intentionally uses Rust code as input, as it is
177        // only testing that CppLexer returns all characters
178        let input = include_str!("../../../text-scanner/src/ext/rust.rs");
179        let mut output = String::new();
180
181        let lexer = CppLexer::new(input);
182        for (_tok, span) in lexer {
183            output.push_str(span.as_str());
184        }
185
186        assert_eq!(input, output);
187    }
188}