any_lexer/lexers/
c.rs

1use text_scanner::{ext::CScannerExt, Scanner};
2
3use crate::{impl_lexer_from_scanner, ScanToken, ScannerExt, TokenSpan};
4
5// Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-keywords?view=msvc-170#standard-c-keywords
6#[rustfmt::skip]
7const KEYWORDS: [&str; 46] = [
8    "alignas", "alignof", "auto", "break", "case", "char", "const", "continue",
9    "default", "do", "double", "else", "enum", "extern", "float", "for", "goto",
10    "if", "inline", "int", "long", "register", "restrict", "return", "short",
11    "signed", "sizeof", "static", "struct", "switch", "typedef", "union",
12    "unsigned", "void", "volatile", "while", "_Alignas", "_Alignof", "_Atomic",
13    "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "_Static_assert",
14    "_Thread_local",
15];
16
17// Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-keywords?view=msvc-170#microsoft-specific-c-keywords
18#[rustfmt::skip]
19const KEYWORDS_MS: [&str; 21] = [
20    "__asm", "__based", "__cdecl", "__declspec", "__except", "__fastcall", "__finally",
21    "__inline", "__int16", "__int32", "__int64", "__int8", "__leave", "__restrict",
22    "__stdcall", "__try", "dllexport", "dllimport", "naked", "static_assert", "thread",
23];
24
25#[derive(PartialEq, Eq, Clone, Copy, Debug)]
26pub enum CToken {
27    Space,
28    LineComment,
29    BlockComment,
30    Ident,
31    Keyword,
32    Char,
33    String,
34    Int,
35    Float,
36    Delim,
37    Punct,
38    /// Given valid C code, then this variant should never be encountered. If
39    /// is is encountered, then check if an issue has already been submitted,
40    /// otherwise please [submit an issue].
41    ///
42    /// [submit an issue]: https://github.com/vallentin/colorblast/issues
43    Unknown,
44}
45
46impl ScanToken for CToken {
47    fn scan_token<'text>(scanner: &mut Scanner<'text>) -> Option<(Self, TokenSpan<'text>)> {
48        let (r, _s) = scanner.skip_whitespace();
49        if !r.is_empty() {
50            return Some((Self::Space, scanner.span(r)));
51        }
52
53        if let Ok((r, _s)) = scanner.scan_c_line_comment() {
54            return Some((Self::LineComment, scanner.span(r)));
55        } else if let Ok((r, _s)) = scanner.scan_c_block_comment() {
56            return Some((Self::BlockComment, scanner.span(r)));
57        }
58
59        if let Ok((r, ident)) = scanner.scan_c_identifier() {
60            let tok = if KEYWORDS.contains(&ident) || KEYWORDS_MS.contains(&ident) {
61                Self::Keyword
62            } else {
63                Self::Ident
64            };
65            return Some((tok, scanner.span(r)));
66        }
67
68        if let Ok((r, _s)) = scanner.scan_c_char() {
69            return Some((Self::Char, scanner.span(r)));
70        } else if let Ok((r, _s)) = scanner.scan_c_string() {
71            return Some((Self::String, scanner.span(r)));
72        }
73
74        if let Ok((r, _s)) = scanner.scan_c_float() {
75            return Some((Self::Float, scanner.span(r)));
76        } else if let Ok((r, _s)) = scanner
77            .scan_c_int_hex()
78            .or_else(|_| scanner.scan_c_int_oct())
79            .or_else(|_| scanner.scan_c_int_dec())
80        {
81            return Some((Self::Int, scanner.span(r)));
82        }
83
84        if let Ok((r, _c)) = scanner.accept_char_any(&['{', '}', '[', ']', '(', ')']) {
85            return Some((Self::Delim, scanner.span(r)));
86        }
87
88        // Reference: https://learn.microsoft.com/en-us/cpp/c-language/c-operators?view=msvc-170
89        let res = scanner.scan_with(|scanner| {
90            let (r, c) = scanner.next()?;
91            match c {
92                '=' => {
93                    _ = scanner.accept_char_any(&['=', '>']);
94                }
95                '+' => {
96                    _ = scanner.accept_char_any(&['+', '=']);
97                }
98                '-' => {
99                    _ = scanner.accept_char_any(&['-', '=']);
100                }
101                '*' | '/' | '%' | '^' | '!' => {
102                    _ = scanner.accept_char('=');
103                }
104                '&' => {
105                    _ = scanner.accept_char_any(&['&', '=']);
106                }
107                '|' => {
108                    _ = scanner.accept_char_any(&['|', '=']);
109                }
110                '<' => {
111                    _ = scanner.accept_char('<');
112                    _ = scanner.accept_char('=');
113                }
114                '>' => {
115                    _ = scanner.accept_char('>');
116                    _ = scanner.accept_char('=');
117                }
118                '.' => {
119                    _ = scanner.scan_with(|scanner| {
120                        scanner.accept_char('.')?;
121                        scanner.accept_char('.')?;
122                        Ok(())
123                    });
124                }
125                '#' => {
126                    _ = scanner.accept_char('#');
127                }
128                ',' | ';' | ':' | '?' | '~' => {}
129                _ => return Err(scanner.ranged_text(r)),
130            }
131            Ok(())
132        });
133        if let Ok((r, _s)) = res {
134            return Some((Self::Punct, scanner.span(r)));
135        }
136
137        let (r, _c) = scanner.next().ok()?;
138        Some((Self::Unknown, scanner.span(r)))
139    }
140}
141
142/// C lexer producing [`CToken`]s.
143///
144/// **Note:** Cloning `CLexer` is essentially a copy, as it just contains
145/// a `&str` and a `usize` for its `cursor`. However, `Copy` is not
146/// implemented, to avoid accidentally copying immutable `CLexer`s.
147#[derive(Clone, Debug)]
148pub struct CLexer<'text> {
149    scanner: Scanner<'text>,
150}
151
152impl<'text> CLexer<'text> {
153    #[inline]
154    pub fn new(text: &'text str) -> Self {
155        Self {
156            scanner: Scanner::new(text),
157        }
158    }
159}
160
161impl_lexer_from_scanner!('text, CLexer<'text>, CToken, scanner);
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn test_c_lexer_spans() {
169        // This intentionally uses Rust code as input, as it is
170        // only testing that CLexer returns all characters
171        let input = include_str!("../../../text-scanner/src/ext/rust.rs");
172        let mut output = String::new();
173
174        let lexer = CLexer::new(input);
175        for (_tok, span) in lexer {
176            output.push_str(span.as_str());
177        }
178
179        assert_eq!(input, output);
180    }
181}