any_lexer/lexers/
rust.rs

1use text_scanner::{ext::RustScannerExt, Scanner};
2
3use crate::{impl_lexer_from_scanner, ScanToken, ScannerExt, TokenSpan};
4
5#[rustfmt::skip]
6const KEYWORDS: [&str; 53] = [
7    "as", "break", "const", "continue", "crate", "else", "enum", "extern",
8    "false", "fn", "for", "if", "impl", "in", "let", "loop", "macro_rules",
9    "match", "mod", "move", "mut", "pub", "ref", "return", "self", "Self",
10    "static", "struct", "super", "trait", "true", "type", "union", "unsafe",
11    "use", "where", "while", "async", "await", "dyn", "abstract", "become",
12    "box", "do", "final", "macro", "override", "priv", "typeof", "unsized",
13    "virtual", "yield", "try",
14];
15
16#[derive(PartialEq, Eq, Clone, Copy, Debug)]
17pub enum RustToken {
18    Space,
19    LineComment,
20    BlockComment,
21    Ident,
22    Keyword,
23    Lifetime,
24    Char,
25    String,
26    RawString,
27    Int,
28    Float,
29    Delim,
30    Punct,
31    /// Given valid C code, then this variant should never be encountered. If
32    /// is is encountered, then check if an issue has already been submitted,
33    /// otherwise please [submit an issue].
34    ///
35    /// [submit an issue]: https://github.com/vallentin/colorblast/issues
36    Unknown,
37}
38
39impl ScanToken for RustToken {
40    fn scan_token<'text>(scanner: &mut Scanner<'text>) -> Option<(Self, TokenSpan<'text>)> {
41        let (r, _s) = scanner.skip_whitespace();
42        if !r.is_empty() {
43            return Some((Self::Space, scanner.span(r)));
44        }
45
46        if let Ok((r, _s)) = scanner.scan_rust_line_comment() {
47            return Some((Self::LineComment, scanner.span(r)));
48        } else if let Ok((r, _s)) = scanner.scan_rust_block_comment() {
49            return Some((Self::BlockComment, scanner.span(r)));
50        }
51
52        if let Ok((r, _s)) = scanner
53            .scan_rust_raw_string()
54            .or_else(|_| scanner.scan_rust_string())
55        {
56            return Some((Self::String, scanner.span(r)));
57        }
58
59        if let Ok((r, ident)) = scanner
60            .scan_rust_raw_identifier()
61            .or_else(|_| scanner.scan_rust_identifier())
62        {
63            let tok = if KEYWORDS.contains(&ident) {
64                Self::Keyword
65            } else {
66                Self::Ident
67            };
68            return Some((tok, scanner.span(r)));
69        }
70
71        if let Ok((_r, '\'')) = scanner.peek() {
72            if let Ok((r, _s)) = scanner.scan_rust_char() {
73                return Some((Self::Char, scanner.span(r)));
74            }
75
76            let res = scanner.scan_with(|scanner| {
77                scanner.accept_char('\'')?;
78                scanner.scan_rust_identifier()?;
79                Ok(())
80            });
81            if let Ok((r, _s)) = res {
82                return Some((Self::Lifetime, scanner.span(r)));
83            }
84
85            let (r, _c) = scanner.next().ok()?;
86            return Some((Self::Unknown, scanner.span(r)));
87        }
88
89        if let Ok((r, _s)) = scanner.scan_rust_float() {
90            return Some((Self::Float, scanner.span(r)));
91        } else if let Ok((r, _s)) = scanner
92            .scan_rust_int_hex()
93            .or_else(|_| scanner.scan_rust_int_oct())
94            .or_else(|_| scanner.scan_rust_int_bin())
95            .or_else(|_| scanner.scan_rust_int_dec())
96        {
97            return Some((Self::Int, scanner.span(r)));
98        }
99
100        if let Ok((r, _c)) = scanner.accept_char_any(&['{', '}', '[', ']', '(', ')']) {
101            return Some((Self::Delim, scanner.span(r)));
102        }
103
104        let res = scanner.scan_with(|scanner| {
105            let (r, c) = scanner.next()?;
106            match c {
107                '=' => {
108                    _ = scanner.accept_char_any(&['=', '>']);
109                }
110                '-' => {
111                    _ = scanner.accept_char_any(&['=', '>']);
112                }
113                '+' | '*' | '/' | '%' | '^' | '!' => {
114                    _ = scanner.accept_char('=');
115                }
116                '&' => {
117                    _ = scanner.accept_char_any(&['&', '=']);
118                }
119                '|' => {
120                    _ = scanner.accept_char_any(&['|', '=']);
121                }
122                '<' => {
123                    _ = scanner.accept_char('<');
124                    _ = scanner.accept_char('=');
125                }
126                '>' => {
127                    _ = scanner.accept_char('>');
128                    _ = scanner.accept_char('=');
129                }
130                '.' => {
131                    if scanner.accept_char('.').is_ok() {
132                        _ = scanner.accept_char_any(&['.', '=']);
133                    }
134                }
135                ':' => {
136                    _ = scanner.accept_char(':');
137                }
138                '@' | '_' | ',' | ';' | '#' | '$' | '?' | '~' => {}
139                _ => return Err(scanner.ranged_text(r)),
140            }
141            Ok(())
142        });
143        if let Ok((r, _s)) = res {
144            return Some((Self::Punct, scanner.span(r)));
145        }
146
147        let (r, _c) = scanner.next().ok()?;
148        Some((Self::Unknown, scanner.span(r)))
149    }
150}
151
152/// Rust lexer producing [`RustToken`]s.
153///
154/// **Note:** Cloning `RustLexer` is essentially a copy, as it just contains
155/// a `&str` and a `usize` for its `cursor`. However, `Copy` is not
156/// implemented, to avoid accidentally copying immutable `RustLexer`s.
157#[derive(Clone, Debug)]
158pub struct RustLexer<'text> {
159    scanner: Scanner<'text>,
160}
161
162impl<'text> RustLexer<'text> {
163    #[inline]
164    pub fn new(text: &'text str) -> Self {
165        Self {
166            scanner: Scanner::new(text),
167        }
168    }
169}
170
171impl_lexer_from_scanner!('text, RustLexer<'text>, RustToken, scanner);
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn test_rust_lexer_spans() {
179        let input = include_str!("../../../text-scanner/src/ext/rust.rs");
180        let mut output = String::new();
181
182        let lexer = RustLexer::new(input);
183        for (_tok, span) in lexer {
184            output.push_str(span.as_str());
185        }
186
187        assert_eq!(input, output);
188    }
189}