any_lexer/lexers/
python.rs

1use text_scanner::ext::{PythonScannerExt, PythonStrExt};
2use text_scanner::Scanner;
3
4use crate::{impl_lexer_from_scanner, ScanToken, ScannerExt, TokenSpan};
5
6#[derive(PartialEq, Eq, Clone, Copy, Debug)]
7pub enum PythonToken {
8    Space,
9    LineComment,
10    ExplicitLineJoiner,
11    Ident,
12    Keyword,
13    SoftKeyword,
14    ShortString,
15    LongString,
16    ShortBytes,
17    LongBytes,
18    Int,
19    Float,
20    Delim,
21    Punct,
22    /// Given valid Python code, then this variant should never be encountered.
23    /// If is is encountered, then check if an issue has already been submitted,
24    /// otherwise please [submit an issue].
25    ///
26    /// [submit an issue]: https://github.com/vallentin/colorblast/issues
27    Unknown,
28}
29
30impl ScanToken for PythonToken {
31    fn scan_token<'text>(scanner: &mut Scanner<'text>) -> Option<(Self, TokenSpan<'text>)> {
32        let (r, _s) = scanner.skip_whitespace();
33        if !r.is_empty() {
34            return Some((Self::Space, scanner.span(r)));
35        }
36
37        if let Ok((r, _s)) = scanner.scan_python_line_comment() {
38            return Some((Self::LineComment, scanner.span(r)));
39        }
40
41        if let Ok((r, ident)) = scanner.scan_python_identifier() {
42            let tok = if ident.is_python_keyword() {
43                Self::Keyword
44            } else if ident.is_python_soft_keyword() {
45                Self::SoftKeyword
46            } else {
47                Self::Ident
48            };
49            return Some((tok, scanner.span(r)));
50        }
51
52        if let Ok((r, _s)) = scanner.scan_python_long_string() {
53            return Some((Self::LongString, scanner.span(r)));
54        } else if let Ok((r, _s)) = scanner.scan_python_short_string() {
55            return Some((Self::ShortString, scanner.span(r)));
56        } else if let Ok((r, _s)) = scanner.scan_python_long_bytes() {
57            return Some((Self::LongBytes, scanner.span(r)));
58        } else if let Ok((r, _s)) = scanner.scan_python_short_bytes() {
59            return Some((Self::ShortBytes, scanner.span(r)));
60        }
61
62        if let Ok((r, _s)) = scanner.scan_python_float() {
63            return Some((Self::Float, scanner.span(r)));
64        } else if let Ok((r, _s)) = scanner
65            .scan_python_int_hex()
66            .or_else(|_| scanner.scan_python_int_oct())
67            .or_else(|_| scanner.scan_python_int_dec())
68        {
69            return Some((Self::Int, scanner.span(r)));
70        }
71
72        if let Ok((r, _c)) = scanner.scan_python_delimiter() {
73            return Some((Self::Delim, scanner.span(r)));
74        } else if let Ok((r, _c)) = scanner.scan_python_operator() {
75            return Some((Self::Punct, scanner.span(r)));
76        }
77
78        if let Ok((r, _s)) = scanner.scan_python_explicit_line_joiner() {
79            return Some((Self::ExplicitLineJoiner, scanner.span(r)));
80        }
81
82        let (r, _c) = scanner.next().ok()?;
83        Some((Self::Unknown, scanner.span(r)))
84    }
85}
86
87/// Python lexer producing [`PythonToken`]s.
88///
89/// **Note:** Cloning `PythonLexer` is essentially a copy, as it just contains
90/// a `&str` and a `usize` for its `cursor`. However, `Copy` is not
91/// implemented, to avoid accidentally copying immutable `PythonLexer`s.
92#[derive(Clone, Debug)]
93pub struct PythonLexer<'text> {
94    scanner: Scanner<'text>,
95}
96
97impl<'text> PythonLexer<'text> {
98    #[inline]
99    pub fn new(text: &'text str) -> Self {
100        Self {
101            scanner: Scanner::new(text),
102        }
103    }
104}
105
106impl_lexer_from_scanner!('text, PythonLexer<'text>, PythonToken, scanner);
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn test_python_lexer_spans() {
114        // This intentionally uses Rust code as input, as it is
115        // only testing that PythonLexer returns all characters
116        let input = include_str!("../../../text-scanner/src/ext/rust.rs");
117        let mut output = String::new();
118
119        let lexer = PythonLexer::new(input);
120        for (_tok, span) in lexer {
121            output.push_str(span.as_str());
122        }
123
124        assert_eq!(input, output);
125    }
126}