rustpython_parser/
token.rs

1//! Token type for Python source code created by the lexer and consumed by the parser.
2//!
3//! This module defines the tokens that the lexer recognizes. The tokens are
4//! loosely based on the token definitions found in the [CPython source].
5//!
6//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h
7use crate::ast::bigint::BigInt;
8use crate::{text_size::TextSize, Mode};
9use std::fmt;
10
11/// The set of tokens the Python source code can be tokenized in.
12#[derive(Clone, Debug, PartialEq, is_macro::Is)]
13pub enum Tok {
14    /// Token value for a name, commonly known as an identifier.
15    Name {
16        /// The name value.
17        name: String,
18    },
19    /// Token value for an integer.
20    Int {
21        /// The integer value.
22        value: BigInt,
23    },
24    /// Token value for a floating point number.
25    Float {
26        /// The float value.
27        value: f64,
28    },
29    /// Token value for a complex number.
30    Complex {
31        /// The real part of the complex number.
32        real: f64,
33        /// The imaginary part of the complex number.
34        imag: f64,
35    },
36    /// Token value for a string.
37    String {
38        /// The string value.
39        value: String,
40        /// The kind of string.
41        kind: StringKind,
42        /// Whether the string is triple quoted.
43        triple_quoted: bool,
44    },
45    /// Token value for a comment. These are filtered out of the token stream prior to parsing.
46    #[cfg(feature = "full-lexer")]
47    Comment(String),
48    /// Token value for a newline.
49    Newline,
50    /// Token value for a newline that is not a logical line break. These are filtered out of
51    /// the token stream prior to parsing.
52    #[cfg(feature = "full-lexer")]
53    NonLogicalNewline,
54    /// Token value for an indent.
55    Indent,
56    /// Token value for a dedent.
57    Dedent,
58    EndOfFile,
59    /// Token value for a left parenthesis `(`.
60    Lpar,
61    /// Token value for a right parenthesis `)`.
62    Rpar,
63    /// Token value for a left square bracket `[`.
64    Lsqb,
65    /// Token value for a right square bracket `]`.
66    Rsqb,
67    /// Token value for a colon `:`.
68    Colon,
69    /// Token value for a comma `,`.
70    Comma,
71    /// Token value for a semicolon `;`.
72    Semi,
73    /// Token value for plus `+`.
74    Plus,
75    /// Token value for minus `-`.
76    Minus,
77    /// Token value for star `*`.
78    Star,
79    /// Token value for slash `/`.
80    Slash,
81    /// Token value for vertical bar `|`.
82    Vbar,
83    /// Token value for ampersand `&`.
84    Amper,
85    /// Token value for less than `<`.
86    Less,
87    /// Token value for greater than `>`.
88    Greater,
89    /// Token value for equal `=`.
90    Equal,
91    /// Token value for dot `.`.
92    Dot,
93    /// Token value for percent `%`.
94    Percent,
95    /// Token value for left bracket `{`.
96    Lbrace,
97    /// Token value for right bracket `}`.
98    Rbrace,
99    /// Token value for double equal `==`.
100    EqEqual,
101    /// Token value for not equal `!=`.
102    NotEqual,
103    /// Token value for less than or equal `<=`.
104    LessEqual,
105    /// Token value for greater than or equal `>=`.
106    GreaterEqual,
107    /// Token value for tilde `~`.
108    Tilde,
109    /// Token value for caret `^`.
110    CircumFlex,
111    /// Token value for left shift `<<`.
112    LeftShift,
113    /// Token value for right shift `>>`.
114    RightShift,
115    /// Token value for double star `**`.
116    DoubleStar,
117    /// Token value for double star equal `**=`.
118    DoubleStarEqual,
119    /// Token value for plus equal `+=`.
120    PlusEqual,
121    /// Token value for minus equal `-=`.
122    MinusEqual,
123    /// Token value for star equal `*=`.
124    StarEqual,
125    /// Token value for slash equal `/=`.
126    SlashEqual,
127    /// Token value for percent equal `%=`.
128    PercentEqual,
129    /// Token value for ampersand equal `&=`.
130    AmperEqual,
131    /// Token value for vertical bar equal `|=`.
132    VbarEqual,
133    /// Token value for caret equal `^=`.
134    CircumflexEqual,
135    /// Token value for left shift equal `<<=`.
136    LeftShiftEqual,
137    /// Token value for right shift equal `>>=`.
138    RightShiftEqual,
139    /// Token value for double slash `//`.
140    DoubleSlash,
141    /// Token value for double slash equal `//=`.
142    DoubleSlashEqual,
143    /// Token value for colon equal `:=`.
144    ColonEqual,
145    /// Token value for at `@`.
146    At,
147    /// Token value for at equal `@=`.
148    AtEqual,
149    /// Token value for arrow `->`.
150    Rarrow,
151    /// Token value for ellipsis `...`.
152    Ellipsis,
153
154    // Self documenting.
155    // Keywords (alphabetically):
156    False,
157    None,
158    True,
159
160    And,
161    As,
162    Assert,
163    Async,
164    Await,
165    Break,
166    Class,
167    Continue,
168    Def,
169    Del,
170    Elif,
171    Else,
172    Except,
173    Finally,
174    For,
175    From,
176    Global,
177    If,
178    Import,
179    In,
180    Is,
181    Lambda,
182    Nonlocal,
183    Not,
184    Or,
185    Pass,
186    Raise,
187    Return,
188    Try,
189    While,
190    Match,
191    Type,
192    Case,
193    With,
194    Yield,
195
196    // RustPython specific.
197    StartModule,
198    StartInteractive,
199    StartExpression,
200}
201
202impl Tok {
203    pub fn start_marker(mode: Mode) -> Self {
204        match mode {
205            Mode::Module => Tok::StartModule,
206            Mode::Interactive => Tok::StartInteractive,
207            Mode::Expression => Tok::StartExpression,
208        }
209    }
210}
211
212impl fmt::Display for Tok {
213    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
214        use Tok::*;
215        match self {
216            Name { name } => write!(f, "'{name}'"),
217            Int { value } => write!(f, "'{value}'"),
218            Float { value } => write!(f, "'{value}'"),
219            Complex { real, imag } => write!(f, "{real}j{imag}"),
220            String {
221                value,
222                kind,
223                triple_quoted,
224            } => {
225                let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 });
226                write!(f, "{kind}{quotes}{value}{quotes}")
227            }
228            Newline => f.write_str("Newline"),
229            #[cfg(feature = "full-lexer")]
230            NonLogicalNewline => f.write_str("NonLogicalNewline"),
231            Indent => f.write_str("Indent"),
232            Dedent => f.write_str("Dedent"),
233            StartModule => f.write_str("StartProgram"),
234            StartInteractive => f.write_str("StartInteractive"),
235            StartExpression => f.write_str("StartExpression"),
236            EndOfFile => f.write_str("EOF"),
237            Lpar => f.write_str("'('"),
238            Rpar => f.write_str("')'"),
239            Lsqb => f.write_str("'['"),
240            Rsqb => f.write_str("']'"),
241            Colon => f.write_str("':'"),
242            Comma => f.write_str("','"),
243            #[cfg(feature = "full-lexer")]
244            Comment(value) => f.write_str(value),
245            Semi => f.write_str("';'"),
246            Plus => f.write_str("'+'"),
247            Minus => f.write_str("'-'"),
248            Star => f.write_str("'*'"),
249            Slash => f.write_str("'/'"),
250            Vbar => f.write_str("'|'"),
251            Amper => f.write_str("'&'"),
252            Less => f.write_str("'<'"),
253            Greater => f.write_str("'>'"),
254            Equal => f.write_str("'='"),
255            Dot => f.write_str("'.'"),
256            Percent => f.write_str("'%'"),
257            Lbrace => f.write_str("'{'"),
258            Rbrace => f.write_str("'}'"),
259            EqEqual => f.write_str("'=='"),
260            NotEqual => f.write_str("'!='"),
261            LessEqual => f.write_str("'<='"),
262            GreaterEqual => f.write_str("'>='"),
263            Tilde => f.write_str("'~'"),
264            CircumFlex => f.write_str("'^'"),
265            LeftShift => f.write_str("'<<'"),
266            RightShift => f.write_str("'>>'"),
267            DoubleStar => f.write_str("'**'"),
268            DoubleStarEqual => f.write_str("'**='"),
269            PlusEqual => f.write_str("'+='"),
270            MinusEqual => f.write_str("'-='"),
271            StarEqual => f.write_str("'*='"),
272            SlashEqual => f.write_str("'/='"),
273            PercentEqual => f.write_str("'%='"),
274            AmperEqual => f.write_str("'&='"),
275            VbarEqual => f.write_str("'|='"),
276            CircumflexEqual => f.write_str("'^='"),
277            LeftShiftEqual => f.write_str("'<<='"),
278            RightShiftEqual => f.write_str("'>>='"),
279            DoubleSlash => f.write_str("'//'"),
280            DoubleSlashEqual => f.write_str("'//='"),
281            At => f.write_str("'@'"),
282            AtEqual => f.write_str("'@='"),
283            Rarrow => f.write_str("'->'"),
284            Ellipsis => f.write_str("'...'"),
285            False => f.write_str("'False'"),
286            None => f.write_str("'None'"),
287            True => f.write_str("'True'"),
288            And => f.write_str("'and'"),
289            As => f.write_str("'as'"),
290            Assert => f.write_str("'assert'"),
291            Async => f.write_str("'async'"),
292            Await => f.write_str("'await'"),
293            Break => f.write_str("'break'"),
294            Class => f.write_str("'class'"),
295            Continue => f.write_str("'continue'"),
296            Def => f.write_str("'def'"),
297            Del => f.write_str("'del'"),
298            Elif => f.write_str("'elif'"),
299            Else => f.write_str("'else'"),
300            Except => f.write_str("'except'"),
301            Finally => f.write_str("'finally'"),
302            For => f.write_str("'for'"),
303            From => f.write_str("'from'"),
304            Global => f.write_str("'global'"),
305            If => f.write_str("'if'"),
306            Import => f.write_str("'import'"),
307            In => f.write_str("'in'"),
308            Is => f.write_str("'is'"),
309            Lambda => f.write_str("'lambda'"),
310            Nonlocal => f.write_str("'nonlocal'"),
311            Not => f.write_str("'not'"),
312            Or => f.write_str("'or'"),
313            Pass => f.write_str("'pass'"),
314            Raise => f.write_str("'raise'"),
315            Return => f.write_str("'return'"),
316            Try => f.write_str("'try'"),
317            While => f.write_str("'while'"),
318            Match => f.write_str("'match'"),
319            Type => f.write_str("'type'"),
320            Case => f.write_str("'case'"),
321            With => f.write_str("'with'"),
322            Yield => f.write_str("'yield'"),
323            ColonEqual => f.write_str("':='"),
324        }
325    }
326}
327
328/// The kind of string literal as described in the [String and Bytes literals]
329/// section of the Python reference.
330///
331/// [String and Bytes literals]: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
332#[derive(PartialEq, Eq, Debug, Clone, Hash, Copy)] // TODO: is_macro::Is
333pub enum StringKind {
334    /// A normal string literal with no prefix.
335    String,
336    /// A f-string literal, with a `f` or `F` prefix.
337    FString,
338    /// A byte string literal, with a `b` or `B` prefix.
339    Bytes,
340    /// A raw string literal, with a `r` or `R` prefix.
341    RawString,
342    /// A raw f-string literal, with a `rf`/`fr` or `rF`/`Fr` or `Rf`/`fR` or `RF`/`FR` prefix.
343    RawFString,
344    /// A raw byte string literal, with a `rb`/`br` or `rB`/`Br` or `Rb`/`bR` or `RB`/`BR` prefix.
345    RawBytes,
346    /// A unicode string literal, with a `u` or `U` prefix.
347    Unicode,
348}
349
350impl TryFrom<char> for StringKind {
351    type Error = String;
352
353    fn try_from(ch: char) -> Result<Self, String> {
354        match ch {
355            'r' | 'R' => Ok(StringKind::RawString),
356            'f' | 'F' => Ok(StringKind::FString),
357            'u' | 'U' => Ok(StringKind::Unicode),
358            'b' | 'B' => Ok(StringKind::Bytes),
359            c => Err(format!("Unexpected string prefix: {c}")),
360        }
361    }
362}
363
364impl TryFrom<[char; 2]> for StringKind {
365    type Error = String;
366
367    fn try_from(chars: [char; 2]) -> Result<Self, String> {
368        match chars {
369            ['r' | 'R', 'f' | 'F'] => Ok(StringKind::RawFString),
370            ['f' | 'F', 'r' | 'R'] => Ok(StringKind::RawFString),
371            ['r' | 'R', 'b' | 'B'] => Ok(StringKind::RawBytes),
372            ['b' | 'B', 'r' | 'R'] => Ok(StringKind::RawBytes),
373            [c1, c2] => Err(format!("Unexpected string prefix: {c1}{c2}")),
374        }
375    }
376}
377
378impl fmt::Display for StringKind {
379    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
380        use StringKind::*;
381        match self {
382            String => f.write_str(""),
383            FString => f.write_str("f"),
384            Bytes => f.write_str("b"),
385            RawString => f.write_str("r"),
386            RawFString => f.write_str("rf"),
387            RawBytes => f.write_str("rb"),
388            Unicode => f.write_str("u"),
389        }
390    }
391}
392
393impl StringKind {
394    /// Returns true if the string is a raw string, i,e one of
395    /// [`StringKind::RawString`] or [`StringKind::RawFString`] or [`StringKind::RawBytes`].
396    pub fn is_raw(&self) -> bool {
397        use StringKind::{RawBytes, RawFString, RawString};
398        matches!(self, RawString | RawFString | RawBytes)
399    }
400
401    /// Returns true if the string is an f-string, i,e one of
402    /// [`StringKind::FString`] or [`StringKind::RawFString`].
403    pub fn is_any_fstring(&self) -> bool {
404        use StringKind::{FString, RawFString};
405        matches!(self, FString | RawFString)
406    }
407
408    /// Returns true if the string is a byte string, i,e one of
409    /// [`StringKind::Bytes`] or [`StringKind::RawBytes`].
410    pub fn is_any_bytes(&self) -> bool {
411        use StringKind::{Bytes, RawBytes};
412        matches!(self, Bytes | RawBytes)
413    }
414
415    /// Returns true if the string is a unicode string, i,e [`StringKind::Unicode`].
416    pub fn is_unicode(&self) -> bool {
417        matches!(self, StringKind::Unicode)
418    }
419
420    /// Returns the number of characters in the prefix.
421    pub fn prefix_len(&self) -> TextSize {
422        use StringKind::*;
423        let len = match self {
424            String => 0,
425            RawString | FString | Unicode | Bytes => 1,
426            RawFString | RawBytes => 2,
427        };
428        len.into()
429    }
430}