libcst_native/tokenizer/core/
mod.rs

1// This implementation is Copyright (c) Meta Platforms, Inc. and affiliates.
2//
3// CPython 3.10.0a5 and the original C code this is based on is
4// Copyright (c) 2001-2021 Python Software Foundation; All Rights Reserved
5//
6// Portions of this module (f-string splitting) are based on parso's tokenize.py, which is also PSF
7// licensed.
8
9/// A port of CPython's tokenizer.c to Rust, with the following significant modifications:
10///
11/// - PEP 263 (encoding detection) support isn't implemented. We depend on other code to do this for
12///   us right now, and expect that the input is utf-8 by the time we see it.
13///
14/// - Removed support for tokenizing from a file handle without reading the whole file in at once.
15///   This significantly complicates parsing and memory is cheap, so we require that the whole file
16///   is read in and converted to a unicode string before tokenization can begin.
17///
18/// - Removed support for the interactive interpreter parsing mode.
19///
20/// - Tweaked the `translate_newlines` functionality and moved most of it into TextPosition. `\r`
21///   characters are no longer removed from the input buffer, so strings may contain `\r` characters
22///   that should be normalized prior to being interpreted.
23///
24/// - Added support for tracking more detailed position information via TextPosition. As a
25///   consequence, consuming and then backing up a character (`tok_nextc`/`tok_backup`) is more
26///   expensive, and we prefer to call `TextPosition::peek()` instead.
27///
28/// - Removed support for tokenizing type comments.
29///
30/// - Reduced the number of different supported token types to match what parso's tokenizer yields.
31///
32/// - Uses some regular expressions. Regular expression are a good fit for a tokenizer, but we don't
33///   use regular expressions everywhere because we can't generate as good of error messages with
34///   them.
35///
36/// - Added support for breaking apart f-strings into multiple tokens, matching Parso's tokenizer
37///   behavior. CPython instead runs the parser recursively to parse f-strings.
38///
39/// Also, in general, the code is less tightly optimized. The CPython implementation is crazy
40/// optimized in ways that wouldn't translate well to rust (e.g. it parses the input utf-8 buffer as
41/// raw bytes instead of unicode codepoints).
42///
43/// The implementation should still be faster than any pure-Python implementation, and most
44/// optimizations (avoiding string copies when slicing) carry over to Rust very well.
45///
46/// Planned (not yet implemented) features:
47///
48/// - Add more feature flags to more closely match the behavior of older versions of Python 3.x.
49///
50/// - Support for a Python 2 mode that tokenizes Python 2.7 code and fails on certain new Python 3
51///   syntax that wasn't supported in 2.7.
52///
53/// - Maybe add back support for tokenizing type comments?
54///
55/// This implementation is tailored to LibCST's needs. If you're looking for a more general-purpose
56/// pure-Rust Python parser, consider using [RustPython's parser][].
57///
58/// [RustPython's parser]: https://crates.io/crates/rustpython-parser
59mod string_types;
60
61use regex::Regex;
62use std::cell::RefCell;
63use std::cmp::Ordering;
64use std::convert::TryInto;
65use std::fmt::Debug;
66use std::fmt::Formatter;
67use std::rc::Rc;
68
69use crate::tokenizer::core::string_types::FTStringType;
70use crate::tokenizer::{
71    core::string_types::{FTStringNode, StringQuoteChar, StringQuoteSize},
72    operators::OPERATOR_RE,
73    text_position::{TextPosition, TextPositionSnapshot},
74    whitespace_parser::State as WhitespaceState,
75};
76
77/// The maximum number of indentation levels at any given point in time. CPython's tokenizer.c caps
78/// this to avoid the complexity of allocating a dynamic array, but we're using a Vec, so it's not
79/// necessary, but we're keeping it to maintain compatibility.
80const MAX_INDENT: usize = 100;
81
82// MAX_CHAR should be std::char::MAX once assoc_char_consts is stablized.
83// https://github.com/rust-lang/rust/issues/71763
84const MAX_CHAR: char = '\u{10ffff}';
85
86thread_local! {
87    static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
88    static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
89    static STRING_PREFIX_RE: Regex =
90        Regex::new(r"\A(?i)(u|[bf]r|r[bft]|r|b|f|t)").expect("regex");
91    static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
92        Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
93    static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
94    static DECIMAL_TAIL_RE: Regex =
95        Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
96    static HEXADECIMAL_TAIL_RE: Regex =
97        Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
98    static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
99    static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");
100
101    /// Used to verify identifiers when there's a non-ascii character in them.
102    // This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
103    // given Python version's behavior.
104    static UNICODE_IDENTIFIER_RE: Regex =
105        Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
106}
107
108#[derive(Debug, Eq, PartialEq, Copy, Clone)]
109pub enum TokType {
110    String,
111    Name,
112    Number,
113    Op,
114    Newline,
115    Indent,
116    Dedent,
117    Async,
118    Await,
119    FStringStart,
120    FStringString,
121    FStringEnd,
122    TStringStart,
123    TStringString,
124    TStringEnd,
125    EndMarker,
126}
127
128#[derive(Debug, thiserror::Error, Eq, PartialEq)]
129pub enum TokError<'t> {
130    #[error("inconsistent mixing of tabs and spaces")]
131    TabSpace,
132    #[error("too many indentation levels")]
133    TooDeep,
134    #[error("no matching outer block for dedent")]
135    Dedent,
136    #[error("unexpected characters after a line continuation")]
137    LineContinuation,
138    #[error("unexpected end of file after a line continuation")]
139    LineContinuationEof,
140    #[error("{0:?} is not a valid identifier")]
141    BadIdentifier(&'t str),
142    #[error("invalid decimal literal")]
143    BadDecimal,
144    #[error(
145        "{}{}",
146        "leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal ",
147        "integers"
148    )]
149    BadDecimalLeadingZeros,
150    #[error("invalid hexadecimal literal")]
151    BadHexadecimal,
152    #[error("invalid octal literal")]
153    BadOctal,
154    #[error("invalid digit {0:?} in octal literal")]
155    BadOctalDigit(char),
156    #[error("invalid binary literal")]
157    BadBinary,
158    #[error("invalid digit {0:?} in binary literal")]
159    BadBinaryDigit(char),
160    #[error("unterminated string literal")]
161    UnterminatedString,
162    #[error("unterminated triple-quoted string literal")]
163    UnterminatedTripleQuotedString,
164    #[error("unmatched {0:?}")]
165    UnmatchedClosingParen(char),
166    #[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?}")]
167    MismatchedClosingParen(char, char),
168    #[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?} on line {2:}")]
169    MismatchedClosingParenOnLine(char, char, usize),
170    #[error("{0:?} is not a valid character in this position")]
171    BadCharacter(char),
172}
173
174// Clone is used for async_hacks, which needs to speculatively look-ahead one token.
175#[derive(Clone)]
176pub struct TokState<'t> {
177    /// The full program's source code (similar to `tok->str` or `tok->buf` in the CPython source
178    /// code). We don't support reading the file line-by-line from a file handle like CPython does,
179    /// so this is the whole program pre-converted to utf-8.
180    pub text_pos: TextPosition<'t>,
181    /// Start of the most recently returned token.
182    pub start_pos: TextPositionSnapshot,
183    /// True after we've encountered an error or there's no more text to process.
184    done: bool,
185    /// How many spaces a tab counts as (always 8)
186    tab_size: usize,
187    /// How many spaces a tab counts as in alt_indent_stack (always 1)
188    alt_tab_size: usize,
189    /// Stack of indentation levels where a tab is counted as 8 characters, used for tracking
190    /// dedents. Length is current indentation level. Should never have more than MAX_INDENT
191    /// entries.
192    indent_stack: Vec<usize>,
193    /// Used to check that tabs and spaces are not mixed.
194    alt_indent_stack: Vec<usize>,
195    /// Beginning of line. True if at the beginning of a new line.
196    at_bol: bool,
197    /// The number of bytes at the beginning of the line, as measured by consume_bol_whitespace.
198    /// Used by libcst to capture (and then validate and parse) the indentation.
199    pub bol_width: usize,
200    /// Set by `consume_bol_whitespace`, true if the current line is blank.
201    blank_line: bool,
202    /// Pending intents (if > 0) or dedents (if < 0). Used when multiple tokens need to be produced
203    /// at once.
204    pending_indents: i32,
205    /// Length is `() [] {}` parenthesis nesting level. Used to allow free continuations inside
206    /// them. Stack entries are to verify that closing parenthesis match opening parenthesis.
207    /// Tuple is (character, lineno).
208    paren_stack: Vec<(char, usize)>,
209    /// Whether we're in a continuation line.
210    cont_line: bool,
211
212    /// True if async/await aren't always keywords.
213    async_hacks: bool,
214    /// True if tokens are inside an 'async def' body.
215    async_def: bool,
216    /// Indentation level of the outermost 'async def'.
217    async_def_indent: usize,
218    /// True if the outermost 'async def' had at least one NEWLINE token after it.
219    async_def_nl: bool,
220
221    /// Splits f-strings into multiple tokens instead of a STRING token if true.
222    ///
223    /// CPython doesn't directly split f-strings in the tokenizer (and therefore doesn't support
224    /// this option). Instead, when the parser encounters an f-string, it recursively re-runs the
225    /// tokenizer and parser.
226    ///
227    /// Supporting this at the tokenizer-level is pretty nasty and adds a lot of complexity.
228    /// Eventually, we should probably support this at the parser-level instead.
229    split_ftstring: bool,
230    ftstring_stack: Vec<FTStringNode>,
231
232    missing_nl_before_eof: bool,
233}
234
235pub struct TokConfig {
236    /// Used in Python 3.5 and 3.6. If enabled, async/await are sometimes keywords and sometimes
237    /// identifiers, depending on if they're being used in the context of an async function. This
238    /// breaks async comprehensions outside of async functions.
239    pub async_hacks: bool,
240    pub split_ftstring: bool,
241    // Not currently supported:
242    // type_comments: bool,
243}
244
245fn is_digit<C: Into<Option<char>>>(ch: C) -> bool {
246    matches!(ch.into(), Some('0'..='9'))
247}
248
249#[derive(Debug)]
250enum NumberState {
251    StartDigit,
252    Fraction,
253    Exponent,
254    Imaginary,
255}
256
257impl<'t> TokState<'t> {
258    pub fn new(text: &'t str, config: &TokConfig) -> Self {
259        let text_pos = TextPosition::new(text);
260        let start_pos = (&text_pos).into();
261        Self {
262            text_pos,
263            start_pos,
264            done: false,
265            tab_size: 8,
266            alt_tab_size: 1,
267            indent_stack: Vec::new(),
268            alt_indent_stack: Vec::new(),
269            at_bol: true,
270            bol_width: 0,
271            blank_line: false,
272            pending_indents: 0,
273            paren_stack: Vec::new(),
274            cont_line: false,
275            async_hacks: config.async_hacks,
276            async_def: false,
277            async_def_indent: 0,
278            async_def_nl: false,
279            split_ftstring: config.split_ftstring,
280            ftstring_stack: Vec::new(),
281            missing_nl_before_eof: text.is_empty() || text.as_bytes()[text.len() - 1] != b'\n',
282        }
283    }
284
285    pub fn is_parenthesized(&self) -> bool {
286        !self.paren_stack.is_empty()
287    }
288
289    /// Implementation of `next()`, wrapped by next() to allow for easier error handling. Roughly
290    /// equivalent to `tok_get` in the C source code.
291    fn next_inner(&mut self) -> Result<TokType, TokError<'t>> {
292        if self.split_ftstring {
293            if let Some(tos) = self.ftstring_stack.last() {
294                if !tos.is_in_expr() {
295                    self.start_pos = (&self.text_pos).into();
296                    let is_in_format_spec = tos.is_in_format_spec();
297                    let is_raw_string = tos.is_raw_string;
298                    if let Some(tok) =
299                        self.maybe_consume_ftstring_string(is_in_format_spec, is_raw_string)?
300                    {
301                        return Ok(tok);
302                    }
303                    if let Some(tok) = self.maybe_consume_ftstring_end() {
304                        return Ok(tok);
305                    }
306                }
307            }
308        }
309
310        // This will never consume a token, but it may set blank_line and it may set
311        // pending_indents.
312        self.consume_bol_whitespace()?;
313
314        // Return pending indents/dedents
315        if let Some(t) = self.process_pending_indents() {
316            self.start_pos = (&self.text_pos).into();
317            return Ok(t);
318        }
319
320        self.maybe_close_async_def();
321
322        'again: loop {
323            // Skip spaces
324            SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));
325
326            // Skip comment, unless it's a type comment
327            if self.text_pos.peek() == Some('#') {
328                ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
329                // type_comment is not supported
330            }
331
332            // Set start of current token
333            self.start_pos = (&self.text_pos).into();
334
335            return match self.text_pos.peek() {
336                // Check for EOF now
337                None => {
338                    if self.missing_nl_before_eof && !self.blank_line {
339                        self.at_bol = true;
340                        self.missing_nl_before_eof = false;
341                        Ok(TokType::Newline)
342                    } else {
343                        let hanging_indents = self.indent_stack.len() as i32;
344                        if self.pending_indents == 0 && hanging_indents != 0 {
345                            // We've reached EOF but there are still pending indents not
346                            // accounted for. Flush them out.
347                            self.pending_indents = -hanging_indents;
348                            self.indent_stack.clear();
349                            self.alt_indent_stack.clear();
350                            self.missing_nl_before_eof = false;
351                        }
352                        if let Some(t) = self.process_pending_indents() {
353                            Ok(t)
354                        } else {
355                            Ok(TokType::EndMarker)
356                        }
357                    }
358                }
359
360                // Identifier (most frequent token!)
361                Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR) => {
362                    self.consume_identifier_or_prefixed_string()
363                }
364
365                // Newline
366                Some('\n') => {
367                    self.text_pos.next();
368                    self.at_bol = true;
369                    if self.split_ftstring
370                        && self
371                            .ftstring_stack
372                            .last()
373                            .map(|node| node.allow_multiline())
374                            == Some(false)
375                    {
376                        Err(TokError::UnterminatedString)
377                    } else if self.blank_line || !self.paren_stack.is_empty() {
378                        // this newline doesn't count
379                        // recurse (basically `goto nextline`)
380                        self.next_inner()
381                    } else {
382                        self.cont_line = false;
383                        if self.async_def {
384                            self.async_def_nl = true;
385                        }
386                        Ok(TokType::Newline)
387                    }
388                }
389
390                // Ellipsis
391                Some('.') if self.text_pos.consume("...") => {
392                    return Ok(TokType::Op);
393                }
394
395                // Number starting with period
396                Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
397                    self.consume_number(NumberState::Fraction)
398                }
399
400                // Dot
401                Some('.') => {
402                    self.text_pos.next();
403                    Ok(TokType::Op)
404                }
405
406                // Number
407                Some('0'..='9') => self.consume_number(NumberState::StartDigit),
408
409                // String
410                Some('\'') | Some('"') => self.consume_string(),
411
412                // Line continuation
413                Some('\\') => {
414                    self.text_pos.next();
415                    if let Some('\n') = self.text_pos.next() {
416                        if self.text_pos.peek() == None {
417                            Err(TokError::LineContinuationEof)
418                        } else {
419                            self.cont_line = true;
420                            // Read next line
421                            continue 'again;
422                        }
423                    } else {
424                        Err(TokError::LineContinuation)
425                    }
426                }
427
428                Some(ch @ '(') | Some(ch @ '[') | Some(ch @ '{') => {
429                    self.text_pos.next();
430                    if let Some(tos) = self.ftstring_stack.last_mut() {
431                        tos.open_parentheses();
432                    }
433                    self.paren_stack.push((ch, self.text_pos.line_number()));
434                    Ok(TokType::Op)
435                }
436
437                Some(closing @ ')') | Some(closing @ ']') | Some(closing @ '}') => {
438                    self.text_pos.next();
439                    if let Some(tos) = self.ftstring_stack.last_mut() {
440                        tos.close_parentheses();
441                    }
442                    if let Some((opening, line_number)) = self.paren_stack.pop() {
443                        match (opening, closing) {
444                            ('(', ')') | ('[', ']') | ('{', '}') => Ok(TokType::Op),
445                            _ => {
446                                if line_number != self.text_pos.line_number() {
447                                    Err(TokError::MismatchedClosingParenOnLine(
448                                        opening,
449                                        closing,
450                                        line_number,
451                                    ))
452                                } else {
453                                    Err(TokError::MismatchedClosingParen(opening, closing))
454                                }
455                            }
456                        }
457                    } else {
458                        Err(TokError::UnmatchedClosingParen(closing))
459                    }
460                }
461
462                Some(':')
463                    if self
464                        .ftstring_stack
465                        .last()
466                        .map(|tos| tos.parentheses_count - tos.format_spec_count == 1)
467                        .unwrap_or(false) =>
468                {
469                    // N.B. This may capture the walrus operator and pass it to the formatter.
470                    // That's intentional. PEP 572 says: "Assignment expressions inside of f-strings
471                    // require parentheses."
472                    //
473                    // >>> f'{x:=10}'    # Valid, passes '=10' to formatter
474                    let tos = self
475                        .ftstring_stack
476                        .last_mut()
477                        .expect("ftstring_stack is not empty");
478                    tos.format_spec_count += 1;
479                    self.text_pos.next();
480                    Ok(TokType::Op)
481                }
482
483                // Operator
484                Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),
485
486                // Bad character
487                // If nothing works, fall back to this error. CPython returns an OP in this case,
488                // and then just relies on the parser to generate a generic syntax error.
489                Some(ch) => Err(TokError::BadCharacter(ch)),
490            };
491        }
492    }
493
494    /// Consumes the whitespace (and comments) at the beginning of the line. May emit an error. Will
495    /// mutate `pending_indents`, so you must check `pending_indents` after calling this.
496    fn consume_bol_whitespace(&mut self) -> Result<(), TokError<'t>> {
497        self.blank_line = false;
498        if !self.at_bol {
499            return Ok(());
500        }
501
502        let mut col = 0; // column where tab counts as 8 characters
503        let mut altcol = 0; // column where tab counts as 1 character
504        self.at_bol = false;
505        self.bol_width = 0;
506
507        // consume space, tab, and formfeed characters
508        loop {
509            match self.text_pos.peek() {
510                Some(' ') => {
511                    col += 1;
512                    altcol += 1;
513                    self.bol_width += 1;
514                    self.text_pos.next();
515                }
516                Some('\t') => {
517                    // Increment both col and altcol using different tab sizes. Tabs snap to the
518                    // next multiple of self.tab_size.
519                    col = (col / self.tab_size + 1) * self.tab_size;
520                    // altcol will later be used for detecting mixed tabs and spaces.
521                    altcol = (altcol / self.alt_tab_size + 1) * self.alt_tab_size;
522                    self.bol_width += 1;
523                    self.text_pos.next();
524                }
525                // Control-L (formfeed) for emacs users
526                Some('\x0c') => {
527                    col = 0;
528                    altcol = 0;
529                    self.bol_width += 1;
530                    self.text_pos.next();
531                }
532                _ => {
533                    break;
534                }
535            }
536        }
537
538        // Lines with only whitespace and/or comments and/or a line continuation
539        // character shouldn't affect the indentation and are not passed to the parser
540        // as NEWLINE tokens.
541        self.blank_line = matches!(
542            self.text_pos.peek(),
543            Some('#') | Some('\n') | Some('\\') | None
544        );
545
546        if self.blank_line || !self.paren_stack.is_empty() {
547            return Ok(());
548        }
549
550        let prev_col = self.indent_stack.last().unwrap_or(&0);
551        match col.cmp(prev_col) {
552            Ordering::Equal => {
553                // No change
554                if altcol != *self.alt_indent_stack.last().unwrap_or(&0) {
555                    return Err(TokError::TabSpace);
556                }
557            }
558            Ordering::Greater => {
559                // col > prev_col
560                // Indent -- always one
561                if self.indent_stack.len() + 1 >= MAX_INDENT {
562                    return Err(TokError::TooDeep);
563                }
564                // col > prev_col, therefore altcol > prev_altcol, unless there's badly mixed tabs
565                // and spaces
566                if altcol <= *self.alt_indent_stack.last().unwrap_or(&0) {
567                    return Err(TokError::TabSpace);
568                }
569                // only emit indents if we're not at EOF
570                if self.text_pos.peek().is_some() {
571                    self.pending_indents += 1;
572                    self.indent_stack.push(col);
573                    self.alt_indent_stack.push(altcol);
574                }
575            }
576            Ordering::Less => {
577                // c < prev_col
578                // Dedent -- any number, must be consistent
579                while matches!(self.indent_stack.last(), Some(&ind_cols) if col < ind_cols) {
580                    self.pending_indents -= 1;
581                    self.indent_stack.pop();
582                    self.alt_indent_stack.pop();
583                }
584                if col != *self.indent_stack.last().unwrap_or(&0) {
585                    return Err(TokError::Dedent);
586                }
587                if altcol != *self.alt_indent_stack.last().unwrap_or(&0) {
588                    return Err(TokError::TabSpace);
589                }
590            }
591        }
592
593        Ok(())
594    }
595
596    fn process_pending_indents(&mut self) -> Option<TokType> {
597        if self.pending_indents != 0 {
598            if self.pending_indents < 0 {
599                self.pending_indents += 1;
600                Some(TokType::Dedent)
601            } else {
602                self.pending_indents -= 1;
603                Some(TokType::Indent)
604            }
605        } else {
606            None
607        }
608    }
609
610    fn maybe_close_async_def(&mut self) {
611        // Check if we are closing an async function
612        if self.async_def
613            && !self.blank_line
614            // (This is irrelevant to the rust implementation which doesn't support type_comments
615            // yet, but the comment is preserved for posterity)
616            // Due to some implementation artifacts of type comments, a TYPE_COMMENT at the start of
617            // a function won't set an indentation level and it will produce a NEWLINE after it. To
618            // avoid spuriously ending an async function due to this, wait until we have some
619            // non-newline char in front of us.
620            // && self.text_pos.peek() == Some('\n')
621            && self.paren_stack.is_empty()
622            // There was a NEWLINE after ASYNC DEF, so we're past the signature.
623            && self.async_def_nl
624            // Current indentation level is less than where the async function was defined
625            && self.async_def_indent >= self.indent_stack.len()
626        {
627            self.async_def = false;
628            self.async_def_indent = 0;
629            self.async_def_nl = false;
630        }
631    }
632
633    fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
634        // Process the various legal combinations of b"", r"", u"",f"", and t"".
635        if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
636            if let Some('"') | Some('\'') = self.text_pos.peek() {
637                // We found a string, not an identifier. Bail!
638                if self.split_ftstring {
639                    let res = match self
640                        .text_pos
641                        .slice_from_start_pos(&self.start_pos)
642                        .chars()
643                        .find(|c| matches!(c, 'f' | 'F' | 't' | 'T'))
644                    {
645                        Some('f' | 'F') => Some(FTStringType::FString),
646                        Some('t' | 'T') => Some(FTStringType::TString),
647                        _ => None,
648                    };
649                    if let Some(str_type) = res {
650                        // Consume the prefix and return the start token
651                        return self.consume_prefixed_string_start(str_type);
652                    }
653                }
654                return self.consume_string();
655            }
656        } else {
657            // the next character must be a potential identifier start, aka `[a-zA-Z_]|[^\x00-\x7f]`
658            let first_ch = self.text_pos.next();
659            debug_assert!(matches!(
660                first_ch,
661                Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
662            ));
663        }
664        POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
665        let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
666        if !verify_identifier(identifier_str) {
667            // TODO: async/await
668            return Err(TokError::BadIdentifier(identifier_str));
669        }
670
671        let allow_async = !self.async_hacks || self.async_def;
672        match (identifier_str, allow_async) {
673            ("async", true) => Ok(TokType::Async),
674            ("await", true) => Ok(TokType::Await),
675            ("async", false) => {
676                // The current token is 'async' and async_hacks is enabled.
677                // Look ahead one token to see if that is 'def'.
678                // This clone is expensive, but modern code doesn't need async_hacks.
679                let mut lookahead_state = self.clone();
680                if lookahead_state.next_inner() == Ok(TokType::Name)
681                    && lookahead_state
682                        .text_pos
683                        .slice_from_start_pos(&lookahead_state.start_pos)
684                        == "def"
685                {
686                    self.async_def = true;
687                    self.async_def_indent = self.indent_stack.len();
688                    Ok(TokType::Async)
689                } else {
690                    Ok(TokType::Name)
691                }
692            }
693            _ => Ok(TokType::Name),
694        }
695    }
696
697    fn consume_number(&mut self, state: NumberState) -> Result<TokType, TokError<'t>> {
698        // This is organized as a state machine. The match could also be rewritten into multiple
699        // functions, but this is closer to how the C code is written (with gotos).
700        match state {
701            NumberState::StartDigit => {
702                let start_digit_ch = self.text_pos.peek();
703                debug_assert!(is_digit(start_digit_ch));
704
705                if start_digit_ch == Some('0') {
706                    self.text_pos.next();
707                    match self.text_pos.peek() {
708                        Some('x') | Some('X') => {
709                            self.text_pos.next();
710                            if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
711                                || self.text_pos.peek() == Some('_')
712                            {
713                                Err(TokError::BadHexadecimal)
714                            } else {
715                                Ok(TokType::Number)
716                            }
717                        }
718                        Some('o') | Some('O') => {
719                            self.text_pos.next();
720                            if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
721                                || self.text_pos.peek() == Some('_')
722                            {
723                                return Err(TokError::BadOctal);
724                            }
725                            if let Some(next_ch) = self.text_pos.peek() {
726                                if is_digit(next_ch) {
727                                    return Err(TokError::BadOctalDigit(next_ch));
728                                }
729                            }
730                            Ok(TokType::Number)
731                        }
732                        Some('b') | Some('B') => {
733                            self.text_pos.next();
734                            if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
735                                || self.text_pos.peek() == Some('_')
736                            {
737                                return Err(TokError::BadBinary);
738                            }
739                            if let Some(next_ch) = self.text_pos.peek() {
740                                if is_digit(next_ch) {
741                                    return Err(TokError::BadBinaryDigit(next_ch));
742                                }
743                            }
744                            Ok(TokType::Number)
745                        }
746                        _ => {
747                            let mut nonzero = false;
748                            // Maybe old-style octal. In any case, allow '0' as a literal
749                            loop {
750                                if self.text_pos.peek() == Some('_') {
751                                    self.text_pos.next();
752                                    if !is_digit(self.text_pos.peek()) {
753                                        return Err(TokError::BadDecimal);
754                                    }
755                                }
756                                if self.text_pos.peek() != Some('0') {
757                                    break;
758                                }
759                                self.text_pos.next();
760                            }
761                            if is_digit(self.text_pos.peek()) {
762                                nonzero = true;
763                                self.consume_decimal_tail()?;
764                            }
765                            if self.text_pos.peek() == Some('.') {
766                                self.consume_number(NumberState::Fraction)
767                            } else if let Some('e') | Some('E') = self.text_pos.peek() {
768                                self.consume_number(NumberState::Exponent)
769                            } else if let Some('j') | Some('J') = self.text_pos.peek() {
770                                self.consume_number(NumberState::Imaginary)
771                            } else if nonzero {
772                                Err(TokError::BadDecimalLeadingZeros)
773                            } else {
774                                Ok(TokType::Number)
775                            }
776                        }
777                    }
778                } else {
779                    self.consume_decimal_tail()?;
780                    if self.text_pos.peek() == Some('.') {
781                        self.consume_number(NumberState::Fraction)
782                    } else if let Some('e') | Some('E') = self.text_pos.peek() {
783                        self.consume_number(NumberState::Exponent)
784                    } else if let Some('j') | Some('J') = self.text_pos.peek() {
785                        self.consume_number(NumberState::Imaginary)
786                    } else {
787                        Ok(TokType::Number)
788                    }
789                }
790            }
791            NumberState::Fraction => {
792                let dot_ch = self.text_pos.next();
793                debug_assert!(dot_ch == Some('.'));
794
795                if is_digit(self.text_pos.peek()) {
796                    self.consume_decimal_tail()?;
797                }
798                if let Some('e') | Some('E') = self.text_pos.peek() {
799                    self.consume_number(NumberState::Exponent)
800                } else if let Some('j') | Some('J') = self.text_pos.peek() {
801                    self.consume_number(NumberState::Imaginary)
802                } else {
803                    Ok(TokType::Number)
804                }
805            }
806            NumberState::Exponent => {
807                let e_ch = self.text_pos.next();
808                debug_assert!(matches!(e_ch, Some('e') | Some('E')));
809
810                if let Some('+') | Some('-') = self.text_pos.peek() {
811                    self.text_pos.next();
812                    if !is_digit(self.text_pos.peek()) {
813                        return Err(TokError::BadDecimal);
814                    }
815                } else if !is_digit(self.text_pos.peek()) {
816                    // Don't consume the 'e'. It could be part of an identifier after this number.
817                    self.text_pos.backup_no_newline();
818                    return Ok(TokType::Number);
819                }
820                self.consume_decimal_tail()?;
821                if let Some('j') | Some('J') = self.text_pos.peek() {
822                    self.consume_number(NumberState::Imaginary)
823                } else {
824                    Ok(TokType::Number)
825                }
826            }
827            NumberState::Imaginary => {
828                let j_ch = self.text_pos.next();
829                debug_assert!(matches!(j_ch, Some('j') | Some('J')));
830
831                Ok(TokType::Number)
832            }
833        }
834    }
835
836    /// Processes a decimal tail. This is the bit after the dot or after an E in a float.
837    fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
838        let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
839        // Assumption: If we've been called, the first character is an integer, so we must have a
840        // regex match
841        debug_assert!(result, "try_decimal_tail was called on a non-digit char");
842        if self.text_pos.peek() == Some('_') {
843            Err(TokError::BadDecimal)
844        } else {
845            Ok(())
846        }
847    }
848
849    fn consume_open_quote(&mut self) -> (StringQuoteChar, StringQuoteSize) {
850        let quote_char: StringQuoteChar = self
851            .text_pos
852            .peek()
853            .try_into()
854            .expect("the next character must be a quote when calling consume_open_quote");
855        let triple_quote_pattern = quote_char.triple_str();
856        let quote_size = if self.text_pos.consume(triple_quote_pattern) {
857            StringQuoteSize::Triple
858        } else {
859            self.text_pos.next(); // consume the single character instead
860            StringQuoteSize::Single
861        };
862        (quote_char, quote_size)
863    }
864
865    fn consume_string(&mut self) -> Result<TokType, TokError<'t>> {
866        // Assumption: The opening quote has not been consumed. Leading characters (b, r, f, etc)
867        // have been consumed.
868        let (quote_char, quote_size) = self.consume_open_quote();
869        let quote_raw = quote_char.into();
870
871        let mut end_quote_size: usize = 0;
872        let quote_usize: usize = quote_size.into();
873        while end_quote_size != quote_usize {
874            match (self.text_pos.next(), quote_size) {
875                (None, StringQuoteSize::Triple) => {
876                    return Err(TokError::UnterminatedTripleQuotedString);
877                }
878                (None, StringQuoteSize::Single) | (Some('\n'), StringQuoteSize::Single) => {
879                    return Err(TokError::UnterminatedString);
880                }
881                (ch @ Some('\''), _) | (ch @ Some('"'), _) if ch == Some(quote_raw) => {
882                    end_quote_size += 1;
883                }
884                (Some(ch), _) => {
885                    end_quote_size = 0;
886                    if ch == '\\' {
887                        // skip escaped char
888                        self.text_pos.next();
889                    }
890                }
891            }
892        }
893
894        Ok(TokType::String)
895    }
896
897    fn consume_prefixed_string_start(
898        &mut self,
899        str_type: FTStringType,
900    ) -> Result<TokType, TokError<'t>> {
901        // Consumes everything after the (f|t) but before the actual string.
902        let (quote_char, quote_size) = self.consume_open_quote();
903        let is_raw_string = self
904            .text_pos
905            .slice_from_start_pos(&self.start_pos)
906            .contains(&['r', 'R'][..]);
907        self.ftstring_stack.push(FTStringNode::new(
908            quote_char,
909            quote_size,
910            is_raw_string,
911            str_type.clone(),
912        ));
913
914        match str_type {
915            FTStringType::FString => Ok(TokType::FStringStart),
916            FTStringType::TString => Ok(TokType::TStringStart),
917        }
918    }
919
920    fn maybe_consume_ftstring_string(
921        &mut self,
922        is_in_format_spec: bool,
923        is_raw_string: bool,
924    ) -> Result<Option<TokType>, TokError<'t>> {
925        let allow_multiline = self
926            .ftstring_stack
927            .last()
928            .map(|node| node.allow_multiline())
929            == Some(true);
930        let str_type = self
931            .ftstring_stack
932            .last()
933            .map(|node| node.string_type.clone());
934        let mut in_named_unicode: bool = false;
935        let mut ok_result = Ok(None); // value to return if we reach the end and don't error out
936        'outer: loop {
937            match (self.text_pos.peek(), allow_multiline) {
938                (None, true) => {
939                    return Err(TokError::UnterminatedTripleQuotedString);
940                }
941                (None, false) | (Some('\n'), false) => {
942                    return Err(TokError::UnterminatedString);
943                }
944                (ch @ Some('\''), _) | (ch @ Some('"'), _) => {
945                    // see if this actually terminates the most recent fstring
946                    if let Some(node) = self.ftstring_stack.last() {
947                        if ch == Some(node.quote_char.into()) {
948                            match node.quote_size {
949                                StringQuoteSize::Single => {
950                                    break 'outer;
951                                }
952                                StringQuoteSize::Triple => {
953                                    if self.text_pos.matches(node.quote_char.triple_str()) {
954                                        break 'outer;
955                                    }
956                                }
957                            }
958                        }
959                    }
960                    self.text_pos.next();
961                }
962                (Some('\\'), _) if !is_raw_string => {
963                    self.text_pos.next();
964                    if is_in_format_spec {
965                        if let Some('{') | Some('}') = self.text_pos.peek() {
966                            // don't consume { or } because we want those to be interpreted as OP
967                            // tokens
968                        } else {
969                            // skip escaped char (e.g. \', \", or newline/line continuation)
970                            self.text_pos.next();
971                        }
972                    } else if let Some(
973                        '\n'
974                        | '\\'
975                        | '\''
976                        | '"'
977                        | 'a'
978                        | 'b'
979                        | 'f'
980                        | 'n'
981                        | 'r'
982                        | 't'
983                        | 'v'
984                        | 'x'
985                        | '0'..='9'
986                        | 'N'
987                        | 'u'
988                        | 'U',
989                    ) = self.text_pos.peek()
990                    {
991                        // skip escaped char
992                        let next_ch = self.text_pos.next();
993                        // check if this is a \N sequence
994                        if let Some('N') = next_ch {
995                            // swallow the next open curly brace if it exists
996                            if let Some('{') = self.text_pos.peek() {
997                                in_named_unicode = true;
998                                self.text_pos.next();
999                            }
1000                        }
1001                    }
1002                }
1003                (Some('\\'), _) if is_raw_string => {
1004                    self.text_pos.next();
1005                    // skip escaped end-of-string marker or backslash
1006                    if let Some('"' | '\'' | '\\') = self.text_pos.peek() {
1007                        self.text_pos.next();
1008                    }
1009                }
1010                (Some('{'), _) => {
1011                    if is_in_format_spec {
1012                        // don't actually consume the {, and generate an OP for it instead
1013                        break 'outer;
1014                    }
1015                    let consumed_double = self.text_pos.consume("{{");
1016                    if !consumed_double {
1017                        break 'outer;
1018                    }
1019                }
1020                (Some('}'), _) => {
1021                    if in_named_unicode {
1022                        in_named_unicode = false;
1023                        self.text_pos.next();
1024                    } else if is_in_format_spec {
1025                        // don't actually consume the }, and generate an OP for it instead
1026                        break 'outer;
1027                    } else if !self.text_pos.consume("}}") {
1028                        return Err(TokError::UnmatchedClosingParen('}'));
1029                    }
1030                }
1031                _ => {
1032                    self.text_pos.next();
1033                }
1034            }
1035            ok_result = match str_type {
1036                Some(FTStringType::FString) => Ok(Some(TokType::FStringString)),
1037                Some(FTStringType::TString) => Ok(Some(TokType::TStringString)),
1038                None => unreachable!("We should always have a string type"),
1039            };
1040        }
1041        ok_result
1042    }
1043
1044    fn maybe_consume_ftstring_end(&mut self) -> Option<TokType> {
1045        let ch = self.text_pos.peek();
1046        if let Some(node) = self.ftstring_stack.last() {
1047            if ch == Some(node.quote_char.into()) {
1048                if node.quote_size == StringQuoteSize::Triple {
1049                    self.text_pos.consume(node.quote_char.triple_str());
1050                } else {
1051                    self.text_pos.next(); // already matched
1052                }
1053                let tok_type = match node.string_type {
1054                    FTStringType::FString => TokType::FStringEnd,
1055                    FTStringType::TString => TokType::TStringEnd,
1056                };
1057                self.ftstring_stack.pop();
1058                return Some(tok_type);
1059            }
1060        }
1061        None
1062    }
1063}
1064
1065impl<'t> Iterator for TokState<'t> {
1066    type Item = Result<TokType, TokError<'t>>;
1067
1068    /// Returns the next token type.
1069    fn next(&mut self) -> Option<Result<TokType, TokError<'t>>> {
1070        // This implementation wraps `next_inner`, which does the actual work.
1071        if self.done {
1072            None
1073        } else {
1074            match self.next_inner() {
1075                Err(err) => {
1076                    self.done = true;
1077                    Some(Err(err))
1078                }
1079                Ok(TokType::EndMarker) => {
1080                    self.done = true;
1081                    Some(Ok(TokType::EndMarker))
1082                }
1083                Ok(t) => Some(Ok(t)),
1084            }
1085        }
1086    }
1087}
1088
1089/// Returns true if the given string is a valid Python 3.x identifier. Follows [PEP 3131][].
1090///
1091/// [PEP 3131]: https://www.python.org/dev/peps/pep-3131/
1092fn verify_identifier(name: &str) -> bool {
1093    // TODO: If `name` is non-ascii, must first normalize name to NFKC.
1094    // Common case: If the entire string is ascii, we can avoid the more expensive regex check,
1095    // since the tokenizer already validates ascii characters before calling us.
1096    name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
1097}
1098
1099#[derive(Clone)]
1100pub struct Token<'a> {
1101    pub r#type: TokType,
1102    pub string: &'a str,
1103    pub start_pos: TextPositionSnapshot,
1104    pub end_pos: TextPositionSnapshot,
1105    pub whitespace_before: Rc<RefCell<WhitespaceState<'a>>>,
1106    pub whitespace_after: Rc<RefCell<WhitespaceState<'a>>>,
1107    pub relative_indent: Option<&'a str>,
1108}
1109
1110impl<'a> Debug for Token<'a> {
1111    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
1112        write!(
1113            f,
1114            "Token({:?}, {}, start={:?}, end={:?}, relative_indent={:?}, ws_before={:?}, ws_after={:?}",
1115            self.r#type, self.string, self.start_pos, self.end_pos, self.relative_indent, self.whitespace_before, self.whitespace_after
1116        )
1117    }
1118}
1119
1120// Dummy Eq implementation. We never compare Tokens like this
1121impl<'a> PartialEq for Token<'a> {
1122    fn eq(&self, _other: &Self) -> bool {
1123        true
1124    }
1125}
1126
1127impl<'a> Eq for Token<'a> {}
1128
1129pub struct TokenIterator<'a> {
1130    previous_whitespace: Option<Rc<RefCell<WhitespaceState<'a>>>>,
1131    core_state: TokState<'a>,
1132    absolute_indents: Vec<&'a str>,
1133}
1134
1135impl<'a> TokenIterator<'a> {
1136    pub fn new(module_text: &'a str, config: &TokConfig) -> Self {
1137        Self {
1138            previous_whitespace: None,
1139            absolute_indents: vec![],
1140            core_state: TokState::new(module_text, config),
1141        }
1142    }
1143}
1144
1145impl<'a> Iterator for TokenIterator<'a> {
1146    type Item = Result<Token<'a>, TokError<'a>>;
1147
1148    fn next(&mut self) -> Option<Self::Item> {
1149        let next = self.core_state.next();
1150        next.as_ref()?;
1151        Some((|| {
1152            let tok_type = next.unwrap()?;
1153            let relative_indent = match tok_type {
1154                TokType::Indent => {
1155                    let end_idx = self.core_state.text_pos.byte_idx();
1156                    let start_idx = end_idx - self.core_state.bol_width;
1157                    let absolute_indent = &self.core_state.text_pos.text()[start_idx..end_idx];
1158                    let relative_indent =
1159                        if let Some(prev_absolute_indent) = self.absolute_indents.last() {
1160                            if let Some(ri) = absolute_indent.strip_prefix(prev_absolute_indent) {
1161                                ri
1162                            } else {
1163                                // TODO: return the correct exception type, improve error message
1164                                return Err(TokError::Dedent);
1165                            }
1166                        } else {
1167                            // there's no previous indent, absolute_indent is relative_indent
1168                            absolute_indent
1169                        };
1170                    self.absolute_indents.push(absolute_indent);
1171                    // HACKY: mutate and fixup the previous whitespace state
1172                    if let Some(ws) = self.previous_whitespace.as_mut() {
1173                        ws.borrow_mut().absolute_indent = absolute_indent;
1174                    }
1175                    Some(relative_indent)
1176                }
1177                TokType::Dedent => {
1178                    self.absolute_indents.pop();
1179                    // HACKY: mutate and fixup the previous whitespace state
1180                    if let Some(ws) = self.previous_whitespace.as_mut() {
1181                        ws.borrow_mut().absolute_indent =
1182                            self.absolute_indents.last().unwrap_or(&"");
1183                    }
1184                    None
1185                }
1186                _ => None,
1187            };
1188            let text_pos = &self.core_state.text_pos;
1189            let whitespace_before = self.previous_whitespace.clone().unwrap_or_default();
1190            let whitespace_after = match tok_type {
1191                TokType::Indent | TokType::Dedent | TokType::EndMarker => whitespace_before.clone(),
1192                _ => Rc::new(RefCell::new(WhitespaceState {
1193                    line: text_pos.line_number(),
1194                    column: text_pos.char_column_number(),
1195                    column_byte: text_pos.byte_column_number(),
1196                    byte_offset: text_pos.byte_idx(),
1197                    absolute_indent: self.absolute_indents.last().unwrap_or(&""),
1198                    is_parenthesized: self.core_state.is_parenthesized(),
1199                })),
1200            };
1201            self.previous_whitespace = Some(whitespace_after.clone());
1202
1203            Ok(Token {
1204                r#type: tok_type,
1205                string: text_pos.slice_from_start_pos(&self.core_state.start_pos),
1206                start_pos: self.core_state.start_pos.clone(),
1207                end_pos: text_pos.into(),
1208                whitespace_after: whitespace_after.clone(),
1209                whitespace_before: whitespace_before.clone(),
1210                relative_indent,
1211            })
1212        })())
1213    }
1214}